Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

Merged
merged 2 commits into from
Jul 16, 2024

Conversation

mcourteaux
Copy link
Contributor

When --for some reason-- an allocation group for fused storage for multiple Funcs that originally are intended to go in GPUShared gets lifted out of the GPU-block loops, and sits in Heap memory instead, the profiling injection logic assumed that this buffer came from a function with the same name. This buffer was incorrectly determined to be on the stack, as it ignored the custom_new and custom_free attributes of the Allocate node.

Consider this example (also included as a new test):

#include "Halide.h"

using namespace Halide;

int main(int argc, char *argv[]) {

    Target t = get_jit_target_from_environment();
    if (!t.has_gpu_feature()) {
        printf("[SKIP] GPU not enabled\n");
        return 0;
    }

    Var x{"x"}, y{"y"};

    Func f1{"f1"}, f2{"f2"};
    f1(x, y) = cast<float>(x + y);
    f2(x, y) = f1(x, y) * 2;

    Func result{"result"};
    result(x, y) = f2(x, y);

    Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
    result
        .compute_root()
        .gpu_tile(x, y, xo, yo, xi, yi, 16, 16)
        .reorder(xi, yi, xo, yo)
        ;

    f2.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    f1.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    result.print_loop_nest();


    t.set_feature(Target::Profile); // Make sure profiling is enabled!
    result.compile_jit(t);

    printf("Success!\n");
    return 0;
}

Produces the following Stmt right before the Profiling pass:

assert(reinterpret<uint64>((struct halide_buffer_t *)result.buffer) != (uint64)0, halide_error_buffer_argument_is_null("result"))
let result = (void *)_halide_buffer_get_host((struct halide_buffer_t *)result.buffer)
let result.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)result.buffer)
let result.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)result.buffer)
let result.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 0)
let result.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 0)
let result.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 0)
let result.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 1)
let result.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 1)
let result.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 1)
if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)result.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)result.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 2, (struct halide_dimension_t *)make_struct((min(result.extent.0, 16) + result.min.0) + -16, max(result.extent.0, 16), 1, 0, (min(result.extent.1, 16) + result.min.1) + -16, max(result.extent.1, 16), max(result.extent.0, 16), 0), (uint64)0)
}
if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 assert(result.type == (uint32)73730, halide_error_bad_type("Output buffer result", result.type, (uint32)73730))
 assert(result.dimensions == 2, halide_error_bad_dimensions("Output buffer result", result.dimensions, 2))
 assert((16 <= result.extent.0) && (((max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -16) <= (result.extent.0 + result.min.0)), halide_error_access_out_of_bounds("Output buffer result", 0, (min(result.extent.0, 16) + result.min.0) + -16, (max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -17, result.min.0, (result.extent.0 + result.min.0) + -1))
 assert((16 <= result.extent.1) && (((max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -16) <= (result.extent.1 + result.min.1)), halide_error_access_out_of_bounds("Output buffer result", 1, (min(result.extent.1, 16) + result.min.1) + -16, (max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -17, result.min.1, (result.extent.1 + result.min.1) + -1))
 assert(result.stride.0 == 1, halide_error_constraint_violated("result.stride.0", result.stride.0, "1", 1))
 let result.total_extent.1 = int64(result.extent.1)*int64(result.extent.0)
 assert((uint64)abs(int64(result.extent.0)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.0)), (uint64)2147483647))
 assert((uint64)abs(int64(result.extent.1)*int64(result.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.1)*int64(result.stride.1)), (uint64)2147483647))
 assert(result.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("result", result.total_extent.1, (int64)2147483647))
 profiling_enable_instance_marker()
 produce result {
  let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)result.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
  allocate allocgroup__f1$0.0__f2$0.1.buffer[float32 * 1]
   custom_new { let t13 = (struct halide_dimension_t *)make_struct(0, 512, 1, 0, 0, (result.extent.0 + 15)/16, 512, 0, 0, (result.extent.1 + 15)/16, ((result.extent.0 + 15)/16)*512, 0) in (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)alloca(size_of_halide_buffer_t()), t13, reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 3, t13, (uint64)0) }
   custom_delete { halide_device_free_as_destructor(allocgroup__f1$0.0__f2$0.1.buffer); }
  let t14 = halide_device_malloc(allocgroup__f1$0.0__f2$0.1.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(t14 == 0, t14)
  let allocgroup__f1$0.0__f2$0.1 = (void *)_halide_buffer_get_device(allocgroup__f1$0.0__f2$0.1.buffer)
  gpu_block<CUDA> (result.s0.y.yo.block_id_y, 0, (result.extent.1 + 15)/16) {
   gpu_block<CUDA> (result.s0.x.xo.block_id_x, 0, (result.extent.0 + 15)/16) {
    gpu_thread<CUDA> (.thread_id_y, 0, 16) {
     gpu_thread<CUDA> (.thread_id_x, 0, 16) {
      let result.s0.y.yi.base.s = min(result.s0.y.yo.block_id_y*16, result.extent.1 + -16)
      let result.s0.x.xi.base.s = min(result.s0.x.xo.block_id_x*16, result.extent.0 + -16)
      produce f1$0 {
       allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)] = float32((((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x) + ((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)))
      }
      gpu_thread_barrier(1)
      produce f2$0 {
       consume f1$0 {
        allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256] = allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)]*2.000000f
       }
      }
      gpu_thread_barrier(1)
      consume f2$0 {
       result[((((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)*result.stride.1) + ((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x)) - ((result.min.1*result.stride.1) + result.min.0)] = allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256]
      }
     }
    }
   }
  }
  free allocgroup__f1$0.0__f2$0.1.buffer
  _halide_buffer_set_device_dirty((struct halide_buffer_t *)result.buffer, (uint1)1)
 }
}

Notice how the allocgroup__f1$0.0__f2$0.1.buffer is outside of the outermost GPU-block loop. When this buffer didn't get lifted out of the kernel, Profiling wasn't an issue, as the profiler doesn't traverse the IR into GPU loops.

The offending line was:

Function func = lookup_function(op->name);

When instrumenting the allocate node. The node is incorrectly determined to be on_stack=true.

This PR checks if there is a custom_new and overrides that it is on the stack to false.

@abadams I wonder if we can't simply rely on Allocate::MemoryType to determine on_stack, or is that still Auto at that moment?

@mcourteaux mcourteaux force-pushed the fix-gpu-alloc-group-profiling branch from baed7a9 to 8095af6 Compare June 28, 2024 11:11
src/Profiling.cpp Outdated Show resolved Hide resolved
@steven-johnson
Copy link
Contributor

Windows failure looks real?

@mcourteaux
Copy link
Contributor Author

I don't know... Doesn't look like it's related at all?

[92/4223] cmd.exe /C "cd /D C:\build_bot\worker\halide-testbranch-main-llvm19-x86-64-windows-cmake\halide-build\src\runtime && C:\build_bot\worker\llvm-19-x86-64-windows\llvm-install\bin\clang.exe -O3 -std=c++17 -ffreestanding -fno-blocks -fno-exceptions -fno-unwind-tables -fno-vectorize -fno-threadsafe-statics -fno-rtti -Wall -Wc++20-designator -Wcast-qual -Werror -Wignored-qualifiers -Wno-comment -Wno-psabi -Wno-unknown-warning-option -Wno-unused-function -Wvla -Wsign-compare -Wimplicit-fallthrough -fpic -g -DDEBUG_RUNTIME -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -DHALIDE_VERSION=18.0.0 -DHALIDE_VERSION_MAJOR=18 -DHALIDE_VERSION_MINOR=0 -DHALIDE_VERSION_PATCH=0 -m32 -target le32-unknown-nacl-unknown -emit-llvm -S -MD -MF initmod.alignment_32_32_debug.d -o initmod.alignment_32_32_debug.ll C:\build_bot\worker\halide-testbranch-main-llvm19-x86-64-windows-cmake\halide-source\src\runtime\alignment_32.cpp && C:\Python311\Lib\site-packages\cmake\data\bin\cmake.exe -E cmake_transform_depfile Ninja gccdepfile C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-source C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-source/src/runtime C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build/src/runtime C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build/src/runtime/initmod.alignment_32_32_debug.d C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build/CMakeFiles/d/179c2da0b3b79af46befa8fa914970247bbecd012831c182921e39862d0fbbd6.d"
FAILED: src/runtime/initmod.alignment_32_32_debug.ll C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build/src/runtime/initmod.alignment_32_32_debug.ll 
cmd.exe /C "cd /D C:\build_bot\worker\halide-testbranch-main-llvm19-x86-64-windows-cmake\halide-build\src\runtime && C:\build_bot\worker\llvm-19-x86-64-windows\llvm-install\bin\clang.exe -O3 -std=c++17 -ffreestanding -fno-blocks -fno-exceptions -fno-unwind-tables -fno-vectorize -fno-threadsafe-statics -fno-rtti -Wall -Wc++20-designator -Wcast-qual -Werror -Wignored-qualifiers -Wno-comment -Wno-psabi -Wno-unknown-warning-option -Wno-unused-function -Wvla -Wsign-compare -Wimplicit-fallthrough -fpic -g -DDEBUG_RUNTIME -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -DHALIDE_VERSION=18.0.0 -DHALIDE_VERSION_MAJOR=18 -DHALIDE_VERSION_MINOR=0 -DHALIDE_VERSION_PATCH=0 -m32 -target le32-unknown-nacl-unknown -emit-llvm -S -MD -MF initmod.alignment_32_32_debug.d -o initmod.alignment_32_32_debug.ll C:\build_bot\worker\halide-testbranch-main-llvm19-x86-64-windows-cmake\halide-source\src\runtime\alignment_32.cpp && C:\Python311\Lib\site-packages\cmake\data\bin\cmake.exe -E cmake_transform_depfile Ninja gccdepfile C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-source C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-source/src/runtime C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build/src/runtime C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build/src/runtime/initmod.alignment_32_32_debug.d C:/build_bot/worker/halide-testbranch-main-llvm19-x86-64-windows-cmake/halide-build/CMakeFiles/d/179c2da0b3b79af46befa8fa914970247bbecd012831c182921e39862d0fbbd6.d"
error: unknown target triple 'le32-unknown-nacl-unknown'

@mcourteaux mcourteaux mentioned this pull request Jul 16, 2024
@steven-johnson
Copy link
Contributor

No, totally unrelated, go ahead and land this

@steven-johnson steven-johnson merged commit a05f459 into halide:main Jul 16, 2024
18 of 19 checks passed
@steven-johnson
Copy link
Contributor

Does this error look like it could be related to this change? https://buildbot.halide-lang.org/master/#/builders/102/builds/448/steps/12/logs/stdio

@mcourteaux
Copy link
Contributor Author

mcourteaux commented Jul 17, 2024

Does this error look like it could be related to this change? https://buildbot.halide-lang.org/master/#/builders/102/builds/448/steps/12/logs/stdio

I'll run this test on my MacBook and see if I can figure out what's up. I doubt that this is related to this PR tho.

@mcourteaux
Copy link
Contributor Author

mcourteaux commented Jul 17, 2024

Could not reproduce with current top-of-tree of Halide with LLVM 18.1.7 (I do realize the buildbot was using LLVM 19, but I don't have that one installed right now...):

~/zec/3rd/halide/tutorial main*
❯ MTL_DEBUG_LAYER=1 HL_JIT_TARGET=host-metal-debug HL_TARGET=host-metal-debug ../build/tutorial/lesson_12_using_the_gpu
Running pipeline on CPU:
Running pipeline on GPU:
2024-07-17 21:58:26.609 lesson_12_using_the_gpu[66079:2010606] Metal API Validation Enabled
Target: x86-64-osx-avx-avx2-f16c-fma-metal-sse41
Testing GPU correctness:
Testing performance on CPU:
1.6190 milliseconds
Testing performance on GPU:
3.3234 milliseconds

Will try again with a debug build. Update: same for debug build.

@steven-johnson
Copy link
Contributor

Did you try it with METAL_DEVICE_WRAPPER_TYPE=1 ?

@steven-johnson
Copy link
Contributor

Definitely repros locally for me with LLVM 19

@mcourteaux
Copy link
Contributor Author

METAL_DEVICE_WRAPPER_TYPE=1

I did now, and it still doesn't reproduce (I'm under LLVM 18 of course, still).

@mcourteaux
Copy link
Contributor Author

Did you revert this PR locally and see if it changes? I'd be surprised if it is related actually.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants