Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/aoti/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <executorch/runtime/platform/log.h>
#include <cstddef>
#include <cstdint>
#include <vector>

namespace executorch {
namespace backends {
Expand Down
10 changes: 7 additions & 3 deletions backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,13 @@ find_package_torch()

# CUDA-specific AOTI functionality
set(_aoti_cuda_sources
runtime/cuda_backend.cpp runtime/shims/memory.cpp
runtime/shims/tensor_attribute.cpp runtime/guard.cpp
runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
runtime/cuda_backend.cpp
runtime/shims/memory.cpp
runtime/shims/tensor_attribute.cpp
runtime/guard.cpp
runtime/shims/cuda_guard.cpp
runtime/shims/int4mm.cu
runtime/platform/platform.cpp
)
add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
target_include_directories(
Expand Down
39 changes: 20 additions & 19 deletions backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@
*/

#include <cuda_runtime.h>
#include <dlfcn.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
#include <unistd.h>
#include <cstdio>

#include <filesystem>
Expand All @@ -23,16 +21,19 @@
// Include our shim layer headers
#include <executorch/backends/aoti/aoti_delegate_handle.h>
#include <executorch/backends/aoti/common_shims.h>
#include <executorch/backends/cuda/runtime/platform/platform.h>
#include <executorch/backends/cuda/runtime/shims/memory.h>
#include <executorch/backends/cuda/runtime/utils.h>

namespace executorch::backends::cuda {

#define LOAD_SYMBOL(handle, member, name, so_handle) \
do { \
handle->member = reinterpret_cast<name##Func>(dlsym(so_handle, #name)); \
ET_CHECK_OR_RETURN_ERROR( \
handle->member != nullptr, AccessFailed, "Failed to load " #name); \
#define LOAD_SYMBOL(handle, member, name, so_handle) \
do { \
auto symbol_res = get_function(so_handle, #name); \
if (!symbol_res.ok()) { \
return symbol_res.error(); \
} \
handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
} while (0)

using namespace std;
Expand Down Expand Up @@ -122,10 +123,10 @@ class ET_EXPERIMENTAL CudaBackend final
// Generate dynamic temporary file path
filesystem::path temp_dir = filesystem::temp_directory_path();
filesystem::path so_path =
temp_dir / (so_blob_key + to_string(getpid()) + ".so");
temp_dir / (so_blob_key + to_string(get_process_id()) + ".so");

// Create a temporary file
ofstream outfile(so_path.c_str(), ios::binary);
ofstream outfile(so_path, ios::binary);

// Write the ELF buffer to the temporary file
ET_LOG(
Expand All @@ -144,24 +145,23 @@ class ET_EXPERIMENTAL CudaBackend final
// Finish writing the file to disk
outfile.close();

// Load the ELF using dlopen
void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
ET_CHECK_OR_RETURN_ERROR(
so_handle != nullptr,
AccessFailed,
"Failed to load shared library: %s",
dlerror());
// Load the lib
Result<void*> lib_handle_res = load_library(so_path);
if (!lib_handle_res.ok()) {
return lib_handle_res.error();
}
void* lib_handle = lib_handle_res.get();

processed->Free();

// Create handle and load function pointers into it
AOTIDelegateHandle* handle = new AOTIDelegateHandle();
handle->so_handle = so_handle;
handle->so_handle = lib_handle;
handle->so_path = so_path.string();

// Load function pointers specific to this handle's shared library
ET_CHECK_OK_OR_RETURN_ERROR(
load_function_pointers_into_handle(so_handle, handle));
load_function_pointers_into_handle(lib_handle, handle));

AOTInductorModelContainerHandle container_handle = nullptr;

Expand Down Expand Up @@ -332,8 +332,9 @@ class ET_EXPERIMENTAL CudaBackend final
// AOTInductorModelContainerDelete(handle->container_handle);

// Now close the shared library
auto err = Error::Ok;
if (handle->so_handle != nullptr) {
dlclose(handle->so_handle);
err = close_library(handle->so_handle);
}

// Remove the temporary shared library file
Expand Down
125 changes: 125 additions & 0 deletions backends/cuda/runtime/platform/platform.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@

/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/cuda/runtime/platform/platform.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/result.h>
#include <filesystem>
#include <string>

#ifdef _WIN32
#include <malloc.h>
#include <windows.h>
#else // Posix
#include <dlfcn.h>
#include <unistd.h>
#include <cstdlib>
#endif

namespace executorch {
namespace backends {
namespace cuda {

executorch::runtime::Result<void*> load_library(
const std::filesystem::path& path) {
#ifdef _WIN32
std::string utf8 = path.u8string();
auto lib_handle = LoadLibrary(utf8.c_str());
if (lib_handle == NULL) {
ET_LOG(
Error,
"Failed to load %s with error: %lu",
utf8.c_str(),
GetLastError());
return executorch::runtime::Error::AccessFailed;
}

#else
std::string path_str = path.string();
void* lib_handle = dlopen(path_str.c_str(), RTLD_LAZY | RTLD_LOCAL);
if (lib_handle == nullptr) {
ET_LOG(
Error, "Failed to load %s with error: %s", path_str.c_str(), dlerror());
return executorch::runtime::Error::AccessFailed;
}
#endif
return (void*)lib_handle;
}

executorch::runtime::Error close_library(void* lib_handle) {
#ifdef _WIN32
if (!FreeLibrary((HMODULE)lib_handle)) {
printf("FreeLibrary failed with error %lu\n", GetLastError());
return executorch::runtime::Error::Internal;
}
#else
if (dlclose(lib_handle) != 0) {
ET_LOG(Error, "dlclose failed: %s\n", dlerror());
return executorch::runtime::Error::Internal;
}
#endif
return executorch::runtime::Error::Ok;
}

executorch::runtime::Result<void*> get_function(
void* lib_handle,
const std::string& fn_name) {
#ifdef _WIN32
auto fn = GetProcAddress((HMODULE)lib_handle, fn_name.c_str());
if (!fn) {
ET_LOG(
Error,
"Failed loading symbol %s with error %lu\n",
fn_name.c_str(),
GetLastError());
return executorch::runtime::Error::Internal;
}
#else
auto fn = dlsym(lib_handle, fn_name.c_str());
if (fn == nullptr) {
ET_LOG(
Error,
"Failed loading symbol %s with error %s\n",
fn_name.c_str(),
dlerror());
return executorch::runtime::Error::Internal;
}
#endif

return (void*)fn; // This I think is technically ub on windows. We should
// probably explicitly pack the bytes.
}

int32_t get_process_id() {
#ifdef _WIN32
return GetCurrentProcessId();
#else
return getpid();
#endif
}

void* aligned_alloc(size_t alignment, size_t size) {
#ifdef _WIN32
return _aligned_malloc(size, alignment);
#else
return std::aligned_alloc(alignment, size);
#endif
}

void aligned_free(void* ptr) {
#ifdef _WIN32
_aligned_free(ptr);
#else
std::free(ptr);
#endif
}

} // namespace cuda
} // namespace backends
} // namespace executorch
38 changes: 38 additions & 0 deletions backends/cuda/runtime/platform/platform.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/result.h>
#include <filesystem>
#include <string>

namespace executorch {
namespace backends {
namespace cuda {

executorch::runtime::Result<void*> load_library(
const std::filesystem::path& path);

executorch::runtime::Error close_library(void* lib_handle);

executorch::runtime::Result<void*> get_function(
void* lib_handle,
const std::string& fn_name);

int32_t get_process_id();

void* aligned_alloc(size_t alignment, size_t size);

void aligned_free(void* ptr);

} // namespace cuda
} // namespace backends
} // namespace executorch
12 changes: 4 additions & 8 deletions backends/cuda/runtime/shims/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@

#include <executorch/backends/aoti/common_shims.h>
#include <executorch/backends/aoti/utils.h>
#include <executorch/backends/cuda/runtime/platform/platform.h>
#include <executorch/backends/cuda/runtime/shims/memory.h>
#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
#include <executorch/backends/cuda/runtime/utils.h>
#include <executorch/runtime/platform/log.h>
#include <cstdint>
#include <cstdlib> // For posix_memalign
#include <memory>
#include <unordered_map>
#include <unordered_set>
Expand Down Expand Up @@ -230,15 +230,11 @@ AOTITorchError aoti_torch_empty_strided(
cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
} else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
// Ensure 16-byte alignment for CPU memory to match CUDA requirements
int result = posix_memalign(&ptr, 16, nbytes);
ET_CHECK_OR_RETURN_ERROR(
result == 0,
MemoryAllocationFailed,
"Failed to allocate aligned CPU memory");
ptr = aligned_alloc(16, nbytes);
ET_CHECK_OR_RETURN_ERROR(
ptr != nullptr,
MemoryAllocationFailed,
"Failed to call posix_memalign");
"Failed to allocate aligned CPU memory");
} else {
ET_CHECK_OR_RETURN_ERROR(
false,
Expand Down Expand Up @@ -339,7 +335,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
Internal,
"Expected host memory but got managed!")
// This is CPU memory - free immediately
free(data_ptr);
aligned_free(data_ptr);
data_ptr = nullptr;
}

Expand Down
Loading