pytorch · JacobSzwejbka · Oct 17, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
@@ -15,6 +15,7 @@
 #include <executorch/runtime/platform/log.h>
 #include <cstddef>
 #include <cstdint>
+#include <vector>
 
 namespace executorch {
 namespace backends {

@@ -36,9 +36,13 @@ find_package_torch()
 
 # CUDA-specific AOTI functionality
 set(_aoti_cuda_sources
-    runtime/cuda_backend.cpp runtime/shims/memory.cpp
-    runtime/shims/tensor_attribute.cpp runtime/guard.cpp
-    runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
+    runtime/cuda_backend.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp
+    runtime/guard.cpp
+    runtime/shims/cuda_guard.cpp
+    runtime/shims/int4mm.cu
+    runtime/platform/platform.cpp
 )
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -7,12 +7,10 @@
  */
 
 #include <cuda_runtime.h>
-#include <dlfcn.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <unistd.h>
 #include <cstdio>
 
 #include <filesystem>
@@ -23,16 +21,19 @@
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 
 namespace executorch::backends::cuda {
 
-#define LOAD_SYMBOL(handle, member, name, so_handle)                        \
-  do {                                                                      \
-    handle->member = reinterpret_cast<name##Func>(dlsym(so_handle, #name)); \
-    ET_CHECK_OR_RETURN_ERROR(                                               \
-        handle->member != nullptr, AccessFailed, "Failed to load " #name);  \
+#define LOAD_SYMBOL(handle, member, name, so_handle)                 \
+  do {                                                               \
+    auto symbol_res = get_function(so_handle, #name);                \
+    if (!symbol_res.ok()) {                                          \
+      return symbol_res.error();                                     \
+    }                                                                \
+    handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
   } while (0)
 
 using namespace std;
@@ -122,10 +123,10 @@ class ET_EXPERIMENTAL CudaBackend final
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
-        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
+        temp_dir / (so_blob_key + to_string(get_process_id()) + ".so");
 
     // Create a temporary file
-    ofstream outfile(so_path.c_str(), ios::binary);
+    ofstream outfile(so_path, ios::binary);
 
     // Write the ELF buffer to the temporary file
     ET_LOG(
@@ -144,24 +145,23 @@ class ET_EXPERIMENTAL CudaBackend final
     // Finish writing the file to disk
     outfile.close();
 
-    // Load the ELF using dlopen
-    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    ET_CHECK_OR_RETURN_ERROR(
-        so_handle != nullptr,
-        AccessFailed,
-        "Failed to load shared library: %s",
-        dlerror());
+    // Load the lib
+    Result<void*> lib_handle_res = load_library(so_path);
+    if (!lib_handle_res.ok()) {
+      return lib_handle_res.error();
+    }
+    void* lib_handle = lib_handle_res.get();
 
     processed->Free();
 
     // Create handle and load function pointers into it
     AOTIDelegateHandle* handle = new AOTIDelegateHandle();
-    handle->so_handle = so_handle;
+    handle->so_handle = lib_handle;
     handle->so_path = so_path.string();
 
     // Load function pointers specific to this handle's shared library
     ET_CHECK_OK_OR_RETURN_ERROR(
-        load_function_pointers_into_handle(so_handle, handle));
+        load_function_pointers_into_handle(lib_handle, handle));
 
     AOTInductorModelContainerHandle container_handle = nullptr;
 
@@ -332,8 +332,9 @@ class ET_EXPERIMENTAL CudaBackend final
     // AOTInductorModelContainerDelete(handle->container_handle);
 
     // Now close the shared library
+    auto err = Error::Ok;
     if (handle->so_handle != nullptr) {
-      dlclose(handle->so_handle);
+      err = close_library(handle->so_handle);
     }
 
     // Remove the temporary shared library file

diff --git a/backends/cuda/runtime/platform/platform.cpp b/backends/cuda/runtime/platform/platform.cpp
@@ -0,0 +1,125 @@
+
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/platform/platform.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <filesystem>
+#include <string>
+
+#ifdef _WIN32
+#include <malloc.h>
+#include <windows.h>
+#else // Posix
+#include <dlfcn.h>
+#include <unistd.h>
+#include <cstdlib>
+#endif
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+executorch::runtime::Result<void*> load_library(
+    const std::filesystem::path& path) {
+#ifdef _WIN32
+  std::string utf8 = path.u8string();
+  auto lib_handle = LoadLibrary(utf8.c_str());
+  if (lib_handle == NULL) {
+    ET_LOG(
+        Error,
+        "Failed to load %s with error: %lu",
+        utf8.c_str(),
+        GetLastError());
+    return executorch::runtime::Error::AccessFailed;
+  }
+
+#else
+  std::string path_str = path.string();
+  void* lib_handle = dlopen(path_str.c_str(), RTLD_LAZY | RTLD_LOCAL);
+  if (lib_handle == nullptr) {
+    ET_LOG(
+        Error, "Failed to load %s with error: %s", path_str.c_str(), dlerror());
+    return executorch::runtime::Error::AccessFailed;
+  }
+#endif
+  return (void*)lib_handle;
+}
+
+executorch::runtime::Error close_library(void* lib_handle) {
+#ifdef _WIN32
+  if (!FreeLibrary((HMODULE)lib_handle)) {
+    printf("FreeLibrary failed with error %lu\n", GetLastError());
+    return executorch::runtime::Error::Internal;
+  }
+#else
+  if (dlclose(lib_handle) != 0) {
+    ET_LOG(Error, "dlclose failed: %s\n", dlerror());
+    return executorch::runtime::Error::Internal;
+  }
+#endif
+  return executorch::runtime::Error::Ok;
+}
+
+executorch::runtime::Result<void*> get_function(
+    void* lib_handle,
+    const std::string& fn_name) {
+#ifdef _WIN32
+  auto fn = GetProcAddress((HMODULE)lib_handle, fn_name.c_str());
+  if (!fn) {
+    ET_LOG(
+        Error,
+        "Failed loading symbol %s with error %lu\n",
+        fn_name.c_str(),
+        GetLastError());
+    return executorch::runtime::Error::Internal;
+  }
+#else
+  auto fn = dlsym(lib_handle, fn_name.c_str());
+  if (fn == nullptr) {
+    ET_LOG(
+        Error,
+        "Failed loading symbol %s with error %s\n",
+        fn_name.c_str(),
+        dlerror());
+    return executorch::runtime::Error::Internal;
+  }
+#endif
+
+  return (void*)fn; // This I think is technically ub on windows. We should
+                    // probably explicitly pack the bytes.
+}
+
+int32_t get_process_id() {
+#ifdef _WIN32
+  return GetCurrentProcessId();
+#else
+  return getpid();
+#endif
+}
+
+void* aligned_alloc(size_t alignment, size_t size) {
+#ifdef _WIN32
+  return _aligned_malloc(size, alignment);
+#else
+  return std::aligned_alloc(alignment, size);
+#endif
+}
+
+void aligned_free(void* ptr) {
+#ifdef _WIN32
+  _aligned_free(ptr);
+#else
+  std::free(ptr);
+#endif
+}
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/platform/platform.h b/backends/cuda/runtime/platform/platform.h
@@ -0,0 +1,38 @@
+
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <filesystem>
+#include <string>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+executorch::runtime::Result<void*> load_library(
+    const std::filesystem::path& path);
+
+executorch::runtime::Error close_library(void* lib_handle);
+
+executorch::runtime::Result<void*> get_function(
+    void* lib_handle,
+    const std::string& fn_name);
+
+int32_t get_process_id();
+
+void* aligned_alloc(size_t alignment, size_t size);
+
+void aligned_free(void* ptr);
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
@@ -8,12 +8,12 @@
 
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/aoti/utils.h>
+#include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
-#include <cstdlib> // For posix_memalign
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
@@ -230,15 +230,11 @@ AOTITorchError aoti_torch_empty_strided(
         cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
   } else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
     // Ensure 16-byte alignment for CPU memory to match CUDA requirements
-    int result = posix_memalign(&ptr, 16, nbytes);
-    ET_CHECK_OR_RETURN_ERROR(
-        result == 0,
-        MemoryAllocationFailed,
-        "Failed to allocate aligned CPU memory");
+    ptr = aligned_alloc(16, nbytes);
     ET_CHECK_OR_RETURN_ERROR(
         ptr != nullptr,
         MemoryAllocationFailed,
-        "Failed to call posix_memalign");
+        "Failed to allocate aligned CPU memory");
   } else {
     ET_CHECK_OR_RETURN_ERROR(
         false,
@@ -339,7 +335,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
                 Internal,
                 "Expected host memory but got managed!")
             // This is CPU memory - free immediately
-            free(data_ptr);
+            aligned_free(data_ptr);
             data_ptr = nullptr;
           }