Update

manuelcandales · manuelcandales · commit 6a6ba0470291 · 2025-10-15T15:32:06.000-04:00
[ghstack-poisoned]
diff --git a/backends/apple/metal/runtime/shims/et_metal.h b/backends/apple/metal/runtime/shims/et_metal.h
@@ -77,6 +77,35 @@ enum class SyncType {
 // =======================
 // ETMetalShaderLibrary - ExecuTorch Metal shader library management
 // =======================
+
+/**
+ * @class ETMetalShaderLibrary
+ * @brief Manages Metal shader library compilation and kernel function retrieval.
+ *
+ * This class provides a high-level interface for compiling Metal shading language
+ * source code into a Metal library and creating compute pipeline states for
+ * kernel functions. It handles the creation and caching of Metal compute pipeline
+ * states and functions, which should be reused across multiple kernel dispatches.
+ *
+ * The class automatically compiles the provided shader source code upon construction
+ * and maintains an internal cache of compute pipeline states for different kernel
+ * functions to avoid redundant compilation.
+ *
+ * Example usage:
+ * @code
+ * std::string shaderSource = R"(
+ *   #include <metal_stdlib>
+ *   using namespace metal;
+ *   kernel void my_kernel(device float* data [[buffer(0)]],
+ *                        uint tid [[thread_position_in_grid]]) {
+ *     data[tid] = data[tid] * 2.0;
+ *   }
+ * )";
+ *
+ * ETMetalShaderLibrary library(shaderSource);
+ * auto kernelFunction = library.getKernelFunction("my_kernel");
+ * @endcode
+ */
 class ETMetalShaderLibrary {
  public:
   ETMetalShaderLibrary(const std::string& source);
@@ -103,6 +132,45 @@ class ETMetalShaderLibrary {
 // =======================
 // ETMetalKernelFunction - ExecuTorch Metal kernel function execution
 // =======================
+
+/**
+ * @class ETMetalKernelFunction
+ * @brief Represents a Metal compute kernel function ready for execution.
+ *
+ * This class encapsulates a Metal compute pipeline state and function, providing
+ * a high-level interface for setting kernel arguments and dispatching compute
+ * work to the GPU. It handles the encoding of compute commands and manages the
+ * interaction with Metal's compute command encoder.
+ *
+ * The class supports different dispatch patterns:
+ * - Single-dimension dispatch for linear workloads
+ * - Multi-dimensional dispatch for grid-based workloads
+ * - Custom thread group sizes for performance optimization
+ *
+ * Kernel arguments can be set using tensors (which will be mapped to Metal buffers)
+ * or scalar values. The class handles the encoding of these arguments
+ * into the compute command encoder.
+ *
+ * Example usage:
+ * @code
+ * // Get kernel function from library
+ * auto kernelFunction = library.getKernelFunction("vector_add");
+ *
+ * // Start encoding commands
+ * kernelFunction->startEncoding();
+ *
+ * // Set tensor arguments
+ * kernelFunction->setArg(0, inputTensorA);
+ * kernelFunction->setArg(1, inputTensorB);
+ * kernelFunction->setArg(2, outputTensor);
+ *
+ * // Set scalar argument
+ * kernelFunction->setArg(3, static_cast<int64_t>(numElements));
+ *
+ * // Dispatch for linear workload
+ * kernelFunction->dispatchSingle(numElements);
+ * @endcode
+ */
 class ETMetalKernelFunction {
  public:
   ETMetalKernelFunction(MTLComputePipelineState_t cps, MTLFunction_t func);
@@ -132,6 +200,45 @@ class ETMetalKernelFunction {
 // =======================
 // ETMetalStream - Metal command buffer and synchronization management
 // =======================
+
+/**
+ * @class ETMetalStream
+ * @brief Manages Metal compute command streams and provides GPU synchronization.
+ *
+ * This class serves as the central management hub for Metal GPU operations, providing
+ * a stream-based abstraction similar to CUDA streams. It handles command buffer lifecycle,
+ * compute command encoder management, and various synchronization patterns required for
+ * efficient GPU computation.
+ *
+ * Key features:
+ * - Lazy command buffer and encoder creation for optimal resource usage
+ * - Thread-safe operations using serial dispatch queues
+ * - Multiple synchronization modes (COMMIT, COMMIT_AND_WAIT, COMMIT_AND_CONTINUE)
+ * - Kernel coalescing to batch multiple operations efficiently
+ * - MPSGraph integration for high-level neural network operations
+ * - Memory operations (copy, fill) with GPU acceleration via blit encoders
+ *
+ * The stream follows PyTorch's MPS stream design patterns, providing similar semantics
+ * for command buffer management and synchronization.
+ *
+ * Example usage:
+ * @code
+ * // Get current stream (typically the default stream)
+ * ETMetalStream* stream = getCurrentMetalStream();
+ *
+ * // Execute kernel operations (handled automatically)
+ * auto kernelFunction = library.getKernelFunction("my_kernel");
+ * kernelFunction->startEncoding();
+ * kernelFunction->setArg(0, inputTensor);
+ * kernelFunction->dispatchSingle(numElements);
+ *
+ * // Synchronize to ensure completion
+ * stream->synchronize(SyncType::COMMIT_AND_WAIT);
+ *
+ * // Copy between GPU buffers using blit encoder
+ * stream->copy(srcBuffer, dstBuffer, numBytes, 0, 0, SyncType::COMMIT);
+ * @endcode
+ */
 class ETMetalStream {
  public:
   ETMetalStream();
diff --git a/backends/apple/metal/runtime/shims/et_metal.mm b/backends/apple/metal/runtime/shims/et_metal.mm
@@ -743,6 +743,26 @@ int metal_copy_memory(void* dst, const void* src, size_t nbytes, bool src_is_dev
 
 void ETMetalStream::copy(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer, size_t length,
                         size_t srcOffset, size_t dstOffset, SyncType syncType) {
+
+    if (length == 0) {
+        return;
+
+    // Check that offsets are within buffer bounds before copying
+    if (!srcBuffer || !dstBuffer) {
+        ET_LOG(Error, "ETMetalStream::copy: Source or destination buffer is nil");
+        return;
+    }
+    NSUInteger srcBufferLength = [srcBuffer length];
+    NSUInteger dstBufferLength = [dstBuffer length];
+    if (srcOffset + length > srcBufferLength) {
+        ET_LOG(Error, "ETMetalStream::copy: Source offset (%zu) + length (%zu) exceeds source buffer size (%zu)", srcOffset, length, srcBufferLength);
+        return;
+    }
+    if (dstOffset + length > dstBufferLength) {
+        ET_LOG(Error, "ETMetalStream::copy: Destination offset (%zu) + length (%zu) exceeds destination buffer size (%zu)", dstOffset, length, dstBufferLength);
+        return;
+    }
+
     dispatch_sync(serialQueue_, ^{
         @autoreleasepool {
             endKernelCoalescing();
@@ -792,8 +812,6 @@ int metal_copy_memory(void* dst, const void* src, size_t nbytes, bool src_is_dev
                            targetOperations:nil
                           resultsDictionary:results
                         executionDescriptor:nil];
-
-            //synchronize(syncType);
         }
     });
 }
diff --git a/backends/apple/metal/runtime/shims/shim_mps.mm b/backends/apple/metal/runtime/shims/shim_mps.mm
@@ -97,31 +97,33 @@ AOTITorchError aoti_torch_mps_get_kernel_function(
         return Error::InvalidArgument;
     }
 
-    try {
-        auto* library = reinterpret_cast<ETMetalShaderLibrary*>(library_handle);
-        auto function_shared_ptr = library->getKernelFunction(std::string(kernel_name));
-        if (!function_shared_ptr) {
-            ET_LOG(Error, "aoti_torch_mps_get_kernel_function: Failed to get kernel function '%s'", kernel_name);
-            return Error::Internal;
-        }
+    @autoreleasepool {
+        try {
+            auto* library = reinterpret_cast<ETMetalShaderLibrary*>(library_handle);
+            auto function_shared_ptr = library->getKernelFunction(std::string(kernel_name));
+            if (!function_shared_ptr) {
+                ET_LOG(Error, "aoti_torch_mps_get_kernel_function: Failed to get kernel function '%s'", kernel_name);
+                return Error::Internal;
+            }
 
-        auto* raw_function = function_shared_ptr.get();
+            auto* raw_function = function_shared_ptr.get();
 
-        // Store the shared_ptr to keep the object alive
-        storeFunctionHandle(raw_function, function_shared_ptr);
+            // Store the shared_ptr to keep the object alive
+            storeFunctionHandle(raw_function, function_shared_ptr);
 
-        // Return raw pointer to match existing API
-        *function_handle = reinterpret_cast<AOTIMetalKernelFunctionHandle>(raw_function);
+            // Return raw pointer to match existing API
+            *function_handle = reinterpret_cast<AOTIMetalKernelFunctionHandle>(raw_function);
 
-        ET_LOG(Debug, "aoti_torch_mps_get_kernel_function: Got kernel function '%s' -> %p", kernel_name, raw_function);
-        return Error::Ok;
+            ET_LOG(Debug, "aoti_torch_mps_get_kernel_function: Got kernel function '%s' -> %p", kernel_name, raw_function);
+            return Error::Ok;
 
-    } catch (const std::exception& e) {
-        ET_LOG(Error, "aoti_torch_mps_get_kernel_function exception: %s", e.what());
-        return Error::Internal;
-    } catch (...) {
-        ET_LOG(Error, "aoti_torch_mps_get_kernel_function: unknown exception");
-        return Error::Internal;
+        } catch (const std::exception& e) {
+            ET_LOG(Error, "aoti_torch_mps_get_kernel_function exception: %s", e.what());
+            return Error::Internal;
+        } catch (...) {
+            ET_LOG(Error, "aoti_torch_mps_get_kernel_function: unknown exception");
+            return Error::Internal;
+        }
     }
 }
 
@@ -133,19 +135,21 @@ AOTITorchError aoti_torch_mps_start_encoding(
         return Error::InvalidArgument;
     }
 
-    try {
-        auto* function = reinterpret_cast<ETMetalKernelFunction*>(func);
-        function->startEncoding();
+    @autoreleasepool {
+        try {
+            auto* function = reinterpret_cast<ETMetalKernelFunction*>(func);
+            function->startEncoding();
 
-        ET_LOG(Debug, "aoti_torch_mps_start_encoding: Started encoding for function %p", function);
-        return Error::Ok;
+            ET_LOG(Debug, "aoti_torch_mps_start_encoding: Started encoding for function %p", function);
+            return Error::Ok;
 
-    } catch (const std::exception& e) {
-        ET_LOG(Error, "aoti_torch_mps_start_encoding exception: %s", e.what());
-        return Error::Internal;
-    } catch (...) {
-        ET_LOG(Error, "aoti_torch_mps_start_encoding: unknown exception");
-        return Error::Internal;
+        } catch (const std::exception& e) {
+            ET_LOG(Error, "aoti_torch_mps_start_encoding exception: %s", e.what());
+            return Error::Internal;
+        } catch (...) {
+            ET_LOG(Error, "aoti_torch_mps_start_encoding: unknown exception");
+            return Error::Internal;
+        }
     }
 }
 
@@ -268,6 +272,11 @@ AOTITorchError aoti_torch_mps_dispatch_array(
         return Error::InvalidArgument;
     }
 
+    if (!length) {
+        ET_LOG(Error, "aoti_torch_mps_dispatch_array_with_group_size: null length pointer");
+        return Error::InvalidArgument;
+    }
+
     try {
         auto* function = reinterpret_cast<ETMetalKernelFunction*>(func);
         function->dispatchArray(length, length_size);
@@ -296,6 +305,11 @@ AOTITorchError aoti_torch_mps_dispatch_array_with_group_size(
         return Error::InvalidArgument;
     }
 
+    if (!length) {
+        ET_LOG(Error, "aoti_torch_mps_dispatch_array_with_group_size: null length pointer");
+        return Error::InvalidArgument;
+    }
+
     try {
         auto* function = reinterpret_cast<ETMetalKernelFunction*>(func);
         function->dispatchArrayWithGroupSize(length, length_size, group_size, group_size_size);