apache
diff --git a/‎ffi/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎ffi/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ffi/include/tvm/ffi/extra/c_env_api.h‎
Lines changed: 33 additions & 0 deletions b/‎ffi/include/tvm/ffi/extra/c_env_api.h‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎ffi/src/ffi/extra/stream_context.cc‎
Lines changed: 81 additions & 0 deletions b/‎ffi/src/ffi/extra/stream_context.cc‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎include/tvm/runtime/device_api.h‎
Lines changed: 1 addition & 1 deletion b/‎include/tvm/runtime/device_api.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/tvm/contrib/cutlass/attention_operation.py‎
Lines changed: 4 additions & 8 deletions b/‎python/tvm/contrib/cutlass/attention_operation.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎python/tvm/contrib/cutlass/conv2d_operation.py‎
Lines changed: 1 addition & 2 deletions b/‎python/tvm/contrib/cutlass/conv2d_operation.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/tvm/contrib/cutlass/gemm_operation.py‎
Lines changed: 7 additions & 6 deletions b/‎python/tvm/contrib/cutlass/gemm_operation.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎python/tvm/contrib/cutlass/gen_tensor_op.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tvm/contrib/cutlass/gen_tensor_op.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/tvm/contrib/cutlass/layer_norm_operation.py‎
Lines changed: 1 addition & 2 deletions b/‎python/tvm/contrib/cutlass/layer_norm_operation.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/tvm/contrib/cutlass/rms_norm_operation.py‎
Lines changed: 1 addition & 2 deletions b/‎python/tvm/contrib/cutlass/rms_norm_operation.py‎
Lines changed: 1 addition & 2 deletions
@@ -73,6 +73,7 @@ if (TVM_FFI_USE_EXTRA_CXX_API)
     "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/library_module.cc"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/library_module_system_lib.cc"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/library_module_dynamic_lib.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/stream_context.cc"
   )
 endif()
 
 
@@ -29,6 +29,39 @@
 extern "C" {
 #endif
 
+// ----------------------------------------------------------------------------
+// Stream context
+// Focusing on minimalistic thread-local context recording stream being used.
+// We explicitly not handle allocation/de-allocation of stream here.
+// ----------------------------------------------------------------------------
+typedef void* TVMFFIStreamHandle;
+
+/*!
+ * \brief FFI function to set the current stream for a device
+ *
+ * \param device_type The type of the device.
+ * \param device_id The id of the device.
+ * \param stream The stream to set.
+ * \param opt_out_original_stream Output original stream if the address is not nullptr.
+ * \note The stream is a weak reference that is cached/owned by the module.
+ * \return 0 when success, nonzero when failure happens
+ */
+TVM_FFI_DLL int TVMFFIEnvSetStream(int32_t device_type, int32_t device_id,
+                                   TVMFFIStreamHandle stream,
+                                   TVMFFIStreamHandle* opt_out_original_stream);
+
+/*!
+ * \brief FFI function to get the current stream for a device
+ *
+ * \param device_type The type of the device.
+ * \param device_id The id of the device.
+ * \return The current stream of the device.
+ */
+TVM_FFI_DLL TVMFFIStreamHandle TVMFFIEnvGetCurrentStream(int32_t device_type, int32_t device_id);
+
+// ----------------------------------------------------------------------------
+// Module symbol management
+// ----------------------------------------------------------------------------
 /*!
  * \brief FFI function to lookup a function from a module's imports.
  *
 
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * \file src/ffi/extra/stream_context.cc
+ *
+ * \brief A minimalistic stream context based on ffi values.
+ */
+
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+
+#include <vector>
+
+namespace tvm {
+namespace ffi {
+
+class StreamContext {
+ public:
+  void SetStream(int32_t device_type, int32_t device_id, TVMFFIStreamHandle stream,
+                 TVMFFIStreamHandle* out_original_stream) {
+    if (static_cast<size_t>(device_type) >= stream_table_.size()) {
+      stream_table_.resize(device_type + 1);
+    }
+    if (static_cast<size_t>(device_id) >= stream_table_[device_type].size()) {
+      stream_table_[device_type].resize(device_id + 1, nullptr);
+    }
+    if (out_original_stream != nullptr) {
+      *out_original_stream = stream_table_[device_type][device_id];
+    }
+    stream_table_[device_type][device_id] = stream;
+  }
+
+  TVMFFIStreamHandle GetStream(int32_t device_type, int32_t device_id) {
+    if (static_cast<size_t>(device_type) < stream_table_.size() &&
+        static_cast<size_t>(device_id) < stream_table_[device_type].size()) {
+      return stream_table_[device_type][device_id];
+    }
+    return nullptr;
+  }
+
+  static StreamContext* ThreadLocal() {
+    static thread_local StreamContext inst;
+    return &inst;
+  }
+
+ private:
+  std::vector<std::vector<TVMFFIStreamHandle>> stream_table_;
+};
+
+}  // namespace ffi
+}  // namespace tvm
+
+int TVMFFIEnvSetStream(int32_t device_type, int32_t device_id, TVMFFIStreamHandle stream,
+                       TVMFFIStreamHandle* out_original_stream) {
+  TVM_FFI_SAFE_CALL_BEGIN();
+  tvm::ffi::StreamContext::ThreadLocal()->SetStream(device_type, device_id, stream,
+                                                    out_original_stream);
+  TVM_FFI_SAFE_CALL_END();
+}
+
+TVMFFIStreamHandle TVMFFIEnvGetCurrentStream(int32_t device_type, int32_t device_id) {
+  TVM_FFI_LOG_EXCEPTION_CALL_BEGIN();
+  return tvm::ffi::StreamContext::ThreadLocal()->GetStream(device_type, device_id);
+  TVM_FFI_LOG_EXCEPTION_CALL_END(TVMFFIEnvGetCurrentStream);
+}
@@ -225,7 +225,7 @@ class TVM_DLL DeviceAPI {
    * \param dev The device to set stream.
    * \param stream The stream to be set.
    */
-  virtual void SetStream(Device dev, TVMStreamHandle stream) {}
+  virtual void SetStream(Device dev, TVMStreamHandle stream);
   /*!
    * \brief Get the current stream
    * \param dev The device to get stream.
 
@@ -147,8 +147,7 @@ def instantiate_attention_template(attrs):
   }
 
   CHECK(Attention::check_supported(p));
-  auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-  cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+  cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${query}->device.device_id));
 
   kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
 
@@ -186,8 +185,7 @@ def instantiate_flash_attention_template(attrs):
     int v_batch_stride = v_row_stride * ${num_keys};
     int o_batch_stride = o_row_stride * ${num_queries};
 
-    auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-    cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+    cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${query}->device.device_id));
 
     flash_attn::flash_attention_forward(
                             static_cast<const cutlass::half_t*>(${query}->data),
@@ -237,8 +235,7 @@ def instantiate_flash_attention_template(attrs):
     int v_batch_stride = v_row_stride * ${num_keys};
     int o_batch_stride = o_row_stride * ${num_queries};
 
-    auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-    cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+    cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${query}->device.device_id));
 
     flash_attn::flash_attention_forward(
                             static_cast<const cutlass::half_t*>(${qkv}->data),
@@ -294,8 +291,7 @@ def instantiate_flash_attention_var_len_template(attrs):
     int v_row_stride = v_head_stride * ${num_kv_heads};
     int o_row_stride = o_head_stride * ${num_q_heads};
 
-    auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-    cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+    cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${query}->device.device_id));
 
     flash_attn::flash_attention_var_len_forward(
                             static_cast<const cutlass::half_t*>(${query}->data),
 
@@ -424,8 +424,7 @@ def instantiate_conv2d_template(attrs):
   TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
   ${split_k_update}
 
-  auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-  cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+  cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${data_arg}->device.device_id));
 
   status = conv2d_op(stream);
   TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
 
@@ -345,8 +345,7 @@ def instantiate_gemm_template(attrs):
   status = gemm_op.initialize(arguments, workspace.get());
   TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
 
-  auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-  cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+  cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${A_arg}->device.device_id));
 
   status = gemm_op(stream);
   TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
@@ -428,8 +427,8 @@ def emit_fp16A_intB_matmul(attrs):
   int n = ${B_arg}->shape[1] * ${float_per_int};
   int k = ${B_arg}->shape[0];
 
-  auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-  cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+  cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetCurrentStream(kDLCUDA, ${A_arg}->device.device_id));
     """,
         attrs,
     )
@@ -447,12 +446,14 @@ def emit_fp16A_intB_matmul(attrs):
 
     template_residual = """
   ${template_common}
-  gemm_fp16_int_bias_act_residual<${weight_dtype}, QuantOp>(static_cast<cutlass::half_t*>(${A_arg}->data),
+  gemm_fp16_int_bias_act_residual<${weight_dtype}, QuantOp>(
+                static_cast<cutlass::half_t*>(${A_arg}->data),
                 static_cast<${weight_dtype}*>(${B_arg}->data),
                 static_cast<cutlass::half_t*>(${scales_arg}->data),
                 ${bias},
                 static_cast<cutlass::half_t*>(${residual_arg}->data),
-                static_cast<cutlass::half_t*>(out0->data), "${activation}", "${binary_op}", "${unary_op}",
+                static_cast<cutlass::half_t*>(out0->data),
+                "${activation}", "${binary_op}", "${unary_op}",
                 m, n, k, ${group_size}, nullptr, 0, stream);
 """
 
 
@@ -487,7 +487,7 @@ def instantiate_template(func_name, annotations, func_args):
         if k in annotations:
             attrs[k] = annotations[k]
 
-    headers = ["tvm/ffi/function.h"]
+    headers = ["tvm/ffi/function.h", "tvm/ffi/extra/c_env_api.h"]
 
     if "relu" in func_name:
         headers.append("cutlass/epilogue/thread/linear_combination_bias_relu.h")
 
@@ -39,8 +39,7 @@ def instantiate_layer_norm_template(attrs):
     cutlass::TensorRef<data_type, RowMajor> _beta((data_type*)${beta}->data, layout_channels);
     cutlass::TensorRef<data_type, RowMajor> _output((data_type*)out0->data, layout_2D);
 
-    auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-    cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+    cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${input}->device.device_id));
 
     cutlass::layernorm(size, _output, _input, _gamma, _beta, stream);
     """
 
@@ -38,8 +38,7 @@ def instantiate_rms_norm_template(attrs):
     cutlass::TensorRef<data_type, RowMajor> _weight((data_type*)${weight}->data, layout_channels);
     cutlass::TensorRef<data_type, RowMajor> _output((data_type*)out0->data, layout_2D);
 
-    auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
-    cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
+    cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetCurrentStream(kDLCUDA, ${input}->device.device_id));
 
     cutlass::rmsnorm(size, _output, _input, _weight, stream, ${rms_eps});
     """
Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,7 @@ if (TVM_FFI_USE_EXTRA_CXX_API)`
`73`	`73`	`"${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/library_module.cc"`
`74`	`74`	`"${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/library_module_system_lib.cc"`
`75`	`75`	`"${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/library_module_dynamic_lib.cc"`
	`76`	`+ "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/stream_context.cc"`
`76`	`77`	`)`
`77`	`78`	`endif()`
`78`	`79`