apache · tqchen · Sep 28, 2025 · Sep 28, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@
 
 [project]
 name = "apache-tvm-ffi"
-version = "0.1.0b10"
+version = "0.1.0b11"
 description = "tvm ffi"
 
 authors = [{ name = "TVM FFI team" }]

diff --git a/python/tvm_ffi/__init__.py b/python/tvm_ffi/__init__.py
@@ -17,7 +17,7 @@
 """TVM FFI Python package."""
 
 # version
-__version__ = "0.1.0b10"
+__version__ = "0.1.0b11"
 
 # order matters here so we need to skip isort here
 # isort: skip_file

diff --git a/python/tvm_ffi/cython/function.pxi b/python/tvm_ffi/cython/function.pxi
@@ -155,7 +155,7 @@ cdef int TVMFFIPyArgSetterDLPackCExporter_(
     if this.c_dlpack_tensor_allocator != NULL:
         ctx.c_dlpack_tensor_allocator = this.c_dlpack_tensor_allocator
 
-    if ctx.device_id != -1:
+    if ctx.device_type != -1:
         # already queried device, do not do it again, pass NULL to stream
         if (this.c_dlpack_from_pyobject)(arg, &temp_managed_tensor, NULL) != 0:
             return -1

diff --git a/tests/python/test_load_inline.py b/tests/python/test_load_inline.py
@@ -167,7 +167,7 @@ def test_load_inline_cuda() -> None:
               }
             }
 
-            void add_one_cuda(tvm::ffi::Tensor x, tvm::ffi::Tensor y) {
+            void add_one_cuda(tvm::ffi::Tensor x, tvm::ffi::Tensor y, int64_t raw_stream) {
               // implementation of a library function
               TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
               DLDataType f32_dtype{kDLFloat, 32, 1};
@@ -184,6 +184,8 @@ def test_load_inline_cuda() -> None:
               // with torch.Tensors
               cudaStream_t stream = static_cast<cudaStream_t>(
                   TVMFFIEnvGetStream(x->device.device_type, x->device.device_id));
+              TVM_FFI_ICHECK_EQ(reinterpret_cast<int64_t>(stream), raw_stream)
+                << "stream must be the same as raw_stream";
               // launch the kernel
               AddOneKernel<<<nblock, nthread_per_block, 0, stream>>>(static_cast<float*>(x->data),
                                                                      static_cast<float*>(y->data), n);
@@ -193,9 +195,18 @@ def test_load_inline_cuda() -> None:
     )
 
     if torch is not None:
+        # test with raw stream
         x_cuda = torch.asarray([1, 2, 3, 4, 5], dtype=torch.float32, device="cuda")
         y_cuda = torch.empty_like(x_cuda)
-        mod.add_one_cuda(x_cuda, y_cuda)
+        mod.add_one_cuda(x_cuda, y_cuda, 0)
+        torch.testing.assert_close(x_cuda + 1, y_cuda)
+
+        # test with torch stream
+        y_cuda = torch.empty_like(x_cuda)
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            mod.add_one_cuda(x_cuda, y_cuda, stream.cuda_stream)
+        stream.synchronize()
         torch.testing.assert_close(x_cuda + 1, y_cuda)