From e4370461896eb63ab6464b29e03fa90a27557f89 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 27 Sep 2025 20:35:55 -0400 Subject: [PATCH] [CYTHON] Fix stream passing bug This PR fixes a bug in stream passing which breaks expected stream passing behavior. Also added a regression case via load_inline_cuda to guard this issue (needs CUDA env to run atm). --- pyproject.toml | 2 +- python/tvm_ffi/__init__.py | 2 +- python/tvm_ffi/cython/function.pxi | 2 +- tests/python/test_load_inline.py | 15 +++++++++++++-- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f73418479..4a6a7346d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ [project] name = "apache-tvm-ffi" -version = "0.1.0b10" +version = "0.1.0b11" description = "tvm ffi" authors = [{ name = "TVM FFI team" }] diff --git a/python/tvm_ffi/__init__.py b/python/tvm_ffi/__init__.py index 807f9a9f9..d88a35e3e 100644 --- a/python/tvm_ffi/__init__.py +++ b/python/tvm_ffi/__init__.py @@ -17,7 +17,7 @@ """TVM FFI Python package.""" # version -__version__ = "0.1.0b10" +__version__ = "0.1.0b11" # order matters here so we need to skip isort here # isort: skip_file diff --git a/python/tvm_ffi/cython/function.pxi b/python/tvm_ffi/cython/function.pxi index 095e3d6a9..2fa75fbef 100644 --- a/python/tvm_ffi/cython/function.pxi +++ b/python/tvm_ffi/cython/function.pxi @@ -155,7 +155,7 @@ cdef int TVMFFIPyArgSetterDLPackCExporter_( if this.c_dlpack_tensor_allocator != NULL: ctx.c_dlpack_tensor_allocator = this.c_dlpack_tensor_allocator - if ctx.device_id != -1: + if ctx.device_type != -1: # already queried device, do not do it again, pass NULL to stream if (this.c_dlpack_from_pyobject)(arg, &temp_managed_tensor, NULL) != 0: return -1 diff --git a/tests/python/test_load_inline.py b/tests/python/test_load_inline.py index cd46bf5f7..229dc6265 100644 --- a/tests/python/test_load_inline.py +++ b/tests/python/test_load_inline.py @@ -167,7 +167,7 @@ def test_load_inline_cuda() -> None: } } - void add_one_cuda(tvm::ffi::Tensor x, tvm::ffi::Tensor y) { + void add_one_cuda(tvm::ffi::Tensor x, tvm::ffi::Tensor y, int64_t raw_stream) { // implementation of a library function TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor"; DLDataType f32_dtype{kDLFloat, 32, 1}; @@ -184,6 +184,8 @@ def test_load_inline_cuda() -> None: // with torch.Tensors cudaStream_t stream = static_cast( TVMFFIEnvGetStream(x->device.device_type, x->device.device_id)); + TVM_FFI_ICHECK_EQ(reinterpret_cast(stream), raw_stream) + << "stream must be the same as raw_stream"; // launch the kernel AddOneKernel<<>>(static_cast(x->data), static_cast(y->data), n); @@ -193,9 +195,18 @@ def test_load_inline_cuda() -> None: ) if torch is not None: + # test with raw stream x_cuda = torch.asarray([1, 2, 3, 4, 5], dtype=torch.float32, device="cuda") y_cuda = torch.empty_like(x_cuda) - mod.add_one_cuda(x_cuda, y_cuda) + mod.add_one_cuda(x_cuda, y_cuda, 0) + torch.testing.assert_close(x_cuda + 1, y_cuda) + + # test with torch stream + y_cuda = torch.empty_like(x_cuda) + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + mod.add_one_cuda(x_cuda, y_cuda, stream.cuda_stream) + stream.synchronize() torch.testing.assert_close(x_cuda + 1, y_cuda)