diff --git a/docs/programming_guides/autotuning.md b/docs/programming_guides/autotuning.md index 66d46889f..9cc5a2d94 100644 --- a/docs/programming_guides/autotuning.md +++ b/docs/programming_guides/autotuning.md @@ -293,7 +293,7 @@ def matmul_configs(M, N, K): Tune compile‑time options explicitly: - `target='auto'|'cuda'|'hip'|'metal'` (normalized to a TVM Target) -- `execution_backend='auto'|'tvm_ffi'|'ctypes'|'cython'|'nvrtc'|'torch'` +- `execution_backend='auto'|'tvm_ffi'|'cython'|'nvrtc'|'torch'` - `pass_configs={...}` to toggle TileLang/TVM passes for experiments On CUDA with multiple GPUs, the tuner sets the current device per worker thread diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py index 55646622e..4113c9d06 100644 --- a/testing/python/cpu/test_tilelang_cpu_gemm.py +++ b/testing/python/cpu/test_tilelang_cpu_gemm.py @@ -101,7 +101,7 @@ def matmul( block_M, block_N, block_K = M // 4, N // 4, K // 4 cpu_func = matmul_jit_test(M, N, K, block_M, block_N, block_K) with tvm.target.Target("c"): - complied_fun = tilelang.compile(cpu_func, -1, execution_backend="ctypes") + complied_fun = tilelang.compile(cpu_func, -1, execution_backend="cython") in_dtype = T.float16 A = torch.randn(M, K, dtype=torch.__getattribute__(in_dtype)) diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py index c5399fc51..546617583 100644 --- a/testing/python/jit/test_tilelang_jit_gemm_cython.py +++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py @@ -256,24 +256,16 @@ def run_cython_kernel_do_bench( ) cython_matmul_kernel = tilelang.compile(program, execution_backend="cython") - ctypes_matmul_kernel = tilelang.compile(program, execution_backend="ctypes") cython_profiler = cython_matmul_kernel.get_profiler() - ctypes_profiler = ctypes_matmul_kernel.get_profiler() cython_latency = cython_profiler.do_bench(func=cython_matmul_kernel) print(f"cython Latency: {cython_latency} ms") - # assert ctypes_latency is not None - tvm_latency = cython_profiler.do_bench() print(f"TVM Latency: {tvm_latency} ms") assert tvm_latency is not None - - ctypes_latency = ctypes_profiler.do_bench(func=ctypes_matmul_kernel) - print(f"ctypes Latency: {ctypes_latency} ms") - assert cython_latency is not None diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py index 69ad49c79..e3f540177 100644 --- a/tilelang/autotuner/param.py +++ b/tilelang/autotuner/param.py @@ -49,7 +49,7 @@ class CompileArgs: """ out_idx: list[int] | int | None = None - execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto" + execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"] = "auto" target: Literal["auto", "cuda", "hip"] = "auto" target_host: str | Target = None verbose: bool = False @@ -265,7 +265,7 @@ def _load_kernel_from_disk( target: str | Target = "auto", target_host: str | Target = None, out_idx: list[int] | int | None = None, - execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi", + execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"] = "tvm_ffi", pass_configs: dict = None, compile_flags: list[str] | str | None = None, func: Callable = None, diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py index 8d9503739..95595d5dc 100644 --- a/tilelang/autotuner/tuner.py +++ b/tilelang/autotuner/tuner.py @@ -144,7 +144,7 @@ def set_compile_args( self, out_idx: list[int] | int | None = None, target: Literal["auto", "cuda", "hip", "metal"] = "auto", - execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto", + execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"] = "auto", target_host: str | Target = None, verbose: bool = False, pass_configs: dict[str, Any] | None = None, @@ -725,7 +725,7 @@ def autotune( # This is the new public interface Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto". target_host : Union[str, Target], optional Target host for cross-compilation. Defaults to None. - execution_backend : Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional + execution_backend : Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"], optional Backend for kernel execution and argument passing. Use "auto" to pick a sensible default per target (cuda->tvm_ffi, metal->torch, others->cython). verbose : bool, optional diff --git a/tilelang/cache/__init__.py b/tilelang/cache/__init__.py index 18ac847bf..9774e8a9c 100644 --- a/tilelang/cache/__init__.py +++ b/tilelang/cache/__init__.py @@ -19,7 +19,7 @@ def cached( *args, target: str | Target = "auto", target_host: str | Target = None, - execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] | None = "auto", + execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"] | None = "auto", verbose: bool | None = False, pass_configs: dict | None = None, compile_flags: list[str] | str | None = None, diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py index cf6a5591b..ded861a3d 100644 --- a/tilelang/cache/kernel_cache.py +++ b/tilelang/cache/kernel_cache.py @@ -48,7 +48,7 @@ class KernelCache: _instance = None # For implementing singleton pattern _lock = threading.Lock() # For thread safety _memory_cache = {} # In-memory cache dictionary - execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi" + execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi" def __new__(cls): """ @@ -77,7 +77,7 @@ def _generate_key( self, func: Callable, out_idx: list[int], - execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi", + execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi", args=None, target: str | Target = "auto", target_host: str | Target = None, @@ -123,7 +123,7 @@ def cached( *args, target: str | Target = "auto", target_host: str | Target = None, - execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto", + execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "auto", verbose: bool = False, pass_configs: dict = None, compile_flags: list[str] | str | None = None, @@ -389,7 +389,7 @@ def _load_kernel_from_disk( target: str | Target = "auto", target_host: str | Target | None = None, out_idx: list[int] | None = None, - execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi", + execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi", pass_configs: dict | None = None, compile_flags: list[str] | str | None = None, func: Callable | None = None, diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py index eac206f72..23163b1fe 100644 --- a/tilelang/jit/__init__.py +++ b/tilelang/jit/__init__.py @@ -49,7 +49,7 @@ def compile( func: PrimFunc[_KP, _T] = None, out_idx: list[int] | int | None = None, - execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto", + execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "auto", target: str | Target = "auto", target_host: str | Target | None = None, verbose: bool = False, @@ -64,7 +64,7 @@ def compile( The TileLang TIR function to compile and wrap. out_idx : Union[List[int], int], optional Index(es) of the output tensors to return (default: None). - execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional + execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional Execution backend to use for kernel execution. Use "auto" to pick a sensible default per target (cuda->tvm_ffi, metal->torch, others->cython). target : Union[str, Target], optional @@ -118,7 +118,7 @@ def compile( def par_compile( funcs: Iterable[PrimFunc[_KP, _T]], out_idx: list[int] | int | None = None, - execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto", + execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "auto", target: str | Target = "auto", target_host: str | Target | None = None, verbose: bool = False, @@ -135,7 +135,7 @@ def par_compile( The TileLang TIR functions to compile and wrap. out_idx : Union[List[int], int], optional Index(es) of the output tensors to return (default: None). - execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional + execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional Execution backend to use for kernel execution. Use "auto" to pick a sensible default per target (cuda->tvm_ffi, metal->torch, others->cython). target : Union[str, Target], optional @@ -202,7 +202,7 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]): out_idx : list[int] | int | None Which output tensor(s) of the compiled kernel should be returned to the caller. Accepts a single index, a list of indices, or None to return all. - execution_backend : Literal["dlpack", "ctypes", "cython"] + execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] Backend used for exchanging arguments and executing the generated kernel. target : str | tvm.target.Target TVM compilation target (e.g. "cuda", "llvm", or "auto"). @@ -256,7 +256,7 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]): """ out_idx: list[int] | int | None - execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] + execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] target: str | Target target_host: str | Target verbose: bool @@ -424,7 +424,7 @@ def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret: return kernel -ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] +ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] @overload @@ -473,7 +473,7 @@ def jit( # This is the new public interface Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto". target_host : Union[str, Target], optional Target host for cross-compilation. Defaults to None. - execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional + execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional Backend for kernel execution and argument passing. Use "auto" to pick a sensible default per target (cuda->tvm_ffi, metal->torch, others->cython). verbose : bool, optional diff --git a/tilelang/jit/adapter/__init__.py b/tilelang/jit/adapter/__init__.py index f511608fc..0d9945285 100644 --- a/tilelang/jit/adapter/__init__.py +++ b/tilelang/jit/adapter/__init__.py @@ -1,6 +1,5 @@ from .base import BaseKernelAdapter # noqa: F401 from .tvm_ffi import TVMFFIKernelAdapter # noqa: F401 -from .ctypes import CtypesKernelAdapter # noqa: F401 from .cython import CythonKernelAdapter # noqa: F401 from .nvrtc import NVRTCKernelAdapter # noqa: F401 from .torch import MetalKernelAdapter # noqa: F401 diff --git a/tilelang/jit/adapter/ctypes/__init__.py b/tilelang/jit/adapter/ctypes/__init__.py deleted file mode 100644 index 5e6fdc84d..000000000 --- a/tilelang/jit/adapter/ctypes/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .adapter import CtypesKernelAdapter # noqa: F401 diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py deleted file mode 100644 index b7cac9d6f..000000000 --- a/tilelang/jit/adapter/ctypes/adapter.py +++ /dev/null @@ -1,302 +0,0 @@ -"""The profiler and convert to torch utils""" - -from __future__ import annotations -import torch -from ..base import BaseKernelAdapter -import ctypes -from typing import Callable, Any -from tilelang import tvm as tvm -from tvm.target import Target -from tvm.relax import TensorType -from tvm import tir -from tilelang.jit.adapter.wrapper import TLWrapper -from tilelang.jit.adapter.libgen import LibraryGenerator -from tilelang.utils.target import determine_target -from tilelang.utils.language import retrieve_func_from_module - - -# TODO(lei): remove ctypes adapter. -class CtypesKernelAdapter(BaseKernelAdapter): - """Adapter class that converts TVM/TIR functions to callable CUDA kernels using ctypes. - - This adapter handles: - 1. Converting TIR functions to compiled CUDA libraries - 2. Managing dynamic shapes in tensor operations - 3. Wrapping C++ kernels for Python/PyTorch usage - """ - - # Class attributes to store compiled kernel information - target = "cuda" - ir_module: tvm.IRModule | None = None - # The global source code of the kernel -> global means the source code of the kernel - # that is not wrapped by the wrapper code - host_kernel_source: str | None = None - device_kernel_source: str | None = None - lib: ctypes.CDLL | None = None # Compiled library handle - # Maps symbolic variables to their corresponding buffer and shape indices - dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None - # Pass configs for the compiler - pass_configs: dict[str, Any] | None = None - - # Add new cache attributes - param_dtypes: list[torch.dtype] | None = None # Cache for parameter dtypes - param_shapes: list[list] | None = None # Cache for parameter shapes - - def __init__( - self, - params: list[TensorType], - result_idx: list[int], - target: str, - func_or_mod: tir.PrimFunc | tvm.IRModule, - host_mod: tvm.IRModule | None = None, - device_mod: tvm.IRModule | None = None, - host_kernel_source: str | None = None, - device_kernel_source: str | None = None, - verbose: bool = False, - pass_configs: dict[str, Any] | None = None, - compile_flags: list[str] | None = None, - ): - """Initialize the adapter with the given TIR function or module. - - Args: - params: List of tensor types for inputs/outputs - result_idx: Indices of output tensors - target: Target platform (e.g., 'cuda') - func_or_mod: TIR function or module to be compiled - verbose: Enable verbose logging - """ - self.params = params - self.result_idx = self._legalize_result_idx(result_idx) - self.host_kernel_source = host_kernel_source - self.device_kernel_source = device_kernel_source - - if isinstance(func_or_mod, tir.PrimFunc): - self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod}) - else: - self.ir_module = func_or_mod - - # Cache parameter information during initialization - # Convert tvm.DataType to torch.dtype for tensor creation - self.param_dtypes = [param.torch_dtype() for param in params] - self.param_shapes = [] - for param in params: - native_shape = [] - for dim in param.shape: - if isinstance(dim, tir.IntImm): - native_shape.append(int(dim)) - elif isinstance(dim, tir.Var): - native_shape.append(dim) # Keep tir.Var for dynamic dimensions - else: - native_shape.append(dim) - self.param_shapes.append(native_shape) - - self.dynamic_symbolic_map = self._process_dynamic_symbolic() - - self.target = Target.canon_target(determine_target(target)) - self.verbose = verbose - self.wrapper = TLWrapper(self.target) - self.lib_generator = LibraryGenerator(self.target, verbose=verbose) - self.lib_generator.assign_pass_configs(pass_configs) - self.lib_generator.assign_compile_flags(compile_flags) - - self.wrapper.assign_optimized_module(self.ir_module) - self.wrapper.assign_pass_configs(pass_configs) - self.wrapper.assign_host_module(host_mod) - self.wrapper.assign_device_module(device_mod) - self.wrapped_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True)) - - self.lib_generator.update_lib_code(self.wrapped_source) - self.lib_generator.compile_lib() - self.lib = self.lib_generator.load_lib() - self.lib.init() - - self._post_init() - - @classmethod - def from_database( - cls, - params: list[TensorType], - result_idx: list[int], - target: str, - func_or_mod: tir.PrimFunc | tvm.IRModule, - host_kernel_source: str, - device_kernel_source: str, - kernel_lib_path: str, - verbose: bool = False, - pass_configs: dict[str, Any] | None = None, - compile_flags: list[str] | None = None, - ): - adapter = cls.__new__(cls) - adapter.params = params - adapter.result_idx = adapter._legalize_result_idx(result_idx) - adapter.host_kernel_source = host_kernel_source - adapter.device_kernel_source = device_kernel_source - adapter.wrapped_source = device_kernel_source + "\n\n" + host_kernel_source - adapter.pass_configs = pass_configs - - if isinstance(func_or_mod, tir.PrimFunc): - adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod}) - else: - adapter.ir_module = func_or_mod - - # Cache parameter information during initialization - # Convert tvm.DataType to torch.dtype for tensor creation - adapter.param_dtypes = [param.torch_dtype() for param in params] - adapter.param_shapes = [] - for param in params: - native_shape = [] - for dim in param.shape: - if isinstance(dim, tir.IntImm): - native_shape.append(int(dim)) - elif isinstance(dim, tir.Var): - native_shape.append(dim) # Keep tir.Var for dynamic dimensions - else: - native_shape.append(dim) - adapter.param_shapes.append(native_shape) - - adapter.dynamic_symbolic_map = adapter._process_dynamic_symbolic() - - adapter.target = Target.canon_target(determine_target(target)) - adapter.verbose = verbose - adapter.lib_generator = LibraryGenerator(adapter.target, verbose=verbose) - adapter.lib_generator.assign_pass_configs(pass_configs) - adapter.lib_generator.assign_compile_flags(compile_flags) - adapter.lib = adapter.lib_generator.load_lib(lib_path=kernel_lib_path) - adapter.lib.init() - - adapter._post_init() - return adapter - - def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int]]: - """Extract information about dynamic shapes from the TIR function. - - Maps symbolic variables to their corresponding (id, buffer_index, dimension) - for runtime shape resolution. - id represents shape or stride, 0 represents shape, 1 represents stride - """ - func = self.prim_func - params = func.params - buffer_map = func.buffer_map - dynamic_symbolic_map = {} - for i, param in enumerate(params): - if param in buffer_map: - buffer = buffer_map[param] - for j, shape in enumerate(buffer.shape): - if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params): - dynamic_symbolic_map[shape] = (0, i, j) - for i, param in enumerate(params): - if param in buffer_map: - buffer = buffer_map[param] - for j, stride in enumerate(buffer.strides): - if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params): - dynamic_symbolic_map[stride] = (1, i, j) - return dynamic_symbolic_map - - def _forward_from_prebuild_lib(self, *args, stream: int | None = None): - """Low-level function to call the compiled CUDA kernel. - - Converts PyTorch tensor pointers to C void pointers for ctypes interface. - """ - ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args] - ctypes_args.append(ctypes.c_void_p(stream)) - self.lib.call(*ctypes_args) - - def _wrap_forward_from_prebuild_lib(self, *ins: list[torch.Tensor], stream: int | None = None): - """High-level wrapper for kernel execution. - - Handles: - 1. Input validation - 2. Output tensor allocation - 3. Dynamic shape resolution - 4. CUDA stream management - - Args: - ins: Input PyTorch tensors - stream: Optional CUDA stream for asynchronous execution - - Returns: - Single tensor or list of tensors containing the kernel results - """ - if len(ins) + len(self.result_idx) != len(self.params): - raise ValueError( - f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs" - ) - ins_idx = 0 - args = [] - - # tensor pointers - for i in range(len(self.params)): - if i in self.result_idx: - dtype = self.param_dtypes[i] - shape = [] - # Now working with native Python list, no FFI calls needed - for s in self.param_shapes[i]: - if isinstance(s, tir.Var): - ref_tensor_idx, ref_shape_idx = self.dynamic_symbolic_map[s] - shape.append(ins[ref_tensor_idx].shape[ref_shape_idx]) - else: # Already converted to Python int during initialization - shape.append(s) - device = ins[0].device if len(ins) > 0 else torch.cuda.current_device() - tensor = torch.empty(*shape, dtype=dtype, device=device) - else: - tensor = ins[ins_idx] - ins_idx += 1 - args.append(tensor) - - # dynamic symbolics - for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items(): - if ref_id == 0: - args.append(ins[buffer_idx].shape[shape_idx]) - else: - args.append(ins[buffer_idx].stride(shape_idx)) - - # if stream is not None, we need to pass the stream to the library - if stream is None: - if str(self.target).startswith("cuda") and torch.cuda.is_available(): - stream = torch.cuda.current_stream().cuda_stream - else: - stream = 0 - - self._forward_from_prebuild_lib(*args, stream=stream) - - if len(self.result_idx) == 1: - return args[self.result_idx[0]] - else: - return [args[i] for i in self.result_idx] - - def _convert_torch_func(self) -> Callable: - """Returns a PyTorch-compatible function wrapper for the kernel.""" - return self._wrap_forward_from_prebuild_lib - - @property - def prim_func(self) -> tir.PrimFunc: - """Returns the primary TIR function from the IR module.""" - return retrieve_func_from_module(self.ir_module) - - @property - def srcpath(self): - """Returns the source path of the compiled library.""" - return self.lib_generator.srcpath - - @property - def libpath(self): - """Returns the path to the compiled library.""" - return self.lib_generator.libpath - - @property - def lib_code(self): - """Returns the code of the compiled library.""" - return self.lib_generator.lib_code - - @property - def is_dynamic(self): - """Indicates whether the kernel handles dynamic shapes.""" - return self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0 - - def get_kernel_source(self, kernel_only: bool = False): - """Returns the source code of the compiled kernel.""" - if kernel_only: - return self.device_kernel_source - else: - # Wrapper only has host kernel source - return self.host_kernel_source diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py index db5e4a8b4..e2604f24b 100644 --- a/tilelang/jit/execution_backend.py +++ b/tilelang/jit/execution_backend.py @@ -35,16 +35,16 @@ def allowed_backends_for_target(target: Target, *, include_unavailable: bool = T if is_cutedsl_target(target): return ["cutedsl"] elif kind == "cuda": - allowed = ["tvm_ffi", "nvrtc", "cython", "ctypes"] + allowed = ["tvm_ffi", "nvrtc", "cython"] elif kind == "hip": - allowed = ["tvm_ffi", "cython", "ctypes"] + allowed = ["tvm_ffi", "cython"] elif kind == "metal": allowed = ["torch"] elif kind == "c": # CPU C backend - allowed = ["cython", "ctypes", "tvm_ffi"] + allowed = ["cython", "tvm_ffi"] else: # Fallback: prefer portable hosts - allowed = ["cython", "ctypes", "tvm_ffi"] + allowed = ["cython", "tvm_ffi"] if not include_unavailable: # Drop NVRTC if not importable diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py index a788e76ba..de935d996 100644 --- a/tilelang/jit/kernel.py +++ b/tilelang/jit/kernel.py @@ -17,7 +17,6 @@ from tilelang.engine.param import CompiledArtifact, KernelParam from tilelang.jit.adapter import ( BaseKernelAdapter, - CtypesKernelAdapter, CythonKernelAdapter, CuTeDSLKernelAdapter, TVMFFIKernelAdapter, @@ -64,7 +63,7 @@ def __init__( self, func: PrimFunc = None, out_idx: list[int] | int = None, - execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi", + execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi", target: str | Target = "auto", target_host: str | Target = None, verbose: bool = False, @@ -81,7 +80,7 @@ def __init__( The TileLang TIR function to compile and wrap. out_idx : Union[List[int], int], optional Index(es) of the output tensors to return (default: None). - execution_backend : Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional + execution_backend : Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional Execution backend to use for kernel execution. target : Union[str, Target], optional Compilation target, either as a string or a TVM Target object (default: "auto"). @@ -112,7 +111,6 @@ def __init__( # Validate the execution backend. assert execution_backend in [ "tvm_ffi", - "ctypes", "cython", "nvrtc", "torch", @@ -158,7 +156,7 @@ def from_database( target: str | Target, target_host: str | Target, out_idx: list[int] | int, - execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], + execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"], pass_configs: dict[str, Any] | None = None, compile_flags: list[str] | None = None, ): @@ -269,19 +267,6 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc, out_idx: list[int pass_configs=pass_configs, compile_flags=compile_flags, ) - elif execution_backend == "ctypes": - adapter = CtypesKernelAdapter( - params=artifact.params, - result_idx=out_idx, - target=target, - func_or_mod=tilelang_func, - host_mod=artifact.host_mod, - device_mod=artifact.device_mod, - device_kernel_source=artifact.kernel_source, - verbose=verbose, - pass_configs=pass_configs, - compile_flags=compile_flags, - ) elif execution_backend == "cython": adapter = CythonKernelAdapter( params=artifact.params, @@ -372,18 +357,6 @@ def _create_adapter_from_database( pass_configs=pass_configs, compile_flags=compile_flags, ) - elif execution_backend == "ctypes": - adapter = CtypesKernelAdapter.from_database( - params=params, - result_idx=result_idx, - target=target, - func_or_mod=func_or_mod, - host_kernel_source=host_kernel_source, - device_kernel_source=device_kernel_source, - kernel_lib_path=kernel_lib_path, - pass_configs=pass_configs, - compile_flags=compile_flags, - ) elif execution_backend == "cython": adapter = CythonKernelAdapter.from_database( params=params, @@ -471,7 +444,7 @@ def get_kernel_source(self, kernel_only: bool = True) -> str: str The source code of the compiled kernel function. """ - if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi", "cutedsl"}: + if self.execution_backend in {"cython", "nvrtc", "tvm_ffi", "cutedsl"}: return self.adapter.get_kernel_source(kernel_only=kernel_only) return self.artifact.kernel_source @@ -479,7 +452,7 @@ def get_host_source(self) -> str: """ Returns the source code of the host function. """ - if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi", "cutedsl"}: + if self.execution_backend in {"cython", "nvrtc", "tvm_ffi", "cutedsl"}: return self.adapter.get_host_source() assert self.artifact.host_mod is not None, "host_mod is not available" return str(self.artifact.host_mod)