tile-ai · LeiWang1999 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
@@ -368,5 +368,10 @@ TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+// __ldg(BufferLoad | Buffer, idx?) -> value
+// Treat as a pure call that returns the loaded value.
+TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
@@ -600,6 +600,24 @@ TVM_DLL const Op &warp_reduce_bitand();
  */
 TVM_DLL const Op &warp_reduce_bitor();
 
+/*!
+ * \brief tilelang intrinsic for CUDA read-only cache load (__ldg).
+ *
+ *  This op allows users to explicitly request a non-coherent cached load
+ *  from global memory on CUDA by emitting `__ldg(&ptr[idx])` for 32-bit
+ *  element types on supported architectures. It provides a direct way to
+ *  leverage the read-only data cache for performance-sensitive loads when
+ *  the compiler cannot infer `const __restrict__` automatically.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.__ldg(x[i])
+ *
+ *  The op takes one argument preferred as a BufferLoad identifying the
+ *  source element; alternatively, backends may support passing a Buffer and
+ *  index expression.
+ */
+TVM_DLL const Op &__ldg();
+
 } // namespace tl
 } // namespace tvm
 

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -2354,6 +2354,23 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
            << ")), \"r\"((int)" << guard << ")\n";
     stream << ");\n";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // Explicit read-only cached load. Preferred form: __ldg(BufferLoad(...)).
+    // Fallback form: __ldg(buffer, index)
+    const BufferLoadNode *bl = nullptr;
+    if (!op->args.empty()) {
+      bl = op->args[0].as<BufferLoadNode>();
+    }
+    if (bl == nullptr) {
+      LOG(FATAL) << "T.__ldg expects a BufferLoad as the first argument.";
+    }
+    const BufferNode *buffer = bl->buffer.get();
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    PrimExpr base = bl->indices[0];
+    // Emit __ldg(&buffer_ref)
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << "__ldg(&(" << buffer_ref << "))";
   } else if (op->op.same_as(builtin::reinterpret())) {
     DataType tgt_dtype = op->dtype;
     DataType src_dtype = op->args[0]->dtype;

diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
@@ -828,6 +828,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // HIP fallback: regular load
+    const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
+    ICHECK(bl) << "T.__ldg expects a BufferLoad as the first argument.";
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    const BufferNode *buffer = bl->buffer.get();
+    PrimExpr base = bl->indices[0];
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << buffer_ref;
   } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
     ICHECK_EQ(op->args.size(), 6U);

diff --git a/testing/python/language/test_tilelang_language_intrinsics_codegen.py b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
@@ -0,0 +1,30 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+
+    @T.prim_func
+    def main(
+            x: T.Tensor((N,), "float32"),
+            y: T.Tensor((N,), "float32"),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print(src)
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+
-@tilelang.testing.requires_cuda
-def test_language_ldg_codegen():
-    N = 128
-
-    @T.prim_func
-    def main(
-        x: T.Tensor((N,), "float32"),
-        y: T.Tensor((N,), "float32"),
-    ):
-        with T.Kernel(N, threads=32) as pid:
-            # Explicitly request read-only cache load for x[pid]
-            y[pid] = T.__ldg(x[pid]) + 1.0
-
-    # Compile for CUDA and retrieve generated CUDA source
-    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
-    src = kernel.get_kernel_source()
-    print(src)
-    # Assert that codegen uses __ldg on CUDA backend
-    # We look for the intrinsic call with address-of argument
-    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
-    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+
+    @T.prim_func
+    def main(
+        x: T.Tensor((N,), "float32"),
+        y: T.Tensor((N,), "float32"),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    # print(src)  # uncomment for debugging
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
-@tilelang.testing.requires_cuda
-def test_language_ldg_codegen():
-    N = 128
-
-    @T.prim_func
-    def main(
-        x: T.Tensor((N,), "float32"),
-        y: T.Tensor((N,), "float32"),
-    ):
-        with T.Kernel(N, threads=32) as pid:
-            # Explicitly request read-only cache load for x[pid]
-            y[pid] = T.__ldg(x[pid]) + 1.0
-
-    # Compile for CUDA and retrieve generated CUDA source
-    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
-    src = kernel.get_kernel_source()
-    print(src)
-    # Assert that codegen uses __ldg on CUDA backend
-    # We look for the intrinsic call with address-of argument
-    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
-    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+
+    @T.prim_func
+    def main(
+        x: T.Tensor((N,), "float32"),
+        y: T.Tensor((N,), "float32"),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    # print(src)  # uncomment for debugging
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
@@ -96,6 +96,7 @@
 )
 from .logical import any_of, all_of  # noqa: F401
 from .builtin import *  # noqa: F401
+from .builtin import __ldg as __ldg  # noqa: F401
-from .builtin import __ldg as __ldg  # noqa: F401
+from .builtin import __ldg as __ldg
-from .builtin import __ldg as __ldg  # noqa: F401
+from .builtin import __ldg as __ldg
 
 from .utils import index_to_coordinates  # noqa: F401
 

diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
@@ -59,6 +59,35 @@ def create_list_of_mbarrier(*args: Any) -> Call:
         raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
 
 
+def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
+    """Explicitly load via CUDA read-only data cache.
+
+    Prefer calling with a BufferLoad: `T.__ldg(x[i])` emits `__ldg(&x[i])` on CUDA.
+    On non-CUDA backends, falls back to a regular load.
+
+    Args:
+        load_or_buf: A `BufferLoad` like `x[i]`, or a `Buffer`.
+        index: Optional index when passing a `Buffer` directly.
+
+    Returns:
+        PrimExpr: The loaded value.
+    """
+    if isinstance(load_or_buf, BufferLoad):
+        dtype = load_or_buf.dtype
+        return tir.call_intrin(str(dtype), tir.op.Op.get("tl.__ldg"), load_or_buf)
+    if isinstance(load_or_buf, tir.Buffer):
+        if index is None:
+            raise ValueError("T.__ldg(Buffer, index) requires an index when passing a Buffer.")
+        idx = index
+        if isinstance(index, (list, tuple)):
+            if len(index) != 1:
+                raise ValueError("T.__ldg currently supports 1D flattened indices.")
+            idx = index[0]
+        bl = BufferLoad(load_or_buf, [idx])
+        return tir.call_intrin(str(load_or_buf.dtype), tir.op.Op.get("tl.__ldg"), bl)
+    raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
+
+
 def get_mbarrier(*args):
     """Retrieve a memory barrier operation.