diff --git a/BUILD b/BUILD
new file mode 100644
index 000000000000..d4b988f04058
--- /dev/null
+++ b/BUILD
@@ -0,0 +1,628 @@
+# This package imports OpenAI's Triton (https://github.com/openai/triton).
+#
+# There are two versions of Triton in google3 at the moment. The older version
+# can be found at //third_party/py/triton. This is the MLIR-based version close
+# to head. We expect to transition users to this version in the following
+# weeks.
+#
+# There is no SLA associated with this package and it may get broken by LLVM
+# imports at any time.
+
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+# copybara:uncomment load("//tools/build_defs/license:license.bzl", "license")
+load("//:triton.bzl", "if_not_msvc")
+
+package(
+    # copybara:uncomment_begin
+    # default_applicable_licenses = [":license"],
+    # default_compatible_with = ["//buildenv/target:gce"],
+    # default_visibility = [
+        # "//third_party/tensorflow/compiler/xla:__subpackages__",
+        # "//third_party/triton:__subpackages__",
+    # ],
+    # copybara:uncomment_end_and_comment_begin
+    default_visibility = ["//visibility:public"],
+    # copybara:comment_end
+    # TODO(csigg): fix and remove
+    features = [
+        "-parse_headers",
+        "-use_header_modules",
+    ],
+)
+
+# copybara:uncomment_begin
+# license(name = "license")
+# 
+# licenses(["notice"])
+# 
+# exports_files(["LICENSE"])
+# copybara:uncomment_end
+
+config_setting(
+    name = "compiler_is_msvc",
+    flag_values = {
+        "@bazel_tools//tools/cpp:compiler": "msvc-cl",
+    },
+)
+
+td_library(
+    name = "td_files",
+    srcs = glob(["include/triton/**/*.td"]),
+    includes = ["include"],
+    deps = [
+        "@llvm-project//mlir:ArithOpsTdFiles",
+        "@llvm-project//mlir:CastInterfacesTdFiles",
+        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:DestinationStyleOpInterfaceTdFiles",
+        "@llvm-project//mlir:FunctionInterfacesTdFiles",
+        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:PassBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+        "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "triton_dialect_inc_gen",
+    tbl_outs = [
+        (
+            ["--gen-dialect-decls"],
+            "include/triton/Dialect/Triton/IR/Dialect.h.inc",
+        ),
+        (
+            ["--gen-dialect-defs"],
+            "include/triton/Dialect/Triton/IR/Dialect.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/Triton/IR/TritonDialect.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["--gen-enum-decls"],
+            "include/triton/Dialect/Triton/IR/OpsEnums.h.inc",
+        ),
+        (
+            ["--gen-enum-defs"],
+            "include/triton/Dialect/Triton/IR/OpsEnums.cpp.inc",
+        ),
+        (
+            ["--gen-op-decls"],
+            "include/triton/Dialect/Triton/IR/Ops.h.inc",
+        ),
+        (
+            ["--gen-op-defs"],
+            "include/triton/Dialect/Triton/IR/Ops.cpp.inc",
+        ),
+        (
+            ["--gen-typedef-decls"],
+            "include/triton/Dialect/Triton/IR/Types.h.inc",
+        ),
+        (
+            ["--gen-typedef-defs"],
+            "include/triton/Dialect/Triton/IR/Types.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/Triton/IR/TritonOps.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_interfaces_inc_gen",
+    tbl_outs = [
+        (
+            ["--gen-attr-interface-decls"],
+            "include/triton/Dialect/Triton/IR/AttrInterfaces.h.inc",
+        ),
+        (
+            ["--gen-attr-interface-defs"],
+            "include/triton/Dialect/Triton/IR/AttrInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/Triton/IR/TritonInterfaces.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_transforms_inc_gen",
+    tbl_outs = [
+        (
+            [
+                "--gen-pass-decls",
+                "--name=Triton",
+            ],
+            "include/triton/Dialect/Triton/Transforms/Passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/Triton/Transforms/Passes.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_combine_inc_gen",
+    # The generated file is #included without relative path.
+    strip_include_prefix = "lib/Dialect/Triton/Transforms",
+    tbl_outs = [
+        (
+            ["--gen-rewriters"],
+            "lib/Dialect/Triton/Transforms/TritonCombine.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lib/Dialect/Triton/Transforms/Combine.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_gpu_dialect_inc_gen",
+    tbl_outs = [
+        (
+            ["--gen-dialect-decls"],
+            "include/triton/Dialect/TritonGPU/IR/Dialect.h.inc",
+        ),
+        (
+            ["--gen-dialect-defs"],
+            "include/triton/Dialect/TritonGPU/IR/Dialect.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_gpu_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["--gen-op-decls"],
+            "include/triton/Dialect/TritonGPU/IR/Ops.h.inc",
+        ),
+        (
+            ["--gen-op-defs"],
+            "include/triton/Dialect/TritonGPU/IR/Ops.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_gpu_attr_inc_gen",
+    tbl_outs = [
+        (
+            ["--gen-attrdef-decls"],
+            "include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc",
+        ),
+        (
+            ["--gen-attrdef-defs"],
+            "include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_gpu_transforms_inc_gen",
+    tbl_outs = [
+        (
+            [
+                "--gen-pass-decls",
+                "--name=TritonGPU",
+            ],
+            "include/triton/Dialect/TritonGPU/Transforms/Passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Dialect/TritonGPU/Transforms/Passes.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_conversion_triton_gpu_to_llvm_passes_inc_gen",
+    tbl_outs = [
+        (
+            [
+                "--gen-pass-decls",
+                "--name=TritonGPUToLLVM",
+            ],
+            "include/triton/Conversion/TritonGPUToLLVM/Passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Conversion/TritonGPUToLLVM/Passes.td",
+    deps = ["td_files"],
+)
+
+gentbl_cc_library(
+    name = "triton_conversion_triton_to_triton_gpu_passes_inc_gen",
+    tbl_outs = [
+        (
+            [
+                "--gen-pass-decls",
+                "--name=TritonToTritonGPU",
+            ],
+            "include/triton/Conversion/TritonToTritonGPU/Passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/triton/Conversion/TritonToTritonGPU/Passes.td",
+    deps = ["td_files"],
+)
+
+cc_library(
+    name = "TritonAnalysis",
+    srcs = glob(["lib/Analysis/*.cpp"]),
+    hdrs = glob(["include/triton/Analysis/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":TritonDialect",
+        ":TritonGPUDialect",
+        ":triton_gpu_attr_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+    ],
+)
+
+cc_library(
+    name = "TritonDialect",
+    srcs = glob(["lib/Dialect/Triton/IR/*.cpp"]),
+    hdrs = glob(["include/triton/Dialect/Triton/IR/*.h"]),
+    copts = if_not_msvc(["-Wno-unused-variable"]),
+    includes = ["include"],
+    deps = [
+        ":triton_dialect_inc_gen",
+        ":triton_interfaces_inc_gen",
+        ":triton_ops_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+    ],
+)
+
+cc_library(
+    name = "TritonTransforms",
+    srcs = glob(["lib/Dialect/Triton/Transforms/*.cpp"]),
+    hdrs = glob(["include/triton/Dialect/Triton/Transforms/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":TritonDialect",
+        ":triton_combine_inc_gen",
+        ":triton_transforms_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = True,  # TritonDialect uses getCanonicalizationPatterns().
+)
+
+cc_library(
+    name = "TritonGPUDialect",
+    srcs = glob(["lib/Dialect/TritonGPU/IR/*.cpp"]),
+    hdrs = glob([
+        "include/triton/Analysis/*.h",
+        "include/triton/Dialect/TritonGPU/IR/*.h",
+    ]),
+    copts = if_not_msvc(["-Wno-unused-variable"]),
+    includes = ["include"],
+    deps = [
+        ":TritonDialect",
+        ":triton_gpu_attr_inc_gen",
+        ":triton_gpu_dialect_inc_gen",
+        ":triton_gpu_ops_inc_gen",
+        ":triton_gpu_transforms_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:DestinationStyleOpInterface",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "TritonGPUTransforms",
+    srcs = glob([
+        "lib/Dialect/TritonGPU/Transforms/*.cpp",
+        "lib/Dialect/TritonGPU/Transforms/*.h",
+    ]),
+    hdrs = glob(["include/triton/Dialect/TritonGPU/Transforms/*.h"]),
+    copts = if_not_msvc(["-Wno-unused-variable"]),
+    includes = ["include"],
+    deps = [
+        ":TritonDialect",
+        ":TritonGPUDialect",
+        ":triton_gpu_transforms_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "TritonGPUToLLVM",
+    srcs = glob([
+        "lib/Conversion/TritonGPUToLLVM/*.h",
+        "lib/Conversion/TritonGPUToLLVM/**/*.cpp",
+    ]) + [
+        "include/triton/Conversion/MLIRTypes.h",
+    ],
+    hdrs = glob([
+        "include/triton/Tools/Sys/*.hpp",
+        "include/triton/Conversion/TritonGPUToLLVM/*.h",
+    ]),
+    copts = if_not_msvc(["-Wno-unused-variable"]),
+    includes = [
+        "include",
+        "lib/Conversion/TritonGPUToLLVM",
+    ],
+    deps = [
+        ":TritonAnalysis",
+        ":TritonDialect",
+        ":TritonGPUDialect",
+        ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen",
+        ":triton_conversion_triton_to_triton_gpu_passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithToLLVM",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUToNVVMTransforms",
+        "@llvm-project//mlir:GPUToROCDLTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:IndexDialect",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLDialect",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "TritonToTritonGPU",
+    srcs = glob([
+        "lib/Conversion/TritonToTritonGPU/*.h",
+        "lib/Conversion/TritonToTritonGPU/*.cpp",
+    ]),
+    hdrs = glob(["include/triton/Conversion/TritonToTritonGPU/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":TritonDialect",
+        ":TritonGPUDialect",
+        ":TritonGPUTransforms",
+        ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen",
+        ":triton_conversion_triton_to_triton_gpu_passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:IndexDialect",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "TritonLLVMIR",
+    srcs = glob([
+        "lib/Target/LLVMIR/*.cpp",
+    ]) + [
+        "include/triton/Tools/Sys/GetEnv.hpp",
+    ],
+    hdrs = glob(["include/triton/Target/LLVMIR/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":TritonGPUToLLVM",
+        ":TritonTransforms",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:IRReader",
+        "@llvm-project//llvm:Linker",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:ConversionPasses",
+        "@llvm-project//mlir:ExecutionEngine",
+        "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:Transforms",
+        # copybara:uncomment_begin
+        # "//third_party/py/triton/google:find_cuda",
+        # copybara:uncomment_end
+    ],
+)
+
+cc_library(
+    name = "TritonPTX",
+    srcs = glob([
+        "lib/Target/PTX/*.cpp",
+    ]),
+    hdrs = glob(["include/triton/Target/PTX/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":TritonLLVMIR",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+    ],
+)
+
+cc_library(
+    name = "TritonHSACO",
+    srcs = glob([
+        "lib/Target/HSACO/*.cpp",
+    ]),
+    hdrs = glob(["include/triton/Target/HSACO/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":TritonLLVMIR",
+        ":TritonTools",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:ExecutionEngine",
+        "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Scalar",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//mlir:ExecutionEngine",
+        "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+    ],
+)
+
+cc_library(
+    name = "TritonTools",
+    hdrs = ["include/triton/Tools/Sys/GetEnv.hpp"],
+    includes = ["include"],
+)
+
+cc_binary(
+    name = "triton-opt",
+    srcs = [
+        "bin/RegisterTritonDialects.h",
+        "bin/triton-opt.cpp",
+        "include/triton/Conversion/TritonGPUToLLVM/Passes.h",
+        "include/triton/Conversion/TritonToTritonGPU/Passes.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":TritonDialect",
+        ":TritonGPUDialect",
+        ":TritonGPUToLLVM",
+        ":TritonGPUTransforms",
+        ":TritonToTritonGPU",
+        ":TritonTransforms",
+        ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen",
+        ":triton_conversion_triton_to_triton_gpu_passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ConversionPasses",
+        "@llvm-project//mlir:ExecutionEngine",
+        "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        # copybara:uncomment "//third_party/triton/test:TritonTestAnalysis",
+    ],
+)
+
+cc_binary(
+    name = "triton-translate",
+    srcs = [
+        "bin/triton-translate.cpp",
+        "include/triton/Conversion/TritonGPUToLLVM/Passes.h",
+        "include/triton/Conversion/TritonToTritonGPU/Passes.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":TritonDialect",
+        ":TritonGPUDialect",
+        ":TritonGPUToLLVM",
+        ":TritonGPUTransforms",
+        ":TritonHSACO",
+        ":TritonLLVMIR",
+        ":TritonPTX",
+        ":TritonToTritonGPU",
+        ":TritonTransforms",
+        ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen",
+        ":triton_conversion_triton_to_triton_gpu_passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:ConversionPasses",
+        "@llvm-project//mlir:ExecutionEngine",
+        "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
index 7c52e99b8aaf..11598fcd44b2 100644
--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -93,6 +93,11 @@ bool expensiveLoadOrStore(Operation *op, Attribute &targetEncoding) {
   // same
   if (isSingleValue(op->getOperand(0)))
     return false;
+  // TODO(manany): Investigate with Openai why the change here
+  // https://github.com/openai/triton/commit/640f3c392184cd14291c1bca6a4795eb0f32a61a
+  // which introduces Case 2 causes breakage to this test
+  // //third_party/py/jax_triton/tests:pallas_test_sm80 --test_filter=test_fused_attention_bwd
+  return true;
   // Case 2: Tensor of pointers has more threads than elements
   // we can presume a high hit-rate that makes it cheap to load
   auto ptrType = op->getOperand(0).getType().cast<RankedTensorType>();
diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp
index 6431b6ae8d89..bc29f18ec773 100644
--- a/lib/Target/PTX/PTXTranslation.cpp
+++ b/lib/Target/PTX/PTXTranslation.cpp
@@ -49,7 +49,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) {
   auto *shortPtr =
       static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
   assert(shortPtr);
-  shortPtr->setValue(true);
+  shortPtr->setValue(false);
   std::string sm = cc == 90 ? "sm_90a" : "sm_" + std::to_string(cc);
   // max PTX version
   int ptxMajor = maxPTX / 10;
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
deleted file mode 100644
index 04da9b52953e..000000000000
--- a/python/MANIFEST.in
+++ /dev/null
@@ -1,4 +0,0 @@
-graft src
-graft triton/third_party
-graft triton/runtime/backends/
-graft triton/language/extra
diff --git a/python/README.md b/python/README.md
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/python/examples/copy_strided.py b/python/examples/copy_strided.py
deleted file mode 100644
index 34cf12630205..000000000000
--- a/python/examples/copy_strided.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import triton
-import triton.language as tl
-
-
-# triton kernel
-@triton.jit
-def kernel(X, stride_xm,
-           Z, stride_zn,
-           BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
-    off_m = tl.arange(0, BLOCK_M)
-    off_n = tl.arange(0, BLOCK_N)
-    Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * 1
-    Zs = Z + off_m[:, None] * 1 + off_n[None, :] * stride_zn
-    tl.store(Zs, tl.load(Xs))
-
-
-ret = triton.compile(kernel, signature="*fp32,i32,*fp32,i32", constants={"BLOCK_M": 64, "BLOCK_N": 64})
-print(ret.asm["ttgir"])
diff --git a/python/examples/empty.py b/python/examples/empty.py
deleted file mode 100644
index df313fb85869..000000000000
--- a/python/examples/empty.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def kernel(X, stride_xm, stride_xn, BLOCK: tl.constexpr):
-    pass
-
-
-X = torch.randn(1, device="cuda")
-pgm = kernel[(1,)](X, 1, 1, BLOCK=1024)
diff --git a/python/pyproject.toml b/python/pyproject.toml
deleted file mode 100644
index 6430c0c154dc..000000000000
--- a/python/pyproject.toml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-[build-system]
-requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18"]
-
-[tool.autopep8]
-aggressive = 1
-ignore = "E501,E701,E731,W690"
-max_line_length = 88
diff --git a/python/setup.py b/python/setup.py
deleted file mode 100644
index 68c6cfee9c5c..000000000000
--- a/python/setup.py
+++ /dev/null
@@ -1,305 +0,0 @@
-import os
-import platform
-import re
-import shutil
-import subprocess
-import sys
-import sysconfig
-import tarfile
-import tempfile
-import urllib.request
-from pathlib import Path
-from typing import NamedTuple
-
-from setuptools import Extension, setup
-from setuptools.command.build_ext import build_ext
-
-
-# Taken from https://github.com/pytorch/pytorch/blob/master/tools/setup_helpers/env.py
-def check_env_flag(name: str, default: str = "") -> bool:
-    return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"]
-
-
-def get_build_type():
-    if check_env_flag("DEBUG"):
-        return "Debug"
-    elif check_env_flag("REL_WITH_DEB_INFO"):
-        return "RelWithDebInfo"
-    elif check_env_flag("TRITON_REL_BUILD_WITH_ASSERTS"):
-        return "TritonRelBuildWithAsserts"
-    else:
-        # TODO: change to release when stable enough
-        return "TritonRelBuildWithAsserts"
-
-# --- third party packages -----
-
-
-class Package(NamedTuple):
-    package: str
-    name: str
-    url: str
-    include_flag: str
-    lib_flag: str
-    syspath_var_name: str
-
-# pybind11
-
-
-def get_pybind11_package_info():
-    name = "pybind11-2.10.0"
-    url = "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz"
-    return Package("pybind11", name, url, "PYBIND11_INCLUDE_DIR", "", "PYBIND11_SYSPATH")
-
-# llvm
-
-
-def get_llvm_package_info():
-    # download if nothing is installed
-    system = platform.system()
-    if system == "Darwin":
-        system_suffix = "apple-darwin"
-    elif system == "Linux":
-        vglibc = tuple(map(int, platform.libc_ver()[1].split('.')))
-        vglibc = vglibc[0] * 100 + vglibc[1]
-        linux_suffix = 'ubuntu-18.04' if vglibc > 217 else 'centos-7'
-        system_suffix = f"linux-gnu-{linux_suffix}"
-    else:
-        return Package("llvm", "LLVM-C.lib", "", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH")
-    use_assert_enabled_llvm = check_env_flag("TRITON_USE_ASSERT_ENABLED_LLVM", "False")
-    release_suffix = "assert" if use_assert_enabled_llvm else "release"
-    name = f'llvm+mlir-17.0.0-x86_64-{system_suffix}-{release_suffix}'
-    version = "llvm-17.0.0-c5dede880d17"
-    url = f"https://github.com/ptillet/triton-llvm-releases/releases/download/{version}/{name}.tar.xz"
-    return Package("llvm", name, url, "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH")
-
-
-def get_thirdparty_packages(triton_cache_path):
-    packages = [get_pybind11_package_info(), get_llvm_package_info()]
-    thirdparty_cmake_args = []
-    for p in packages:
-        package_root_dir = os.path.join(triton_cache_path, p.package)
-        package_dir = os.path.join(package_root_dir, p.name)
-        if p.syspath_var_name in os.environ:
-            package_dir = os.environ[p.syspath_var_name]
-        version_file_path = os.path.join(package_dir, "version.txt")
-        if p.syspath_var_name not in os.environ and\
-           (not os.path.exists(version_file_path) or Path(version_file_path).read_text() != p.url):
-            try:
-                shutil.rmtree(package_root_dir)
-            except Exception:
-                pass
-            os.makedirs(package_root_dir, exist_ok=True)
-            print(f'downloading and extracting {p.url} ...')
-            ftpstream = urllib.request.urlopen(p.url)
-            file = tarfile.open(fileobj=ftpstream, mode="r|*")
-            file.extractall(path=package_root_dir)
-            # write version url to package_dir
-            with open(os.path.join(package_dir, "version.txt"), "w") as f:
-                f.write(p.url)
-        if p.include_flag:
-            thirdparty_cmake_args.append(f"-D{p.include_flag}={package_dir}/include")
-        if p.lib_flag:
-            thirdparty_cmake_args.append(f"-D{p.lib_flag}={package_dir}/lib")
-    return thirdparty_cmake_args
-
-# ---- package data ---
-
-
-def download_and_copy_ptxas():
-    base_dir = os.path.dirname(__file__)
-    src_path = "bin/ptxas"
-    version = "12.1.105"
-    url = f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-64/cuda-nvcc-{version}-0.tar.bz2"
-    dst_prefix = os.path.join(base_dir, "triton")
-    dst_suffix = os.path.join("third_party", "cuda", src_path)
-    dst_path = os.path.join(dst_prefix, dst_suffix)
-    is_linux = platform.system() == "Linux"
-    download = False
-    if is_linux:
-        download = True
-        if os.path.exists(dst_path):
-            curr_version = subprocess.check_output([dst_path, "--version"]).decode("utf-8").strip()
-            curr_version = re.search(r"V([.|\d]+)", curr_version).group(1)
-            download = curr_version != version
-    if download:
-        print(f'downloading and extracting {url} ...')
-        ftpstream = urllib.request.urlopen(url)
-        file = tarfile.open(fileobj=ftpstream, mode="r|*")
-        with tempfile.TemporaryDirectory() as temp_dir:
-            file.extractall(path=temp_dir)
-            src_path = os.path.join(temp_dir, src_path)
-            os.makedirs(os.path.split(dst_path)[0], exist_ok=True)
-            shutil.copy(src_path, dst_path)
-    return dst_suffix
-
-
-# ---- cmake extension ----
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name, path, sourcedir=""):
-        Extension.__init__(self, name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
-        self.path = path
-
-
-class CMakeBuild(build_ext):
-
-    user_options = build_ext.user_options + [('base-dir=', None, 'base directory of Triton')]
-
-    def initialize_options(self):
-        build_ext.initialize_options(self)
-        self.base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
-
-    def finalize_options(self):
-        build_ext.finalize_options(self)
-
-    def run(self):
-        try:
-            out = subprocess.check_output(["cmake", "--version"])
-        except OSError:
-            raise RuntimeError(
-                "CMake must be installed to build the following extensions: " + ", ".join(e.name for e in self.extensions)
-            )
-
-        match = re.search(r"version\s*(?P<major>\d+)\.(?P<minor>\d+)([\d.]+)?", out.decode())
-        cmake_major, cmake_minor = int(match.group("major")), int(match.group("minor"))
-        if (cmake_major, cmake_minor) < (3, 18):
-            raise RuntimeError("CMake >= 3.18.0 is required")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def get_cmake_dir(self):
-        plat_name = sysconfig.get_platform()
-        python_version = sysconfig.get_python_version()
-        dir_name = f"cmake.{plat_name}-{sys.implementation.name}-{python_version}"
-        cmake_dir = Path(self.base_dir) / "python" / "build" / dir_name
-        cmake_dir.mkdir(parents=True, exist_ok=True)
-        return cmake_dir
-
-    def build_extension(self, ext):
-        lit_dir = shutil.which('lit')
-        user_home = os.getenv("HOME") or os.getenv("USERPROFILE") or \
-            os.getenv("HOMEPATH") or None
-        if not user_home:
-            raise RuntimeError("Could not find user home directory")
-        triton_cache_path = os.path.join(user_home, ".triton")
-        # lit is used by the test suite
-        thirdparty_cmake_args = get_thirdparty_packages(triton_cache_path)
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path)))
-        # create build directories
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-        # python directories
-        python_include_dir = sysconfig.get_path("platinclude")
-        cmake_args = [
-            "-DLLVM_ENABLE_WERROR=ON",
-            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
-            "-DTRITON_BUILD_TUTORIALS=OFF",
-            "-DTRITON_BUILD_PYTHON_MODULE=ON",
-            "-DPython3_EXECUTABLE:FILEPATH=" + sys.executable,
-            "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON",
-            "-DPYTHON_INCLUDE_DIRS=" + python_include_dir,
-        ]
-        if lit_dir is not None:
-            cmake_args.append("-DLLVM_EXTERNAL_LIT=" + lit_dir)
-        cmake_args.extend(thirdparty_cmake_args)
-
-        # configuration
-        cfg = get_build_type()
-        build_args = ["--config", cfg]
-
-        if platform.system() == "Windows":
-            cmake_args += [f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"]
-            if sys.maxsize > 2**32:
-                cmake_args += ["-A", "x64"]
-            build_args += ["--", "/m"]
-        else:
-            cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg]
-            max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count()))
-            build_args += ['-j' + max_jobs]
-
-        if check_env_flag("TRITON_BUILD_WITH_CLANG_LLD"):
-            cmake_args += ["-DCMAKE_C_COMPILER=clang",
-                           "-DCMAKE_CXX_COMPILER=clang++",
-                           "-DCMAKE_LINKER=lld",
-                           "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
-                           "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
-                           "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld"]
-
-        env = os.environ.copy()
-        cmake_dir = self.get_cmake_dir()
-        subprocess.check_call(["cmake", self.base_dir] + cmake_args, cwd=cmake_dir, env=env)
-        subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=cmake_dir)
-
-
-download_and_copy_ptxas()
-
-
-setup(
-    name="triton",
-    version="2.1.0",
-    author="Philippe Tillet",
-    author_email="phil@openai.com",
-    description="A language and compiler for custom Deep Learning operations",
-    long_description="",
-    packages=[
-        "triton",
-        "triton/_C",
-        "triton/common",
-        "triton/compiler",
-        "triton/debugger",
-        "triton/language",
-        "triton/language/extra",
-        "triton/ops",
-        "triton/ops/blocksparse",
-        "triton/runtime",
-        "triton/runtime/backends",
-        "triton/third_party/cuda/bin",
-        "triton/third_party/cuda/include",
-        "triton/third_party/cuda/lib",
-        "triton/tools",
-    ],
-    install_requires=[
-        "filelock",
-    ],
-    include_package_data=True,
-    ext_modules=[CMakeExtension("triton", "triton/_C/")],
-    cmdclass={"build_ext": CMakeBuild},
-    zip_safe=False,
-    # for PyPI
-    keywords=["Compiler", "Deep Learning"],
-    url="https://github.com/openai/triton/",
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Developers",
-        "Topic :: Software Development :: Build Tools",
-        "License :: OSI Approved :: MIT License",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-    ],
-    test_suite="tests",
-    extras_require={
-        "build": [
-            "cmake>=3.18",
-            "lit",
-        ],
-        "tests": [
-            "autopep8",
-            "flake8",
-            "isort",
-            "numpy",
-            "pytest",
-            "scipy>=1.7.1",
-        ],
-        "tutorials": [
-            "matplotlib",
-            "pandas",
-            "tabulate",
-        ],
-    },
-)
diff --git a/python/src/extra/cuda.ll b/python/src/extra/cuda.ll
deleted file mode 100644
index 0ab2f6896bdd..000000000000
--- a/python/src/extra/cuda.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; ~/.triton/llvm/llvm+mlir-17.0.0-x86_64-linux-gnu-ubuntu-18.04-release/bin/llvm-as ./src/extra/cuda.ll -o ./triton/language/extra/cuda.bc
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "nvptx64-nvidia-cuda"
-
-
-define i64 @globaltimer() #0 {
-  %1 = call i64 asm sideeffect "mov.u64 $0, %globaltimer;", "=l"() nounwind
-  ret i64 %1
-}
-
-define i32 @smid() #0 {
-  %1 = call i32 asm "mov.u32 $0, %smid;", "=r"() nounwind
-  ret i32 %1
-}
-
-attributes #0 = { alwaysinline nounwind }
diff --git a/python/src/main.cc b/python/src/main.cc
deleted file mode 100644
index 801a83a4b19f..000000000000
--- a/python/src/main.cc
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <pybind11/pybind11.h>
-
-void init_superblocking(pybind11::module &m);
-void init_torch_utils(pybind11::module &m);
-void init_triton(pybind11::module &m);
-void init_cutlass(pybind11::module &m);
-
-PYBIND11_MODULE(libtriton, m) {
-  m.doc() = "Python bindings to the C++ Triton API";
-  init_triton(m);
-}
diff --git a/python/src/triton.cc b/python/src/triton.cc
deleted file mode 100644
index 66c112d09bc0..000000000000
--- a/python/src/triton.cc
+++ /dev/null
@@ -1,1703 +0,0 @@
-﻿#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Verifier.h"
-
-#include "mlir/Conversion/Passes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-
-#include "mlir/Parser/Parser.h"
-#include "mlir/Support/FileUtilities.h"
-
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Index/IR/IndexDialect.h"
-#include "mlir/Dialect/Index/IR/IndexOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "triton/Analysis/Allocation.h"
-#include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h"
-#include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Types.h"
-#include "triton/Dialect/Triton/Transforms/Passes.h"
-#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
-#include "triton/Target/HSACO/HSACOTranslation.h"
-#include "triton/Target/LLVMIR/LLVMIRTranslation.h"
-#include "triton/Target/PTX/PTXTranslation.h"
-#include "triton/Tools/Sys/GetEnv.hpp"
-#include "triton/Tools/Sys/GetPlatform.hpp"
-
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "llvm/Support/SourceMgr.h"
-
-#include <Python.h>
-#include <cctype>
-#include <fstream>
-#include <optional>
-#include <pybind11/buffer_info.h>
-#include <pybind11/functional.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
-#include <regex>
-#include <signal.h>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-namespace py = pybind11;
-
-enum backend_t {
-  HOST,
-  CUDA,
-  ROCM,
-};
-
-void init_triton_runtime(py::module &&m) {
-  // wrap backend_t
-  py::enum_<backend_t>(m, "backend")
-      .value("HOST", HOST)
-      .value("CUDA", CUDA)
-      .value("ROCM", ROCM)
-      .export_values();
-}
-
-/*****************************************************************************/
-/* Python bindings for triton::ir                                            */
-/*****************************************************************************/
-
-void init_triton_ir(py::module &&m) {
-  using ret = py::return_value_policy;
-  using namespace pybind11::literals;
-
-  py::enum_<mlir::triton::PaddingOption>(m, "PADDING_OPTION")
-      .value("PAD_ZERO", mlir::triton::PaddingOption::PAD_ZERO)
-      .value("PAD_NAN", mlir::triton::PaddingOption::PAD_NAN)
-      .export_values();
-
-  py::enum_<mlir::triton::CacheModifier>(m, "CACHE_MODIFIER")
-      .value("NONE", mlir::triton::CacheModifier::NONE)
-      .value("CA", mlir::triton::CacheModifier::CA)
-      .value("CG", mlir::triton::CacheModifier::CG)
-      .export_values();
-
-  py::enum_<mlir::triton::EvictionPolicy>(m, "EVICTION_POLICY")
-      .value("NORMAL", mlir::triton::EvictionPolicy::NORMAL)
-      .value("EVICT_FIRST", mlir::triton::EvictionPolicy::EVICT_FIRST)
-      .value("EVICT_LAST", mlir::triton::EvictionPolicy::EVICT_LAST)
-      .export_values();
-
-  py::enum_<mlir::triton::RMWOp>(m, "ATOMIC_OP")
-      .value("ADD", mlir::triton::RMWOp::ADD)
-      .value("FADD", mlir::triton::RMWOp::FADD)
-      .value("AND", mlir::triton::RMWOp::AND)
-      .value("OR", mlir::triton::RMWOp::OR)
-      .value("XOR", mlir::triton::RMWOp::XOR)
-      .value("XCHG", mlir::triton::RMWOp::XCHG)
-      .value("MAX", mlir::triton::RMWOp::MAX)
-      .value("MIN", mlir::triton::RMWOp::MIN)
-      .value("UMIN", mlir::triton::RMWOp::UMIN)
-      .value("UMAX", mlir::triton::RMWOp::UMAX);
-
-  py::class_<mlir::MLIRContext>(m, "context")
-      .def(py::init<>())
-      .def("load_triton", [](mlir::MLIRContext &self) {
-        self.getOrLoadDialect<mlir::triton::TritonDialect>();
-        self.getOrLoadDialect<mlir::index::IndexDialect>();
-        self.getOrLoadDialect<mlir::triton::TritonDialect>();
-        self.getOrLoadDialect<mlir::gpu::GPUDialect>();
-        // we load LLVM because the frontend uses LLVM.undef for
-        // some placeholders
-        self.getOrLoadDialect<mlir::LLVM::LLVMDialect>();
-      });
-  // .def(py::init([](){
-  //   mlir::MLIRContext context;
-  //   context.getOrLoadDialect<mlir::triton.TritonDialect>();
-  //   // TODO: should we return a (raw/unique) pointer here?
-  //   return context;
-  // }));
-
-  // py::class_<ir::value>(m, "value")
-  //     .def("multiple_of", [](ir::value *self, int val) {
-  //       if (auto *instr = dynamic_cast<ir::instruction*>(self)) {
-  //         instr->set_metadata(ir::metadata::multiple_of, val);
-  //       } else
-  //         throw std::runtime_error("multiple_of");
-  //     })
-  //     .def("max_contiguous", [](ir::value *self, int val) {
-  //       if (auto *instr = dynamic_cast<ir::instruction*>(self)) {
-  //         instr->set_metadata(ir::metadata::max_contiguous, val);
-  //       } else
-  //         throw std::runtime_error("max_contiguous");
-  //     })
-  //     .def("set_fdiv_ieee_rounding", [](ir::value *self, bool val) {
-  //       if (auto *instr = dynamic_cast<ir::binary_operator*>(self))
-  //         instr->set_fdiv_ieee_rounding(val);
-  //       else
-  //         throw std::runtime_error("set_fdiv_ieee_rounding");
-  //     })
-  //     .def("ops", [](ir::value *self) {
-  //       if (auto *instr = dynamic_cast<ir::instruction*>(self)) {
-  //         return instr->ops();
-  //       }
-  //       throw std::runtime_error("cannot use ops()");
-  //     })
-  //     .def("replace_all_uses_with", &ir::value::replace_all_uses_with)
-  //     .def("erase_from_parent", [](ir::value *self) {
-  //       if (auto *instr = dynamic_cast<ir::instruction*>(self))
-  //         return instr->erase_from_parent();
-  //       throw std::runtime_error("cannot use erase_from_parent");
-  //     })
-  //     .def_property("name", &ir::value::get_name, &ir::value::set_name)
-  //     .def_property_readonly("type", &ir::value::get_type);
-
-  // // // Do we need under in TritonIR ?
-  // // py::class_<ir::undef_value, ir::constant>(m, "undef")
-  // //     .def("get", &ir::undef_value::get, ret::reference);
-
-  py::class_<mlir::Type>(m, "type")
-      .def("is_integer", &mlir::Type::isInteger)
-      .def("is_fp16", &mlir::Type::isF16)
-      .def("__str__", [](mlir::Type &self) {
-        std::string str;
-        llvm::raw_string_ostream os(str);
-        self.print(os);
-        return os.str();
-      });
-
-  py::class_<mlir::FunctionType>(m, "function_type")
-      .def("param_types", [](mlir::FunctionType &self) {
-        return std::vector<mlir::Type>(self.getInputs().begin(),
-                                       self.getInputs().end());
-      });
-
-  py::class_<mlir::Value>(m, "value")
-      .def("set_attr",
-           [](mlir::Value &self, std::string &name,
-              mlir::Attribute &attr) -> void {
-             if (mlir::Operation *definingOp = self.getDefiningOp())
-               definingOp->setAttr(name, attr);
-             else {
-               auto arg = self.cast<mlir::BlockArgument>();
-               int id = arg.getArgNumber();
-               std::string attrName = name + "_arg" + std::to_string(id);
-               mlir::Block *owner = arg.getOwner();
-               if (owner->isEntryBlock() &&
-                   !mlir::isa<mlir::triton::FuncOp>(owner->getParentOp())) {
-                 owner->getParentOp()->setAttr(attrName, attr);
-               }
-             }
-           })
-      .def("get_context", &mlir::Value::getContext)
-      .def("replace_all_uses_with",
-           [](mlir::Value &self, mlir::Value &newValue) {
-             self.replaceAllUsesWith(newValue);
-           })
-      .def("get_type", &mlir::Value::getType);
-
-  py::class_<mlir::BlockArgument, mlir::Value>(m, "block_argument");
-
-  py::class_<mlir::Region>(m, "region")
-      .def("get_parent_region", &mlir::Region::getParentRegion, ret::reference)
-      .def("size", [](mlir::Region &self) { return self.getBlocks().size(); })
-      .def("empty", &mlir::Region::empty);
-
-  py::class_<mlir::Block>(m, "block")
-      .def("arg",
-           [](mlir::Block &self, int index) -> mlir::BlockArgument {
-             return self.getArgument(index);
-           })
-      .def("add_argument",
-           [](mlir::Block &self, mlir::Type ty) {
-             auto loc = mlir::UnknownLoc::get(ty.getContext());
-             self.addArgument(ty, loc);
-           })
-      .def("get_num_arguments", &mlir::Block::getNumArguments)
-      .def("dump", &mlir::Block::dump)
-      .def("move_before", &mlir::Block::moveBefore)
-      .def("insert_before", &mlir::Block::insertBefore)
-      .def("get_parent", &mlir::Block::getParent, ret::reference)
-      .def("merge_block_before",
-           [](mlir::Block &self, mlir::Block &dst) {
-             // ref: RewriterBase::mergeBlocks()
-             if (self.getNumArguments() != 0)
-               throw std::runtime_error(
-                   "This block has arguments, don't merge");
-             dst.getOperations().splice(dst.begin(), self.getOperations());
-             self.dropAllUses();
-             self.erase();
-           })
-      .def("replace_use_in_block_with",
-           [](mlir::Block &self, mlir::Value &v, mlir::Value &newVal) {
-             v.replaceUsesWithIf(newVal, [&](mlir::OpOperand &operand) {
-               mlir::Operation *user = operand.getOwner();
-               mlir::Block *currentBlock = user->getBlock();
-               while (currentBlock) {
-                 if (currentBlock == &self)
-                   return true;
-                 // Move up one level
-                 currentBlock =
-                     currentBlock->getParent()->getParentOp()->getBlock();
-               }
-               return false;
-             });
-           })
-      .def("__str__",
-           [](mlir::Block &self) {
-             std::string str;
-             llvm::raw_string_ostream os(str);
-             self.print(os);
-             return str;
-           })
-      .def("has_terminator",
-           [](mlir::Block &self) {
-             return !self.empty() &&
-                    self.back().hasTrait<mlir::OpTrait::IsTerminator>();
-           })
-      .def("has_return",
-           [](mlir::Block &self) {
-             return !self.empty() &&
-                    self.back().hasTrait<mlir::OpTrait::ReturnLike>();
-           })
-      .def("erase", [](mlir::Block &self) { self.erase(); });
-
-  // using eattr = ir::attribute_kind_t;
-  // py::enum_<eattr>(m, "attribute_kind")
-  //     .value("readonly", eattr::readonly)
-  //     .value("writeonly", eattr::writeonly)
-  //     .value("noalias", eattr::noalias)
-  //     .value("aligned", eattr::aligned)
-  //     .value("multiple_of", eattr::multiple_of)
-  //     .value("retune", eattr::retune)
-  //     .value("not_implemented", eattr::not_implemented);
-
-  py::class_<mlir::Attribute>(m, "attribute");
-  py::class_<mlir::IntegerAttr, mlir::Attribute>(m, "integer_attr");
-  py::class_<mlir::BoolAttr, mlir::Attribute>(m, "bool_attr");
-
-  // Ops
-  py::class_<mlir::OpState>(m, "OpState")
-      .def("set_attr",
-           [](mlir::OpState &self, std::string &name,
-              mlir::Attribute &attr) -> void { self->setAttr(name, attr); })
-      .def(
-          "get_num_results",
-          [](mlir::OpState &self) -> unsigned { return self->getNumResults(); })
-      .def("get_result",
-           [](mlir::OpState &self, unsigned idx) -> mlir::Value {
-             return self->getResult(idx);
-           })
-      .def(
-          "get_region",
-          [](mlir::OpState &self, unsigned idx) -> mlir::Region & {
-            return self->getRegion(idx);
-          },
-          ret::reference)
-      .def(
-          "get_body",
-          [](mlir::scf::ForOp &self, unsigned idx) -> mlir::Block * {
-            return self.getBody(idx);
-          },
-          ret::reference)
-      .def("dump", [](mlir::OpState &self) { self->dump(); })
-      .def("__str__",
-           [](mlir::OpState &self) -> std::string {
-             std::string str;
-             llvm::raw_string_ostream os(str);
-             self->print(os);
-             return str;
-           })
-      .def("append_operand",
-           [](mlir::OpState &self, mlir::Value &val) {
-             self->insertOperands(self->getNumOperands(), val);
-           })
-      .def("verify", [](mlir::OpState &self) -> bool {
-        return mlir::succeeded(mlir::verify(self.getOperation()));
-      });
-  // scf Ops
-  py::class_<mlir::scf::ForOp, mlir::OpState>(m, "ForOp")
-      .def("get_induction_var", &mlir::scf::ForOp::getInductionVar);
-
-  py::class_<mlir::scf::IfOp, mlir::OpState>(m, "IfOp")
-      .def("get_then_block", &mlir::scf::IfOp::thenBlock, ret::reference)
-      .def("get_else_block", &mlir::scf::IfOp::elseBlock, ret::reference)
-      .def("get_then_yield", &mlir::scf::IfOp::thenYield)
-      .def("get_else_yield", &mlir::scf::IfOp::elseYield);
-  py::class_<mlir::scf::YieldOp, mlir::OpState>(m, "YieldOp");
-  py::class_<mlir::scf::WhileOp, mlir::OpState>(m, "WhileOp")
-      .def("get_before", &mlir::scf::WhileOp::getBefore, ret::reference)
-      .def("get_after", &mlir::scf::WhileOp::getAfter, ret::reference);
-  py::class_<mlir::scf::ConditionOp, mlir::OpState>(m, "ConditionOp");
-
-  // dynamic_attr is used to transfer ownership of the MLIR context to the
-  // module
-  py::class_<mlir::ModuleOp, mlir::OpState>(m, "module", py::dynamic_attr())
-      .def("dump", &mlir::ModuleOp::dump)
-      .def("str",
-           [](mlir::ModuleOp &self) -> std::string {
-             std::string str;
-             llvm::raw_string_ostream os(str);
-             self.print(os);
-             return str;
-           })
-      .def("push_back",
-           [](mlir::ModuleOp &self, mlir::triton::FuncOp &funcOp) -> void {
-             self.push_back(funcOp);
-           })
-      .def("has_function",
-           [](mlir::ModuleOp &self, std::string &funcName) -> bool {
-             if (self.lookupSymbol(funcName))
-               return true;
-             return false;
-           })
-      .def("get_function",
-           [](mlir::ModuleOp &self,
-              std::string &funcName) -> mlir::triton::FuncOp {
-             return self.lookupSymbol<mlir::triton::FuncOp>(funcName);
-           })
-      .def("get_single_function",
-           [](mlir::ModuleOp &self) -> mlir::triton::FuncOp {
-             llvm::SmallVector<mlir::triton::FuncOp> funcs;
-             self.walk(
-                 [&](mlir::triton::FuncOp func) { funcs.push_back(func); });
-             if (funcs.size() != 1)
-               throw std::runtime_error("Expected a single function");
-             return funcs[0];
-           });
-
-  m.def("make_attr",
-        [](const std::vector<int> &values, mlir::MLIRContext &context) {
-          return mlir::DenseIntElementsAttr::get(
-                     mlir::RankedTensorType::get(
-                         {static_cast<int64_t>(values.size())},
-                         mlir::IntegerType::get(&context, 32)),
-                     values)
-              .cast<mlir::Attribute>();
-        });
-
-  m.def(
-      "parse_mlir_module",
-      [](const std::string &inputFilename, mlir::MLIRContext &context) {
-        // initialize registry
-        // note: we initialize llvm for undef
-        mlir::DialectRegistry registry;
-        registry.insert<
-            mlir::triton::TritonDialect, mlir::triton::gpu::TritonGPUDialect,
-            mlir::math::MathDialect, mlir::arith::ArithDialect,
-            mlir::index::IndexDialect, mlir::scf::SCFDialect,
-            mlir::cf::ControlFlowDialect, mlir::LLVM::LLVMDialect>();
-        context.appendDialectRegistry(registry);
-        context.loadAllAvailableDialects();
-
-        // parse module
-        mlir::OwningOpRef<mlir::ModuleOp> module =
-            mlir::parseSourceFile<mlir::ModuleOp>(inputFilename, &context);
-        if (!module)
-          throw std::runtime_error("Parse MLIR file failed.");
-        // locations are incompatible with ptx < 7.5 !
-        module->walk([](mlir::Operation *op) {
-          op->setLoc(mlir::UnknownLoc::get(op->getContext()));
-        });
-
-        return module->clone();
-      },
-      ret::take_ownership);
-
-  py::class_<mlir::triton::FuncOp, mlir::OpState>(m, "function")
-      // .def_property_readonly("attrs", &ir::function::attrs)
-      // .def("add_attr", &ir::function::add_attr);
-      .def("args",
-           [](mlir::triton::FuncOp &self, unsigned idx) -> mlir::BlockArgument {
-             return self.getArgument(idx);
-           })
-      .def(
-          "add_entry_block",
-          [](mlir::triton::FuncOp &self) -> mlir::Block * {
-            return self.addEntryBlock();
-          },
-          ret::reference)
-      .def(
-          "set_arg_attr",
-          [](mlir::triton::FuncOp &self, int arg_no, const std::string &name,
-             int val) {
-            // set arg attributes "name" to value "val"
-            auto attrTy = mlir::IntegerType::get(self.getContext(), 32);
-            self.setArgAttr(arg_no, name, mlir::IntegerAttr::get(attrTy, val));
-          },
-          ret::reference)
-      .def("finalize",
-           [](mlir::triton::FuncOp &self) -> void {
-             // Remove dead code
-             // 1. Unreachable code after return
-             self.walk([&](mlir::Block *block) {
-               mlir::Operation *retOp = nullptr;
-               // It's better to not use walk here because we only want to
-               // check operations in the current block
-               for (auto &op : block->getOperations()) {
-                 if (mlir::isa<mlir::triton::ReturnOp>(op))
-                   if (retOp == nullptr) {
-                     retOp = &op;
-                     break;
-                   }
-               }
-               if (retOp && retOp != &block->back()) {
-                 auto pos = retOp->getIterator();
-                 pos++;
-                 auto *newBlock = block->splitBlock(pos);
-                 newBlock->erase();
-               }
-             });
-           })
-      .def_property_readonly("type", &mlir::triton::FuncOp::getFunctionType)
-      .def("reset_type", &mlir::triton::FuncOp::setType);
-
-  py::class_<mlir::OpBuilder::InsertPoint>(m, "InsertPoint");
-
-  py::class_<mlir::OpBuilder>(m, "builder", py::dynamic_attr())
-      .def(py::init<mlir::MLIRContext *>())
-      // // getters
-      .def_property_readonly("context", &mlir::OpBuilder::getContext,
-                             ret::reference)
-      .def("create_module",
-           [](mlir::OpBuilder &self) -> mlir::ModuleOp {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::ModuleOp>(loc);
-           })
-      .def("ret",
-           [](mlir::OpBuilder &self, std::vector<mlir::Value> &vals) -> void {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::triton::ReturnOp>(loc, vals);
-           })
-      .def("call",
-           [](mlir::OpBuilder &self, mlir::triton::FuncOp &func,
-              std::vector<mlir::Value> &args) -> mlir::OpState {
-             auto loc = self.getUnknownLoc();
-             auto callOp = self.create<mlir::triton::CallOp>(loc, func, args);
-             return callOp;
-           })
-      // insertion block/point
-      .def("set_insertion_point_to_start",
-           [](mlir::OpBuilder &self, mlir::Block &block) -> void {
-             self.setInsertionPointToStart(&block);
-           })
-      .def("set_insertion_point_to_end",
-           [](mlir::OpBuilder &self, mlir::Block &block) {
-             self.setInsertionPointToEnd(&block);
-           })
-      .def("set_insertion_point_after",
-           [](mlir::OpBuilder &self, mlir::Operation &op) {
-             self.setInsertionPointAfter(&op);
-           })
-      .def(
-          "get_insertion_block",
-          [](mlir::OpBuilder &self) -> mlir::Block * {
-            return self.getInsertionBlock();
-          },
-          ret::reference)
-      .def("get_insertion_point", &mlir::OpBuilder::saveInsertionPoint)
-      .def("restore_insertion_point", &mlir::OpBuilder::restoreInsertionPoint)
-      // .def("set_insert_point", [](ir::builder *self,
-      // std::pair<ir::basic_block*, ir::instruction*> pt) {
-      //   ir::basic_block *bb = pt.first;
-      //   ir::instruction *instr = pt.second;
-      //   if (instr) {
-      //     if (bb != instr->get_parent())
-      //       throw std::runtime_error("invalid insertion point, instr not in
-      //       bb");
-      //     self->set_insert_point(instr);
-      //   } else {
-      //     assert(bb);
-      //     self->set_insert_point(bb);
-      //   }
-      // })
-      // Attr
-      .def("get_bool_attr", &mlir::OpBuilder::getBoolAttr)
-      .def("get_int32_attr", &mlir::OpBuilder::getI32IntegerAttr)
-      // Use arith.ConstantOp to create constants
-      // Constants
-      .def("get_int1",
-           [](mlir::OpBuilder &self, bool v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(self.create<mlir::arith::ConstantIntOp>(
-                 loc, v, self.getI1Type()));
-           })
-      .def("get_int8",
-           [](mlir::OpBuilder &self, int64_t v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(self.create<mlir::arith::ConstantIntOp>(
-                 loc, v, self.getI8Type()));
-           })
-      .def("get_int16",
-           [](mlir::OpBuilder &self, int64_t v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(self.create<mlir::arith::ConstantIntOp>(
-                 loc, v, self.getI16Type()));
-           })
-      .def("get_int32",
-           [](mlir::OpBuilder &self, int64_t v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(self.create<mlir::arith::ConstantIntOp>(
-                 loc, v, self.getI32Type()));
-           })
-      .def("get_int64",
-           [](mlir::OpBuilder &self, int64_t v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(self.create<mlir::arith::ConstantIntOp>(
-                 loc, v, self.getI64Type()));
-           })
-      .def("get_bf16",
-           [](mlir::OpBuilder &self, float v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             auto type = self.getBF16Type();
-             return self.create<mlir::arith::ConstantFloatOp>(
-                 loc,
-                 mlir::APFloat(type.getFloatSemantics(), std::to_string(v)),
-                 type);
-           })
-      .def("get_fp16",
-           [](mlir::OpBuilder &self, float v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::ConstantOp>(
-                 loc, self.getF16FloatAttr(v));
-           })
-      .def("get_fp32",
-           [](mlir::OpBuilder &self, float v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::ConstantOp>(
-                 loc, self.getF32FloatAttr(v));
-           })
-      .def("get_fp64",
-           [](mlir::OpBuilder &self, double v) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::ConstantOp>(
-                 loc, self.getF64FloatAttr(v));
-           })
-      .def("get_null_value",
-           [](mlir::OpBuilder &self, mlir::Type type) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             if (auto floatTy = type.dyn_cast<mlir::FloatType>())
-               return self.create<mlir::arith::ConstantFloatOp>(
-                   loc, mlir::APFloat(floatTy.getFloatSemantics(), 0), floatTy);
-             else if (auto intTy = type.dyn_cast<mlir::IntegerType>())
-               return self.create<mlir::arith::ConstantIntOp>(loc, 0, intTy);
-             else
-               throw std::runtime_error("Not implemented");
-           })
-      .def("get_all_ones_value",
-           [](mlir::OpBuilder &self, mlir::Type type) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             uint64_t val = 0xFFFFFFFFFFFFFFFF;
-             if (auto intTy = type.dyn_cast<mlir::IntegerType>())
-               return self.create<mlir::arith::ConstantIntOp>(loc, val, intTy);
-             else
-               throw std::runtime_error("Not implemented");
-           })
-
-      // Types
-      .def("get_void_ty",
-           [](mlir::OpBuilder &self) -> mlir::Type {
-             return self.getNoneType();
-           })
-      .def("get_int1_ty",
-           [](mlir::OpBuilder &self) -> mlir::Type {
-             return self.getI1Type();
-           }) // or ret::copy?
-      .def("get_int8_ty",
-           [](mlir::OpBuilder &self) -> mlir::Type { return self.getI8Type(); })
-      .def("get_int16_ty",
-           [](mlir::OpBuilder &self) -> mlir::Type {
-             return self.getType<mlir::IntegerType>(16);
-           })
-      .def(
-          "get_int32_ty",
-          [](mlir::OpBuilder &self) -> mlir::Type { return self.getI32Type(); })
-      .def(
-          "get_int64_ty",
-          [](mlir::OpBuilder &self) -> mlir::Type { return self.getI64Type(); })
-      .def("get_fp8e4_ty",
-           [](mlir::OpBuilder &self) -> mlir::Type {
-             return self.getType<mlir::Float8E4M3FNType>();
-           })
-      .def("get_fp8e5_ty",
-           [](mlir::OpBuilder &self) -> mlir::Type {
-             return self.getType<mlir::Float8E5M2Type>();
-           })
-      .def(
-          "get_half_ty",
-          [](mlir::OpBuilder &self) -> mlir::Type { return self.getF16Type(); })
-      .def("get_bf16_ty",
-           [](mlir::OpBuilder &self) -> mlir::Type {
-             return self.getBF16Type();
-           })
-      .def(
-          "get_float_ty",
-          [](mlir::OpBuilder &self) -> mlir::Type { return self.getF32Type(); })
-      .def(
-          "get_double_ty",
-          [](mlir::OpBuilder &self) -> mlir::Type { return self.getF64Type(); })
-      .def("get_ptr_ty",
-           [](mlir::OpBuilder &self, mlir::Type &type,
-              int addrSpace) -> mlir::Type {
-             return mlir::triton::PointerType::get(type, addrSpace);
-           })
-      .def("get_block_ty",
-           [](mlir::OpBuilder &self, mlir::Type &elementType,
-              std::vector<int64_t> &shape) -> mlir::Type {
-             return mlir::RankedTensorType::get(shape, elementType);
-           })
-      .def("get_function_ty",
-           [](mlir::OpBuilder &self, std::vector<mlir::Type> inTypes,
-              std::vector<mlir::Type> outTypes) -> mlir::Type {
-             return self.getFunctionType(inTypes, outTypes);
-           })
-
-      // Ops
-      .def("get_or_insert_function",
-           [](mlir::OpBuilder &self, mlir::ModuleOp &module,
-              std::string &funcName, mlir::Type &funcType,
-              std::string &visibility, bool noinline) -> mlir::triton::FuncOp {
-             if (mlir::Operation *funcOperation = module.lookupSymbol(funcName))
-               return llvm::dyn_cast<mlir::triton::FuncOp>(funcOperation);
-             auto loc = self.getUnknownLoc();
-             if (auto funcTy = funcType.dyn_cast<mlir::FunctionType>()) {
-               llvm::SmallVector<mlir::NamedAttribute> attrs = {
-                   mlir::NamedAttribute(self.getStringAttr("sym_visibility"),
-                                        self.getStringAttr(visibility)),
-                   mlir::NamedAttribute(self.getStringAttr("noinline"),
-                                        self.getBoolAttr(noinline))};
-               return self.create<mlir::triton::FuncOp>(loc, funcName, funcTy,
-                                                        attrs);
-             }
-             throw std::runtime_error("invalid function type");
-           })
-      .def(
-          "create_block",
-          [](mlir::OpBuilder &self) -> mlir::Block * {
-            mlir::Region *parent = self.getBlock()->getParent();
-            return self.createBlock(parent);
-          },
-          ret::reference)
-      .def(
-          "create_block_with_parent",
-          [](mlir::OpBuilder &self, mlir::Region &parent,
-             std::vector<mlir::Type> &argTypes) -> mlir::Block * {
-            auto argLoc = self.getUnknownLoc();
-            llvm::SmallVector<mlir::Location, 8> argLocs(argTypes.size(),
-                                                         argLoc);
-            return self.createBlock(&parent, {}, argTypes, argLocs);
-          },
-          ret::reference)
-      .def(
-          "new_block",
-          [](mlir::OpBuilder &self) -> mlir::Block * {
-            return new mlir::Block();
-          },
-          ret::reference)
-      // Unstructured control flow
-      .def("create_cond_branch",
-           [](mlir::OpBuilder &self, mlir::Value condition,
-              mlir::Block *trueDest, mlir::Block *falseDest) {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::cf::CondBranchOp>(loc, condition, trueDest,
-                                                 falseDest);
-             return;
-           })
-      .def("create_branch",
-           [](mlir::OpBuilder &self, mlir::Block *dest,
-              std::vector<mlir::Value> &args) {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::cf::BranchOp>(loc, dest, args);
-             return;
-           })
-      // Structured control flow
-      .def("create_for_op",
-           [](mlir::OpBuilder &self, mlir::Value &lb, mlir::Value &ub,
-              mlir::Value &step,
-              std::vector<mlir::Value> &initArgs) -> mlir::scf::ForOp {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::scf::ForOp>(loc, lb, ub, step, initArgs);
-           })
-      .def("create_if_op",
-           [](mlir::OpBuilder &self, std::vector<mlir::Type> &retTypes,
-              mlir::Value &condition, bool withElse) -> mlir::scf::IfOp {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::scf::IfOp>(loc, retTypes, condition,
-                                                 withElse);
-           })
-      .def("create_yield_op",
-           [](mlir::OpBuilder &self,
-              std::vector<mlir::Value> &yields) -> mlir::scf::YieldOp {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::scf::YieldOp>(loc, yields);
-           })
-      .def("create_while_op",
-           [](mlir::OpBuilder &self, std::vector<mlir::Type> &retTypes,
-              std::vector<mlir::Value> &initArgs) -> mlir::scf::WhileOp {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::scf::WhileOp>(loc, retTypes, initArgs);
-           })
-      .def("create_condition_op",
-           [](mlir::OpBuilder &self, mlir::Value &cond,
-              std::vector<mlir::Value> &args) -> mlir::scf::ConditionOp {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::scf::ConditionOp>(loc, cond, args);
-           })
-
-      // miscellaneous
-      .def("create_make_range",
-           [](mlir::OpBuilder &self, int start, int end) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             auto retType =
-                 mlir::RankedTensorType::get({end - start}, self.getI32Type());
-             return self.create<mlir::triton::MakeRangeOp>(loc, retType, start,
-                                                           end);
-           })
-
-      // Cast instructions
-      // Conversions for custom FP types (FP8)
-      .def("create_fp_to_fp",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::FpToFpOp>(loc, dstType, src);
-           })
-      // Conversions for standard LLVM builtin types
-      .def("create_bitcast",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::BitcastOp>(loc, dstType, src);
-           })
-      .def("create_si_to_fp",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::SIToFPOp>(loc, dstType, src);
-           })
-      .def("create_ui_to_fp",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::UIToFPOp>(loc, dstType, src);
-           })
-      .def("create_fp_to_si",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::FPToSIOp>(loc, dstType, src);
-           })
-      .def("create_fp_to_ui",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::FPToUIOp>(loc, dstType, src);
-           })
-      .def("create_fp_ext",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::ExtFOp>(loc, dstType, src);
-           })
-      .def("create_fp_trunc",
-           [](mlir::OpBuilder &self, mlir::Value &src,
-              mlir::Type &dstType) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::TruncFOp>(loc, dstType, src);
-           })
-      .def("create_int_cast",
-           [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType,
-              bool isSigned) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             // get element type if necessary
-             mlir::Type srcType = src.getType();
-             auto srcTensorType = srcType.dyn_cast<mlir::RankedTensorType>();
-             auto dstTensorType = dstType.dyn_cast<mlir::RankedTensorType>();
-             mlir::Type srcEltType = srcType;
-             mlir::Type dstEltType = dstType;
-             if (dstTensorType && srcTensorType) {
-               dstEltType = dstTensorType.getElementType();
-               srcEltType = srcTensorType.getElementType();
-             }
-             unsigned srcWidth = srcEltType.getIntOrFloatBitWidth();
-             unsigned dstWidth = dstEltType.getIntOrFloatBitWidth();
-             if (srcWidth == dstWidth)
-               return self.create<mlir::arith::BitcastOp>(loc, dstType, src);
-             else if (srcWidth > dstWidth)
-               return self.create<mlir::arith::TruncIOp>(loc, dstType, src);
-             else if (isSigned)
-               return self.create<mlir::arith::ExtSIOp>(loc, dstType, src);
-             else
-               return self.create<mlir::arith::ExtUIOp>(loc, dstType, src);
-           })
-      .def("create_to_index",
-           [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::IndexCastOp>(
-                 loc, self.getIndexType(), input);
-           })
-      .def("create_index_to_si",
-           [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::IndexCastOp>(
-                 loc, self.getI64Type(), input);
-           })
-      .def("create_fmul",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::MulFOp>(loc, lhs, rhs);
-           })
-      .def("create_fdiv",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::DivFOp>(loc, lhs, rhs);
-           })
-      .def("create_frem",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::RemFOp>(loc, lhs, rhs);
-           })
-      .def("create_fadd",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::AddFOp>(loc, lhs, rhs);
-           })
-      .def("create_fsub",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::SubFOp>(loc, lhs, rhs);
-           })
-      .def("create_mul",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::MulIOp>(loc, lhs, rhs);
-           })
-      .def("create_sdiv",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::DivSIOp>(loc, lhs, rhs);
-           })
-      .def("create_udiv",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::DivUIOp>(loc, lhs, rhs);
-           })
-      .def("create_srem",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::RemSIOp>(loc, lhs, rhs);
-           })
-      .def("create_urem",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::RemUIOp>(loc, lhs, rhs);
-           })
-      .def("create_add",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::AddIOp>(loc, lhs, rhs);
-           })
-      .def("create_sub",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(
-                 self.create<mlir::arith::SubIOp>(loc, lhs, rhs));
-           })
-      .def("create_shl",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(
-                 self.create<mlir::arith::ShLIOp>(loc, lhs, rhs));
-           })
-      .def("create_lshr",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(
-                 self.create<mlir::arith::ShRUIOp>(loc, lhs, rhs));
-           })
-      .def("create_ashr",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return mlir::Value(
-                 self.create<mlir::arith::ShRSIOp>(loc, lhs, rhs));
-           })
-      // AddPtr (similar to GEP)
-      .def("create_addptr",
-           [](mlir::OpBuilder &self, mlir::Value &ptr,
-              mlir::Value &offset) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::AddPtrOp>(loc, ptr.getType(), ptr,
-                                                        offset);
-           })
-      // Comparison (int)
-      .def("create_icmpSLE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::sle, lhs, rhs);
-           })
-      .def("create_icmpSLT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::slt, lhs, rhs);
-           })
-      .def("create_icmpSGE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::sge, lhs, rhs);
-           })
-      .def("create_icmpSGT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::sgt, lhs, rhs);
-           })
-      .def("create_icmpULE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::ule, lhs, rhs);
-           })
-      .def("create_icmpULT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::ult, lhs, rhs);
-           })
-      .def("create_icmpUGE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::uge, lhs, rhs);
-           })
-      .def("create_icmpUGT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::ugt, lhs, rhs);
-           })
-      .def("create_icmpEQ",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::eq, lhs, rhs);
-           })
-      .def("create_icmpNE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpIOp>(
-                 loc, mlir::arith::CmpIPredicate::ne, lhs, rhs);
-           })
-      // Comparison (float)
-      .def("create_fcmpOLT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::OLT, lhs, rhs);
-           })
-      .def("create_fcmpOGT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::OGT, lhs, rhs);
-           })
-      .def("create_fcmpOLE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::OLE, lhs, rhs);
-           })
-      .def("create_fcmpOGE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::OGE, lhs, rhs);
-           })
-      .def("create_fcmpOEQ",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::OEQ, lhs, rhs);
-           })
-      .def("create_fcmpONE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::ONE, lhs, rhs);
-           })
-      .def("create_fcmpULT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::ULT, lhs, rhs);
-           })
-      .def("create_fcmpUGT",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::UGT, lhs, rhs);
-           })
-      .def("create_fcmpULE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::ULE, lhs, rhs);
-           })
-      .def("create_fcmpUGE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::UGE, lhs, rhs);
-           })
-      .def("create_fcmpUEQ",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::UEQ, lhs, rhs);
-           })
-      .def("create_fcmpUNE",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::CmpFOp>(
-                 loc, mlir::arith::CmpFPredicate::UNE, lhs, rhs);
-           })
-      // // Logical
-      .def("create_and",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::AndIOp>(loc, lhs, rhs);
-           })
-      .def("create_xor",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::XOrIOp>(loc, lhs, rhs);
-           })
-      .def("create_or",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::OrIOp>(loc, lhs, rhs);
-           })
-      // Input/Output
-      .def("create_load",
-           [](mlir::OpBuilder &self, mlir::Value &ptrs,
-              mlir::triton::CacheModifier cacheModifier,
-              mlir::triton::EvictionPolicy evictionPolicy,
-              bool isVolatile) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::LoadOp>(
-                 loc, ptrs, cacheModifier, evictionPolicy, isVolatile);
-           })
-      .def("create_store",
-           [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &value,
-              mlir::triton::CacheModifier cacheModifier,
-              mlir::triton::EvictionPolicy evictionPolicy) -> void {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::triton::StoreOp>(loc, ptrs, value, cacheModifier,
-                                                evictionPolicy);
-           })
-      .def("create_tensor_pointer_load",
-           [](mlir::OpBuilder &self, mlir::Value &ptr,
-              std::vector<int32_t> &boundaryCheck,
-              std::optional<mlir::triton::PaddingOption> paddingOption,
-              mlir::triton::CacheModifier cacheModifier,
-              mlir::triton::EvictionPolicy evictionPolicy,
-              bool isVolatile) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::LoadOp>(
-                 loc, ptr, boundaryCheck, paddingOption, cacheModifier,
-                 evictionPolicy, isVolatile);
-           })
-      .def("create_tensor_pointer_store",
-           [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &val,
-              std::vector<int32_t> &boundaryCheck,
-              mlir::triton::CacheModifier cacheModifier,
-              mlir::triton::EvictionPolicy evictionPolicy) -> void {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::triton::StoreOp>(loc, ptr, val, boundaryCheck,
-                                                cacheModifier, evictionPolicy);
-           })
-      .def("create_masked_load",
-           [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &mask,
-              std::optional<mlir::Value> &other,
-              mlir::triton::CacheModifier cacheModifier,
-              mlir::triton::EvictionPolicy evictionPolicy,
-              bool isVolatile) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::LoadOp>(
-                 loc, ptrs, mask, other.value_or(mlir::Value()), cacheModifier,
-                 evictionPolicy, isVolatile);
-           })
-      .def("create_masked_store",
-           [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &val,
-              mlir::Value &mask, mlir::triton::CacheModifier cacheModifier,
-              mlir::triton::EvictionPolicy evictionPolicy) -> void {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::triton::StoreOp>(loc, ptrs, val, mask,
-                                                cacheModifier, evictionPolicy);
-           })
-      .def("create_view",
-           [](mlir::OpBuilder &self, mlir::Value &arg,
-              std::vector<int64_t> &shape) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             auto argType = arg.getType()
-                                .dyn_cast<mlir::RankedTensorType>()
-                                .getElementType();
-             return self.create<mlir::triton::ViewOp>(
-                 loc, mlir::RankedTensorType::get(shape, argType), arg);
-           })
-      .def(
-          "create_expand_dims",
-          [](mlir::OpBuilder &self, mlir::Value &arg, int axis) -> mlir::Value {
-            auto loc = self.getUnknownLoc();
-            auto argType = arg.getType().dyn_cast<mlir::RankedTensorType>();
-            auto argEltType = argType.getElementType();
-            std::vector<int64_t> retShape = argType.getShape();
-            retShape.insert(retShape.begin() + axis, 1);
-            return self.create<mlir::triton::ExpandDimsOp>(
-                loc, mlir::RankedTensorType::get(retShape, argEltType), arg,
-                axis);
-          })
-      .def("create_cat",
-           [](mlir::OpBuilder &self, mlir::Value &lhs,
-              mlir::Value &rhs) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             auto lhsType = lhs.getType().dyn_cast<mlir::RankedTensorType>();
-             auto rhsType = rhs.getType().dyn_cast<mlir::RankedTensorType>();
-             if (!(lhsType.getShape().size() == 1 &&
-                   rhsType.getShape().size() == 1))
-               throw std::runtime_error(
-                   "shape not supported by cat. Expecting rank-1 inputs");
-             std::vector<int64_t> shape{lhsType.getShape()[0] +
-                                        rhsType.getShape()[0]};
-             return self.create<mlir::triton::CatOp>(
-                 loc,
-                 mlir::RankedTensorType::get(shape, lhsType.getElementType()),
-                 lhs, rhs);
-           })
-      .def("create_trans",
-           [](mlir::OpBuilder &self, mlir::Value &arg) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             auto argType = arg.getType().dyn_cast<mlir::RankedTensorType>();
-             auto argEltType = argType.getElementType();
-             std::vector<int64_t> retShape = argType.getShape();
-             std::reverse(retShape.begin(), retShape.end());
-             return self.create<mlir::triton::TransOp>(
-                 loc, mlir::RankedTensorType::get(retShape, argEltType), arg);
-           })
-      .def("create_broadcast",
-           [](mlir::OpBuilder &self, mlir::Value &arg,
-              std::vector<int64_t> &shape) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             if (auto argType =
-                     arg.getType().dyn_cast<mlir::RankedTensorType>())
-               return self.createOrFold<mlir::triton::BroadcastOp>(
-                   loc,
-                   mlir::RankedTensorType::get(shape, argType.getElementType()),
-                   arg);
-             throw std::runtime_error(
-                 "arg is not of RankedTensorType, use create_splat");
-           })
-      .def("create_splat",
-           [](mlir::OpBuilder &self, mlir::Value &arg,
-              std::vector<int64_t> &shape) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             auto argType = arg.getType();
-             auto ret = self.createOrFold<mlir::triton::SplatOp>(
-                 loc, mlir::RankedTensorType::get(shape, argType), arg);
-             return ret;
-           })
-      // // atomic
-      .def("create_atomic_cas",
-           [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &cmp,
-              mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             mlir::Type dstType;
-             if (auto srcTensorType =
-                     ptr.getType().dyn_cast<mlir::RankedTensorType>()) {
-               mlir::Type dstElemType = srcTensorType.getElementType()
-                                            .cast<mlir::triton::PointerType>()
-                                            .getPointeeType();
-               dstType = mlir::RankedTensorType::get(srcTensorType.getShape(),
-                                                     dstElemType);
-             } else {
-               auto ptrType = mlir::getElementTypeOrSelf(ptr)
-                                  .cast<mlir::triton::PointerType>();
-               dstType = ptrType.getPointeeType();
-             }
-             return self.create<mlir::triton::AtomicCASOp>(loc, dstType, ptr,
-                                                           cmp, val);
-           })
-      .def("create_atomic_rmw",
-           [](mlir::OpBuilder &self, mlir::triton::RMWOp rmwOp,
-              mlir::Value &ptr, mlir::Value &val,
-              mlir::Value &mask) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             mlir::Type dstType;
-             if (auto srcTensorType =
-                     ptr.getType().dyn_cast<mlir::RankedTensorType>()) {
-               mlir::Type dstElemType = srcTensorType.getElementType()
-                                            .cast<mlir::triton::PointerType>()
-                                            .getPointeeType();
-               dstType = mlir::RankedTensorType::get(srcTensorType.getShape(),
-                                                     dstElemType);
-             } else {
-               auto ptrType = mlir::getElementTypeOrSelf(ptr)
-                                  .cast<mlir::triton::PointerType>();
-               dstType = ptrType.getPointeeType();
-             }
-             return self.create<mlir::triton::AtomicRMWOp>(loc, dstType, rmwOp,
-                                                           ptr, val, mask);
-           })
-      // External
-      .def("create_extern_elementwise",
-           [](mlir::OpBuilder &self, const std::string &libName,
-              const std::string &libPath, const std::string &symbol,
-              std::vector<mlir::Value> &argList, mlir::Type retType,
-              bool isPure) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             if (isPure)
-               return self.create<mlir::triton::PureExternElementwiseOp>(
-                   loc, retType, argList, libName, libPath, symbol);
-             else
-               return self.create<mlir::triton::ImpureExternElementwiseOp>(
-                   loc, retType, argList, libName, libPath, symbol);
-           })
-      // Built-in instruction
-      .def("create_get_program_id",
-           [](mlir::OpBuilder &self, int axis) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::GetProgramIdOp>(
-                 loc, self.getI32Type(), self.getI32IntegerAttr(axis));
-           })
-      .def("create_get_num_programs",
-           [](mlir::OpBuilder &self, int axis) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::GetNumProgramsOp>(
-                 loc, self.getI32Type(), self.getI32IntegerAttr(axis));
-           })
-      .def("create_dot",
-           [](mlir::OpBuilder &self, mlir::Value &a, mlir::Value &b,
-              mlir::Value &c, bool allowTF32) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::DotOp>(loc, c.getType(), a, b, c,
-                                                     allowTF32);
-           })
-      .def("create_exp",
-           [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::math::ExpOp>(loc, val);
-           })
-      .def("create_cos",
-           [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::math::CosOp>(loc, val);
-           })
-      .def("create_sin",
-           [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::math::SinOp>(loc, val);
-           })
-      .def("create_log",
-           [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::math::LogOp>(loc, val);
-           })
-      .def("create_sqrt",
-           [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::math::SqrtOp>(loc, val);
-           })
-      .def("create_fabs",
-           [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::math::AbsFOp>(loc, val);
-           })
-      .def("create_iabs",
-           [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::math::AbsIOp>(loc, val);
-           })
-      .def("create_reduce",
-           [](mlir::OpBuilder &self, std::vector<mlir::Value> operands,
-              int axis) -> mlir::OpState {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::ReduceOp>(loc, operands, axis);
-           })
-      .def("create_reduce_ret",
-           [](mlir::OpBuilder &self, py::args args) -> mlir::OpState {
-             auto loc = self.getUnknownLoc();
-             llvm::SmallVector<mlir::Value> return_values;
-             for (const auto &arg : args) {
-               return_values.push_back(py::cast<mlir::Value>(arg));
-             }
-             return self.create<mlir::triton::ReduceReturnOp>(loc,
-                                                              return_values);
-           })
-      .def("create_ptr_to_int",
-           [](mlir::OpBuilder &self, mlir::Value &val,
-              mlir::Type &type) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::PtrToIntOp>(loc, type, val);
-           })
-      .def("create_int_to_ptr",
-           [](mlir::OpBuilder &self, mlir::Value &val,
-              mlir::Type &type) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::IntToPtrOp>(loc, type, val);
-           })
-      .def("create_select",
-           [](mlir::OpBuilder &self, mlir::Value &condition,
-              mlir::Value &trueValue, mlir::Value &falseValue) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::arith::SelectOp>(loc, condition,
-                                                       trueValue, falseValue);
-           })
-      .def("create_print",
-           [](mlir::OpBuilder &self, const std::string &prefix,
-              const std::vector<mlir::Value> &values) -> void {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::triton::PrintOp>(
-                 loc,
-                 mlir::StringAttr::get(self.getContext(),
-                                       llvm::StringRef(prefix)),
-                 values);
-           })
-      .def("create_assert",
-           [](mlir::OpBuilder &self, mlir::Value &condition,
-              const std::string &message, const std::string &fileName,
-              const std::string &funcName, unsigned lineNo) -> void {
-             auto loc = self.getUnknownLoc();
-             auto messageAttr = mlir::StringAttr::get(self.getContext(),
-                                                      llvm::StringRef(message));
-             auto fileNameAttr = mlir::StringAttr::get(
-                 self.getContext(), llvm::StringRef(fileName));
-             auto funcNameAttr = mlir::StringAttr::get(
-                 self.getContext(), llvm::StringRef(funcName));
-             auto lineNoAttr = self.getI32IntegerAttr(lineNo);
-             self.create<mlir::triton::AssertOp>(loc, condition, messageAttr,
-                                                 fileNameAttr, funcNameAttr,
-                                                 lineNoAttr);
-           })
-      // Undef
-      .def("create_undef",
-           [](mlir::OpBuilder &self, mlir::Type &type) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<::mlir::LLVM::UndefOp>(loc, type);
-           })
-      // Force GPU barrier
-      .def("create_barrier",
-           [](mlir::OpBuilder &self) {
-             auto loc = self.getUnknownLoc();
-             self.create<mlir::gpu::BarrierOp>(loc);
-           })
-      // Make a block pointer (tensor pointer in Triton IR)
-      .def("create_make_block_ptr",
-           [](mlir::OpBuilder &self, mlir::Value &base,
-              std::vector<mlir::Value> &shape,
-              std::vector<mlir::Value> &strides,
-              std::vector<mlir::Value> &offsets,
-              std::vector<int32_t> &tensorShape,
-              std::vector<int32_t> &order) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::MakeTensorPtrOp>(
-                 loc, base, shape, strides, offsets, tensorShape, order);
-           })
-      // Advance a block pointer
-      .def("create_advance",
-           [](mlir::OpBuilder &self, mlir::Value &ptr,
-              std::vector<mlir::Value> &offsets) -> mlir::Value {
-             auto loc = self.getUnknownLoc();
-             return self.create<mlir::triton::AdvanceOp>(loc, ptr.getType(),
-                                                         ptr, offsets);
-           });
-
-  py::class_<mlir::PassManager>(m, "pass_manager")
-      .def(py::init<mlir::MLIRContext *>())
-      .def("enable_debug",
-           [](mlir::PassManager &self) {
-             auto printingFlags = mlir::OpPrintingFlags();
-             printingFlags.elideLargeElementsAttrs(16);
-             self.enableIRPrinting(
-                 /*shouldPrintBeforePass=*/nullptr,
-                 /*shouldPrintAfterPass=*/
-                 [](mlir::Pass *pass, mlir::Operation *) {
-                   return ::triton::tools::getBoolEnv("MLIR_ENABLE_DUMP");
-                 },
-                 /*printModuleScope=*/false,
-                 /*printAfterOnlyOnChange=*/true,
-                 /*printAfterOnlyOnFailure*/ false, llvm::dbgs(),
-                 printingFlags);
-           })
-      .def("run",
-           [](mlir::PassManager &self, mlir::ModuleOp &mod) {
-             // TODO: maybe dump module to file and print error for better
-             // diagnostics
-             if (mlir::failed(self.run(mod.getOperation())))
-               throw std::runtime_error("PassManager::run failed");
-           })
-      .def(
-          "add_sccp_pass",
-          [](mlir::PassManager &self) { self.addPass(mlir::createSCCPPass()); })
-      .def("add_tritongpu_coalesce_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createTritonGPUCoalescePass());
-           })
-      .def("add_symbol_dce_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createSymbolDCEPass());
-           })
-      .def("add_inliner_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createInlinerPass());
-           })
-      .def("add_canonicalizer_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createCanonicalizerPass());
-           })
-      .def("add_cse_pass",
-           [](mlir::PassManager &self) { self.addPass(mlir::createCSEPass()); })
-      .def("add_licm_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createLoopInvariantCodeMotionPass());
-           })
-      .def("add_triton_combine_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::triton::createCombineOpsPass());
-           })
-      .def("add_rewrite_tensor_pointer_pass",
-           [](mlir::PassManager &self, int computeCapability) {
-             self.addPass(mlir::triton::createRewriteTensorPointerPass(
-                 computeCapability));
-           })
-      .def("add_convert_triton_to_tritongpu_pass",
-           [](mlir::PassManager &self, int numWarps) {
-             self.addPass(
-                 mlir::triton::createConvertTritonToTritonGPUPass(numWarps));
-           })
-      .def("add_tritongpu_pipeline_pass",
-           [](mlir::PassManager &self, int numStages) {
-             self.addPass(mlir::createTritonGPUPipelinePass(numStages));
-           })
-      .def("add_tritongpu_prefetch_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createTritonGPUPrefetchPass());
-           })
-      .def("add_tritongpu_accelerate_matmul_pass",
-           [](mlir::PassManager &self, int computeCapability) {
-             self.addPass(
-                 mlir::createTritonGPUAccelerateMatmulPass(computeCapability));
-           })
-      .def("add_tritongpu_optimize_dot_operands_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createTritonGPUOptimizeDotOperandsPass());
-           })
-      .def("add_tritongpu_remove_layout_conversions_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createTritonGPURemoveLayoutConversionsPass());
-           })
-      .def("add_tritongpu_reorder_instructions_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createTritonGPUReorderInstructionsPass());
-           })
-      .def("add_tritongpu_decompose_conversions_pass",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::createTritonGPUDecomposeConversionsPass());
-           })
-      .def("add_triton_gpu_to_llvm",
-           [](mlir::PassManager &self) {
-             self.addPass(mlir::triton::createConvertTritonGPUToLLVMPass());
-           })
-      .def("add_scf_to_cfg", [](mlir::PassManager &self) {
-        self.addPass(mlir::createConvertSCFToCFPass());
-      });
-}
-
-void init_triton_translation(py::module &m) {
-  using ret = py::return_value_policy;
-
-  m.def("get_shared_memory_size", [](mlir::ModuleOp mod) {
-    auto shared = mod->getAttrOfType<mlir::IntegerAttr>("triton_gpu.shared");
-    return shared.getInt();
-  });
-
-  m.def(
-      "translate_triton_gpu_to_llvmir",
-      [](mlir::ModuleOp op, int computeCapability, bool isROCM) {
-        py::gil_scoped_release allow_threads;
-        llvm::LLVMContext llvmContext;
-        auto llvmModule = ::mlir::triton::translateTritonGPUToLLVMIR(
-            &llvmContext, op, computeCapability, isROCM);
-        if (!llvmModule)
-          llvm::report_fatal_error("Failed to translate TritonGPU to LLVM IR.");
-
-        std::string str;
-        llvm::raw_string_ostream os(str);
-        llvmModule->print(os, nullptr);
-        os.flush();
-        return str;
-      },
-      ret::take_ownership);
-
-  m.def(
-      "translate_llvmir_to_ptx",
-      [](const std::string llvmIR, int capability, int version) -> std::string {
-        py::gil_scoped_release allow_threads;
-        // create LLVM module from C++
-        llvm::LLVMContext context;
-        std::unique_ptr<llvm::MemoryBuffer> buffer =
-            llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str());
-        llvm::SMDiagnostic error;
-        std::unique_ptr<llvm::Module> module =
-            llvm::parseIR(buffer->getMemBufferRef(), error, context);
-        if (!module) {
-          llvm::report_fatal_error(
-              "failed to parse IR: " + error.getMessage() +
-              "lineno: " + std::to_string(error.getLineNo()));
-        }
-
-        // translate module to PTX
-        auto ptxCode =
-            triton::translateLLVMIRToPTX(*module, capability, version);
-        return ptxCode;
-      },
-      ret::take_ownership);
-
-  m.def("compile_ptx_to_cubin",
-        [](const std::string &ptxCode, const std::string &ptxasPath,
-           int capability) -> py::object {
-          std::string cubin;
-          {
-            py::gil_scoped_release allow_threads;
-
-            // compile ptx with ptxas
-            llvm::SmallString<64> fsrc;
-            llvm::SmallString<64> flog;
-            llvm::sys::fs::createTemporaryFile("compile-ptx-src", "", fsrc);
-            llvm::sys::fs::createTemporaryFile("compile-ptx-log", "", flog);
-            std::string fbin = std::string(fsrc) + ".o";
-            llvm::FileRemover logRemover(flog);
-            llvm::FileRemover binRemover(fbin);
-            const char *_fsrc = fsrc.c_str();
-            const char *_flog = flog.c_str();
-            const char *_fbin = fbin.c_str();
-            std::ofstream ofs(_fsrc);
-            ofs << ptxCode << std::endl;
-            ofs.close();
-            std::string cmd;
-            int err;
-            cmd = ptxasPath + " -v --gpu-name=sm_" +
-                  std::to_string(capability) + (capability == 90 ? "a " : " ") +
-                  _fsrc + " -o " + _fsrc + ".o 2> " + _flog;
-
-            err = system(cmd.c_str());
-            if (err != 0) {
-              err >>= 8;
-              std::ifstream _log(_flog);
-              std::string log(std::istreambuf_iterator<char>(_log), {});
-              if (err == 255) {
-                throw std::runtime_error(
-                    "Internal Triton PTX codegen error: \n" + log);
-              } else if (err == 128 + SIGSEGV) {
-                throw std::runtime_error("Please run `ptxas " +
-                                         fsrc.str().str() +
-                                         "` to confirm that this is a "
-                                         "bug in `ptxas`\n" +
-                                         log);
-              } else {
-                throw std::runtime_error("`ptxas` failed with error code " +
-                                         std::to_string(err) + ": \n" + log);
-              }
-              return {};
-            } else {
-              llvm::FileRemover srcRemover(fsrc);
-              std::ifstream _cubin(_fbin, std::ios::binary);
-              cubin = std::string(std::istreambuf_iterator<char>(_cubin), {});
-              _cubin.close();
-              // Do not return here, exit the gil scope and return below
-            }
-          }
-          py::bytes bytes(cubin);
-          return std::move(bytes);
-        });
-
-  m.def("add_external_libs",
-        [](mlir::ModuleOp &op, const std::vector<std::string> &names,
-           const std::vector<std::string> &paths) {
-          ::mlir::triton::addExternalLibs(op, names, paths);
-        });
-
-  m.def(
-      "translate_llvmir_to_hsaco",
-      [](const std::string llvmIR, std::string gfx_arch, std::string gfx_triple,
-         std::string gfx_features) -> std::tuple<std::string, std::string> {
-        // create LLVM module from C++
-        llvm::LLVMContext context;
-        std::unique_ptr<llvm::MemoryBuffer> buffer =
-            llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str());
-        llvm::SMDiagnostic error;
-        std::unique_ptr<llvm::Module> module =
-            llvm::parseIR(buffer->getMemBufferRef(), error, context);
-        // translate module to HSACO
-        auto hsacoCode = triton::translateLLVMIRToHSACO(
-            *module, gfx_arch, gfx_triple, gfx_features);
-        return hsacoCode;
-      },
-      ret::take_ownership);
-}
-
-void init_triton(py::module &m) {
-  py::module subm = m.def_submodule("triton");
-  // init_triton_codegen(subm.def_submodule("code_gen"));
-  init_triton_runtime(subm.def_submodule("runtime"));
-  init_triton_ir(subm.def_submodule("ir"));
-  init_triton_translation(subm);
-}
diff --git a/python/test/regression/test_functional_regressions.py b/python/test/regression/test_functional_regressions.py
deleted file mode 100644
index 02e9d2323f18..000000000000
--- a/python/test/regression/test_functional_regressions.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import numpy as np
-import torch
-from numpy.random import RandomState
-
-import triton
-import triton.language as tl
-
-
-def test_chained_matmul():
-    # Regression test for issue #1601
-    def chained_matmul_reference(a, b, c):
-        intermediate = torch.einsum('MK,NK->MN', a, b)
-        return torch.einsum('MN,NK->MK', intermediate, c)
-
-    @triton.jit
-    def chained_matmul_kernel(
-            A,  # shape: (m, k)
-            B,  # shape: (n, k)
-            C,  # shape: (n, k)
-            out,  # shape: (m, k)
-            m, n, k: tl.constexpr,
-            block_m: tl.constexpr,
-            block_n: tl.constexpr,
-            block_k: tl.constexpr):
-
-        tl.static_assert(block_k == k,
-                         f"expected block_k == k but got {block_k} != {k}")
-
-        block_ix = tl.program_id(0)
-        a_tile = (block_ix * block_m + tl.arange(0, block_m))[:, None] * block_k \
-            + tl.arange(0, block_k)[None, :]
-
-        a = tl.load(A + a_tile, mask=a_tile < m * k, other=0.0)
-
-        acc = tl.zeros([block_m, block_k], dtype=tl.float32)
-
-        for loop_block_start in range(0, n, block_n):
-            bc_tile = (loop_block_start + tl.arange(0, block_n))[:, None] * block_k \
-                + tl.arange(0, block_k)[None, :]
-            b = tl.load(B + bc_tile, mask=bc_tile < n * k, other=0.0)
-
-            intermediate = tl.dot(a, tl.trans(b))
-            intermediate_mask = ((loop_block_start + tl.arange(0, block_n)) < n)[None, :] \
-                * (tl.arange(0, block_m) < m)[:, None]
-
-            intermediate = tl.where(intermediate_mask, intermediate, 0.0)
-
-            c = tl.load(C + bc_tile, mask=bc_tile < n * k)
-
-            acc += tl.dot(intermediate.to(A.dtype.element_ty), c)
-
-        tl.store(out + a_tile, acc.to(A.dtype.element_ty), mask=a_tile < m * k)
-
-    m, n, k = 32, 64, 128
-    block_m, block_n, block_k = 16, 32, k
-
-    grid = (triton.cdiv(m, block_m),)
-    a = torch.randint(low=0, high=2, size=(m, k), dtype=torch.float16,
-                      device='cuda')
-    b = torch.randint(low=0, high=2, size=(n, k), dtype=torch.float16,
-                      device='cuda')
-    c = torch.randint_like(b, low=0, high=2)
-    triton_result = torch.zeros_like(a)
-
-    torch_result = chained_matmul_reference(a, b, c)
-    chained_matmul_kernel[grid](a, b, c, triton_result, m, n, k,
-                                block_m=block_m, block_n=block_n,
-                                block_k=block_k)
-
-    assert (torch_result == triton_result).all()
-
-
-def test_vecmat():
-    @triton.jit
-    def batched_vecmat(
-        # inputs
-        A,  # shape: [dim_m, dim_k]
-        B,  # shape: [dim_m, dim_n, dim_k]
-        # dimensions
-        dim_m, dim_n, dim_k,
-        # outputs
-        output,
-        # block information
-        block_m: tl.constexpr, block_n: tl.constexpr, block_k: tl.constexpr
-    ):
-        m_index = tl.program_id(0)
-        n_index = tl.program_id(1)
-        # Output tile
-        output_tile = (m_index * block_m + tl.arange(0, block_m))[:, None] * dim_n \
-            + (n_index * block_n + tl.arange(0, block_n))[None, :]
-
-        vecmat = tl.zeros([block_m, block_n], dtype=A.dtype.element_ty)
-        k_blocks = dim_k // block_k
-        for k_index in range(k_blocks):
-            # Load A tile
-            a_tile = (m_index * block_m + tl.arange(0, block_m))[:, None] * dim_k \
-                + (k_index * block_k + tl.arange(0, block_k))[None, :]
-            a = tl.load(A + a_tile)
-
-            # Load B tile, transposed to [n, m, k] in order to broadcast A on a
-            # leading dimension.
-            b_tile = (m_index * block_m + tl.arange(0, block_m))[None, :, None] * dim_n * dim_k \
-                + (n_index * block_n + tl.arange(0, block_n))[:, None, None] * dim_k \
-                + (k_index * block_k + tl.arange(0, block_k))[None, None, :]
-            b = tl.load(B + b_tile)
-
-            expanded_a, _ = tl.broadcast(a, b)
-            vecmat += tl.trans(tl.sum(expanded_a * b, axis=2))
-
-        tl.store(output + output_tile, vecmat)
-
-    M, N, K = 128, 128, 128
-    block_m, block_n, block_k = 16, 32, 64
-
-    rs = RandomState(17)
-    A_vec = rs.randint(0, 4, (M, K)).astype('float32')
-    B_vec = rs.randint(0, 4, (M, N, K)).astype('float32')
-    A = A_vec
-    B = B_vec
-
-    A_tri = torch.tensor(A, device='cuda')
-    B_tri = torch.tensor(B, device='cuda')
-    C_tri = torch.zeros((M, N), dtype=torch.float32, device='cuda')
-
-    grid = (M // block_m, N // block_n)
-
-    batched_vecmat[grid](A_tri, B_tri, M, N, K, C_tri,
-                         block_m=block_m, block_n=block_n, block_k=block_k,
-                         num_warps=4, num_stages=1)
-
-    A_expanded = A[:, np.newaxis, :]
-    A_broadcasted = np.broadcast_to(A_expanded, (M, N, K))
-    AB = A_broadcasted * B
-    C_ref = np.sum(AB, axis=2)
-
-    np.testing.assert_allclose(C_ref, C_tri.cpu().numpy(), rtol=0.01, atol=1e-3)
diff --git a/python/test/regression/test_performance.py b/python/test/regression/test_performance.py
deleted file mode 100644
index 341248fd06c4..000000000000
--- a/python/test/regression/test_performance.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import subprocess
-import sys
-
-import pytest
-import torch
-
-import triton
-import triton.language as tl
-import triton.ops
-from triton.testing import get_dram_gbps, get_max_tensorcore_tflops
-
-DEVICE_NAME = {7: 'v100', 8: 'a100'}[torch.cuda.get_device_capability()[0]]
-
-#######################
-# Utilities
-#######################
-
-
-def print_perf(cur_ms, cur_util, ref_util):
-    # print on the same line cur_ms, cur_util and ref_util with 3 decimal places
-    print(f'{cur_ms:.3f} ms \t cur: {cur_util:.3f} \t ref: {ref_util:.3f} \t dif={cur_util - ref_util:.3f}', end='\t')
-
-
-def nvsmi(attrs):
-    attrs = ','.join(attrs)
-    cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits']
-    out = subprocess.check_output(cmd)
-    ret = out.decode(sys.stdout.encoding).split(',')
-    ret = [int(x) for x in ret]
-    return ret
-
-
-#######################
-# Matrix Multiplication
-#######################
-
-sm_clocks = {'v100': 1350, 'a100': 1350}
-mem_clocks = {'v100': 877, 'a100': 1215}
-
-matmul_data = {
-    'v100': {
-        # square
-        (512, 512, 512): {'float16': 0.158},
-        (1024, 1024, 1024): {'float16': 0.466},
-        (2048, 2048, 2048): {'float16': 0.695},
-        (4096, 4096, 4096): {'float16': 0.831},
-        (8192, 8192, 8192): {'float16': 0.849},
-        # tall-skinny
-        (16, 1024, 1024): {'float16': 0.0128},
-        (16, 4096, 4096): {'float16': 0.0883},
-        (16, 8192, 8192): {'float16': 0.101},
-        (64, 1024, 1024): {'float16': 0.073},
-        (64, 4096, 4096): {'float16': 0.270},
-        (64, 8192, 8192): {'float16': 0.459},
-        (1024, 64, 1024): {'float16': 0.0692},
-        (4096, 64, 4096): {'float16': 0.264},
-        (8192, 64, 8192): {'float16': 0.452},
-    },
-    # NOTE:
-    # A100 in the CI server is slow-ish for some reason.
-    # On some other servers, we are getting about 90% peak for 8kx8x8k float16
-    'a100': {
-        (512, 512, 512): {'float16': 0.084, 'float32': 0.13, 'int8': 0.05},
-        (1024, 1024, 1024): {'float16': 0.332, 'float32': 0.35, 'int8': 0.169},
-        (2048, 2048, 2048): {'float16': 0.641, 'float32': 0.57, 'int8': 0.34},
-        (4096, 4096, 4096): {'float16': 0.785, 'float32': 0.75, 'int8': 0.46},
-        (8192, 8192, 8192): {'float16': 0.805, 'float32': 0.85, 'int8': 0.51},
-        # tall-skinny
-        (16, 1024, 1024): {'float16': 0.0077, 'float32': 0.0127, 'int8': 0.005},
-        (16, 4096, 4096): {'float16': 0.044, 'float32': 0.0457, 'int8': 0.0259},
-        (16, 8192, 8192): {'float16': 0.07, 'float32': 0.0648, 'int8': 0.0431},
-        (64, 1024, 1024): {'float16': 0.030, 'float32': 0.0509, 'int8': 0.0169},
-        (64, 4096, 4096): {'float16': 0.163, 'float32': 0.162, 'int8': 0.097},
-        (64, 8192, 8192): {'float16': 0.285, 'float32': 0.257, 'int8': 0.174},
-        (1024, 64, 1024): {'float16': 0.033, 'float32': 0.0458, 'int8': 0.017},
-        (4096, 64, 4096): {'float16': 0.16, 'float32': 0.177, 'int8': 0.102},
-        (8192, 64, 8192): {'float16': 0.254, 'float32': 0.230, 'int8': 0.177},
-    }
-}
-
-
-@pytest.mark.parametrize('M, N, K, dtype_str',
-                         [(M, N, K, dtype_str)
-                          for M, N, K in matmul_data[DEVICE_NAME].keys()
-                          for dtype_str in ['float16']])
-def test_matmul(M, N, K, dtype_str):
-    if dtype_str in ['float32', 'int8'] and DEVICE_NAME != 'a100':
-        pytest.skip('Only test float32 & int8 on a100')
-    dtype = {'float16': torch.float16, 'float32': torch.float32, 'int8': torch.int8}[dtype_str]
-    torch.manual_seed(0)
-    ref_gpu_util = matmul_data[DEVICE_NAME][(M, N, K)][dtype_str]
-    cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
-    max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3)
-    if dtype == torch.int8:
-        a = torch.randint(-128, 127, (M, K), dtype=dtype, device='cuda')
-        b = torch.randint(-128, 127, (N, K), dtype=dtype, device='cuda')
-        b = b.t()  # only test row-col layout
-    else:
-        a = torch.randn((M, K), dtype=dtype, device='cuda')
-        b = torch.randn((K, N), dtype=dtype, device='cuda')
-    fn = lambda: triton.ops.matmul(a, b)
-    ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=300)
-    cur_gpu_perf = 2. * M * N * K / ms * 1e-9
-    cur_gpu_util = cur_gpu_perf / max_gpu_perf
-    print_perf(ms, cur_gpu_util, ref_gpu_util)
-    triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
-
-
-#######################
-# Element-Wise
-#######################
-
-
-@triton.jit
-def _add(x_ptr, y_ptr, output_ptr, n_elements,
-         BLOCK_SIZE: tl.constexpr):
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = tl.load(y_ptr + offsets, mask=mask)
-    output = x + y
-    tl.store(output_ptr + offsets, output, mask=mask)
-
-
-elementwise_data = {
-    'v100': {
-        1024 * 16: 0.0219,
-        1024 * 64: 0.0791,
-        1024 * 256: 0.243,
-        1024 * 1024: 0.530,
-        1024 * 4096: 0.796,
-        1024 * 16384: 0.905,
-        1024 * 65536: 0.939,
-    },
-    'a100': {
-        1024 * 16: 0.010,
-        1024 * 64: 0.040,
-        1024 * 256: 0.132,
-        1024 * 1024: 0.353,
-        1024 * 4096: 0.605,
-        1024 * 16384: 0.758,
-        1024 * 65536: 0.850,
-    }
-}
-
-
-@pytest.mark.parametrize('N', elementwise_data[DEVICE_NAME].keys())
-def test_elementwise(N):
-    torch.manual_seed(0)
-    ref_gpu_util = elementwise_data[DEVICE_NAME][N]
-    max_gpu_perf = get_dram_gbps()
-    z = torch.empty((N, ), dtype=torch.float16, device='cuda')
-    x = torch.randn_like(z)
-    y = torch.randn_like(z)
-    grid = lambda args: (triton.cdiv(N, args['BLOCK_SIZE']), )
-    fn = lambda: _add[grid](x, y, z, N, BLOCK_SIZE=1024)
-    ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=500)
-    cur_gpu_perf = 3. * N * z.element_size() / ms * 1e-6
-    cur_gpu_util = cur_gpu_perf / max_gpu_perf
-    print_perf(ms, cur_gpu_util, ref_gpu_util)
-    triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
-
-#######################
-# Flash-Attention
-#######################
-
-
-flash_attention_data = {
-    "a100": {
-        (4, 48, 4096, 64, 'forward', 'float16'): 0.37,
-        (4, 48, 4096, 64, 'backward', 'float16'): 0.25,
-    }
-}
-
-
-@pytest.mark.parametrize("Z, H, N_CTX, D_HEAD", [[4, 48, 4096, 64]])
-@pytest.mark.parametrize("mode", ['forward', 'backward'])
-@pytest.mark.parametrize("dtype_str", ['float16'])
-def test_flash_attention(Z, H, N_CTX, D_HEAD, mode, dtype_str):
-    is_backward = mode == 'backward'
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 8:
-        pytest.skip("Flash attention only supported for compute capability < 80")
-    torch.manual_seed(20)
-    dtype = {'float16': torch.float16, 'float32': torch.float32, 'int8': torch.int8}[dtype_str]
-    # init data
-    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_()
-    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_()
-    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_()
-    sm_scale = 0.2
-    # benchmark
-    fn = lambda: triton.ops.attention(q, k, v, sm_scale)
-    if is_backward:
-        o = fn()
-        do = torch.randn_like(o)
-        fn = lambda: o.backward(do, retain_graph=True)
-    ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=500)
-    # compute flops
-    flops_per_matmul = 2. * Z * H * N_CTX * N_CTX * D_HEAD * 0.5
-    total_flops = 2 * flops_per_matmul
-    if is_backward:
-        total_flops *= 2.5  # 2.0(bwd) + 0.5(recompute)
-    cur_gpu_perf = total_flops / ms * 1e-9
-    # maximum flops
-    cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
-    max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3)
-    cur_gpu_util = cur_gpu_perf / max_gpu_perf
-    ref_gpu_util = flash_attention_data[DEVICE_NAME][(Z, H, N_CTX, D_HEAD, mode, dtype_str)]
-    print_perf(ms, cur_gpu_util, ref_gpu_util)
-    triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05)
diff --git a/python/test/unit/debugger/test_debugger.py b/python/test/unit/debugger/test_debugger.py
deleted file mode 100644
index 741fcab3becd..000000000000
--- a/python/test/unit/debugger/test_debugger.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import random
-
-import torch
-
-import triton
-import triton.language as tl
-from triton.debugger.debugger import program_ids_from_grid
-
-
-def test_addition():
-
-    @triton.jit(interpret=True)
-    def add_kernel(
-        x_ptr,
-        y_ptr,
-        output_ptr,
-        n_elements,
-        BLOCK_SIZE: tl.constexpr,
-    ):
-        pid = tl.program_id(axis=0)
-        block_start = pid * BLOCK_SIZE
-        offsets = block_start + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < n_elements
-        x = tl.load(x_ptr + offsets, mask=mask)
-        y = tl.load(y_ptr + offsets, mask=mask)
-        output = x + y
-        tl.store(output_ptr + offsets, output, mask=mask)
-
-    a = torch.rand((128,), device="cuda")
-    b = torch.rand((128,), device="cuda")
-    expected = a + b
-    output = torch.empty((128,), device="cuda")
-
-    def grid(meta):
-        return (triton.cdiv(128, meta["BLOCK_SIZE"]),)
-
-    add_kernel[grid](a, b, output, 128, BLOCK_SIZE=32)
-
-    assert torch.allclose(expected, output, atol=1e-2, rtol=0)
-
-
-def test_program_ids_from_grid():
-    random.seed(123)
-    grid = (3, 4)
-    expected_combinations = 3 * 4
-    unique_combinations = set(program_ids_from_grid(grid))
-    assert len(unique_combinations) == expected_combinations
-
-    first_run = list(program_ids_from_grid(grid))
-    second_run = list(program_ids_from_grid(grid))
-    assert first_run != second_run
-
-
-def test_atomic():
-    @triton.jit(interpret=True)
-    def atomic(
-        x_ptr,
-    ):
-        pid = tl.program_id(axis=0)
-        tl.atomic_add(x_ptr + pid, 1)
-        t = tl.atomic_xchg(x_ptr + pid, 3)
-        t += 1  # 2
-        tl.atomic_cas(x_ptr + pid, 3, t)  # match
-        tl.atomic_cas(x_ptr + pid, 40, 9)  # no match
-    nb_dim = 16
-    a = torch.zeros((nb_dim, ), dtype=torch.int32, device="cuda")
-
-    atomic[(nb_dim, )](a)
-    assert torch.allclose(a, torch.full_like(a, 2))
diff --git a/python/test/unit/language/assert_helper.py b/python/test/unit/language/assert_helper.py
deleted file mode 100644
index 8419d5a34218..000000000000
--- a/python/test/unit/language/assert_helper.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import sys
-
-import torch
-from torch.testing import assert_close
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def kernel_device_assert(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    tl.device_assert(x == 0, "x != 0")
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit
-def kernel_device_assert_scalar(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    # Trivial assert
-    tl.device_assert(0 == 0, "x != 0")
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit(debug=False)
-def kernel_device_assert_no_debug(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    tl.device_assert(x == 0, "x != 0")
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit
-def kernel_assert(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    assert x == 0, "x != 0"
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit
-def kernel_static_assert(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    tl.static_assert(BLOCK == 128, "BLOCK != 128")
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-def test_assert(func: str):
-    shape = (128, )
-    x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda')
-    y = torch.zeros(shape, dtype=x.dtype, device="cuda")
-    if func == "device_assert":
-        kernel_device_assert[(1,)](x, y, BLOCK=shape[0])
-        kernel_device_assert_scalar[(1,)](x, y, BLOCK=shape[0])
-    elif func == "no_debug":
-        # TRITON_DEBUG=True can override the debug flag
-        kernel_device_assert_no_debug[(1,)](x, y, BLOCK=shape[0])
-    elif func == "assert":
-        kernel_assert[(1,)](x, y, BLOCK=shape[0])
-    elif func == "static_assert":
-        kernel_static_assert[(1,)](x, y, BLOCK=shape[0])
-    assert_close(y, x)
-
-
-@triton.jit
-def jit_device_assert_none(x):
-    tl.device_assert(x == 0, "x != 0")
-
-
-@triton.jit(debug=True)
-def jit_device_assert_true(x):
-    tl.device_assert(x == 0, "x != 0")
-
-
-@triton.jit(debug=False)
-def jit_device_assert_false(x):
-    tl.device_assert(x == 0, "x != 0")
-
-
-@triton.jit
-def kernel_device_assert_nested(X, Y, BLOCK: tl.constexpr, jit_debug: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    if jit_debug == "true":
-        jit_device_assert_true(x)
-    elif jit_debug == "false":
-        jit_device_assert_false(x)
-    else:
-        jit_device_assert_none(x)
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit(debug=True)
-def kernel_device_assert_nested_true(X, Y, BLOCK: tl.constexpr, jit_debug: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    if jit_debug == "true":
-        jit_device_assert_true(x)
-    elif jit_debug == "false":
-        jit_device_assert_false(x)
-    else:
-        jit_device_assert_none(x)
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit(debug=False)
-def kernel_device_assert_nested_false(X, Y, BLOCK: tl.constexpr, jit_debug: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    if jit_debug == "true":
-        jit_device_assert_true(x)
-    elif jit_debug == "false":
-        jit_device_assert_false(x)
-    else:
-        jit_device_assert_none(x)
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-def test_assert_nested(caller: str, callee: str):
-    shape = (128, )
-    x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda')
-    y = torch.zeros(shape, dtype=x.dtype, device="cuda")
-    if caller == "none":
-        kernel_device_assert_nested[(1,)](x, y, BLOCK=shape[0], jit_debug=callee)
-    elif caller == "true":
-        kernel_device_assert_nested_true[(1,)](x, y, BLOCK=shape[0], jit_debug=callee)
-    elif caller == "false":
-        kernel_device_assert_nested_false[(1,)](x, y, BLOCK=shape[0], jit_debug=callee)
-    assert_close(y, x)
-
-
-if __name__ == "__main__":
-    if len(sys.argv) == 3:
-        test_assert_nested(sys.argv[1], sys.argv[2])
-    else:
-        test_assert(sys.argv[1])
diff --git a/python/test/unit/language/print_helper.py b/python/test/unit/language/print_helper.py
deleted file mode 100644
index afdd12960737..000000000000
--- a/python/test/unit/language/print_helper.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import sys
-
-import torch
-from torch.testing import assert_close
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def kernel_device_print(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    tl.device_print("", x)
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit
-def kernel_print(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    print("", x)
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-@triton.jit
-def kernel_static_print(X, Y, BLOCK: tl.constexpr):
-    x = tl.load(X + tl.arange(0, BLOCK))
-    tl.static_print(x)
-    tl.store(Y + tl.arange(0, BLOCK), x)
-
-
-def test_print(func: str, data_type: str):
-    shape = (128, )
-    # limit the range of integers so that the sum does not overflow
-    x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda').to(getattr(torch, data_type))
-    y = torch.zeros(shape, dtype=x.dtype, device="cuda")
-    if func == "device_print":
-        kernel_device_print[(1,)](x, y, BLOCK=shape[0])
-    elif func == "print":
-        kernel_print[(1,)](x, y, BLOCK=shape[0])
-    elif func == "static_print":
-        kernel_static_print[(1,)](x, y, BLOCK=shape[0])
-    assert_close(y, x)
-
-
-if __name__ == "__main__":
-    test_print(sys.argv[1], sys.argv[2])
diff --git a/python/test/unit/language/test_annotations.py b/python/test/unit/language/test_annotations.py
deleted file mode 100644
index 88df39fac52d..000000000000
--- a/python/test/unit/language/test_annotations.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-from __future__ import annotations
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-def test_annotations():
-
-    @triton.jit
-    def _kernel(X: torch.Tensor, N: int, BLOCK_SIZE: tl.constexpr):
-        pass
-
-    x = torch.empty(1, device='cuda')
-    _kernel[(1,)](x, x.shape[0], 32)
-    try:
-        _kernel[(1,)](x.shape[0], x.shape[0], 32)
-    except AttributeError:
-        pass
diff --git a/python/test/unit/language/test_block_pointer.py b/python/test/unit/language/test_block_pointer.py
deleted file mode 100644
index 147249076181..000000000000
--- a/python/test/unit/language/test_block_pointer.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import pytest
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def block_copy_kernel(a_ptr, b_ptr, N, BLOCK_SIZE: tl.constexpr, padding_option: tl.constexpr):
-    pid = tl.program_id(0)
-    # We only copy half of the data to see if the padding works
-    a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(N // 2, ), strides=(1, ), offsets=(pid * BLOCK_SIZE, ),
-                                    block_shape=(BLOCK_SIZE, ), order=(0, ))
-    b_block_ptr = tl.make_block_ptr(base=b_ptr, shape=(N, ), strides=(1, ), offsets=(pid * BLOCK_SIZE, ),
-                                    block_shape=(BLOCK_SIZE, ), order=(0, ))
-    a = tl.load(a_block_ptr, boundary_check=(0, ), padding_option=padding_option)
-    tl.store(b_block_ptr, a, boundary_check=(0, ))
-
-
-@pytest.mark.parametrize("dtype_str, n, padding_option",
-                         [(dtype_str, n, padding) for dtype_str in ("bool", "int16", "float16")
-                          for n in (64, 128, 256, 512, 1024)
-                          for padding in ("zero", "nan")])
-def test_block_copy(dtype_str, n, padding_option):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] >= 9:
-        pytest.skip("Hopper support is working in progress")
-
-    dtype = getattr(torch, dtype_str)
-    if dtype_str in ("bool", "int16"):
-        if padding_option == "nan":
-            pytest.skip("Padding with NaN is not supported for integer types")
-        a = torch.randint(0, 2, (n, ), device="cuda", dtype=dtype)
-    else:
-        a = torch.randn((n, ), device="cuda", dtype=dtype)
-    b = torch.zeros((n, ), device="cuda", dtype=dtype)
-
-    grid = lambda meta: (triton.cdiv(n, meta["BLOCK_SIZE"]),)
-    block_copy_kernel[grid](a_ptr=a, b_ptr=b, N=n, BLOCK_SIZE=64, padding_option=padding_option)
-
-    assert torch.all(a[0: n // 2] == b[0: n // 2])
-    if padding_option == "zero":
-        assert torch.all(b[n // 2: n] == 0)
-    else:
-        assert torch.all(torch.isnan(b[n // 2: n]))
-
-
-@triton.jit
-def matmul_no_scf_with_advance_kernel(
-        a_ptr, b_ptr, c_ptr,
-        M, N, K,
-        stride_am, stride_ak,
-        stride_bk, stride_bn,
-        stride_cm, stride_cn,
-        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr
-):
-    offs_m = tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(M, K), strides=(stride_am, stride_ak),
-                                    offsets=(0, 0), block_shape=(BLOCK_M, BLOCK_K), order=(1, 0))
-    b_block_ptr = tl.make_block_ptr(base=b_ptr, shape=(K, N), strides=(stride_bk, stride_bn),
-                                    offsets=(0, 0), block_shape=(BLOCK_K, BLOCK_N), order=(1, 0))
-    # Below two lines are just for testing negative offsets for the `advance` API, which could be removed
-    a_block_ptr = tl.advance(a_block_ptr, (BLOCK_M, -BLOCK_K))
-    a_block_ptr = tl.advance(a_block_ptr, (-BLOCK_M, BLOCK_K))
-    a = tl.load(a_block_ptr, boundary_check=(1, ), padding_option="zero")
-    b = tl.load(b_block_ptr, boundary_check=(0, ), padding_option="zero")
-
-    c = tl.dot(a, b)
-    c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn
-    tl.store(c_ptrs, c)
-
-
-@pytest.mark.parametrize("shape, num_warps", [
-    (shape, num_warps)
-    for shape in [
-        [64, 64, 16],
-        [64, 64, 32],
-        [64, 64, 64],
-    ]
-    for num_warps in [4, 8]
-])
-def test_block_ptr_matmul_no_scf(shape, num_warps):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] >= 9:
-        pytest.skip("Hopper support is working in progress")
-
-    m, n, k = shape
-    a = torch.randn((m, k), device="cuda", dtype=torch.float16)
-    b = torch.randn((k, n), device="cuda", dtype=torch.float16)
-    c = torch.empty((m, n), device="cuda", dtype=torch.float32)
-
-    grid = lambda META: (1, )
-    matmul_no_scf_with_advance_kernel[grid](a_ptr=a, b_ptr=b, c_ptr=c,
-                                            M=m, N=n, K=k,
-                                            stride_am=a.stride(0), stride_ak=a.stride(1),
-                                            stride_bk=b.stride(0), stride_bn=b.stride(1),
-                                            stride_cm=c.stride(0), stride_cn=c.stride(1),
-                                            BLOCK_M=m, BLOCK_N=n, BLOCK_K=k,
-                                            num_warps=num_warps)
-    golden = torch.matmul(a, b)
-    torch.testing.assert_allclose(c, golden)
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
deleted file mode 100644
index b2fdc02daa79..000000000000
--- a/python/test/unit/language/test_core.py
+++ /dev/null
@@ -1,2992 +0,0 @@
-# flake8: noqa: F821,F841
-import itertools
-import os
-import re
-from typing import Optional, Union
-
-import numpy as np
-import pytest
-import torch
-from numpy.random import RandomState
-
-import triton
-import triton._C.libtriton.triton as _triton
-import triton.language as tl
-from triton.runtime.jit import JITFunction, TensorWrapper, reinterpret
-
-int_dtypes = ['int8', 'int16', 'int32', 'int64']
-uint_dtypes = ['uint8', 'uint16', 'uint32', 'uint64']
-float_dtypes = ['float16', 'float32', 'float64']
-dtypes = int_dtypes + uint_dtypes + float_dtypes
-dtypes_with_bfloat16 = dtypes + ['bfloat16']
-torch_dtypes = ['bool'] + int_dtypes + ['uint8'] + float_dtypes + ['bfloat16']
-
-
-def _bitwidth(dtype: str) -> int:
-    # ex.: "int64" -> 64
-    return int(re.search(r'(\d+)$', dtype).group(1))
-
-
-def numpy_random(shape, dtype_str, rs: Optional[RandomState] = None, low=None, high=None):
-    """
-    Override `rs` if you're calling this function twice and don't want the same
-    result for both calls.
-    """
-    if isinstance(shape, int):
-        shape = (shape, )
-    if rs is None:
-        rs = RandomState(seed=17)
-    if dtype_str in int_dtypes + uint_dtypes:
-        iinfo = np.iinfo(getattr(np, dtype_str))
-        low = iinfo.min if low is None else max(low, iinfo.min)
-        high = iinfo.max if high is None else min(high, iinfo.max)
-        dtype = getattr(np, dtype_str)
-        x = rs.randint(low, high, shape, dtype=dtype)
-        x[x == 0] = 1  # Hack. Never return zero so tests of division don't error out.
-        return x
-    elif dtype_str in float_dtypes:
-        return rs.normal(0, 1, shape).astype(dtype_str)
-    elif dtype_str == 'bfloat16':
-        return (rs.normal(0, 1, shape).astype('float32').view('uint32')
-                & np.uint32(0xffff0000)).view('float32')
-    elif dtype_str in ['bool', 'int1', 'bool_']:
-        return rs.normal(0, 1, shape) > 0.0
-    else:
-        raise RuntimeError(f'Unknown dtype {dtype_str}')
-
-
-def to_triton(x: np.ndarray, device='cuda', dst_type=None) -> Union[TensorWrapper, torch.Tensor]:
-    '''
-    Note: We need dst_type because the type of x can be different from dst_type.
-          For example: x is of type `float32`, dst_type is `bfloat16`.
-          If dst_type is None, we infer dst_type from x.
-    '''
-    t = x.dtype.name
-    if t in uint_dtypes:
-        signed_type_name = t.lstrip('u')  # e.g. "uint16" -> "int16"
-        x_signed = x.astype(getattr(np, signed_type_name))
-        return reinterpret(torch.tensor(x_signed, device=device), getattr(tl, t))
-    else:
-        if t == 'float32' and dst_type == 'bfloat16':
-            return torch.tensor(x, device=device).bfloat16()
-        return torch.tensor(x, device=device)
-
-
-def torch_dtype_name(dtype) -> str:
-    if isinstance(dtype, triton.language.dtype):
-        return dtype.name
-    elif isinstance(dtype, torch.dtype):
-        # 'torch.int64' -> 'int64'
-        m = re.match(r'^torch\.(\w+)$', str(dtype))
-        return m.group(1)
-    else:
-        raise TypeError(f'not a triton or torch dtype: {type(dtype)}')
-
-
-def to_numpy(x):
-    if isinstance(x, TensorWrapper):
-        return x.base.cpu().numpy().astype(getattr(np, torch_dtype_name(x.dtype)))
-    elif isinstance(x, torch.Tensor):
-        if x.dtype is torch.bfloat16:
-            return x.cpu().float().numpy()
-        return x.cpu().numpy()
-    else:
-        raise ValueError(f"Not a triton-compatible tensor: {x}")
-
-
-def patch_kernel(template, to_replace):
-    kernel = triton.JITFunction(template.fn)
-    for key, value in to_replace.items():
-        kernel.src = kernel.src.replace(key, value)
-    return kernel
-
-
-def check_type_supported(dtype):
-    '''
-    skip test if dtype is not supported on the current device
-    '''
-    cc = torch.cuda.get_device_capability()
-    if cc[0] < 8 and (dtype is tl.bfloat16 or dtype == "bfloat16" or dtype is torch.bfloat16):
-        pytest.skip("bfloat16 is only supported on NVGPU with cc >= 80")
-
-
-class MmaLayout:
-    def __init__(self, version, warps_per_cta):
-        self.version = version
-        self.warps_per_cta = str(warps_per_cta)
-
-    def __str__(self):
-        return f"#triton_gpu.mma<{{versionMajor={self.version[0]}, versionMinor={self.version[1]}, warpsPerCTA={self.warps_per_cta}}}>"
-
-
-class BlockedLayout:
-    def __init__(self, size_per_thread, threads_per_warp, warps_per_cta, order):
-        self.sz_per_thread = str(size_per_thread)
-        self.threads_per_warp = str(threads_per_warp)
-        self.warps_per_cta = str(warps_per_cta)
-        self.order = str(order)
-
-    def __str__(self):
-        return f"#triton_gpu.blocked<{{sizePerThread={self.sz_per_thread}, threadsPerWarp={self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, order={self.order}}}>"
-
-
-class SharedLayout:
-    def __init__(self, vec, per_phase, max_phase, order):
-        self.vec = str(vec)
-        self.per_phase = str(per_phase)
-        self.max_phase = str(max_phase)
-        self.order = str(order)
-
-    def __str__(self):
-        return f"#triton_gpu.shared<{{vec={self.vec}, perPhase={self.per_phase}, maxPhase={self.max_phase}, order={self.order}}}>"
-
-
-@pytest.mark.parametrize("dtype_x", list(dtypes) + ["bfloat16"])
-def test_empty_kernel(dtype_x, device='cuda'):
-    SIZE = 128
-
-    @triton.jit
-    def kernel(X, SIZE: tl.constexpr):
-        pass
-    check_type_supported(dtype_x)
-    x = to_triton(numpy_random(SIZE, dtype_str=dtype_x), device=device, dst_type=dtype_x)
-    kernel[(1, )](x, SIZE=SIZE, num_warps=4)
-
-
-# generic test functions
-def _test_unary(dtype_x, expr, numpy_expr=None, device='cuda'):
-    check_type_supported(dtype_x)  # early return if dtype_x is not supported
-    SIZE = 128
-    # define the kernel / launch-grid
-
-    @triton.jit
-    def kernel(Z, X, SIZE: tl.constexpr):
-        off = tl.arange(0, SIZE)
-        x = tl.load(X + off)
-        z = GENERATE_TEST_HERE
-        tl.store(Z + off, z)
-
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': expr})
-    # inputs
-    x = numpy_random(SIZE, dtype_str=dtype_x)
-    if 'log' in expr:
-        x = np.abs(x) + 0.01
-    # reference result
-    z_ref = eval(expr if numpy_expr is None else numpy_expr)
-    # triton result
-    x_tri = to_triton(x, device=device, dst_type=dtype_x)
-    z_tri = to_triton(np.empty_like(z_ref), device=device, dst_type=dtype_x)
-    kernel[(1, )](z_tri, x_tri, SIZE=SIZE, num_warps=4)
-    # compare
-    np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01)
-
-
-def _binary_op_dtype_override(a: str, b: str) -> Optional[np.dtype]:
-    """
-    Given two dtype strings, returns the numpy dtype Triton thinks binary
-    operations on the two types should return. Returns None if the return value
-    matches numpy. This is generally needed because Triton and pytorch return
-    narrower floating point types than numpy in mixed operations, and because
-    Triton follows C/C++ semantics around mixed signed/unsigned operations, and
-    numpy/pytorch do not.
-    """
-    overrides = {
-        ('float16', 'int16'): np.float16,
-        ('float16', 'int32'): np.float16,
-        ('float16', 'int64'): np.float16,
-        ('float16', 'uint16'): np.float16,
-        ('float16', 'uint32'): np.float16,
-        ('float16', 'uint64'): np.float16,
-        ('int8', 'uint8'): np.uint8,
-        ('int8', 'uint16'): np.uint16,
-        ('int8', 'uint32'): np.uint32,
-        ('int8', 'uint64'): np.uint64,
-        ('int16', 'uint16'): np.uint16,
-        ('int16', 'uint32'): np.uint32,
-        ('int16', 'uint64'): np.uint64,
-        ('int32', 'uint32'): np.uint32,
-        ('int32', 'uint64'): np.uint64,
-        ('int64', 'uint64'): np.uint64,
-    }
-    key = (a, b) if a < b else (b, a)
-    return overrides.get(key)
-
-
-def _test_binary(dtype_x, dtype_y, expr, numpy_expr=None, mode_x='real', mode_y='real', device='cuda', y_low=None, y_high=None):
-    check_type_supported(dtype_x)  # early return if dtype_x is not supported
-    check_type_supported(dtype_y)
-    SIZE = 128
-    # define the kernel / launch-grid
-
-    @triton.jit
-    def kernel(Z, X, Y, SIZE: tl.constexpr):
-        off = tl.arange(0, SIZE)
-        x = tl.load(X + off)
-        y = tl.load(Y + off)
-        z = GENERATE_TEST_HERE
-        tl.store(Z + off, z)
-
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': expr})
-    # inputs
-    rs = RandomState(17)
-    x = numpy_random(SIZE, dtype_str=dtype_x, rs=rs)
-    y = numpy_random(SIZE, dtype_str=dtype_y, rs=rs, low=y_low, high=y_high)
-    if mode_x == 'nan':
-        x[:] = float('nan')
-    if mode_y == 'nan':
-        y[:] = float('nan')
-    # reference result
-    z_ref = eval(expr if numpy_expr is None else numpy_expr)
-    dtype_z = _binary_op_dtype_override(dtype_x, dtype_y)
-    if dtype_z is not None:
-        z_ref = z_ref.astype(dtype_z)
-    # triton result
-    x_tri = to_triton(x, device=device, dst_type=dtype_x)
-    y_tri = to_triton(y, device=device, dst_type=dtype_y)
-    z_tri = to_triton(np.empty(SIZE, dtype=z_ref.dtype), device=device)
-    kernel[(1, )](z_tri, x_tri, y_tri, SIZE=SIZE, num_warps=4)
-    np.testing.assert_allclose(z_ref, to_numpy(z_tri), err_msg=expr, rtol=0.01)
-
-
-def _mod_operation_ill_conditioned(dtype_x, dtype_y) -> bool:
-    # The result of x % y is ill-conditioned if x % y is much smaller than x.
-    # pytorch/CUDA has slightly different (probably better) rounding on
-    # remainders than stock LLVM. We currently don't expect to match it
-    # bit-for-bit.
-    return (dtype_x, dtype_y) in [
-        ('int32', 'bfloat16'),
-        ('int32', 'float16'),
-        ('int32', 'float32'),
-        ('int64', 'bfloat16'),
-        ('int64', 'float16'),
-        ('int64', 'float32'),
-        ('int64', 'float64'),
-        ('uint16', 'bfloat16'),
-        ('uint16', 'float16'),
-        ('uint16', 'float32'),
-        ('uint32', 'bfloat16'),
-        ('uint32', 'float16'),
-        ('uint32', 'float32'),
-        ('uint64', 'bfloat16'),
-        ('uint64', 'float16'),
-        ('uint64', 'float32'),
-        ('uint64', 'float64'),
-    ]
-
-# ---------------
-# test binary ops
-# ---------------
-
-
-@pytest.mark.parametrize("dtype_x, dtype_y, op", [
-    (dtype_x, dtype_y, op)
-    for op in ['+', '-', '*', '/', '%']
-    for dtype_x in dtypes_with_bfloat16
-    for dtype_y in dtypes_with_bfloat16
-])
-def test_bin_op(dtype_x, dtype_y, op, device='cuda'):
-    expr = f' x {op} y'
-    if op == '%' and dtype_x in int_dtypes + uint_dtypes and dtype_y in int_dtypes + uint_dtypes:
-        # LLVM has 'numpy.fmod', not 'numpy.remainder', semantics on integer remainders.
-        numpy_expr = 'np.fmod(x, y)'
-    elif op in ('/', '%') and dtype_x in ('int16', 'float16', 'bfloat16') and dtype_y in ('int16', 'float16', 'bfloat16'):
-        # Triton promotes 16-bit floating-point / and % to 32-bit because there
-        # are no native div or FRem operations on float16. Since we have to
-        # convert anyway, we may as well take the accuracy bump.
-        numpy_expr = f'x.astype(np.float32) {op} y.astype(np.float32)'
-    elif (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)):
-        numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})'
-    elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)):
-        numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})'
-    else:
-        numpy_expr = None
-    if op == '%' and _mod_operation_ill_conditioned(dtype_x, dtype_y):
-        with pytest.raises(AssertionError, match='Not equal to tolerance'):
-            _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device)
-    elif (op in ('%', '/') and
-          ((dtype_x in int_dtypes and dtype_y in uint_dtypes) or
-           (dtype_x in uint_dtypes and dtype_y in int_dtypes))):
-        with pytest.raises(triton.CompilationError) as exc_info:
-            _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device)
-        assert re.match('Cannot use .* because they have different signedness', str(exc_info.value.__cause__))
-    else:
-        _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device)
-
-
-@pytest.mark.parametrize("dtype_x, dtype_y",
-                         [(dtype_x, dtype_y) for dtype_x in int_dtypes for dtype_y in int_dtypes] +
-                         [(dtype_x, dtype_y) for dtype_x in uint_dtypes for dtype_y in uint_dtypes]
-                         )
-def test_floordiv(dtype_x, dtype_y, device='cuda'):
-    # Triton has IEEE, not numpy/torch, semantics for %, and those carry
-    # through to //, so we have to use a nonstandard expression to get a
-    # reference result for //.
-    expr = 'x // y'
-    numpy_expr = '((x - np.fmod(x, y)) / y)'
-    _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device)
-
-
-def test_unsigned_name_mangling(device='cuda'):
-    # Test that uint32 and int32 are mangled differently by the compiler
-    SIZE = 128
-    # define the kernel / launch-grid
-
-    @triton.jit
-    def kernel(O1, O2, X, Y, SIZE: tl.constexpr):
-        off = tl.arange(0, SIZE)
-        x = tl.load(X + off)
-        y = tl.load(Y + off)
-        out1 = tl.abs(x)  # uint32 -> nop
-        out2 = tl.abs(-y)  # int32 -> should have an effect
-        tl.store(O1 + off, out1)
-        tl.store(O2 + off, out2)
-
-    dtype_x = 'uint32'
-    dtype_y = 'int32'
-    # inputs
-    rs = RandomState(17)
-    x = numpy_random(SIZE, dtype_str=dtype_x, rs=rs)
-    y = numpy_random(SIZE, dtype_str=dtype_y, rs=rs)
-    # reference result
-    expect = (np.abs(x), np.abs(-y))
-    # triton result
-    x_tri = to_triton(x, device=device, dst_type=dtype_x)
-    y_tri = to_triton(y, device=device, dst_type=dtype_y)
-    actual = tuple(
-        to_triton(np.empty_like(e), device=device)
-        for e in expect
-    )
-    kernel[(1, )](actual[0], actual[1], x_tri, y_tri, SIZE=SIZE, num_warps=4)
-
-    # Bitwise op, so expect exact equality
-    assert (expect[0] == to_numpy(actual[0])).all()
-    assert (expect[1] == to_numpy(actual[1])).all()
-
-
-# ---------------
-# test bitwise ops
-# ---------------
-@pytest.mark.parametrize("dtype_x, dtype_y, op", [
-    (dtype_x, dtype_y, op)
-    for op in ['&', '|', '^']
-    for dtype_x in dtypes + dtypes_with_bfloat16
-    for dtype_y in dtypes + dtypes_with_bfloat16
-])
-def test_bitwise_op(dtype_x, dtype_y, op, device='cuda'):
-    expr = f'x {op} y'
-    if (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)):
-        numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})'
-    elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)):
-        numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})'
-    else:
-        numpy_expr = None
-    if 'float' in dtype_x + dtype_y:
-        with pytest.raises(triton.CompilationError) as exc_info:
-            _test_binary(dtype_x, dtype_y, expr, numpy_expr='np.array([])', device=device)
-        # The CompilationError must have been caused by a C++ exception with this text.
-        assert re.match('invalid operands of type', str(exc_info.value.__cause__))
-    else:
-        _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device)
-
-
-@pytest.mark.parametrize("dtype_x, dtype_y, op", [
-    (dtype_x, dtype_y, op)
-    for op in ['<<', '>>']
-    for dtype_x in int_dtypes + uint_dtypes
-    for dtype_y in int_dtypes + uint_dtypes
-])
-def test_shift_op(dtype_x, dtype_y, op, device='cuda'):
-    expr = f'x {op} y'
-    bw = max(_bitwidth(dtype_x), _bitwidth(dtype_y))
-    if dtype_x.startswith('int'):
-        dtype_z = f'int{bw}'
-    else:
-        dtype_z = f'uint{bw}'
-    numpy_expr = f'x.astype(np.{dtype_z}) {op} y.astype(np.{dtype_z})'
-    _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device, y_low=0, y_high=65)
-
-
-# ---------------
-# test compare ops
-# ---------------
-ops = ['==', '!=', '>', '<', '>=', '<=']
-
-
-@pytest.mark.parametrize("dtype_x, dtype_y, op, mode_x, mode_y",
-                         # real
-                         [
-                             (dtype_x, dtype_y, op, 'real', 'real')
-                             for op in ops
-                             for dtype_x in dtypes
-                             for dtype_y in dtypes
-                         ] +
-                         # NaNs
-                         [('float32', 'float32', op, mode_x, mode_y)
-                             for op in ops
-                             for mode_x, mode_y in [('nan', 'real'),
-                                                    ('real', 'nan'),
-                                                    ('nan', 'nan')]
-
-                          ])
-def test_compare_op(dtype_x, dtype_y, op, mode_x, mode_y, device='cuda'):
-    expr = f'x {op} y'
-    if (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)):
-        numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})'
-    elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)):
-        numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})'
-    else:
-        numpy_expr = None
-    _test_binary(dtype_x, dtype_y, expr, numpy_expr, mode_x=mode_x, mode_y=mode_y, device=device)
-
-
-# ---------------
-# test broadcast
-# ---------------
-@pytest.mark.parametrize("dtype", dtypes_with_bfloat16)
-def test_broadcast(dtype):
-    @triton.jit
-    def broadcast_kernel(x_ptr, y_ptr, y_broadcasted_ptr, M: tl.constexpr, N: tl.constexpr):
-        offset1 = tl.arange(0, M)
-        offset2 = tl.arange(0, N)
-        x = tl.load(x_ptr + N * offset1[:, None] + offset2[None, :])
-        y = tl.load(y_ptr + offset2)
-        _, y_broadcasted = tl.broadcast(x, y)
-        tl.store(y_broadcasted_ptr + N * offset1[:, None] + offset2[None, :], y_broadcasted)
-
-    M = 32
-    N = 64
-    rs = RandomState(17)
-    x = numpy_random((M, N), dtype_str=dtype, rs=rs)
-    y = numpy_random(N, dtype_str=dtype, rs=rs)
-    _, y_broadcasted_np = np.broadcast_arrays(x, y)
-
-    x_tri = to_triton(x, device='cuda', dst_type=dtype)
-    y_tri = to_triton(y, device='cuda', dst_type=dtype)
-    y_broadcasted_tri = to_triton(np.empty((M, N), dtype=y_broadcasted_np.dtype), device='cuda', dst_type=dtype)
-
-    broadcast_kernel[(1,)](x_tri, y_tri, y_broadcasted_tri, M=M, N=N)
-    assert (y_broadcasted_np == to_numpy(y_broadcasted_tri)).all()
-
-
-# ----------------
-# test expand_dims
-# ----------------
-def test_expand_dims():
-    @triton.jit
-    def expand_dims_kernel(dummy, N: tl.constexpr):
-        offset1 = tl.arange(0, N)
-
-        t = tl.expand_dims(offset1, 0)
-        tl.static_assert(t.shape == [1, N])
-
-        t = tl.expand_dims(offset1, 1)
-        tl.static_assert(t.shape == [N, 1])
-
-        t = tl.expand_dims(offset1, -1)
-        tl.static_assert(t.shape == [N, 1])
-
-        t = tl.expand_dims(offset1, -2)
-        tl.static_assert(t.shape == [1, N])
-
-        t = tl.expand_dims(offset1, (0, -1))
-        tl.static_assert(t.shape == [1, N, 1])
-
-        t = tl.expand_dims(offset1, (0, 1, 3))
-        tl.static_assert(t.shape == [1, 1, N, 1])
-
-        t = tl.expand_dims(offset1, (-4, 2, -1))
-        tl.static_assert(t.shape == [1, N, 1, 1])
-
-        t = tl.expand_dims(offset1, (3, 1, 2))
-        tl.static_assert(t.shape == [N, 1, 1, 1])
-
-    N = 32
-    dummy_tensor = torch.empty((), device="cuda")
-    expand_dims_kernel[(1,)](dummy_tensor, N)
-
-
-def test_expand_dims_error_cases():
-    @triton.jit
-    def dim_out_of_range1(dummy, N: tl.constexpr):
-        offset1 = tl.arange(0, N)
-
-        t = tl.expand_dims(offset1, -2)
-        t = tl.expand_dims(offset1, -3)
-
-    @triton.jit
-    def dim_out_of_range2(dummy, N: tl.constexpr):
-        offset1 = tl.arange(0, N)
-
-        t = tl.expand_dims(offset1, 1)
-        t = tl.expand_dims(offset1, 2)
-
-    @triton.jit
-    def duplicate_dim1(dummy, N: tl.constexpr):
-        offset1 = tl.arange(0, N)
-
-        t = tl.expand_dims(offset1, (0, 0))
-
-    @triton.jit
-    def duplicate_dim2(dummy, N: tl.constexpr):
-        offset1 = tl.arange(0, N)
-
-        t = tl.expand_dims(offset1, (0, -3))
-
-    N = 32
-    dummy_tensor = torch.empty((), device="cuda")
-
-    with pytest.raises(triton.CompilationError, match="invalid axis -3"):
-        dim_out_of_range1[(1,)](dummy_tensor, N)
-
-    with pytest.raises(triton.CompilationError, match="invalid axis 2"):
-        dim_out_of_range2[(1,)](dummy_tensor, N)
-
-    with pytest.raises(triton.CompilationError, match=r"duplicate axes, normalized axes = \[0, 0\]"):
-        duplicate_dim1[(1,)](dummy_tensor, N)
-
-    with pytest.raises(triton.CompilationError, match=r"duplicate axes, normalized axes = \[0, 0\]"):
-        duplicate_dim2[(1,)](dummy_tensor, N)
-
-
-# ---------------
-# test where
-# ---------------
-@pytest.mark.parametrize("dtype", dtypes_with_bfloat16 + ["*int32"])
-def test_where(dtype):
-    select_ptrs = False
-    if dtype == "*int32":
-        dtype = "int64"
-        select_ptrs = True
-    check_type_supported(dtype)
-
-    @triton.jit
-    def where_kernel(cond_ptr, a_ptr, b_ptr, output_ptr, n_elements,
-                     BLOCK_SIZE: tl.constexpr,
-                     TEST_POINTERS: tl.constexpr,
-                     TEST_SCALAR_POINTERS: tl.constexpr):
-        offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < n_elements
-        decide = tl.load(cond_ptr + offsets, mask=mask)
-        if TEST_SCALAR_POINTERS:
-            ptr = tl.where(tl.load(cond_ptr), a_ptr, b_ptr)
-            output = tl.load(ptr + offsets, mask=mask)
-        else:
-            if TEST_POINTERS:
-                a = tl.load(a_ptr + offsets, mask=mask).to(tl.pi32_t)
-                b = tl.load(b_ptr + offsets, mask=mask).to(tl.pi32_t)
-            else:
-                a = tl.load(a_ptr + offsets, mask=mask)
-                b = tl.load(b_ptr + offsets, mask=mask)
-            output = tl.where(decide, a, b)
-        tl.store(output_ptr + offsets, output, mask=mask)
-
-    SIZE = 1_000
-    rs = RandomState(17)
-    cond = numpy_random(SIZE, 'bool', rs)
-    x = numpy_random(SIZE, dtype_str=dtype, rs=rs)
-    y = numpy_random(SIZE, dtype_str=dtype, rs=rs)
-    z = np.where(cond, x, y)
-
-    cond_tri = to_triton(cond, device='cuda')
-    x_tri = to_triton(x, device='cuda', dst_type=dtype)
-    y_tri = to_triton(y, device='cuda', dst_type=dtype)
-    z_tri = to_triton(np.empty(SIZE, dtype=z.dtype), device='cuda', dst_type=dtype)
-
-    grid = lambda meta: (triton.cdiv(SIZE, meta['BLOCK_SIZE']),)
-    where_kernel[grid](cond_tri, x_tri, y_tri, z_tri, SIZE, BLOCK_SIZE=1024, TEST_POINTERS=select_ptrs, TEST_SCALAR_POINTERS=False)
-    assert (z == to_numpy(z_tri)).all()
-    if select_ptrs:
-        where_kernel[grid](cond_tri, x_tri, y_tri, z_tri, SIZE, BLOCK_SIZE=1024, TEST_POINTERS=select_ptrs, TEST_SCALAR_POINTERS=True)
-        z = np.where(cond[0], x, y)
-        assert (z == to_numpy(z_tri)).all()
-
-
-def test_where_broadcast():
-    @triton.jit
-    def where_kernel(cond_ptr, a_ptr, out_ptr, BLOCK_SIZE: tl.constexpr):
-        xoffsets = tl.arange(0, BLOCK_SIZE)[:, None]
-        yoffsets = tl.arange(0, BLOCK_SIZE)[None, :]
-
-        mask = tl.load(cond_ptr + yoffsets)
-        vals = tl.load(a_ptr + yoffsets + BLOCK_SIZE * xoffsets)
-        res = tl.where(mask, vals, 0.)
-        tl.store(out_ptr + yoffsets + BLOCK_SIZE * xoffsets, res)
-
-    @triton.jit
-    def where_scalar_condition(a_ptr, out_ptr, BLOCK_SIZE: tl.constexpr):
-        xoffsets = tl.arange(0, BLOCK_SIZE)[:, None]
-        yoffsets = tl.arange(0, BLOCK_SIZE)[None, :]
-        mask = 0
-        vals = tl.load(a_ptr + yoffsets + BLOCK_SIZE * xoffsets)
-        res = tl.where(mask, vals, 0.)
-        tl.store(out_ptr + yoffsets + BLOCK_SIZE * xoffsets, res)
-
-    SIZE = 32
-    dtype = 'float32'
-    rs = RandomState(17)
-    x = numpy_random((SIZE, SIZE), dtype_str=dtype, rs=rs)
-    mask = numpy_random(SIZE, 'bool', rs=rs)
-    z = np.where(mask, x, 0)
-    cond_tri = to_triton(mask, device="cuda")
-    x_tri = to_triton(x, device='cuda', dst_type=dtype)
-    z_tri = to_triton(np.empty((SIZE, SIZE), dtype=z.dtype), device='cuda', dst_type=dtype)
-    where_kernel[(1,)](cond_tri, x_tri, z_tri, SIZE)
-    assert (z == to_numpy(z_tri)).all()
-    where_scalar_condition[(1,)](x_tri, z_tri, SIZE)
-    z = np.where(0, x, 0)
-    assert (z == to_numpy(z_tri)).all()
-
-# ---------------
-# test unary ops
-# ---------------
-
-
-@pytest.mark.parametrize("dtype_x, expr", [
-    (dtype_x, ' -x') for dtype_x in dtypes_with_bfloat16
-] + [
-    (dtype_x, ' ~x') for dtype_x in int_dtypes
-])
-def test_unary_op(dtype_x, expr, device='cuda'):
-    _test_unary(dtype_x, expr, device=device)
-
-# ----------------
-# test math ops
-# ----------------
-
-
-@pytest.mark.parametrize("dtype_x, expr", [(dtype_x, expr) for dtype_x in ["float32", "float64"] for expr in ['exp', 'log', 'cos', 'sin']])
-def test_math_op(dtype_x, expr, device='cuda'):
-    _test_unary(dtype_x, f'tl.{expr}(x)', f'np.{expr}(x) ', device=device)
-
-# ----------------
-# test abs
-# ----------------
-
-
-@pytest.mark.parametrize("dtype_x", [
-    (dtype_x)
-    for dtype_x in dtypes_with_bfloat16
-])
-def test_abs(dtype_x, device='cuda'):
-    _test_unary(dtype_x, 'tl.abs(x)', 'np.abs(x) ', device=device)
-
-
-@pytest.mark.parametrize("in_dtype", [tl.float8e4, tl.float8e5])
-def test_abs_f8(in_dtype):
-
-    @triton.jit
-    def abs_kernel(Z, X, SIZE: tl.constexpr):
-        off = tl.arange(0, SIZE)
-        x = tl.load(X + off)
-        z = tl.abs(x)
-        tl.store(Z + off, z)
-
-    f8_tensor = torch.tensor(range(-128, 128), dtype=torch.int8, device='cuda')
-    # f32_to_f8 doesn't handle nan, so we make sure f8_tensor doesn't contain any nan
-    all_exp_ones = (f8_tensor & 0b01111100) == 128 - 2**in_dtype.fp_mantissa_width
-    f8_tensor[all_exp_ones] = 0
-    f8 = triton.reinterpret(f8_tensor, in_dtype)
-    n_elements = f8_tensor.numel()
-    out_f8 = torch.empty_like(f8_tensor)
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    abs_kernel[(1,)](f8, triton.reinterpret(out_f8, in_dtype), n_elements)
-
-    f32_tensor = convert_float_to_float32(f8_tensor, in_dtype)
-    expect = f32_tensor.abs()
-    actual_f8 = convert_float_to_float32(out_f8, in_dtype)
-    torch.testing.assert_allclose(expect, actual_f8)
-
-
-# ----------------
-# test indexing
-# ----------------
-
-
-def make_ptr_str(name, shape):
-    rank = len(shape)
-    offsets = []
-    stride = 1
-    for i in reversed(range(rank)):
-        idx = ', '.join([':' if ii == i else 'None' for ii in range(rank)])
-        offsets += [f'tl.arange(0, {shape[i]})[{idx}]*{stride}']
-        stride *= shape[i]
-    return f"{name} + {' + '.join(offsets)}"
-
-
-# TODO: handle `%4 = triton_gpu.convert_layout %3 : (tensor<32xi32, #blocked0>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>``
-@pytest.mark.parametrize("expr, dtype_str", [
-    (f'x[{s}]', d)
-    for s in ['None, :', ':, None',
-              'None, :, :',
-              ':, :, None']
-    for d in ['int32', 'uint32', 'uint16']
-])
-def test_index1d(expr, dtype_str, device='cuda'):
-    rank_x = expr.count(':')
-    rank_y = expr.count(',') + 1
-    shape_x = [32 for _ in range(rank_x)]
-    shape_z = [32 for _ in range(rank_y)]
-    shape_z_rank_mismatch = [32 for _ in range(rank_y + 1)]
-    shape_z_dim_mismatch = [64 for _ in range(rank_y)]
-
-    # Triton kernel
-    @triton.jit
-    def kernel(Z, X, SIZE: tl.constexpr):
-        m = tl.arange(0, SIZE)
-        n = tl.arange(0, SIZE)
-        x = tl.load(X_PTR_EXPR)
-        z = GENERATE_TEST_HERE
-        tl.store(Z_PTR_EXPR, z)
-
-    def generate_kernel(shape_x, shape_z):
-        to_replace = {
-            'X_PTR_EXPR': make_ptr_str('X', shape_x),
-            'Z_PTR_EXPR': make_ptr_str('Z', shape_z),
-            'GENERATE_TEST_HERE': expr,
-        }
-        return patch_kernel(kernel, to_replace)
-
-    kernel_match = generate_kernel(shape_x, shape_z)
-    kernel_dim_mismatch = generate_kernel(shape_x, shape_z_dim_mismatch)
-    kernel_rank_mismatch = generate_kernel(shape_x, shape_z_rank_mismatch)
-
-    # torch result
-    x = numpy_random(shape_x, dtype_str=dtype_str)
-    y = np.zeros(shape_z, dtype=getattr(np, dtype_str))
-    z_ref = eval(expr) + y
-    # triton result
-    z_tri = to_triton(np.empty_like(z_ref), device=device)
-    x_tri = to_triton(x)
-    kernel_match[(1, )](z_tri, x_tri, num_warps=1, SIZE=shape_x[0])
-    # compare
-    assert (z_ref == to_numpy(z_tri)).all()
-
-    def catch_compilation_error(kernel):
-        try:
-            kernel[(1, )](z_tri, x_tri, num_warps=1, SIZE=shape_x[0])
-        except triton.CompilationError as e:
-            np.testing.assert_(True)
-        except BaseException:
-            np.testing.assert_(False)
-
-    catch_compilation_error(kernel_dim_mismatch)
-    catch_compilation_error(kernel_rank_mismatch)
-
-
-# ---------------
-# test tuples
-# ---------------
-
-
-@triton.jit
-def tuples_fn(a, b):
-    return a + b, \
-        a - b, \
-        a * b
-
-
-def test_tuples():
-    device = 'cuda'
-
-    @triton.jit
-    def with_fn(X, Y, A, B, C):
-        x = tl.load(X)
-        y = tl.load(Y)
-        a, b, c = tuples_fn(x, y)
-        tl.store(A, a)
-        tl.store(B, b)
-        tl.store(C, c)
-
-    @triton.jit
-    def without_fn(X, Y, A, B, C):
-        x = tl.load(X)
-        y = tl.load(Y)
-        a, b, c = x + y, x - y, x * y
-        tl.store(A, a)
-        tl.store(B, b)
-        tl.store(C, c)
-
-    x = torch.tensor([1.3], device=device, dtype=torch.float32)
-    y = torch.tensor([1.9], device=device, dtype=torch.float32)
-    a_tri = torch.tensor([0], device=device, dtype=torch.float32)
-    b_tri = torch.tensor([0], device=device, dtype=torch.float32)
-    c_tri = torch.tensor([0], device=device, dtype=torch.float32)
-    for kernel in [with_fn, without_fn]:
-        kernel[(1, )](x, y, a_tri, b_tri, c_tri, num_warps=1)
-        a_ref, b_ref, c_ref = x + y, x - y, x * y
-        assert a_tri == a_ref
-        assert b_tri == b_ref
-        assert c_tri == c_ref
-
-
-@triton.jit(noinline=True)
-def noinline_simple_fn(x, y, Z):
-    z = x + y
-    tl.store(Z, z)
-
-
-@triton.jit(noinline=True)
-def noinline_call_graph_fn1(x):
-    return x + 1
-
-
-@triton.jit(noinline=True)
-def noinline_call_graph_fn2(y):
-    return y + 2
-
-
-@triton.jit(noinline=True)
-def noinline_call_graph_fn(x, y, Z):
-    t0 = noinline_call_graph_fn1(x)
-    t1 = noinline_call_graph_fn2(y)
-    z = t0 + t1
-    tl.store(Z, z)
-
-
-@triton.jit(noinline=True)
-def noinline_shared_fn(x, y, Z):
-    offs = tl.arange(0, 16)[:, None] * 16 + tl.arange(0, 16)[None, :]
-    z = tl.load(Z + offs)
-    z = tl.dot(z, z) + x + y
-    tl.store(Z + offs, z)
-
-
-@triton.jit(noinline=True)
-def noinline_dynamic_fn(x, y, Z):
-    if x >= 1:
-        x = noinline_call_graph_fn1(x)
-    else:
-        x = noinline_call_graph_fn2(x)
-    if y >= 2:
-        y = noinline_call_graph_fn2(y)
-    else:
-        y = noinline_call_graph_fn1(y)
-    z = x + y
-    tl.store(Z, z)
-
-
-@triton.jit(noinline=True)
-def noinline_call_multi_values_fn(x, y):
-    return x + 1, y + 2
-
-
-@triton.jit(noinline=True)
-def noinline_multi_values_fn(x, y, Z):
-    x, y = noinline_call_multi_values_fn(x, y)
-    z = x + y
-    tl.store(Z, z)
-
-
-@pytest.mark.parametrize("mode", ["simple", "call_graph", "shared", "dynamic", "multi_values"])
-def test_noinline(mode):
-    device = 'cuda'
-
-    @triton.jit
-    def kernel(X, Y, Z):
-        x = tl.load(X)
-        y = tl.load(Y)
-        GENERATE_TEST_HERE(x, y, Z)
-
-    func_name = f'noinline_{mode}_fn'
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': func_name})
-    x = torch.tensor([1.0], device=device, dtype=torch.float32)
-    y = torch.tensor([2.0], device=device, dtype=torch.float32)
-    if mode == "shared":
-        z = torch.ones((16, 16), device=device, dtype=torch.float32)
-    else:
-        z = torch.tensor([0.0], device=device, dtype=torch.float32)
-    kernel[(1,)](x, y, z, num_warps=1)
-    if mode == "simple":
-        assert torch.equal(z, x + y)
-    elif mode == "call_graph" or mode == "dynamic" or mode == "multi_values":
-        assert torch.equal(z, x + 1 + y + 2)
-    elif mode == "shared":
-        ref = torch.full((16, 16), 16, device=device, dtype=torch.float32)
-        assert torch.equal(z, ref + x + y)
-
-
-# ---------------
-# test atomics
-# ---------------
-@pytest.mark.parametrize("op, dtype_x_str, mode", itertools.chain.from_iterable([
-    [
-        ('add', 'float16', mode),
-        ('add', 'uint32', mode), ('add', 'int32', mode), ('add', 'float32', mode),
-        ('max', 'uint32', mode), ('max', 'int32', mode), ('max', 'float32', mode),
-        ('min', 'uint32', mode), ('min', 'int32', mode), ('min', 'float32', mode),
-    ]
-    for mode in ['all_neg', 'all_pos', 'min_neg', 'max_pos']]))
-def test_atomic_rmw(op, dtype_x_str, mode, device='cuda'):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 7:
-        if dtype_x_str == 'float16':
-            pytest.skip("Only test atomic float16 ops on devices with sm >= 70")
-    n_programs = 5
-
-    # triton kernel
-    @triton.jit
-    def kernel(X, Z):
-        pid = tl.program_id(0)
-        x = tl.load(X + pid)
-        old = GENERATE_TEST_HERE
-
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.atomic_{op}(Z, x)'})
-    numpy_op = {'add': np.sum, 'max': np.max, 'min': np.min}[op]
-    max_neutral = float('-inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).min
-    min_neutral = float('inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).max
-    neutral = {'add': 0, 'max': max_neutral, 'min': min_neutral}[op]
-
-    # triton result
-    rs = RandomState(17)
-    x = np.array([2**i for i in range(n_programs)], dtype=getattr(np, dtype_x_str))
-    if mode == 'all_neg':
-        x = -np.abs(x)
-    if mode == 'all_pos':
-        x = np.abs(x)
-    if mode == 'min_neg':
-        idx = rs.randint(n_programs, size=(1, )).item()
-        x[idx] = -np.max(np.abs(x)) - 1
-    if mode == 'max_pos':
-        idx = rs.randint(n_programs, size=(1, )).item()
-        x[idx] = np.max(np.abs(x)) + 1
-    x_tri = to_triton(x, device=device)
-
-    z_tri = to_triton(np.array([neutral], dtype=getattr(np, dtype_x_str)), device=device)
-    kernel[(n_programs, )](x_tri, z_tri)
-    # torch result
-    z_ref = numpy_op(x).astype(getattr(np, dtype_x_str))
-    # compare
-    exact = op not in ['add']
-    if exact:
-        assert z_ref.item() == to_numpy(z_tri).item()
-    else:
-        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01)
-
-
-def test_atomic_rmw_predicate(device="cuda"):
-    @triton.jit
-    def kernel(X):
-        val = tl.program_id(0)
-        if val < 64:
-            tl.atomic_max(X, val)
-    x = torch.zeros((1,), device=device, dtype=torch.int32)
-    kernel[(4096,)](x)
-    assert x.item() == 63
-
-
-@pytest.mark.parametrize("shape, axis",
-                         [(shape, axis) for shape in [(2, 2), (2, 8), (8, 2), (8, 8), (32, 32)] for axis in [0, 1]])
-def test_tensor_atomic_rmw(shape, axis, device="cuda"):
-    shape0, shape1 = shape
-    # triton kernel
-
-    @triton.jit
-    def kernel(Z, X, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr):
-        off0 = tl.arange(0, SHAPE0)
-        off1 = tl.arange(0, SHAPE1)
-        x = tl.load(X + off0[:, None] * SHAPE1 + off1[None, :])
-        z = tl.sum(x, axis=AXIS)
-        if AXIS == 1:
-            tl.atomic_add(Z + off0, z)
-        else:
-            tl.atomic_add(Z + off1, z)
-    rs = RandomState(17)
-    x = numpy_random((shape0, shape1), dtype_str="float32", rs=rs)
-    # reference result
-    z_ref = np.sum(x, axis=axis, keepdims=False)
-    # triton result
-    x_tri = to_triton(x, device=device)
-    z_shape = (shape0, ) if axis == 1 else (shape1, )
-    z_tri = to_triton(np.zeros(z_shape, dtype="float32"), device=device)
-    kernel[(1,)](z_tri, x_tri, axis, shape0, shape1)
-    np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=1e-4)
-
-
-def test_tensor_atomic_rmw_block(device="cuda"):
-    shape = (8, 8)
-
-    @triton.jit
-    def kernel(X, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr):
-        off0 = tl.arange(0, SHAPE0)
-        off1 = tl.arange(0, SHAPE1)
-        offs = off0[:, None] * SHAPE1 + off1[None, :]
-        val = offs.to(tl.float32)
-        x = X + offs
-        tl.atomic_min(x, val)
-    x = torch.ones((8, 8), device=device, dtype=torch.float32)
-    kernel[(2,)](x, shape[0], shape[1])
-    assert torch.min(x).item() == 0.0
-
-
-def test_atomic_cas():
-    # 1. make sure that atomic_cas changes the original value (Lock)
-    @triton.jit
-    def change_value(Lock):
-        tl.atomic_cas(Lock, 0, 1)
-
-    Lock = torch.zeros((1,), device='cuda', dtype=torch.int32)
-    change_value[(1,)](Lock)
-
-    assert (Lock[0] == 1)
-
-    # 2. only one block enters the critical section
-    @triton.jit
-    def serialized_add(data, Lock):
-        ptrs = data + tl.arange(0, 128)
-        while tl.atomic_cas(Lock, 0, 1) == 1:
-            pass
-
-        tl.store(ptrs, tl.load(ptrs) + 1.0)
-
-        # release lock
-        tl.atomic_xchg(Lock, 0)
-
-    Lock = torch.zeros((1,), device='cuda', dtype=torch.int32)
-    data = torch.zeros((128,), device='cuda', dtype=torch.float32)
-    ref = torch.full((128,), 64.0)
-    serialized_add[(64,)](data, Lock)
-    np.testing.assert_allclose(to_numpy(data), to_numpy(ref))
-
-
-# ---------------
-# test cast
-# ---------------
-
-
-@pytest.mark.parametrize("dtype_x, dtype_z, bitcast", [
-    (dtype_x, dtype_z, False)
-    for dtype_x in dtypes
-    for dtype_z in dtypes
-] + [
-    ('float32', 'bfloat16', False),
-    ('bfloat16', 'float32', False),
-    ('float32', 'int32', True),
-    ('float32', 'int1', False),
-] + [
-    (f'uint{x}', f'int{x}', True) for x in [8, 16, 32, 64]
-] + [
-    (f'int{x}', f'uint{x}', True) for x in [8, 16, 32, 64]
-])
-def test_cast(dtype_x, dtype_z, bitcast, device='cuda'):
-    # bfloat16 on cc < 80 will not be tested
-    check_type_supported(dtype_x)
-    check_type_supported(dtype_z)
-
-    # This is tricky because numpy doesn't have bfloat, and torch doesn't have uints.
-    x0 = 43 if dtype_x in int_dtypes else 43.5
-    if dtype_x in float_dtypes and dtype_z == 'int1':
-        x0 = 0.5
-    if dtype_x.startswith('bfloat'):
-        x_tri = torch.tensor([x0], dtype=getattr(torch, dtype_x), device=device)
-    else:
-        x = np.array([x0], dtype=getattr(np, dtype_x))
-        x_tri = to_triton(x)
-
-    # triton kernel
-    @triton.jit
-    def kernel(X, Z, BITCAST: tl.constexpr):
-        x_ptr = X + tl.arange(0, 1)
-        z_ptr = Z + tl.arange(0, 1)
-        x = tl.load(x_ptr)
-        z = x.to(Z.dtype.element_ty, bitcast=BITCAST)
-        tl.store(z_ptr, z)
-
-    dtype_z_np = dtype_z if dtype_z != 'int1' else 'bool_'
-    # triton result
-    if dtype_z.startswith('bfloat'):
-        z_tri = torch.empty((1,), dtype=getattr(torch, dtype_z), device=device)
-    else:
-        z_tri = to_triton(np.empty((1, ), dtype=getattr(np, dtype_z_np)), device=device)
-    kernel[(1, )](x_tri, z_tri, BITCAST=bitcast)
-    # torch result
-    if dtype_z.startswith('bfloat') or dtype_x.startswith('bfloat'):
-        assert bitcast is False
-        z_ref = x_tri.to(z_tri.dtype)
-        assert z_tri == z_ref
-    else:
-        if bitcast:
-            z_ref = x.view(getattr(np, dtype_z_np))
-        else:
-            z_ref = x.astype(getattr(np, dtype_z_np))
-        assert to_numpy(z_tri) == z_ref
-
-
-@pytest.mark.parametrize("dtype_str", list(torch_dtypes))
-def test_store_constant(dtype_str):
-    check_type_supported(dtype_str)
-
-    """Tests that boolean True is stored as 1"""
-    @triton.jit
-    def kernel(output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
-        offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < n_elements
-        output = GENERATE_TEST_HERE
-        tl.store(output_ptr + offsets, output, mask=mask)
-
-    triton_dtype_str = 'uint8' if dtype_str == 'bool' else dtype_str
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.zeros([BLOCK_SIZE], dtype=tl.{triton_dtype_str}) + 1'})
-    block_size = 128
-    ref = torch.ones([block_size], dtype=getattr(torch, dtype_str), device='cuda')
-    output = torch.zeros([block_size], dtype=getattr(torch, dtype_str), device='cuda')
-    kernel[(1,)](output, block_size, BLOCK_SIZE=block_size)
-
-    assert torch.all(output == ref)
-
-
-def test_load_store_same_ptr():
-    @triton.jit()
-    def kernel(in_out_ptr):
-        pid = tl.program_id(axis=0)
-        x = tl.load(in_out_ptr + pid)
-        out = x * 2
-        tl.store(in_out_ptr + pid, out)
-
-    for _ in range(1000):
-        x = torch.ones((65536,), device="cuda", dtype=torch.float32)
-        kernel[(65536,)](x, num_warps=32)
-        assert torch.all(x == 2)
-
-
-def convert_float_to_float32(fp: torch.tensor, dtype=None):
-    if not dtype:
-        dtype = getattr(tl, torch_dtype_name(fp.dtype))
-
-    fp = fp.view(getattr(torch, f"int{dtype.primitive_bitwidth}"))
-    exp_width = dtype.primitive_bitwidth - dtype.fp_mantissa_width - 1
-    exp_bias = 2 ** (exp_width - 1) - 1
-    sign = ((fp >> (dtype.primitive_bitwidth - 1)) & 0x01).int()
-    exp = ((fp >> dtype.fp_mantissa_width) & ((1 << exp_width) - 1)).int()
-    frac = (fp & ((1 << dtype.fp_mantissa_width) - 1)).int()
-
-    output = torch.where(exp == 0,
-                         # subnormal
-                         ((-1.0) ** sign) * (2.0 ** (1 - exp_bias)) * (frac / (2.0 ** dtype.fp_mantissa_width)),
-                         # normal
-                         ((-1.0) ** sign) * (2.0 ** (exp - exp_bias)) * (1.0 + frac / (2.0 ** dtype.fp_mantissa_width))).float()
-
-    extended_exp = ((1 << (tl.float32.primitive_bitwidth - tl.float32.fp_mantissa_width - 1)) - 1) << tl.float32.fp_mantissa_width
-    # special cases, exp is 0b11..1
-    if dtype == tl.float8e4:
-        # float8e4m3 does not have infinities
-        output[fp == torch.tensor(0b01111111, dtype=torch.int8)] = torch.nan
-        output[fp == torch.tensor(0b11111111, dtype=torch.int8)] = torch.nan
-    else:
-        output = torch.where(exp == (1 << exp_width) - 1,
-                             ((sign << (tl.float32.primitive_bitwidth - 1)) | extended_exp | (frac << (tl.float32.fp_mantissa_width - dtype.fp_mantissa_width))).view(torch.float32),
-                             output)
-    return output
-
-
-@pytest.mark.parametrize("in_dtype", [torch.float16, torch.bfloat16])
-def test_convert_float16_to_float32(in_dtype):
-    """Tests that check convert_float_to_float32 function"""
-    check_type_supported(in_dtype)
-
-    f16_input = torch.tensor(range(-int(2 ** (16 - 1)), int(2 ** (16 - 1))), dtype=torch.int16).view(in_dtype)
-    f32_output = convert_float_to_float32(f16_input)
-
-    nan = f16_input.isnan()
-    assert torch.all(f32_output[nan].isnan())
-    inf = f16_input.isinf()
-    assert torch.all(f32_output[inf].isinf())
-    other = torch.logical_not(torch.logical_or(nan, inf))
-    assert torch.all(f16_input[other] == f32_output[other])
-
-
-@pytest.mark.parametrize("in_dtype", [tl.float8e4, tl.float8e5])
-@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16, torch.float32])
-def test_f8_xf16_roundtrip(in_dtype, out_dtype):
-    """Tests that converting an f8 to f16 and back to f8 doesn't change its value"""
-    check_type_supported(out_dtype)
-
-    @triton.jit
-    def copy_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
-        offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < n_elements
-        input = tl.load(input_ptr + offsets, mask=mask)
-        output = input
-        tl.store(output_ptr + offsets, output, mask=mask)
-
-    f8_tensor = torch.tensor(range(-128, 128), dtype=torch.int8, device='cuda')
-    # f32_to_f8 doesn't handle nan, so we make sure f8_tensor doesn't contain any nan
-    all_exp_ones = (f8_tensor & 0b01111100) == 128 - 2**in_dtype.fp_mantissa_width
-    f8_tensor[all_exp_ones] = 0
-    f8 = triton.reinterpret(f8_tensor, in_dtype)
-    n_elements = f8_tensor.numel()
-    xf16 = torch.empty_like(f8_tensor, dtype=out_dtype)
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    copy_kernel[grid](f8, xf16, n_elements, BLOCK_SIZE=1024)
-
-    # exponent_mask = 0b01111100 for float8e5
-    # exponent_mask = 0b01111000 for float8e4
-    exponent_mask = 0b01111111 ^ ((1 << in_dtype.fp_mantissa_width) - 1)
-    normal = torch.logical_and((f8_tensor & exponent_mask) != 0, (f8_tensor & exponent_mask) != exponent_mask)
-    ref16 = convert_float_to_float32(f8_tensor, in_dtype)
-    # WARN: currently only normal float8s are handled
-    assert torch.all(xf16[normal] == ref16[normal])
-
-    f8_output_tensor = torch.empty_like(xf16, dtype=torch.int8)
-    f8_output = triton.reinterpret(f8_output_tensor, in_dtype)
-    copy_kernel[grid](xf16, f8_output, n_elements, BLOCK_SIZE=1024)
-
-    assert torch.all(f8_tensor == f8_output_tensor)
-
-
-@pytest.mark.parametrize("in_dtype", [tl.float8e4, tl.float8e5])
-@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
-def test_f16_to_f8_rounding(in_dtype, out_dtype):
-    """Takes all float16s, converts them to float8 and back to float16. Checks that the absolute
-    error is the minimum over all float8.
-    Or the same explanation a bit mathier:
-    for all f16 |f16 - fromf8(tof8(f16))| == min over all f8 |f16 - fromf8(f8)|"""
-    @triton.jit
-    def copy_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
-        offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < n_elements
-        input = tl.load(input_ptr + offsets, mask=mask)
-        output = input
-        tl.store(output_ptr + offsets, output, mask=mask)
-
-    i16_input = torch.tensor(range(-int(2 ** (16 - 1)), int(2 ** (16 - 1))), dtype=torch.int16, device='cuda')
-    f16_input = i16_input.view(out_dtype)
-    n_elements = f16_input.numel()
-    f8_output_tensor = torch.empty_like(f16_input, dtype=torch.int8)
-    f8_output = triton.reinterpret(f8_output_tensor, in_dtype)
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    copy_kernel[grid](f16_input, f8_output, n_elements, BLOCK_SIZE=1024)
-
-    f16_output = torch.empty_like(f16_input, dtype=out_dtype)
-    copy_kernel[grid](f8_output, f16_output, n_elements, BLOCK_SIZE=1024)
-
-    abs_error = torch.abs(f16_input - f16_output)
-
-    all_f8_vals_tensor = torch.tensor(range(2 ** 8), dtype=torch.uint8, device='cuda')
-    all_f8_vals = triton.reinterpret(all_f8_vals_tensor, in_dtype)
-    all_f8_vals_in_f16 = torch.empty_like(all_f8_vals_tensor, dtype=out_dtype)
-    copy_kernel[grid](all_f8_vals, all_f8_vals_in_f16, n_elements=256, BLOCK_SIZE=1024)
-
-    all_finite_f8_vals_in_f16 = all_f8_vals_in_f16[
-        torch.isfinite(all_f8_vals_in_f16)
-    ]
-
-    min_error = torch.min(
-        torch.abs(
-            f16_input.reshape((-1, 1))
-            - all_finite_f8_vals_in_f16.reshape((1, -1))
-        ),
-        dim=1,
-    )[0]
-
-    # WARN: only normalized numbers are handled
-    f8_normal_min = 1 << in_dtype.fp_mantissa_width  # 0b00001000 for float8e4
-    f8_normal_max = 0b01111110 if in_dtype == tl.float8e4 else 0b01111011
-    f16_min, f16_max, f16_max_minus_1 = convert_float_to_float32(torch.tensor([f8_normal_min, f8_normal_max, f8_normal_max - 1], dtype=torch.int8), in_dtype)
-    assert torch.all(torch.isfinite(f16_min))
-    assert torch.all(torch.isfinite(f16_max))
-    thres_error = f16_max - f16_max_minus_1
-    mismatch = torch.logical_and(
-        torch.logical_or(abs_error != min_error, abs_error > thres_error), torch.logical_and(torch.isfinite(f16_input), torch.logical_and(torch.abs(f16_input) <= f16_max, torch.abs(f16_input) >= f16_min))
-    )
-    assert torch.all(
-        torch.logical_not(mismatch)
-    ), f"f16_input[mismatch]={f16_input[mismatch]} f16_output[mismatch]={f16_output[mismatch]} abs_error[mismatch]={abs_error[mismatch]} min_error[mismatch]={min_error[mismatch]}"
-
-
-# ---------------
-# test reduce
-# ---------------
-
-
-def get_reduced_dtype(dtype_str, op):
-    if op in ('argmin', 'argmax'):
-        return 'int32'
-    if dtype_str in ['int8', 'uint8', 'int16', 'uint16']:
-        return 'int32'
-    if dtype_str == 'bfloat16':
-        return 'float32'
-    return dtype_str
-
-
-@pytest.mark.parametrize("op, dtype_str, shape",
-                         [(op, dtype, shape)
-                          for op in ['min', 'max', 'sum', 'argmin', 'argmax']
-                          for dtype in dtypes_with_bfloat16
-                          for shape in [32, 64, 128, 512]])
-def test_reduce1d(op, dtype_str, shape, device='cuda'):
-    check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
-
-    # triton kernel
-    @triton.jit
-    def kernel(X, Z, BLOCK: tl.constexpr):
-        x = tl.load(X + tl.arange(0, BLOCK))
-        tl.store(Z, GENERATE_TEST_HERE)
-
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
-    # input
-    rs = RandomState(17)
-    # limit the range of integers so that the sum does not overflow
-    x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
-    x_tri = to_triton(x, device=device)
-    numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
-                'argmin': np.argmin, 'argmax': np.argmax}[op]
-    # numpy result
-    z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
-    z_tri_dtype_str = z_dtype_str
-    if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
-        z_dtype_str = 'float32'
-        z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
-        # trunc mantissa for a fair comparison of accuracy
-        z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
-        z_tri_dtype_str = 'bfloat16'
-    else:
-        z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
-    # triton result
-    z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
-                      device=device, dst_type=z_tri_dtype_str)
-    kernel[(1,)](x_tri, z_tri, BLOCK=shape)
-    z_tri = to_numpy(z_tri)
-    # compare
-    if op == 'sum':
-        np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
-    else:
-        if op in ('argmin', 'argmax'):
-            # argmin and argmax can have multiple valid indices.
-            # so instead we compare the values pointed by indices
-            np.testing.assert_equal(x[z_ref], x[z_tri])
-        else:
-            np.testing.assert_equal(z_ref, z_tri)
-
-
-# TODO: [Qingyi] Fix argmin / argmax
-reduce_configs1 = [
-    (op, dtype, (1, 1024), axis) for dtype in dtypes_with_bfloat16
-    for op in ['min', 'max', 'sum', 'argmin', 'argmax']
-    for axis in [1]
-]
-
-
-# shape (128, 256) and (32, 1024) are not enabled on sm86 because the required shared memory
-# exceeds the limit of 99KB
-reduce2d_shapes = [(2, 32), (4, 32), (4, 128)]
-# TODO: fix and uncomment
-# , (32, 64), (64, 128)]
-if 'V100' in torch.cuda.get_device_name(0):
-    reduce2d_shapes += [(128, 256) and (32, 1024)]
-
-
-reduce_configs2 = [
-    (op, 'float32', shape, axis)
-    for op in ['min', 'max', 'sum', 'argmin', 'argmax']
-    for shape in reduce2d_shapes
-    for axis in [0, 1]
-]
-
-
-@pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
-def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
-    check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
-
-    # triton kernel
-    @triton.jit
-    def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
-        range_m = tl.arange(0, BLOCK_M)
-        range_n = tl.arange(0, BLOCK_N)
-        x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
-        z = GENERATE_TEST_HERE
-        if AXIS == 1:
-            tl.store(Z + range_m, z)
-        else:
-            tl.store(Z + range_n, z)
-
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
-    # input
-    rs = RandomState(17)
-    # limit the range of integers so that the sum does not overflow
-    x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
-    x_tri = to_triton(x)
-    numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
-                'argmin': np.argmin, 'argmax': np.argmax}[op]
-    z_dtype_str = get_reduced_dtype(dtype_str, op)
-    z_tri_dtype_str = z_dtype_str
-    # numpy result
-    if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
-        z_dtype_str = 'float32'
-        z_tri_dtype_str = 'bfloat16'
-        z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
-        # trunc mantissa for a fair comparison of accuracy
-        z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
-    else:
-        z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
-    # triton result
-    z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
-                      device=device, dst_type=z_tri_dtype_str)
-    kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
-    z_tri = to_numpy(z_tri)
-    # compare
-    if op == 'sum':
-        np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
-    else:
-        if op in ('argmin', 'argmax'):
-            # argmin and argmax can have multiple valid indices.
-            # so instead we compare the values pointed by indices
-            z_ref_index = np.expand_dims(z_ref, axis=axis)
-            z_tri_index = np.expand_dims(z_tri, axis=axis)
-            z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
-            z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
-            np.testing.assert_equal(z_ref_value, z_tri_value)
-        else:
-            np.testing.assert_equal(z_ref, z_tri)
-
-
-layouts = [
-    BlockedLayout([1, 4], [8, 4], [4, 1], [1, 0]),
-    BlockedLayout([1, 4], [8, 4], [4, 1], [0, 1]),
-    MmaLayout(version=(2, 0), warps_per_cta=[4, 1])
-]
-
-
-@pytest.mark.parametrize("M, N", [[128, 16], [128, 128], [32, 128]])
-@pytest.mark.parametrize("src_layout", layouts)
-@pytest.mark.parametrize("axis", [0, 1])
-def test_reduce_layouts(M, N, src_layout, axis, device='cuda'):
-    rdims_2d = f"1x{N}" if axis == 0 else f"{M}x1"
-    rdims_1d = f"{N}" if axis == 0 else f"{M}"
-    store_range = "%7" if axis == 0 else "%1"
-    ir = f"""
-    #blocked = #triton_gpu.blocked<{{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}}>
-    #src = {src_layout}
-    module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{
-    tt.func public @kernel_0d1d2c3d4c(%arg0: !tt.ptr<f32> {{tt.divisibility = 16 : i32}}, %arg1: i32 {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr<f32> {{tt.divisibility = 16 : i32}}) {{
-        %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>>
-        %1 = tt.expand_dims %0 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>>) -> tensor<{M}x1xi32, #blocked>
-        %2 = tt.splat %arg1 : (i32) -> tensor<{M}x1xi32, #blocked>
-        %3 = arith.muli %1, %2 : tensor<{M}x1xi32, #blocked>
-        %4 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<{M}x1x!tt.ptr<f32>, #blocked>
-        %5 = tt.addptr %4, %3 : tensor<{M}x1x!tt.ptr<f32>, #blocked>, tensor<{M}x1xi32, #blocked>
-        %6 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>>
-        %7 = tt.expand_dims %6 {{axis = 0 : i32}} : (tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>>) -> tensor<1x{N}xi32, #blocked>
-        %8 = tt.broadcast %5 : (tensor<{M}x1x!tt.ptr<f32>, #blocked>) -> tensor<{M}x{N}x!tt.ptr<f32>, #blocked>
-        %9 = tt.broadcast %7 : (tensor<1x{N}xi32, #blocked>) -> tensor<{M}x{N}xi32, #blocked>
-        %10 = tt.addptr %8, %9 : tensor<{M}x{N}x!tt.ptr<f32>, #blocked>, tensor<{M}x{N}xi32, #blocked>
-        %11 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<{rdims_2d}x!tt.ptr<f32>, #blocked>
-        %12 = tt.addptr %11, {store_range} : tensor<{rdims_2d}x!tt.ptr<f32>, #blocked>, tensor<{rdims_2d}xi32, #blocked>
-        %13 = tt.load %10 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}x{N}xf32, #blocked>
-        %14 = triton_gpu.convert_layout %13 : (tensor<{M}x{N}xf32, #blocked>) -> tensor<{M}x{N}xf32, #src>
-        %15 = "tt.reduce"(%14) ({{
-        ^bb0(%arg3: f32, %arg4: f32):
-          %16 = "triton_gpu.cmpf"(%arg3, %arg4) {{predicate = 2 : i64}} : (f32, f32) -> i1
-          %17 = arith.select %16, %arg3, %arg4 : f32
-          tt.reduce.return %17 : f32
-        }}) {{axis = {axis} : i32}} : (tensor<{M}x{N}xf32, #src>) -> tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #src}}>>
-        %18 = triton_gpu.convert_layout %15 : (tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #src}}>>) -> tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #blocked}}>>
-        %19 = tt.expand_dims %18 {{axis = {axis} : i32}} : (tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #blocked}}>>) -> tensor<{rdims_2d}xf32, #blocked>
-        tt.store %12, %19 {{cache = 1 : i32, evict = 1 : i32}} : tensor<{rdims_2d}xf32, #blocked>
-        tt.return
-    }}
-    }}
-    """
-
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
-        f.write(ir)
-        f.flush()
-        kernel = triton.compile(f.name)
-
-    rs = RandomState(17)
-    x = rs.randint(0, 4, (M, N)).astype('float32')
-    x = (x.view('uint32') & np.uint32(0xffffe000)).view('float32')
-
-    if axis == 0:
-        z = np.zeros((1, N)).astype('float32')
-    else:
-        z = np.zeros((M, 1)).astype('float32')
-
-    x_tri = torch.tensor(x, device=device)
-    z_tri = torch.tensor(z, device=device)
-
-    pgm = kernel[(1, 1, 4)](x_tri, x_tri.stride(0), z_tri)
-
-    z_ref = np.max(x, axis=axis, keepdims=True)
-
-    np.testing.assert_allclose(z_ref, z_tri.cpu().numpy(), rtol=0.01, atol=1e-3)
-
-
-layouts = [
-    BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]),
-    BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]),
-    MmaLayout(version=(2, 0), warps_per_cta=[4, 1])
-]
-
-
-@pytest.mark.parametrize("M", [32, 64, 128, 256])
-@pytest.mark.parametrize("src_layout", layouts)
-def test_store_op(M, src_layout, device='cuda'):
-    ir = f"""
-    #src = {src_layout}
-    module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{
-        tt.func public @kernel(%arg0: !tt.ptr<f32> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<f32> {{tt.divisibility = 16 : i32}}) {{
-            %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-            %1 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<{M}x!tt.ptr<f32>, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-            %2 = tt.addptr %1, %0 : tensor<{M}x!tt.ptr<f32>, #triton_gpu.slice<{{dim = 1, parent = #src}}>>, tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-            %3 = tt.load %2 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}xf32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-            %4 = tt.expand_dims %3 {{axis = 1 : i32}} : (tensor<{M}xf32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xf32, #src>
-            %5 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-            %6 = tt.expand_dims %5 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xi32, #src>
-            %7 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<{M}x1x!tt.ptr<f32>, #src>
-            %8 = tt.addptr %7, %6 : tensor<{M}x1x!tt.ptr<f32>, #src>, tensor<{M}x1xi32, #src>
-            tt.store %8, %4 : tensor<{M}x1xf32, #src>
-            tt.return
-        }}
-    }}
-    """
-
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
-        f.write(ir)
-        f.flush()
-        store_kernel = triton.compile(f.name)
-
-    rs = RandomState(17)
-    x = rs.randint(0, 4, (M, 1)).astype('float32')
-    y = np.zeros((M, 1), dtype='float32')
-    x_tri = torch.tensor(x, device=device)
-    y_tri = torch.tensor(y, device=device)
-
-    pgm = store_kernel[(1, 1, 1)](x_tri, y_tri)
-    y_ref = x
-    np.testing.assert_allclose(y_ref, y_tri.cpu().numpy(), rtol=0.01, atol=1e-3)
-
-
-layouts = [
-    BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]),
-    BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]),
-    MmaLayout(version=(2, 0), warps_per_cta=[4, 1])
-]
-
-
-@pytest.mark.parametrize("M", [64, 128, 256])
-@pytest.mark.parametrize("src_layout", layouts)
-@pytest.mark.parametrize("dst_layout", layouts)
-@pytest.mark.parametrize("src_dim", [0, 1])
-@pytest.mark.parametrize("dst_dim", [0, 1])
-def test_convert1d(M, src_layout, dst_layout, src_dim, dst_dim, device='cuda'):
-    ir = f"""
-    #dst = {dst_layout}
-    #src = {src_layout}
-    module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{
-        tt.func public @kernel(%arg0: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}) {{
-            %0 = tt.splat %arg0 : (!tt.ptr<i32>) -> tensor<{M}x!tt.ptr<i32>, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>
-            %1 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>
-            %2 = tt.addptr %0, %1 : tensor<{M}x!tt.ptr<i32>, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>, tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>
-            %3 = tt.load %2 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>
-            %4 = tt.splat %arg1 : (!tt.ptr<i32>) -> tensor<{M}x!tt.ptr<i32>, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>>
-            %5 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>>
-            %6 = tt.addptr %4, %5 : tensor<{M}x!tt.ptr<i32>, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>>, tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>>
-            %7 = triton_gpu.convert_layout %3 : (tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>) -> tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>>
-            tt.store %6, %7 : tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>>
-            tt.return
-        }}
-    }}
-    """
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
-        f.write(ir)
-        f.flush()
-        kernel = triton.compile(f.name)
-
-    rs = RandomState(17)
-    x = rs.randint(0, 4, (M, )).astype('int32')
-    y = np.zeros((M, ), dtype='int32')
-    x_tri = torch.tensor(x, device=device)
-    y_tri = torch.tensor(y, device=device)
-    pgm = kernel[(1, 1, 1)](x_tri, y_tri)
-    y_ref = x
-    np.testing.assert_allclose(y_ref, y_tri.cpu().numpy(), rtol=0.01, atol=1e-3)
-
-
-@triton.jit
-def _welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
-    delta = mean_2 - mean_1
-    new_weight = weight_1 + weight_2
-    w2_over_w = weight_2 / new_weight
-    return (
-        mean_1 + delta * w2_over_w,
-        m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w,
-        new_weight,
-    )
-
-
-layouts = [
-    BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]),
-    BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]),
-    BlockedLayout([1, 4], [1, 32], [1, 4], [1, 0]),
-    BlockedLayout([1, 4], [8, 4], [2, 2], [0, 1])
-]
-
-
-@pytest.mark.parametrize("M, N", [[128, 128], [256, 128], [256, 256], [128, 256]])
-@pytest.mark.parametrize("src_layout", layouts)
-def test_chain_reduce(M, N, src_layout, device='cuda'):
-    ir = f"""
-    #src = {src_layout}
-    module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{
-    tt.func public @sum_kernel_0d1d(%arg0: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<i32> {{tt.divisibility = 16 : i32}}) {{
-        %cst = arith.constant dense<{N}> : tensor<{M}x1xi32, #src>
-        %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-        %1 = tt.expand_dims %0 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xi32, #src>
-        %2 = arith.muli %1, %cst : tensor<{M}x1xi32, #src>
-        %3 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>
-        %4 = tt.expand_dims %3 {{axis = 0 : i32}} : (tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>) -> tensor<1x{N}xi32, #src>
-        %5 = tt.broadcast %2 : (tensor<{M}x1xi32, #src>) -> tensor<{M}x{N}xi32, #src>
-        %6 = tt.broadcast %4 : (tensor<1x{N}xi32, #src>) -> tensor<{M}x{N}xi32, #src>
-        %7 = arith.addi %5, %6 : tensor<{M}x{N}xi32, #src>
-        %8 = tt.splat %arg0 : (!tt.ptr<i32>) -> tensor<{M}x{N}x!tt.ptr<i32>, #src>
-        %9 = tt.addptr %8, %7 : tensor<{M}x{N}x!tt.ptr<i32>, #src>, tensor<{M}x{N}xi32, #src>
-        %10 = tt.load %9 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}x{N}xi32, #src>
-        %11 = "tt.reduce"(%10) ({{
-        ^bb0(%arg2: i32, %arg3: i32):
-        %13 = arith.addi %arg2, %arg3 : i32
-        tt.reduce.return %13 : i32
-        }}) {{axis = 1 : i32}} : (tensor<{M}x{N}xi32, #src>) -> tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
-        %12 = "tt.reduce"(%11) ({{
-        ^bb0(%arg2: i32, %arg3: i32):
-        %13 = arith.addi %arg2, %arg3 : i32
-        tt.reduce.return %13 : i32
-        }}) {{axis = 0 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> i32
-        tt.store %arg1, %12 {{cache = 1 : i32, evict = 1 : i32}} : i32
-        tt.return
-    }}
-    }}
-    """
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
-        f.write(ir)
-        f.flush()
-        kernel = triton.compile(f.name)
-
-    rs = RandomState(17)
-    x = rs.randint(0, 4, (M, N)).astype('int32')
-
-    z = np.zeros((1,)).astype('int32')
-
-    x_tri = torch.tensor(x, device=device)
-    z_tri = torch.tensor(z, device=device)
-
-    pgm = kernel[(1, 1, 1)](x_tri, z_tri)
-    z_ref = np.sum(x)
-
-    np.testing.assert_allclose(z_ref, z_tri.cpu().numpy(), rtol=0.01, atol=1e-3)
-
-
-def test_generic_reduction(device='cuda'):
-
-    @triton.jit
-    def var_mean_kernel(X, out_mean, out_var, BLOCK: tl.constexpr):
-        xindex = tl.arange(0, BLOCK)
-        x = tl.load(X + xindex)
-        mean = x
-        m2 = tl.zeros_like(x)
-        weight = tl.full(x.shape, 1, x.dtype)
-        (mean, m2, weight) = tl.reduce((mean, m2, weight), 0, _welford_combine)
-        tl.store(out_mean, mean)
-        tl.store(out_var, m2 / weight)
-
-    SIZE = 512
-    x = torch.rand(SIZE, device=device)
-    out_mean = torch.empty((), device=device)
-    out_var = torch.empty((), device=device)
-
-    var_mean_kernel[(1,)](x, out_mean, out_var, BLOCK=SIZE)
-
-    expect_var, expect_mean = torch.var_mean(x, dim=0, correction=0)
-    torch.testing.assert_close(out_mean, expect_mean)
-    torch.testing.assert_close(out_var, expect_var)
-
-
-# ---------------
-# test permute
-# ---------------
-
-
-@pytest.mark.parametrize("dtype_str, shape, perm",
-                         [(dtype, shape, perm)
-                          # TODO: bfloat16
-                          for dtype in ['float16', 'float32']
-                             for shape in [(64, 64), (128, 128)]
-                             for perm in [(1, 0)]])
-def test_permute(dtype_str, shape, perm, device='cuda'):
-    check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
-
-    # triton kernel
-    @triton.jit
-    def kernel(X, stride_xm, stride_xn,
-               Z, stride_zm, stride_zn,
-               BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
-        off_m = tl.arange(0, BLOCK_M)
-        off_n = tl.arange(0, BLOCK_N)
-        Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * stride_xn
-        Zs = Z + off_m[:, None] * stride_zm + off_n[None, :] * stride_zn
-        tl.store(Zs, tl.load(Xs))
-    # input
-    x = numpy_random(shape, dtype_str=dtype_str)
-    # triton result
-    z_tri = to_triton(np.empty_like(x), device=device, dst_type=dtype_str)
-    z_tri_contiguous = to_triton(np.empty_like(x), device=device, dst_type=dtype_str)
-    x_tri = to_triton(x, device=device, dst_type=dtype_str)
-    pgm = kernel[(1, 1)](x_tri, x_tri.stride(0), x_tri.stride(1),
-                         z_tri, z_tri.stride(1), z_tri.stride(0),
-                         BLOCK_M=shape[0], BLOCK_N=shape[1])
-    pgm_contiguous = kernel[(1, 1)](x_tri, x_tri.stride(1), x_tri.stride(0),
-                                    z_tri_contiguous, z_tri_contiguous.stride(0), z_tri_contiguous.stride(1),
-                                    BLOCK_M=shape[0], BLOCK_N=shape[1])
-    # numpy result
-    z_ref = x.transpose(*perm)
-    # compare
-    np.testing.assert_allclose(to_numpy(z_tri), z_ref)
-    np.testing.assert_allclose(to_numpy(z_tri_contiguous), z_ref)
-    # parse ptx to make sure ld/st are vectorized
-    ptx = pgm.asm['ptx']
-    assert 'ld.global.v4' in ptx
-    assert 'st.global.v4' in ptx
-    ptx = pgm_contiguous.asm['ptx']
-    assert 'ld.global.v4' in ptx
-    assert 'st.global.v4' in ptx
-
-# ---------------
-# test dot
-# ---------------
-
-
-@pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, in_dtype, out_dtype",
-                         [(*shape, 4, False, False, epilogue, allow_tf32, in_dtype, out_dtype)
-                          for shape in [(64, 64, 64), (16, 16, 16)]
-                          for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
-                          for allow_tf32 in [True, False]
-                          for in_dtype, out_dtype in [('float16', 'float16'),
-                                                      ('float16', 'float32'),
-                                                      ('float32', 'float32')]
-                          if not (allow_tf32 and (in_dtype in ['float16']))] +
-
-                         [(*shape_nw, col_a, col_b, 'none', allow_tf32, in_dtype, out_dtype)
-                          for shape_nw in [[128, 256, 32, 8],
-                                           [128, 16, 32, 4],
-                                           [32, 128, 64, 4],
-                                           [128, 128, 64, 4],
-                                           [64, 128, 128, 4],
-                                           [32, 128, 64, 2],
-                                           [64, 64, 32, 4],
-                                           [32, 32, 128, 16],
-                                           [128, 128, 64, 2],
-                                           [64, 128, 128, 2]]
-                          for allow_tf32 in [True]
-                          for col_a in [True, False]
-                          for col_b in [True, False]
-                          for in_dtype, out_dtype in [('int8', 'int8'),
-                                                      ('float16', 'float16'),
-                                                      ('float16', 'float32'),
-                                                      ('float32', 'float32')]])
-def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, in_dtype, out_dtype, device='cuda'):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 7:
-        pytest.skip("Only test tl.dot() on devices with sm >= 70")
-    if capability[0] < 8:
-        if in_dtype == 'int8':
-            pytest.skip("Only test int8 on devices with sm >= 80")
-        elif in_dtype == 'float32' and allow_tf32:
-            pytest.skip("Only test tf32 on devices with sm >= 80")
-    if capability[0] == 7:
-        if (M, N, K, num_warps) == (128, 256, 32, 8):
-            pytest.skip("shared memory out of resource")
-        if out_dtype == 'float16':
-            # TODO: support out_dtype=float16 for tl.dot on V100
-            pytest.skip("Only test out_dtype=float16 on devices with sm >=80")
-
-    torch.backends.cuda.matmul.allow_tf32 = allow_tf32
-
-    # triton kernel
-    @triton.jit
-    def kernel(X, stride_xm, stride_xk,
-               Y, stride_yk, stride_yn,
-               W, stride_wn, stride_wl,
-               Z, stride_zm, stride_zn,
-               out_dtype: tl.constexpr,
-               BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-               ADD_MATRIX: tl.constexpr, ADD_ROWS: tl.constexpr, ADD_COLS: tl.constexpr,
-               ALLOW_TF32: tl.constexpr,
-               DO_SOFTMAX: tl.constexpr, CHAIN_DOT: tl.constexpr,
-               COL_A: tl.constexpr, COL_B: tl.constexpr):
-        off_m = tl.arange(0, BLOCK_M)
-        off_n = tl.arange(0, BLOCK_N)
-        off_l = tl.arange(0, BLOCK_N)
-        off_k = tl.arange(0, BLOCK_K)
-        Xs = X + off_m[:, None] * stride_xm + off_k[None, :] * stride_xk
-        Ys = Y + off_k[:, None] * stride_yk + off_n[None, :] * stride_yn
-        Ws = W + off_n[:, None] * stride_wn + off_l[None, :] * stride_wl
-        Zs = Z + off_m[:, None] * stride_zm + off_n[None, :] * stride_zn
-        x = tl.load(Xs)
-        y = tl.load(Ys)
-        z = tl.dot(x, y, allow_tf32=ALLOW_TF32, out_dtype=out_dtype)
-        if ADD_MATRIX:
-            z += tl.load(Zs)
-        if ADD_ROWS:
-            ZRs = Z + off_m * stride_zm
-            z += tl.load(ZRs)[:, None]
-        if ADD_COLS:
-            ZCs = Z + off_n * stride_zn
-            z += tl.load(ZCs)[None, :]
-        if DO_SOFTMAX:
-            max = tl.max(z, 1)
-            z = z - max[:, None]
-            num = tl.exp(z.to(tl.float32)).to(max.dtype)
-            den = tl.sum(num, 1)
-            z = num / den[:, None]
-        if CHAIN_DOT:
-            w = tl.load(Ws)
-            z = tl.dot(z.to(w.dtype), w, out_dtype=out_dtype)
-        tl.store(Zs, z)
-    # input
-    rs = RandomState(17)
-    if col_a:
-        x = numpy_random((K, M), dtype_str=in_dtype, rs=rs).T
-    else:
-        x = numpy_random((M, K), dtype_str=in_dtype, rs=rs)
-    if col_b:
-        y = numpy_random((N, K), dtype_str=in_dtype, rs=rs).T
-    else:
-        y = numpy_random((K, N), dtype_str=in_dtype, rs=rs)
-    w = numpy_random((N, N), dtype_str=in_dtype, rs=rs)
-    if 'int' not in in_dtype:
-        x *= .1
-        y *= .1
-    if in_dtype == 'float32' and allow_tf32:
-        x = (x.view('uint32') & np.uint32(0xffffe000)).view('float32')
-        y = (y.view('uint32') & np.uint32(0xffffe000)).view('float32')
-        w = (w.view('uint32') & np.uint32(0xffffe000)).view('float32')
-    x_tri = to_triton(x, device=device)
-    y_tri = to_triton(y, device=device)
-    w_tri = to_triton(w, device=device)
-    # triton result
-    if out_dtype == 'int8':
-        z = 1 + numpy_random((M, N), dtype_str='int32', rs=rs)
-    else:
-        z = 1 + numpy_random((M, N), dtype_str=in_dtype, rs=rs) * .1
-
-    z_tri = to_triton(z, device=device)
-    if epilogue == 'trans':
-        z_tri = torch.as_strided(z_tri, (M, N), z_tri.stride()[::-1])
-
-    if out_dtype == 'int8':
-        out_dtype = tl.int8
-    elif out_dtype == 'float16' and epilogue != 'softmax':
-        # TODO: for out_dtype == 'float16' and epilogue == 'softmax', it will
-        # fail with the following error: 'llvm.fmul' op requires the same type
-        # for all operands and results
-        out_dtype = tl.float16
-    else:
-        out_dtype = tl.float32
-
-    pgm = kernel[(1, 1)](x_tri, x_tri.stride(0), x_tri.stride(1),
-                         y_tri, y_tri.stride(0), y_tri.stride(1),
-                         w_tri, w_tri.stride(0), w_tri.stride(1),
-                         z_tri, z_tri.stride(0), z_tri.stride(1),
-                         out_dtype,
-                         COL_A=col_a, COL_B=col_b,
-                         BLOCK_M=M, BLOCK_K=K, BLOCK_N=N,
-                         ADD_MATRIX=epilogue == 'add-matrix',
-                         ADD_ROWS=epilogue == 'add-rows',
-                         ADD_COLS=epilogue == 'add-cols',
-                         DO_SOFTMAX=epilogue == 'softmax',
-                         CHAIN_DOT=epilogue == 'chain-dot',
-                         ALLOW_TF32=allow_tf32,
-                         num_warps=num_warps)
-    # torch result
-    if in_dtype == 'int8':
-        z_ref = np.matmul(x.astype(np.float32),
-                          y.astype(np.float32())).astype(np.int32)
-    else:
-        z_ref = np.matmul(x, y)
-
-    if epilogue == 'add-matrix':
-        z_ref += z
-    if epilogue == 'add-rows':
-        z_ref += z[:, 0][:, None]
-    if epilogue == 'add-cols':
-        z_ref += z[0, :][None, :]
-    if epilogue == 'softmax':
-        num = np.exp(z_ref - np.max(z_ref, axis=-1, keepdims=True))
-        denom = np.sum(num, axis=-1, keepdims=True)
-        z_ref = num / denom
-    if epilogue == 'chain-dot':
-        z_ref = np.matmul(z_ref, w)
-    # compare
-    # print(z_ref[:,0], z_tri[:,0])
-    if in_dtype == 'float32':
-        # XXX: Somehow there's a larger difference when we use float32
-        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3)
-    elif out_dtype == tl.float16:
-        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3)
-    else:
-        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01)
-    # make sure ld/st are vectorized
-    ptx = pgm.asm['ptx']
-    if (K > 16 or N > 16 or M > 16) and (M * N // (num_warps * 32) >= 4):
-        # XXX: skip small sizes because they are not vectorized
-        assert 'ld.global.v4' in ptx
-        assert 'st.global.v4' in ptx
-    if in_dtype == 'float32' and allow_tf32:
-        assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' in ptx
-    elif in_dtype == 'float32' and allow_tf32:
-        assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' not in ptx
-    elif in_dtype == 'int8':
-        assert 'mma.sync.aligned.m16n8k32.row.col.satfinite.s32.s8.s8.s32' in ptx
-    elif out_dtype == tl.float16:
-        assert 'mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16' in ptx
-
-
-@pytest.mark.parametrize("dtype_str", int_dtypes + float_dtypes + ['bfloat16'])
-def test_full(dtype_str):
-    dtype = getattr(torch, dtype_str)
-    check_type_supported(dtype)  # bfloat16 on cc < 80 will not be tested
-
-    @triton.jit
-    def kernel_static(out):
-        a = GENERATE_TEST_HERE
-        out_ptr = out + tl.arange(0, 128)[:]
-        tl.store(out_ptr, a)
-
-    @triton.jit
-    def kernel_dynamic(out, val, dtype: tl.constexpr):
-        a = tl.full((128,), val, dtype)
-        out_ptr = out + tl.arange(0, 128)[:]
-        tl.store(out_ptr, a)
-
-    kernel_static_patched = patch_kernel(kernel_static, {'GENERATE_TEST_HERE': f"tl.full((128,), 2, tl.{dtype_str})"})
-    out_static = torch.zeros((128), dtype=dtype, device="cuda")
-    kernel_static_patched[(1,)](out_static)
-    out_dynamic = torch.zeros((128), dtype=dtype, device="cuda")
-    kernel_dynamic[(1,)](out_dynamic, 2, getattr(triton.language, dtype_str))
-    assert torch.all(out_static == 2)
-    assert torch.all(out_dynamic == 2)
-
-
-@pytest.mark.parametrize("literal, dtype_str",
-                         [(1e+50, "f64"), (1e+10, "f32"), (1.0, "f32"),
-                          ('float("inf")', "f32"), ('float("-inf")', "f32"),
-                          ('float("nan")', "f32"), ('float("-nan")', "f32"),
-                          (0., "f32"),
-                          (5, "i32"), (2**40, "i64"),])
-def test_constexpr(literal, dtype_str):
-    @triton.jit
-    def kernel(out_ptr):
-        val = GENERATE_TEST_HERE
-        tl.store(out_ptr.to(tl.pointer_type(val.dtype)), val)
-
-    kernel_patched = patch_kernel(kernel, {'GENERATE_TEST_HERE': f"{literal}"})
-    out = torch.zeros((1,), dtype=torch.float32, device="cuda")
-    h = kernel_patched[(1,)](out)
-    assert re.search(r"arith.constant .* : " + dtype_str, h.asm["ttir"]) is not None
-
-# TODO: uncomment once DotOperandEncoding::getElemsPerThread is implemented
-# @pytest.mark.parametrize("dtype_str", ['float32', 'float16'])
-# def test_dot_without_load(dtype_str):
-#     @triton.jit
-#     def _kernel(out):
-#         a = GENERATE_TEST_HERE
-#         b = GENERATE_TEST_HERE
-#         c = tl.dot(a, b)
-#         out_ptr = out + tl.arange(0, 32)[:, None] * 32 + tl.arange(0, 32)[None, :]
-#         tl.store(out_ptr, c)
-
-#     kernel = patch_kernel(_kernel, {'GENERATE_TEST_HERE': f"tl.full((32, 32), 1.0, tl.{dtype_str})"})
-#     a = torch.ones((32, 32), dtype=getattr(torch, dtype_str), device="cuda")
-#     b = torch.ones((32, 32), dtype=getattr(torch, dtype_str), device="cuda")
-#     out_ref = torch.matmul(a, b)
-#     out = torch.zeros((32, 32), dtype=getattr(torch, dtype_str), device="cuda")
-#     kernel[(1,)](out)
-#     assert torch.all(out == out_ref)
-
-# ---------------
-# test arange
-# ---------------
-
-
-@pytest.mark.parametrize("start", [0, 1, 7, 16])
-def test_arange(start, device='cuda'):
-    BLOCK = 128
-    z_tri = torch.empty(BLOCK, dtype=torch.int32, device=device)
-
-    @triton.jit
-    def _kernel(z, BLOCK: tl.constexpr,
-                START: tl.constexpr, END: tl.constexpr):
-        off = tl.arange(0, BLOCK)
-        val = tl.arange(START, END)
-        tl.store(z + off, val)
-    _kernel[(1,)](z_tri, START=start, END=start + BLOCK, BLOCK=BLOCK)
-    z_ref = torch.arange(start, BLOCK + start, dtype=torch.int32, device=device)
-    np.testing.assert_allclose(to_numpy(z_tri), to_numpy(z_ref))
-
-# ---------------
-# test load
-# ---------------
-
-
-@pytest.mark.parametrize("dtype_str, size, size_diff", [(dtype_str, size, size_diff) for dtype_str in torch_dtypes for size in [128, 512] for size_diff in [0, 1, 2, 3, 4]])
-def test_masked_load(dtype_str, size, size_diff, device='cuda'):
-    dtype = getattr(torch, dtype_str)
-    check_type_supported(dtype)  # bfloat16 on cc < 80 will not be tested
-
-    input_size = size - size_diff
-    output_size = size
-    if dtype_str == 'bool':
-        input = torch.randint(0, 2, (input_size,), dtype=dtype, device=device)
-    elif dtype_str in int_dtypes or dtype_str in uint_dtypes:
-        input = torch.randint(0, 127, (input_size,), dtype=dtype, device=device)
-    else:
-        input = torch.rand(input_size, dtype=dtype, device=device)
-    output = torch.zeros((output_size,), dtype=dtype, device=device)
-
-    @triton.jit
-    def _kernel(in_ptr, out_ptr, in_size: tl.constexpr, out_size: tl.constexpr):
-        in_offsets = tl.arange(0, out_size)
-        # Load inputs.
-        x = GENERATE_TEST_HERE
-        # Store output
-        output_offsets = tl.arange(0, out_size)
-        tl.store(out_ptr + output_offsets, x)
-
-    mask_str = "mask=in_offsets < in_size, other=1" if size_diff > 0 else "None"
-    kernel = patch_kernel(_kernel, {'GENERATE_TEST_HERE': f"tl.load(in_ptr + in_offsets, {mask_str})"})
-    kernel[(1,)](input, output, input_size, output_size)
-
-    reference_out = torch.cat((input, torch.ones((size_diff,), dtype=dtype, device=device)))
-    # print((output - reference_out).nonzero())
-    torch.testing.assert_allclose(output, reference_out)
-
-# Testing masked loads with an intermate copy to shared memory run.
-
-
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
-def test_masked_load_shared_memory(dtype, device='cuda'):
-    check_type_supported(dtype)  # bfloat16 on cc < 80 will not be tested
-
-    M = 32
-    N = 32
-    K = 16
-
-    in1 = torch.rand((M, K), dtype=dtype, device=device)
-    in2 = torch.rand((K, N), dtype=dtype, device=device)
-    out = torch.zeros((M, N), dtype=dtype, device=device)
-
-    @triton.jit
-    def _kernel(in1_ptr, in2_ptr, output_ptr,
-                in_stride, in2_stride, out_stride,
-                in_numel, in2_numel, out_numel,
-                M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-
-        M_offsets = tl.arange(0, M)
-        N_offsets = tl.arange(0, N)
-        K_offsets = tl.arange(0, K)
-
-        in_offsets = M_offsets[:, None] * in_stride + K_offsets[None, :]
-        in2_offsets = K_offsets[:, None] * in2_stride + N_offsets[None, :]
-
-        # Load inputs.
-        x = tl.load(in1_ptr + in_offsets, mask=in_offsets < M * K)
-        w = tl.load(in2_ptr + in2_offsets, mask=in2_offsets < K * N)
-
-        # Without a dot product the memory doesn't get promoted to shared.
-        o = tl.dot(x, w, out_dtype=tl.float32)
-
-        # Store output
-        output_offsets = M_offsets[:, None] * out_stride + N_offsets[None, :]
-        tl.store(output_ptr + output_offsets, o, mask=output_offsets < M * N)
-
-    pgm = _kernel[(1,)](in1, in2, out,
-                        in1.stride()[0],
-                        in2.stride()[0],
-                        out.stride()[0],
-                        in1.numel(),
-                        in2.numel(),
-                        out.numel(),
-                        M=M, N=N, K=K)
-
-    reference_out = torch.matmul(in1, in2)
-    torch.testing.assert_allclose(out, reference_out, atol=1e-2, rtol=0)
-
-
-@pytest.mark.parametrize("cache", ["", ".ca", ".cg"])
-def test_load_cache_modifier(cache):
-    src = torch.empty(128, device='cuda')
-    dst = torch.empty(128, device='cuda')
-
-    @triton.jit
-    def _kernel(dst, src, CACHE: tl.constexpr):
-        offsets = tl.arange(0, 128)
-        x = tl.load(src + offsets, cache_modifier=CACHE)
-        tl.store(dst + offsets, x)
-
-    pgm = _kernel[(1,)](dst, src, CACHE=cache)
-    ptx = pgm.asm['ptx']
-    if cache == '':
-        assert 'ld.global.ca' not in ptx
-        assert 'ld.global.cg' not in ptx
-    if cache == '.cg':
-        assert 'ld.global.cg' in ptx
-        assert 'ld.global.ca' not in ptx
-    if cache == '.ca':
-        assert 'ld.global.ca' in ptx
-        assert 'ld.global.cg' not in ptx
-
-
-@pytest.mark.parametrize("N", [16, 10, 11, 1024])
-def test_vectorization(N):
-    src = torch.empty(1024, device='cuda')
-    dst = torch.empty(1024, device='cuda')
-
-    @triton.jit
-    def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
-        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        x = tl.load(src + offsets, mask=offsets < N)
-        tl.store(dst + offsets, x, mask=offsets < N)
-    pgm = _kernel[(1,)](dst, src, N=N, BLOCK_SIZE=src.shape[0])
-    ptx = pgm.asm["ptx"]
-    if N % 16 == 0:
-        assert "ld.global.v4.b32" in ptx
-    else:
-        assert "ld.global.b32" in ptx
-    # np.testing.assert_allclose(dst, src[:N])
-
-
-@pytest.mark.parametrize("has_hints", [False, True])
-def test_vectorization_hints(has_hints):
-    src = torch.empty(1024, device='cuda')
-    dst = torch.empty(1024, device='cuda')
-    off = torch.zeros(1, device='cuda', dtype=torch.int32)
-
-    @triton.jit
-    def _kernel(dst, src, off, N, BLOCK_SIZE: tl.constexpr, HINT: tl.constexpr):
-        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        offsets = offsets + tl.load(off)
-        if HINT:
-            tl.max_contiguous(tl.multiple_of(offsets, 1024), 1024)
-        x = tl.load(src + offsets, mask=offsets < N)
-        tl.store(dst + offsets, x, mask=offsets < N)
-    pgm = _kernel[(1,)](dst, src, off, N=1024, BLOCK_SIZE=src.shape[0], HINT=has_hints)
-    ptx = pgm.asm["ptx"]
-    if has_hints:
-        assert "ld.global.v4.b32" in ptx
-    else:
-        assert "ld.global.v4.b32" not in ptx
-
-# ---------------
-# test store
-# ---------------
-
-# ---------------
-# test if
-# ---------------
-
-# ---------------
-# test for
-# ---------------
-
-# ---------------
-# test while
-# ---------------
-
-# ---------------
-# test default
-# ---------------
-# TODO: can't be local to test_default
-
-
-@triton.jit
-def _impl(value=10):
-    return value
-
-
-def test_default():
-    value = 5
-    ret0 = torch.zeros(1, dtype=torch.int32, device='cuda')
-    ret1 = torch.zeros(1, dtype=torch.int32, device='cuda')
-
-    @triton.jit
-    def _kernel(ret0, ret1, value):
-        tl.store(ret0, _impl())
-        tl.store(ret1, _impl(value))
-
-    _kernel[(1,)](ret0, ret1, value)
-    assert ret0.item() == 10
-    assert ret1.item() == value
-
-# ---------------
-# test noop
-# ----------------
-
-
-def test_noop(device='cuda'):
-    @triton.jit
-    def kernel(x):
-        pass
-    x = to_triton(numpy_random((1,), dtype_str='int32'), device=device)
-    kernel[(1, )](x)
-
-
-@pytest.mark.parametrize("device", ['cuda', 'cpu', 'cpu_pinned'])
-def test_pointer_arguments(device):
-    @triton.jit
-    def kernel(x):
-        pass
-    pin_memory = 'pinned' in device
-    x = torch.empty(1024, device=device.split('_')[0], pin_memory=pin_memory)
-    if device == "cpu":
-        with pytest.raises(ValueError):
-            kernel[(1,)](x)
-    else:
-        kernel[(1, )](x)
-
-
-@pytest.mark.parametrize("value, value_type", [
-    (-1, 'i32'), (0, 'i32'), (-2**31, 'i32'), (2**31 - 1, 'i32'),
-    (2**31, 'i64'), (2**32 - 1, 'i64'), (2**32, 'i64'), (2**63 - 1, 'i64'),
-    (-2**63, 'i64'), (2**63, 'u64'), (2**64 - 1, 'u64')
-])
-def test_value_specialization(value: int, value_type: str, device='cuda') -> None:
-    spec_type = None
-
-    def cache_hook(*args, **kwargs):
-        nonlocal spec_type
-        spec_type = kwargs["compile"]["signature"][0]
-    JITFunction.cache_hook = cache_hook
-
-    @triton.jit
-    def kernel(VALUE, X):
-        pass
-
-    x = torch.tensor([3.14159], device='cuda')
-    pgm = kernel[(1, )](value, x)
-
-    JITFunction.cache_hook = None
-    assert spec_type == value_type
-
-# --------------------
-# value specialization
-# --------------------
-
-
-@pytest.mark.parametrize(
-    "value, overflow",
-    [(2**64 - 1, False), (2**64, True), (-2**63, False), (-2**63 - 1, True)]
-)
-def test_value_specialization_overflow(value: int, overflow: bool, device='cuda') -> None:
-
-    @triton.jit
-    def kernel(VALUE, X):
-        pass
-
-    x = torch.tensor([3.14159], device='cuda')
-
-    if overflow:
-        with pytest.raises(OverflowError):
-            kernel[(1, )](value, x)
-    else:
-        kernel[(1, )](value, x)
-
-
-# ----------------
-# test constexpr
-# ----------------
-
-@pytest.mark.parametrize("op", ['+', '-', '*', '/', '%', '<', '>', '<<', '>>', '&', '^', '|'])
-@pytest.mark.parametrize("is_lhs_constexpr", [False, True])
-@pytest.mark.parametrize("is_rhs_constexpr", [True, False])
-def test_bin_op_constexpr(op, is_lhs_constexpr, is_rhs_constexpr):
-
-    @triton.jit
-    def kernel(Z, X, Y):
-        x = tl.load(X)
-        y = tl.load(Y)
-        z = GENERATE_TEST_HERE
-        tl.store(Z, z)
-
-    if op in ['<<', '>>', '&', '^', '|']:  # int op
-        x_str = "3" if is_lhs_constexpr else "x"
-        y_str = "4" if is_rhs_constexpr else "y"
-        x = numpy_random((1,), dtype_str="int32")
-        y = numpy_random((1,), dtype_str="int32")
-    else:
-        x_str = "3.14" if is_lhs_constexpr else "x"
-        y_str = "4.13" if is_rhs_constexpr else "y"
-        x = numpy_random((1,), dtype_str="float32")
-        y = numpy_random((1,), dtype_str="float32")
-    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f"{x_str} {op} {y_str}"})
-    z = np.array(eval(f"{x_str} {op} {y_str}"))
-    x_tri = to_triton(x)
-    y_tri = to_triton(y)
-    z_tri = to_triton(np.empty((1,), dtype=z.dtype))
-    kernel[(1,)](z_tri, x_tri, y_tri)
-    np.testing.assert_allclose(z, to_numpy(z_tri))
-
-
-def test_constexpr_shape():
-
-    @triton.jit
-    def kernel(X):
-        off = tl.arange(0, 128 + 128)
-        tl.store(X + off, off)
-
-    x_tri = to_triton(np.empty((256, ), dtype=np.int32))
-    kernel[(1,)](x_tri)
-    np.testing.assert_equal(to_numpy(x_tri), np.arange(0, 256))
-
-
-def test_constexpr_scalar_shape():
-
-    @triton.jit
-    def kernel(X, s):
-        off = tl.arange(0, 256)
-        val = off % (256 // s)
-        tl.store(X + off, val)
-
-    x_tri = to_triton(np.empty((256, ), dtype=np.int32))
-    kernel[(1,)](x_tri, 32)
-    np.testing.assert_equal(to_numpy(x_tri), np.arange(0, 256) % 8)
-
-# -------------
-# test call
-# -------------
-
-
-@triton.jit
-def val_multiplier(val, i):
-    return val * i
-
-
-@triton.jit(noinline=True)
-def val_multiplier_noinline(val, i):
-    return val * i
-
-
-@triton.jit
-def vecmul_kernel(ptr, n_elements, rep, type: tl.constexpr):
-    pid = tl.program_id(axis=0)
-    offsets = pid * 128 + tl.arange(0, 128)
-    mask = offsets < n_elements
-    vec = tl.load(ptr + offsets, mask=mask)
-    for i in range(1, rep):
-        if type == "inline":
-            vec = val_multiplier(vec, i)
-        else:
-            vec = val_multiplier_noinline(vec, i)
-    tl.store(ptr + offsets, vec, mask=mask)
-
-
-@pytest.mark.parametrize("type", ["inline", "noinline"])
-def test_call(type):
-
-    @triton.jit
-    def kernel(ptr, n_elements, num1, num2, type: tl.constexpr):
-        vecmul_kernel(ptr, n_elements, num1, type)
-        vecmul_kernel(ptr, n_elements, num2, type)
-
-    size = 1024
-    rand_val = numpy_random((size,), dtype_str="float32")
-    rand_val_tri = to_triton(rand_val, device='cuda')
-    err_msg = ""
-    try:
-        kernel[(size // 128,)](rand_val_tri, size, 3, 5, type)
-    except Exception as e:
-        err_msg = str(e)
-
-    if type == "noinline":
-        assert err_msg is not ""
-    else:
-        ans = rand_val * 1 * 2 * 1 * 2 * 3 * 4
-        np.testing.assert_equal(to_numpy(rand_val_tri), ans)
-
-# -------------
-# test if
-# -------------
-
-
-@pytest.mark.parametrize("if_type", ["if", "if_exp", "if_and"])
-def test_if(if_type):
-
-    @triton.jit
-    def kernel(Cond, XTrue, XFalse, Ret, IfType: tl.constexpr, BoolVar: tl.constexpr):
-        pid = tl.program_id(0)
-        cond = tl.load(Cond)
-        if IfType == "if":
-            if pid % 2 == 0:
-                tl.store(Ret, tl.load(XTrue))
-            else:
-                tl.store(Ret, tl.load(XFalse))
-        elif IfType == "if_exp":
-            tl.store(Ret, tl.load(XTrue)) if pid % 2 else tl.store(Ret, tl.load(XFalse))
-        elif IfType == "if_and":
-            if BoolVar and pid % 2 == 0:
-                tl.store(Ret, tl.load(XTrue))
-            else:
-                tl.store(Ret, tl.load(XFalse))
-
-    cond = torch.ones(1, dtype=torch.int32, device='cuda')
-    x_true = torch.tensor([3.14], dtype=torch.float32, device='cuda')
-    x_false = torch.tensor([1.51], dtype=torch.float32, device='cuda')
-    ret = torch.empty(1, dtype=torch.float32, device='cuda')
-    kernel[(1,)](cond, x_true, x_false, ret, if_type, True)
-    assert torch.equal(ret, x_true)
-
-
-def test_num_warps_pow2():
-    dst = torch.empty(128, device='cuda')
-
-    @triton.jit
-    def _kernel(dst):
-        pass
-
-    with pytest.raises(AssertionError, match='must be a power of 2'):
-        _kernel[(1,)](dst=dst, num_warps=3)
-    _kernel[(1,)](dst=dst, num_warps=1)
-    _kernel[(1,)](dst=dst, num_warps=2)
-    _kernel[(1,)](dst=dst, num_warps=4)
-
-# -------------
-# test extern
-# -------------
-
-
-@pytest.mark.parametrize("dtype_str, expr, lib_path",
-                         [('int32', 'math.ffs', ''),
-                          ('float32', 'math.log2', ''),
-                          ('float32', 'math.scalbn', ''),
-                          ('float32', 'math.pow', tl.math.libdevice_path()),
-                          ('float64', 'math.pow_dtype', tl.math.libdevice_path()),
-                          ('float64', 'math.norm4d', '')])
-def test_math_tensor(dtype_str, expr, lib_path):
-
-    @triton.jit
-    def kernel(X, Y, BLOCK: tl.constexpr):
-        x = tl.load(X + tl.arange(0, BLOCK))
-        y = GENERATE_TEST_HERE
-        tl.store(Y + tl.arange(0, BLOCK), y)
-
-    shape = (128, )
-    rs = RandomState(17)
-    # limit the range of integers so that the sum does not overflow
-    x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
-
-    if expr == 'math.log2':
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.broadcast_to(tl.{expr}(5.0), x.shape)'})
-        y_ref = np.log2(5.0)
-    elif expr == 'math.ffs':
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x)'})
-        y_ref = np.zeros(shape, dtype=x.dtype)
-        for i in range(shape[0]):
-            y_ref[i] = (int(x[i]) & int(-x[i])).bit_length()
-    elif expr == 'math.scalbn':
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x, 2)'})
-        y_ref = x * pow(2, 2)
-    elif expr == 'math.pow_dtype':
-        x = np.abs(x)
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.math.pow(x, 0.5)'})
-        y_ref = np.power(x, 0.5)
-    elif expr == 'math.pow':
-        # numpy does not allow negative factors in power, so we use abs()
-        x = np.abs(x)
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x, x)'})
-        y_ref = np.power(x, x)
-    elif expr == 'math.pow_dtype':
-        x = np.abs(x)
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.math.pow(x, 0.5)'})
-        y_ref = np.power(x, 0.5)
-    elif expr == 'math.norm4d':
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x, x, x, x)'})
-        y_ref = np.sqrt(4 * np.power(x, 2))
-
-    x_tri = to_triton(x)
-    # triton result
-    y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda')
-    kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path})
-    # compare
-    if expr == 'math.ffs':
-        np.testing.assert_equal(y_ref, to_numpy(y_tri))
-    else:
-        np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01)
-
-
-@pytest.mark.parametrize("dtype_str, expr, lib_path",
-                         [('float32', 'math.pow', ''),
-                          ('float64', 'math.pow_dtype', ''),
-                          ('float64', 'math.pow', tl.math.libdevice_path())])
-def test_math_scalar(dtype_str, expr, lib_path):
-
-    @triton.jit
-    def kernel(X, Y, BLOCK: tl.constexpr):
-        x = X
-        y = GENERATE_TEST_HERE
-        tl.store(Y + tl.arange(0, BLOCK), y)
-
-    shape = (128, )
-    rs = RandomState(17)
-    # limit the range of integers so that the sum does not overflow
-    x = numpy_random((1,), dtype_str=dtype_str, rs=rs)
-    y_ref = np.zeros(shape, dtype=x.dtype)
-
-    # numpy does not allow negative factors in power, so we use abs()
-    if expr == 'math.pow':
-        x = np.abs(x)
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.math.pow(x, x)'})
-        y_ref[:] = np.power(x, x)
-    elif expr == 'math.pow_dtype':
-        x = np.abs(x)
-        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.math.pow(x, 0.5)'})
-        y_ref[:] = np.power(x, 0.5)
-
-    # triton result
-    x_tri = to_triton(x)[0].item()
-    y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda')
-    kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path})
-    # compare
-    np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01)
-
-# -----------------------
-# test control flow
-# -----------------------
-
-
-@pytest.mark.parametrize("lo, hi, iv", [(2**35, 2**35 + 20, 1), (2**35, 2**35 + 20, 2), (2**35, 2**35 + 20, 3),
-                                        (15, -16, -1), (15, -16, -2), (15, -16, -3),
-                                        (-18, -22, -1), (22, 18, -1)])
-def test_for_iv(lo, hi, iv):
-
-    @triton.jit
-    def kernel(Out, lo, hi, iv: tl.constexpr):
-        acc = 0
-        acc = acc.to(tl.int64)
-        for i in range(lo, hi, iv):
-            acc += i
-        tl.store(Out, acc)
-
-    lo = 2**35
-    hi = 2**35 + 20
-    out = to_triton(np.zeros((1,), dtype=np.int64), device='cuda')
-    kernel[(1,)](out, lo, hi, iv)
-    assert out[0] == sum(range(lo, hi, iv))
-
-
-def test_if_else():
-
-    @triton.jit
-    def kernel(Cond, TrueVal, FalseVal, Out):
-        if tl.load(Cond):
-            val = tl.load(TrueVal)
-        else:
-            val = tl.load(FalseVal)
-        tl.store(Out, val)
-
-    out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-    true_val = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda')
-    false_val = to_triton(np.full((1,), 2, dtype=np.int32), device='cuda')
-    cond = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-    # True
-    cond[0] = True
-    kernel[(1,)](cond, true_val, false_val, out)
-    assert to_numpy(out)[0] == true_val[0]
-    # False
-    cond[0] = False
-    kernel[(1,)](cond, true_val, false_val, out)
-    assert to_numpy(out)[0] == false_val[0]
-
-
-@pytest.mark.parametrize("mode", ["dynamic", "static"])
-def test_if_return(mode):
-
-    @triton.jit
-    def kernel(ExitEarly, Out, cond: tl.constexpr, mode: tl.constexpr):
-        if mode == "dynamic":
-            if tl.load(ExitEarly):
-                tl.store(Out, 0)
-                return
-        else:
-            if cond:
-                tl.store(Out, 0)
-                return
-        tl.store(Out, 1)
-
-    out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-    exit_early = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-    # exit early path taken
-    exit_early[0] = 1
-    kernel[(1,)](exit_early, out, True, mode)
-    assert to_numpy(out)[0] == 0
-    # exit early path not taken
-    exit_early[0] = 0
-    kernel[(1,)](exit_early, out, False, mode)
-    assert to_numpy(out)[0] == 1
-
-
-@triton.jit
-def add_fn(x):
-    return x + 1
-
-
-@triton.jit(noinline=True)
-def add_fn_noinline(x):
-    return x + 1
-
-
-@triton.jit
-def add_fn_return(x, pid):
-    if pid == 0:
-        return x + 1
-    else:
-        return x + 2
-
-
-@triton.jit
-def add_fn_expr(Out, x):
-    tl.store(Out, x)
-
-
-@triton.jit
-def add_fn_static_cond(x, cond: tl.constexpr):
-    if cond == "":
-        return x
-    else:
-        return x + 1
-
-
-@pytest.mark.parametrize("call_type", ["attribute", "attribute_jit",
-                                       "jit", "jit_if", "jit_ifexp", "jit_expr",
-                                       "jit_static_cond", "jit_noinline", "jit_extern"])
-def test_if_call(call_type):
-    @triton.jit
-    def kernel(Out, call_type: tl.constexpr):
-        pid = tl.program_id(0)
-        o = tl.load(Out)
-        if call_type == "attribute":
-            # call attribute
-            if pid == 0:
-                a = o
-                a = a.to(tl.int32).to(tl.int32) + 1
-                o = a
-        elif call_type == "attribute_jit":
-            # call attribute and jit function
-            if pid == 0:
-                a = o
-                a = tl.load(Out + add_fn(a) - 1).to(tl.int32) + 1
-                o = a
-        elif call_type == "jit":
-            if pid == 0:
-                # regular function call
-                a = o
-                a = add_fn(a)
-                o = a
-        elif call_type == "jit_if":
-            # function without end_if block
-            if pid == 0:
-                a = o
-                a = add_fn_return(a, pid)
-                o = a
-        elif call_type == "jit_ifexp":
-            # ifexp expression
-            if pid == 0:
-                a = o
-                a = add_fn(a) if pid == 0 else add_fn_return(a, pid)
-                o = a
-        elif call_type == "jit_expr":
-            # call without return
-            if pid == 0:
-                a = o + 1
-                add_fn_expr(Out, a)
-                o = a
-        elif call_type == "jit_static_cond":
-            if pid == 0:
-                a = o + 1
-                add_fn_static_cond(o, call_type)
-                o = a
-        elif call_type == "jit_noinline":
-            if pid == 0:
-                a = o + 1
-                add_fn_noinline(a)
-                o = a
-        elif call_type == "jit_extern":
-            if pid == 0:
-                a = o + 1
-                tl.cdiv(a, a)
-                o = a
-
-        tl.store(Out, o)
-
-    out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-    kernel[(1,)](out, call_type)
-    assert to_numpy(out)[0] == 1
-
-
-@pytest.mark.parametrize("_cond1", [True, False])
-@pytest.mark.parametrize("_cond2", [True, False])
-@pytest.mark.parametrize("_cond3", [True, False])
-def test_nested_if_else_return(_cond1, _cond2, _cond3):
-
-    @triton.jit
-    def kernel(Cond1, Cond2, Cond3, Val1, Val2, Val3, Out):
-        val = 0
-        if tl.load(Cond1):
-            if tl.load(Cond2):
-                val = tl.load(Val1)
-            else:
-                return
-        else:
-            if tl.load(Cond3):
-                val = tl.load(Val2)
-            else:
-                val = tl.load(Val3)
-        tl.store(Out, val)
-
-    out = to_triton(np.full((1,), -1, dtype=np.int32), device='cuda')
-    cond1 = to_triton(np.full((1,), _cond1, dtype=np.int32), device='cuda')
-    cond2 = to_triton(np.full((1,), _cond2, dtype=np.int32), device='cuda')
-    cond3 = to_triton(np.full((1,), _cond3, dtype=np.int32), device='cuda')
-    val1 = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda')
-    val2 = to_triton(np.full((1,), 2, dtype=np.int32), device='cuda')
-    val3 = to_triton(np.full((1,), 3, dtype=np.int32), device='cuda')
-    kernel[(1,)](cond1, cond2, cond3, val1, val2, val3, out)
-    targets = {
-        (True, True, True): val1[0],
-        (True, True, False): val1[0],
-        (True, False, True): out[0],
-        (True, False, False): out[0],
-        (False, True, True): val2[0],
-        (False, True, False): val3[0],
-        (False, False, True): val2[0],
-        (False, False, False): val3[0],
-    }
-    assert out[0] == targets[(_cond1, _cond2, _cond3)]
-
-
-def test_while():
-
-    @triton.jit
-    def kernel(InitI, Bound, CutOff, OutI, OutJ):
-        init_i = tl.load(InitI)
-        curr_i = init_i
-        j = 0
-        while curr_i == init_i and j < tl.load(Bound):
-            curr_i = curr_i + (j == tl.load(CutOff))
-            j += 1
-        tl.store(OutI, curr_i)
-        tl.store(OutJ, j)
-
-    out_i = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-    out_j = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-    init_i = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda')
-    bound = to_triton(np.full((1,), 10, dtype=np.int32), device='cuda')
-    cut_off = to_triton(np.full((1,), 5, dtype=np.int32), device='cuda')
-    kernel[(1,)](init_i, bound, cut_off, out_i, out_j)
-    assert out_i[0] == init_i[0] + 1
-    assert out_j[0] == cut_off[0] + 1
-
-# def test_for_if():
-
-#     @triton.jit
-#     def kernel(bound, cutoff, M, N):
-#         m = 0
-#         n = 0
-#         for i in range(bound):
-#             if i > cutoff:
-#                 m = m + 1
-#             else:
-#                 n = n + 1
-#         tl.store(M, m)
-#         tl.store(N, n)
-
-#     m = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-#     n = to_triton(np.zeros((1,), dtype=np.int32), device='cuda')
-#     kernel[(1,)](10, 7, m, n)
-#     print(m[0])
-#     print(n[0])
-
-# -----------------------
-# test extra
-# -----------------------
-
-
-def test_globaltimer():
-
-    @triton.jit
-    def kernel(Out1, Out2):
-        start = tl.extra.cuda.globaltimer()
-        off = tl.arange(0, 128)
-        for i in range(100):
-            tl.store(Out1 + off, tl.load(Out1 + off) + 1)
-        end = tl.extra.cuda.globaltimer()
-        tl.store(Out2, end - start)
-
-    out1 = to_triton(np.zeros((128,), dtype=np.int64), device='cuda')
-    out2 = to_triton(np.zeros((1,), dtype=np.int64), device='cuda')
-    h = kernel[(1,)](out1, out2)
-    assert out2[0] > 0
-    # 2 inlined globaltimers + one extra in the wrapper extern function
-    assert h.asm["ptx"].count("%globaltimer") == 3
-
-
-def test_smid():
-
-    @triton.jit
-    def kernel(Out):
-        tl.store(Out + tl.program_id(0), tl.extra.cuda.smid())
-
-    out = to_triton(np.zeros((1024,), dtype=np.int32), device='cuda')
-    h = kernel[(out.shape[0],)](out)
-    assert out.sort()[0].unique().shape[0] > 0
-    assert h.asm["ptx"].count("%smid") == 2
-
-# -----------------------
-# test layout conversions
-# -----------------------
-# TODO: backend should be tested separately
-
-
-layouts = [
-    # MmaLayout(version=1, warps_per_cta=[1, 4]),
-    MmaLayout(version=(2, 0), warps_per_cta=[1, 4]),
-    # MmaLayout(version=1, warps_per_cta=[4, 1]),
-    MmaLayout(version=(2, 0), warps_per_cta=[4, 1]),
-    BlockedLayout([1, 8], [2, 16], [4, 1], [1, 0]),
-    BlockedLayout([1, 4], [4, 8], [2, 2], [1, 0]),
-    BlockedLayout([1, 1], [1, 32], [2, 2], [1, 0]),
-    BlockedLayout([8, 1], [16, 2], [1, 4], [0, 1]),
-    BlockedLayout([4, 1], [8, 4], [2, 2], [0, 1]),
-    BlockedLayout([1, 1], [32, 1], [2, 2], [0, 1]),
-    BlockedLayout([4, 4], [1, 32], [4, 1], [1, 0])
-]
-
-intermediate_layouts = [
-    None,
-    SharedLayout(1, 1, 1, [1, 0]),
-    SharedLayout(4, 2, 4, [1, 0]),
-    SharedLayout(2, 2, 4, [1, 0]),
-]
-
-
-@pytest.mark.parametrize("shape", [(128, 128)])
-@pytest.mark.parametrize("dtype", ['float16'])
-@pytest.mark.parametrize("src_layout", layouts)
-@pytest.mark.parametrize("interm_layout", intermediate_layouts)
-@pytest.mark.parametrize("dst_layout", layouts)
-def test_convert2d(dtype, shape, src_layout, interm_layout, dst_layout, device='cuda'):
-    if str(src_layout) == str(dst_layout):
-        pytest.skip()
-    if 'mma' in str(src_layout) and 'mma' in str(dst_layout):
-        pytest.skip()
-
-    layouts = f"""
-    #src = {src_layout}
-    #dst = {dst_layout}
-    """ if interm_layout is None else f"""
-    #src = {src_layout}
-    #interm = {interm_layout}
-    #dst = {dst_layout}
-    """
-
-    conversion = f"""
-    %12 = triton_gpu.convert_layout %9 : (tensor<128x128xi32, #src>) -> tensor<128x128xi32, #dst>
-    %13 = triton_gpu.convert_layout %11 : (tensor<128x128xf16, #src>) -> tensor<128x128xf16, #dst>
-    """ if interm_layout is None else f"""
-    %15 = triton_gpu.convert_layout %9 : (tensor<128x128xi32, #src>) -> tensor<128x128xi32, #interm>
-    %16 = triton_gpu.convert_layout %15 : (tensor<128x128xi32, #interm>) -> tensor<128x128xi32, #src>
-    %17 = triton_gpu.convert_layout %11 : (tensor<128x128xf16, #src>) -> tensor<128x128xf16, #interm>
-    %18 = triton_gpu.convert_layout %17 : (tensor<128x128xf16, #interm>) -> tensor<128x128xf16, #src>
-
-    %12 = triton_gpu.convert_layout %16 : (tensor<128x128xi32, #src>) -> tensor<128x128xi32, #dst>
-    %13 = triton_gpu.convert_layout %18 : (tensor<128x128xf16, #src>) -> tensor<128x128xf16, #dst>
-    """
-
-    ir = layouts + """
-    module attributes {"triton_gpu.num-warps" = 4 : i32} {
-  tt.func public @kernel_0d1d(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-    %cst = arith.constant dense<128> : tensor<128x1xi32, #src>
-    %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #src}>>
-    %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #src}>>
-    %2 = tt.splat %arg0 : (!tt.ptr<f16>) -> tensor<128x128x!tt.ptr<f16>, #src>
-    %4 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #src}>>) -> tensor<128x1xi32, #src>
-    %5 = arith.muli %4, %cst : tensor<128x1xi32, #src>
-    %6 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #src}>>) -> tensor<1x128xi32, #src>
-    %7 = tt.broadcast %6 : (tensor<1x128xi32, #src>) -> tensor<128x128xi32, #src>
-    %8 = tt.broadcast %5 : (tensor<128x1xi32, #src>) -> tensor<128x128xi32, #src>
-    %9 = arith.addi %8, %7 : tensor<128x128xi32, #src>
-    %10 = tt.addptr %2, %9 : tensor<128x128x!tt.ptr<f16>, #src>, tensor<128x128xi32, #src>
-    %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x128xf16, #src>
-    %3 = tt.splat %arg1 : (!tt.ptr<f16>) -> tensor<128x128x!tt.ptr<f16>, #dst>
-    """ + conversion + """
-    %14 = tt.addptr %3, %12 : tensor<128x128x!tt.ptr<f16>, #dst>, tensor<128x128xi32, #dst>
-    tt.store %14, %13 : tensor<128x128xf16, #dst>
-    tt.return
-  }
-}
-"""
-
-    x = to_triton(numpy_random(shape, dtype_str=dtype))
-    z = torch.empty_like(x)
-
-    # write the IR to a temporary file using mkstemp
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
-        f.write(ir)
-        f.flush()
-        kernel = triton.compile(f.name)
-    kernel[(1, 1, 1)](x.data_ptr(), z.data_ptr())
-
-    assert torch.equal(z, x)
-
-
-def test_load_scalar_with_mask():
-    @triton.jit
-    def kernel(Input, Index, Out, N: int):
-        index = tl.load(Index)
-        scalar = tl.load(Input + index, mask=index < N, other=0)
-        tl.store(Out, scalar, mask=index < N)
-    Index = torch.tensor([0], dtype=torch.int32, device='cuda')
-    Input = torch.tensor([0], dtype=torch.int32, device='cuda')
-    Out = torch.empty_like(Index, device='cuda')
-    kernel[(1,)](Input, Index, Out, Index.numel())
-    assert Out.data[0] == 0
-
-
-# This test is used to test our own PTX codegen for float16 and int16 conversions
-# maybe delete it later after ptxas has been fixed
-@pytest.mark.parametrize("dtype_str", ['float16', 'int16'])
-def test_ptx_cast(dtype_str):
-    @triton.jit
-    def kernel(in_ptr0, out_ptr2, xnumel, rnumel, dtype: tl.constexpr, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr):
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
-        xmask = xindex < xnumel
-        rbase = tl.arange(0, RBLOCK)[None, :]
-        x0 = xindex
-        _tmp4 = (tl.zeros([XBLOCK, RBLOCK], dtype) - 10000).to(dtype)
-        for roffset in range(0, rnumel, RBLOCK):
-            rindex = roffset + rbase
-            rmask = rindex < rnumel
-            r1 = rindex
-            tmp0 = tl.load(in_ptr0 + (r1 + (197 * x0)), rmask & xmask).to(dtype)
-            tmp1 = 2
-            tmp2 = tmp0 * tmp1
-            tmp3 = tmp2.to(dtype)
-            tmp5 = _tmp4 < tmp3
-            _tmp4 = tl.where(rmask & xmask & tmp5, tmp3, _tmp4)
-            tl.store(out_ptr2 + (r1 + (197 * x0) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), _tmp4, rmask & xmask)
-
-    torch.manual_seed(123)
-    if dtype_str == 'int16':
-        torch_dtype = torch.int16
-        triton_dtype = tl.int32
-    else:
-        torch_dtype = torch.float16
-        triton_dtype = tl.float32
-
-    s0 = 4
-    buf11 = -torch.ones((6 * s0, 197, 197), device='cuda', dtype=torch_dtype)
-    buf14 = -torch.ones((s0, 6, 197, 197), device='cuda', dtype=torch_dtype)
-    kernel[(4728,)](buf11, buf14, 1182 * s0, 197, triton_dtype, 1, 256, num_warps=2)
-    assert buf14.to(torch.float32).mean() == -2.0
diff --git a/python/test/unit/language/test_random.py b/python/test/unit/language/test_random.py
deleted file mode 100644
index 39ae59e35a8d..000000000000
--- a/python/test/unit/language/test_random.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import numpy as np
-import pytest
-import scipy.stats
-import torch
-
-import triton
-import triton.language as tl
-
-#####################################
-# Reference Philox Implementation
-#####################################
-
-
-class PhiloxConfig:
-    def __init__(self, PHILOX_ROUND_A, PHILOX_ROUND_B, PHILOX_KEY_A, PHILOX_KEY_B, DTYPE):
-        self.PHILOX_ROUND_A = np.array(PHILOX_ROUND_A, dtype=DTYPE)
-        self.PHILOX_ROUND_B = np.array(PHILOX_ROUND_B, dtype=DTYPE)
-        self.PHILOX_KEY_A = np.array(PHILOX_KEY_A, dtype=DTYPE)
-        self.PHILOX_KEY_B = np.array(PHILOX_KEY_B, dtype=DTYPE)
-        self.DTYPE = DTYPE
-
-
-# This is better for GPU
-PHILOX_32 = PhiloxConfig(
-    PHILOX_KEY_A=0x9E3779B9,
-    PHILOX_KEY_B=0xBB67AE85,
-    PHILOX_ROUND_A=0xD2511F53,
-    PHILOX_ROUND_B=0xCD9E8D57,
-    DTYPE=np.uint32,
-)
-
-# This is what numpy implements
-PHILOX_64 = PhiloxConfig(
-    PHILOX_KEY_A=0x9E3779B97F4A7C15,
-    PHILOX_KEY_B=0xBB67AE8584CAA73B,
-    PHILOX_ROUND_A=0xD2E7470EE14C6C93,
-    PHILOX_ROUND_B=0xCA5A826395121157,
-    DTYPE=np.uint64,
-)
-
-
-class CustomPhilox4x:
-    def __init__(self, seed, config):
-        self._config = config
-        seed = self._into_pieces(seed)
-        self._key = np.array(seed[:2], dtype=self._dtype)
-        self._counter = np.array((0, 0) + seed[2:], dtype=self._dtype)
-
-    @property
-    def _dtype(self):
-        return self._config.DTYPE
-
-    def _into_pieces(self, n, pad=4):
-        res = []
-        while len(res) < pad:
-            res.append(np.array(n, dtype=self._dtype))
-            n >>= (np.dtype(self._dtype).itemsize * 8)
-        assert n == 0
-        return tuple(res)
-
-    def _multiply_low_high(self, a, b):
-        low = a * b
-        high = int(a) * int(b)
-        high = np.array(high >> (np.dtype(self._dtype).itemsize * 8), dtype=self._dtype)
-        return low, high
-
-    def _single_round(self, counter, key):
-        lo0, hi0 = self._multiply_low_high(self._config.PHILOX_ROUND_A, counter[0])
-        lo1, hi1 = self._multiply_low_high(self._config.PHILOX_ROUND_B, counter[2])
-        ret0 = hi1 ^ counter[1] ^ key[0]
-        ret1 = lo1
-        ret2 = hi0 ^ counter[3] ^ key[1]
-        ret3 = lo0
-        return np.array([ret0, ret1, ret2, ret3], dtype=self._dtype)
-
-    def _raise_key(self, key):
-        pk = [self._config.PHILOX_KEY_A, self._config.PHILOX_KEY_B]
-        return key + np.array(pk, dtype=self._dtype)
-
-    def random_raw(self):
-        counter = self._counter
-        key = self._key
-        for _ in range(10):
-            counter = self._single_round(counter, key)
-            key = self._raise_key(key)
-        self.advance(1)
-        return counter
-
-    def advance(self, n_steps):
-        self._counter[0] += n_steps
-        assert self._counter[0] < 2**32, "FIXME: doesn't work for large offsets"
-
-
-class CustomPhilox(CustomPhilox4x):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.buffer = []
-
-    def random_raw(self):
-        if len(self.buffer) == 0:
-            self.buffer = list(super().random_raw())[::-1]
-        return int(self.buffer.pop())
-
-
-#####################################
-# Unit Tests
-#####################################
-
-BLOCK = 1024
-
-# test generation of random uint32
-
-
-@pytest.mark.parametrize('size, seed',
-                         [(size, seed) for size in ['10', '4,53', '10000']
-                          for seed in [0, 42, 124, 54, 0xffffffff, 0xdeadbeefcafeb0ba]]
-                         )
-def test_randint(size, seed, device='cuda'):
-    size = list(map(int, size.split(',')))
-
-    @triton.jit
-    def kernel(X, N, seed):
-        offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)
-        rand = tl.randint(seed, offset)
-        tl.store(X + offset, rand, mask=offset < N)
-    # triton result
-    x = torch.empty(size, dtype=torch.int32, device=device)
-    N = x.numel()
-    grid = (triton.cdiv(N, BLOCK),)
-    kernel[grid](x, N, seed)
-    out_tri = x.cpu().numpy().astype(np.uint32).flatten().tolist()
-    # reference result
-    gen = CustomPhilox4x(seed, config=PHILOX_32)
-    out_ref = [gen.random_raw()[0] for _ in out_tri]
-    assert out_tri == out_ref
-
-# test uniform PRNG
-
-
-@pytest.mark.parametrize('size, seed',
-                         [(size, seed) for size in [1000000]
-                          for seed in [0, 42, 124, 54]]
-                         )
-def test_rand(size, seed, device='cuda'):
-    @triton.jit
-    def kernel(X, N, seed):
-        offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)
-        rand = tl.rand(seed, offset)
-        tl.store(X + offset, rand, mask=offset < N)
-    # triton result
-    x = torch.empty(size, dtype=torch.float32, device=device)
-    N = x.numel()
-    grid = (triton.cdiv(N, BLOCK),)
-    kernel[grid](x, N, seed)
-    assert all((x >= 0) & (x <= 1))
-    assert scipy.stats.kstest(x.tolist(), 'uniform', args=(0, 1)).statistic < 0.01
-
-# test normal PRNG
-
-
-@pytest.mark.parametrize('size, seed',
-                         [(size, seed) for size in [1000000]
-                          for seed in [0, 42, 124, 54]]
-                         )
-def test_randn(size, seed, device='cuda'):
-    @triton.jit
-    def kernel(X, N, seed):
-        offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)
-        rand = tl.randn(seed, offset)
-        tl.store(X + offset, rand, mask=offset < N)
-    # triton result
-    x = torch.empty(size, dtype=torch.float32, device=device)
-    N = x.numel()
-    grid = (triton.cdiv(N, BLOCK),)
-    kernel[grid](x, N, seed)
-    assert abs(x.mean()) < 1e-2
-    assert abs(x.std() - 1) < 1e-2
-
-
-# tl.rand() should never produce >=1.0
-
-def test_rand_limits():
-    @triton.jit
-    def kernel(input, output, n: tl.constexpr):
-        idx = tl.arange(0, n)
-        x = tl.load(input + idx)
-        y = tl.random.uint32_to_uniform_float(x)
-        tl.store(output + idx, y)
-
-    min_max_int32 = torch.tensor([
-        torch.iinfo(torch.int32).min,
-        torch.iinfo(torch.int32).max,
-    ], dtype=torch.int32, device='cuda')
-    output = torch.empty(2, dtype=torch.float32, device='cuda')
-    kernel[(1,)](min_max_int32, output, 2)
-
-    assert output[0] == output[1]
-    assert 1.0 - torch.finfo(torch.float32).eps <= output[0].item() < 1.0
diff --git a/python/test/unit/language/test_subprocess.py b/python/test/unit/language/test_subprocess.py
deleted file mode 100644
index 3263166d8c28..000000000000
--- a/python/test/unit/language/test_subprocess.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import os
-import subprocess
-import sys
-
-import pytest
-
-dir_path = os.path.dirname(os.path.realpath(__file__))
-print_path = os.path.join(dir_path, "print_helper.py")
-assert_path = os.path.join(dir_path, "assert_helper.py")
-
-# TODO: bfloat16 after LLVM-15
-func_types = ["device_assert", "assert", "static_assert", "no_debug"]
-nested_types = [(caller, callee) for caller in ["true", "false", "none"] for callee in ["true", "false", "none"]]
-torch_types = ["int8", "uint8", "int16", "int32", "long", "float16", "float32", "float64"]
-
-
-@pytest.mark.parametrize("func_type, data_type",
-                         [("device_print", data_type) for data_type in torch_types] + [("print", "int32"), ("static_print", "int32")])
-def test_print(func_type: str, data_type: str):
-    proc = subprocess.Popen([sys.executable, print_path, func_type, data_type], stdout=subprocess.PIPE, shell=False)
-    outs, _ = proc.communicate()
-    outs = outs.split()
-    new_lines = set()
-    for line in outs:
-        try:
-            value = line
-            if func_type != "static_print":
-                value = int(float(line))
-            new_lines.add(value)
-        except Exception as e:
-            print(e)
-    if func_type != "static_print":
-        for i in range(128):
-            assert i in new_lines
-        assert len(new_lines) == 128
-    else:
-        assert len(new_lines) == 1
-
-
-@pytest.mark.parametrize("func_type", func_types)
-def test_assert(func_type: str):
-    os.environ["TRITON_DEBUG"] = "1"
-    proc = subprocess.Popen([sys.executable, assert_path, func_type], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
-    _, errs = proc.communicate()
-    errs = errs.splitlines()
-    num_errs = 0
-    for err in errs:
-        if "x != 0" in err.decode("utf-8"):
-            num_errs += 1
-    os.environ["TRITON_DEBUG"] = "0"
-    if func_type != "static_assert":
-        assert num_errs == 127
-    else:
-        assert num_errs == 0
-
-
-@pytest.mark.parametrize("caller_type, callee_type", nested_types)
-def test_assert_nested(caller_type, callee_type):
-    proc = subprocess.Popen([sys.executable, assert_path, caller_type, callee_type], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
-    _, errs = proc.communicate()
-    errs = errs.splitlines()
-    num_errs = 0
-    for err in errs:
-        if "x != 0" in err.decode("utf-8"):
-            num_errs += 1
-    if caller_type == "none":
-        if callee_type == "true":
-            assert num_errs == 127
-        else:
-            assert num_errs == 0
-    elif caller_type == "true":
-        if callee_type == "false":
-            assert num_errs == 0
-        else:
-            assert num_errs == 127
-    elif caller_type == "false":
-        if callee_type == "true":
-            assert num_errs == 127
-        else:
-            assert num_errs == 0
diff --git a/python/test/unit/operators/test_blocksparse.py b/python/test/unit/operators/test_blocksparse.py
deleted file mode 100644
index 5f94cd8b31bf..000000000000
--- a/python/test/unit/operators/test_blocksparse.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import pytest
-import torch
-
-import triton
-import triton.ops
-
-
-def sparsify_tensor(x, mask, block):
-    ret = torch.empty((x.size(0), mask.sum(), block, block), dtype=x.dtype, device=x.device)
-    for idx, (h, i, j) in enumerate(zip(*mask.nonzero(as_tuple=True))):
-        ret[:, idx, :, :] = x[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block]
-    return ret
-
-
-def make_pair(shape, device="cuda", alpha=1e-2, beta=0., trans=False, data=None, dtype=torch.float32):
-    if data is None:
-        data = torch.randn(shape, dtype=torch.float32, requires_grad=True, device=device)
-    ref_ret = data
-    ref_ret = ref_ret * alpha + beta
-    ref_ret = ref_ret.half().to(dtype)
-    if trans:
-        ref_ret = ref_ret.t().requires_grad_()
-    ref_ret = ref_ret.detach().requires_grad_()
-    tri_ret = ref_ret.clone().detach().requires_grad_()
-    return ref_ret, tri_ret
-
-
-def mask_tensor(x, mask, block, value=0):
-    ret = x.clone()
-    for h, i, j in zip(*(mask == 0).nonzero(as_tuple=True)):
-        ret[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] = value
-    return ret
-
-
-@pytest.mark.parametrize("MODE", ["sdd", "dds", "dsd"])
-@pytest.mark.parametrize("TRANS_A", [False, True])
-@pytest.mark.parametrize("TRANS_B", [False, True])
-@pytest.mark.parametrize("BLOCK", [16, 32, 64])
-@pytest.mark.parametrize("DTYPE", [torch.float16])
-def test_matmul(MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, Z=3, H=2, M=512, N=384, K=256):
-    seed = 0
-    torch.manual_seed(seed)
-    is_sdd = MODE == "sdd"
-    is_dsd = MODE == "dsd"
-    is_dds = MODE == "dds"
-    do_sparsify = lambda x: sparsify_tensor(x, layout, BLOCK)
-    do_mask = lambda x: mask_tensor(x, layout, BLOCK)
-    # create inputs
-    # create op
-    a_shape = (Z, H, K, M) if TRANS_A else (Z, H, M, K)
-    b_shape = (Z, H, N, K) if TRANS_B else (Z, H, K, N)
-    c_shape = (Z, H, M, N)
-    shape = {
-        "sdd": (M, N),
-        "dsd": (a_shape[2], a_shape[3]),
-        "dds": (b_shape[2], b_shape[3]),
-    }[MODE]
-    layout = torch.randint(2, (H, shape[0] // BLOCK, shape[1] // BLOCK))
-    layout[1, 2, :] = 0
-    layout[1, :, 1] = 0
-    # create data
-    a_ref, a_tri = make_pair(a_shape, alpha=.1, dtype=DTYPE)
-    b_ref, b_tri = make_pair(b_shape, alpha=.1, dtype=DTYPE)
-    dc_ref, dc_tri = make_pair(c_shape, dtype=DTYPE)
-    # compute [torch]
-    dc_ref = do_mask(dc_ref) if is_sdd else dc_ref
-    a_ref = do_mask(a_ref) if is_dsd else a_ref
-    b_ref = do_mask(b_ref) if is_dds else b_ref
-    a_ref.retain_grad()
-    b_ref.retain_grad()
-    c_ref = torch.matmul(a_ref.transpose(2, 3) if TRANS_A else a_ref,
-                         b_ref.transpose(2, 3) if TRANS_B else b_ref)
-    c_ref.backward(dc_ref)
-    c_ref = do_sparsify(c_ref) if is_sdd else c_ref
-    da_ref = do_sparsify(a_ref.grad) if is_dsd else a_ref.grad
-    db_ref = do_sparsify(b_ref.grad) if is_dds else b_ref.grad
-    # triton result
-    dc_tri = do_sparsify(dc_tri) if is_sdd else dc_tri
-    a_tri = do_sparsify(a_tri) if is_dsd else a_tri
-    b_tri = do_sparsify(b_tri) if is_dds else b_tri
-    a_tri.retain_grad()
-    b_tri.retain_grad()
-    op = triton.ops.blocksparse.matmul(layout, BLOCK, MODE, trans_a=TRANS_A, trans_b=TRANS_B, device="cuda")
-    c_tri = op(a_tri, b_tri)
-    c_tri.backward(dc_tri)
-    da_tri = a_tri.grad
-    db_tri = b_tri.grad
-    # compare
-    torch.testing.assert_allclose(c_ref, c_tri)
-    torch.testing.assert_allclose(da_ref, da_tri)
-    torch.testing.assert_allclose(db_ref, db_tri)
-
-
-configs = [
-    (16, 256),
-    (32, 576),
-    (64, 1871),
-    (128, 2511),
-]
-
-
-@pytest.mark.parametrize("is_dense", [False, True])
-@pytest.mark.parametrize("BLOCK, WIDTH", configs)
-def test_softmax(BLOCK, WIDTH, is_dense, Z=2, H=2, is_causal=True, scale=0.4):
-    # set seed
-    torch.random.manual_seed(0)
-    Z, H, M, N = 2, 3, WIDTH, WIDTH
-    # initialize layout
-    # make sure each row has at least one non-zero element
-    layout = torch.randint(2, (H, M // BLOCK, N // BLOCK))
-    if is_dense:
-        layout[:] = 1
-    else:
-        layout[1, 2, :] = 0
-        layout[1, :, 1] = 0
-    # initialize data
-    a_shape = (Z, H, M, N)
-    a_ref, a_tri = make_pair(a_shape)
-    dout_ref, dout_tri = make_pair(a_shape)
-    # compute [torch]
-    a_ref = mask_tensor(a_ref, layout, BLOCK, value=float("-inf"))
-    a_ref.retain_grad()
-    at_mask = torch.ones((M, N), device="cuda")
-    if is_causal:
-        at_mask = torch.tril(at_mask)
-    M = at_mask[None, None, :, :] + torch.zeros_like(a_ref)
-    a_ref[M == 0] = float("-inf")
-    out_ref = torch.softmax(a_ref * scale, -1)
-    out_ref.backward(dout_ref)
-    out_ref = sparsify_tensor(out_ref, layout, BLOCK)
-    da_ref = sparsify_tensor(a_ref.grad, layout, BLOCK)
-    # compute [triton]
-    a_tri = sparsify_tensor(a_tri, layout, BLOCK)
-    a_tri.retain_grad()
-    dout_tri = sparsify_tensor(dout_tri, layout, BLOCK)
-    op = triton.ops.blocksparse.softmax(layout, BLOCK, device="cuda", is_dense=is_dense)
-    out_tri = op(a_tri, scale=scale, is_causal=is_causal)
-    out_tri.backward(dout_tri)
-    da_tri = a_tri.grad
-    # compare
-    torch.testing.assert_allclose(out_tri, out_ref)
-    torch.testing.assert_allclose(da_tri, da_ref)
-
-
-@pytest.mark.parametrize("block", [16, 32, 64])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
-def test_attention_fwd_bwd(
-    block,
-    dtype,
-    input_scale=1.0,
-    scale=1 / 8.0,
-    n_ctx=256,
-    batch_size=2,
-    n_heads=2,
-):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 7:
-        pytest.skip("Only test tl.dot() on devices with sm >= 70")
-
-    # inputs
-    qkv_shape = (batch_size, n_heads, n_ctx, 64)
-    qkvs = [
-        torch.nn.Parameter(input_scale * torch.randn(qkv_shape), requires_grad=True).to(dtype).cuda() for _ in range(3)
-    ]
-
-    # Triton:
-    n_blocks = n_ctx // block
-    layout = torch.tril(torch.ones([n_heads, n_blocks, n_blocks], dtype=torch.long))
-    query, key, value = [x.clone() for x in qkvs]
-    query.retain_grad()
-    key.retain_grad()
-    value.retain_grad()
-    attn_out = triton_attention(layout, block, query=query, key=key, value=value, scale=scale)
-    # ad hoc loss
-    loss = (attn_out ** 2).mean()
-    loss.backward()
-    grads = [query.grad, key.grad, value.grad]
-
-    # Torch version:
-    torch_q, torch_k, torch_v = [x.clone() for x in qkvs]
-    attn_mask = torch.ones([n_ctx, n_ctx], device="cuda", dtype=dtype)
-    attn_mask = torch.tril(attn_mask, diagonal=0)
-    attn_mask = 1e6 * (-1 + (attn_mask.reshape((1, 1, n_ctx, n_ctx)).cuda()))
-    torch_q.retain_grad()
-    torch_k.retain_grad()
-    torch_v.retain_grad()
-    scores = scale * torch.einsum("bhsd,bhtd->bhst", torch_q, torch_k)
-    scores = scores + attn_mask
-    probs = torch.softmax(scores, dim=-1)
-    torch_attn_out = torch.einsum("bhst,bhtd->bhsd", probs, torch_v)
-    # ad hoc loss
-    torch_loss = (torch_attn_out ** 2).mean()
-    torch_loss.backward()
-    torch_grads = [torch_q.grad, torch_k.grad, torch_v.grad]
-
-    # comparison
-    # print(f"Triton loss {loss} and torch loss {torch_loss}.  Also checking grads...")
-    torch.testing.assert_allclose(loss, torch_loss, atol=1e-3, rtol=0)
-    for g1, g2 in zip(grads, torch_grads):
-        torch.testing.assert_allclose(g1, g2)
-
-
-@pytest.mark.parametrize("block", [16, 32, 64])
-def triton_attention(
-    layout,
-    block: int,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-):
-    sparse_dot_sdd_nt = triton.ops.blocksparse.matmul(layout, block, "sdd", trans_a=False, trans_b=True, device=value.device)
-    sparse_dot_dsd_nn = triton.ops.blocksparse.matmul(layout, block, "dsd", trans_a=False, trans_b=False, device=value.device)
-    sparse_softmax = triton.ops.blocksparse.softmax(layout, block, device=value.device)
-
-    w = sparse_dot_sdd_nt(query, key)
-    w = sparse_softmax(w, scale=scale, is_causal=True)
-    a = sparse_dot_dsd_nn(w, value)
-    return a
diff --git a/python/test/unit/operators/test_cross_entropy.py b/python/test/unit/operators/test_cross_entropy.py
deleted file mode 100644
index 20d57ece57a9..000000000000
--- a/python/test/unit/operators/test_cross_entropy.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import pytest
-import torch
-
-import triton
-import triton.ops
-
-
-@pytest.mark.parametrize("M, N, dtype, mode",
-                         [
-                             (M, N, dtype, mode) for M in [1024, 821]
-                             for N in [512, 857, 1871, 2089, 8573, 31000]
-                             for dtype in ['float16', 'float32']
-                             for mode in ['forward', 'backward']
-                         ]
-                         )
-def test_op(M, N, dtype, mode):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 8 and dtype == "bfloat16":
-        pytest.skip("Only test bfloat16 on devices with sm >= 80")
-    dtype = {'bfloat16': torch.bfloat16, 'float16': torch.float16, 'float32': torch.float32}[dtype]
-    # create inputs
-    x = torch.randn(M, N, dtype=dtype, device='cuda', requires_grad=True)
-    idx = 4 + torch.ones(M, dtype=torch.int64, device='cuda')
-    # forward pass
-    tt_y = triton.ops.cross_entropy(x, idx)
-    th_y = torch.nn.CrossEntropyLoss(reduction="none")(x, idx)
-    if mode == 'forward':
-        torch.testing.assert_allclose(th_y, tt_y)
-    # backward pass
-    elif mode == 'backward':
-        dy = torch.randn_like(tt_y)
-        # triton backward
-        tt_y.backward(dy)
-        tt_dx = x.grad.clone()
-        # torch backward
-        x.grad.zero_()
-        th_y.backward(dy)
-        th_dx = x.grad.clone()
-        torch.testing.assert_allclose(th_dx, tt_dx)
diff --git a/python/test/unit/operators/test_flash_attention.py b/python/test/unit/operators/test_flash_attention.py
deleted file mode 100644
index c9d8babe342f..000000000000
--- a/python/test/unit/operators/test_flash_attention.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import pytest
-import torch
-
-import triton
-import triton.ops
-
-
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(4, 48, 1024, 64)])
-@pytest.mark.parametrize('dtype', [torch.float16, torch.bfloat16])
-def test_op(Z, H, N_CTX, D_HEAD, dtype):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 8:
-        pytest.skip("Flash attention only supported for compute capability < 80")
-    torch.manual_seed(20)
-    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_()
-    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_()
-    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_()
-    sm_scale = 0.2
-    dout = torch.randn_like(q)
-    # reference implementation
-    M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda"))
-    p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
-    for z in range(Z):
-        for h in range(H):
-            p[:, :, M == 0] = float("-inf")
-    p = torch.softmax(p.float(), dim=-1).to(dtype)
-    # p = torch.exp(p)
-    ref_out = torch.matmul(p, v)
-    ref_out.backward(dout)
-    ref_dv, v.grad = v.grad.clone(), None
-    ref_dk, k.grad = k.grad.clone(), None
-    ref_dq, q.grad = q.grad.clone(), None
-    # # triton implementation
-    tri_out = triton.ops.attention(q, k, v, sm_scale)
-    # print(ref_out)
-    # print(tri_out)
-    tri_out.backward(dout)
-    tri_dv, v.grad = v.grad.clone(), None
-    tri_dk, k.grad = k.grad.clone(), None
-    tri_dq, q.grad = q.grad.clone(), None
-    # compare
-    atol = 1e-1 if dtype == torch.bfloat16 else 1e-2
-    torch.testing.assert_allclose(ref_out, tri_out, atol=atol, rtol=0)
-    torch.testing.assert_allclose(ref_dv, tri_dv, atol=atol, rtol=0)
-    torch.testing.assert_allclose(ref_dk, tri_dk, atol=atol, rtol=0)
-    torch.testing.assert_allclose(ref_dq, tri_dq, atol=atol, rtol=0)
diff --git a/python/test/unit/operators/test_inductor.py b/python/test/unit/operators/test_inductor.py
deleted file mode 100644
index f7e2ce2aa7e0..000000000000
--- a/python/test/unit/operators/test_inductor.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-
-
-def test_normalization_with_remat():
-
-    @triton.jit
-    def triton_(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, rnumel, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr):
-        xnumel = 512
-        rnumel = 4096
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
-        xmask = xindex < xnumel
-        rbase = tl.arange(0, RBLOCK)[None, :]
-        x3 = xindex
-        x0 = xindex % 64
-        tmp1 = tl.load(in_ptr0 + (x0), xmask)
-        tmp3 = tl.load(in_ptr1 + (x0), xmask)
-        tmp11 = tl.load(in_ptr2 + (x0), xmask)
-        tmp13 = tl.load(in_ptr3 + (x0), xmask)
-        _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0
-        for roffset in range(0, rnumel, RBLOCK):
-            rindex = roffset + rbase
-            rmask = rindex < rnumel
-            r2 = rindex
-            tmp0 = tl.load(in_out_ptr0 + (r2 + (4096 * x3)), rmask & xmask, eviction_policy='evict_last', other=0)
-            tmp2 = tmp0 - tmp1
-            tmp4 = 1e-05
-            tmp5 = tmp3 + tmp4
-            tmp6 = tl.sqrt(tmp5)
-            tmp7 = 1 / tmp6
-            tmp8 = 1.0
-            tmp9 = tmp7 * tmp8
-            tmp10 = tmp2 * tmp9
-            tmp12 = tmp10 * tmp11
-            tmp14 = tmp12 + tmp13
-            _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp14, _tmp17)
-            tl.store(in_out_ptr0 + (r2 + (4096 * x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp14, rmask & xmask)
-        tmp17 = tl.sum(_tmp17, 1)[:, None]
-        tmp18 = 4096.0
-        tmp19 = tmp17 / tmp18
-        tl.store(in_out_ptr1 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp19, xmask)
-
-    torch.manual_seed(123)
-
-    buf14 = torch.rand(8, 64, 64, 64, device="cuda")
-    buf16 = torch.rand(8, 1, 64, device="cuda")
-    arg114_1 = torch.rand(64, device="cuda")
-    arg115_1 = torch.rand(64, device="cuda")
-    arg8_1 = torch.rand(64, device="cuda")
-    arg9_1 = torch.rand(64, device="cuda")
-    triton_[(512,)](buf14, buf16, arg114_1, arg115_1, arg8_1, arg9_1, 512, 4096, 1, 2048)
-    torch.testing.assert_allclose(buf16.mean().item(), buf14.mean().item(), atol=1e-7, rtol=0)
-
-
-def test_avg_pool_bw():
-
-    @triton.jit
-    def triton_(in_ptr0, out_ptr0, XBLOCK: tl.constexpr):
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        x1 = (xindex // 8) % 8
-        x0 = xindex % 8
-        x2 = (xindex // 64)
-        x5 = xindex
-        tmp0 = (-1) + x1
-        tmp1 = (-1) + x0
-        tmp2 = 2 + x1
-        tmp3 = 2 + x0
-        tmp4 = 0
-        tmp5 = tl.where(tmp0 != tmp0, tmp0, tl.where(tmp0 > tmp4, tmp0, tmp4))
-        tmp6 = tl.where(tmp1 != tmp1, tmp1, tl.where(tmp1 > tmp4, tmp1, tmp4))
-        tmp7 = 8
-        tmp8 = tl.where(tmp2 != tmp2, tmp2, tl.where(tmp2 < tmp7, tmp2, tmp7))
-        tmp9 = tl.where(tmp3 != tmp3, tmp3, tl.where(tmp3 < tmp7, tmp3, tmp7))
-        tmp10 = tmp5 + tmp4
-        tmp11 = tmp6 + tmp4
-        tmp12 = 1
-        tmp13 = tmp8 - tmp12
-        tmp14 = tl.where(tmp10 != tmp10, tmp10, tl.where(tmp10 < tmp13, tmp10, tmp13))
-        tmp15 = tmp9 - tmp12
-        tmp16 = tl.where(tmp11 != tmp11, tmp11, tl.where(tmp11 < tmp15, tmp11, tmp15))
-        tmp17 = tl.load(in_ptr0 + (tmp16 + (8 * tmp14) + (64 * x2)), None).to(tl.float32)
-        tmp18 = tmp17 / 9
-        tmp19 = tmp10 < tmp8
-        tmp20 = tmp11 < tmp9
-        tmp21 = tmp19 & tmp20
-        tmp22 = 0.0
-        tmp23 = tl.where(tmp21, tmp18, tmp22)
-        tmp24 = tmp6 + tmp12
-        tmp25 = tl.where(tmp24 != tmp24, tmp24, tl.where(tmp24 < tmp15, tmp24, tmp15))
-        tmp26 = tl.load(in_ptr0 + (tmp25 + (8 * tmp14) + (64 * x2)), None).to(tl.float32)
-        tmp27 = tmp26 / 9
-        tmp28 = tmp24 < tmp9
-        tmp29 = tmp19 & tmp28
-        tmp30 = tmp23 + tmp27
-        tmp31 = tl.where(tmp29, tmp30, tmp23)
-        tmp32 = 2
-        tmp33 = tmp6 + tmp32
-        tmp34 = tl.where(tmp33 != tmp33, tmp33, tl.where(tmp33 < tmp15, tmp33, tmp15))
-        tmp35 = tl.load(in_ptr0 + (tmp34 + (8 * tmp14) + (64 * x2)), None).to(tl.float32)
-        tmp36 = tmp35 / 9
-        tmp37 = tmp33 < tmp9
-        tmp38 = tmp19 & tmp37
-        tmp39 = tmp31 + tmp36
-        tmp40 = tl.where(tmp38, tmp39, tmp31)
-        tmp41 = tmp5 + tmp12
-        tmp42 = tl.where(tmp41 != tmp41, tmp41, tl.where(tmp41 < tmp13, tmp41, tmp13))
-        tmp43 = tl.load(in_ptr0 + (tmp16 + (8 * tmp42) + (64 * x2)), None).to(tl.float32)
-        tmp44 = tmp43 / 9
-        tmp45 = tmp41 < tmp8
-        tmp46 = tmp45 & tmp20
-        tmp47 = tmp40 + tmp44
-        tmp48 = tl.where(tmp46, tmp47, tmp40)
-        tmp49 = tl.load(in_ptr0 + (tmp25 + (8 * tmp42) + (64 * x2)), None).to(tl.float32)
-        tmp50 = tmp49 / 9
-        tmp51 = tmp45 & tmp28
-        tmp52 = tmp48 + tmp50
-        tmp53 = tl.where(tmp51, tmp52, tmp48)
-        tmp54 = tl.load(in_ptr0 + (tmp34 + (8 * tmp42) + (64 * x2)), None).to(tl.float32)
-        tmp55 = tmp54 / 9
-        tmp56 = tmp45 & tmp37
-        tmp57 = tmp53 + tmp55
-        tmp58 = tl.where(tmp56, tmp57, tmp53)
-        tmp59 = tmp5 + tmp32
-        tmp60 = tl.where(tmp59 != tmp59, tmp59, tl.where(tmp59 < tmp13, tmp59, tmp13))
-        tmp61 = tl.load(in_ptr0 + (tmp16 + (8 * tmp60) + (64 * x2)), None).to(tl.float32)
-        tmp62 = tmp61 / 9
-        tmp63 = tmp59 < tmp8
-        tmp64 = tmp63 & tmp20
-        tmp65 = tmp58 + tmp62
-        tmp66 = tl.where(tmp64, tmp65, tmp58)
-        tmp67 = tl.load(in_ptr0 + (tmp25 + (8 * tmp60) + (64 * x2)), None).to(tl.float32)
-        tmp68 = tmp67 / 9
-        tmp69 = tmp63 & tmp28
-        tmp70 = tmp66 + tmp68
-        tmp71 = tl.where(tmp69, tmp70, tmp66)
-        tmp72 = tl.load(in_ptr0 + (tmp34 + (8 * tmp60) + (64 * x2)), None).to(tl.float32)
-        tmp73 = tmp72 / 9
-        tmp74 = tmp63 & tmp37
-        tmp75 = tmp71 + tmp73
-        tmp76 = tl.where(tmp74, tmp75, tmp71)
-        tl.store(out_ptr0 + (x5 + tl.zeros([XBLOCK], tl.int32)), tmp76, None)
-
-    inp = torch.ones(8, 2048, 8, 8, device="cuda", dtype=torch.half)
-    out = torch.ones_like(inp) * 3
-    numel = inp.numel()
-    triton_[(numel // 1024,)](inp, out, 1024)
-    out_ref = torch.ones_like(inp)
-    out_ref[:, :, 1:7, 0::7] = 2 / 3
-    out_ref[:, :, 0::7, 1:7] = 2 / 3
-    out_ref[:, :, 0::7, 0::7] = 4 / 9
-    torch.testing.assert_allclose(out, out_ref)
diff --git a/python/test/unit/operators/test_matmul.py b/python/test/unit/operators/test_matmul.py
deleted file mode 100644
index ec46445ae835..000000000000
--- a/python/test/unit/operators/test_matmul.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import itertools
-
-import pytest
-import torch
-
-import triton
-import triton.ops
-
-
-@pytest.mark.parametrize(
-    "BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE",
-    itertools.chain(
-        *[
-            [
-                # 1 warp
-                (16, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (32, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (16, 32, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (16, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (32, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (16, 32, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (16, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (64, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                (16, 64, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
-                # 2 warp
-                (64, 32, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),
-                (32, 64, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),
-                (64, 32, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),
-                (32, 64, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),
-                (128, 32, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),
-                (32, 128, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),
-                # 4 warp
-                (128, 64, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),
-                (64, 128, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),
-                (128, 32, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),
-                (32, 128, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),
-                (128, 32, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),
-                (32, 128, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),
-                # 8 warp
-                (128, 256, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),
-                (256, 128, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),
-                (256, 128, 32, 1, 8, 2, None, None, None, AT, BT, DTYPE),
-                # split-k
-                (64, 64, 16, 2, 4, 2, None, None, None, AT, BT, DTYPE),
-                (64, 64, 16, 4, 4, 2, None, None, None, AT, BT, DTYPE),
-                (64, 64, 16, 8, 4, 2, None, None, None, AT, BT, DTYPE),
-                # variable input
-                (128, 128, 32, 1, 4, 2, 1024, 1024, 1024, AT, BT, DTYPE),
-                (128, 128, 32, 1, 4, 2, 384, 128, 640, AT, BT, DTYPE),
-                (128, 128, 32, 1, 4, 2, 107, 233, 256, AT, BT, DTYPE),
-                (128, 128, 32, 1, 4, 2, 107, 233, 311, AT, BT, DTYPE),
-            ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True]
-        ],
-        # n-stage
-        *[
-            [
-                (16, 16, 16, 1, 1, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
-                (64, 32, 64, 1, 2, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
-                (128, 64, 16, 1, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
-                (256, 128, 32, 1, 8, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
-                (128, 128, 32, 1, 4, STAGES, 384, 128, 640, AT, BT, DTYPE),
-                # split-k
-                (64, 64, 16, 8, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
-                (64, 64, 16, 8, 4, STAGES, 1024, 1024, 32, AT, BT, DTYPE),
-            ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True] for STAGES in [2, 3, 4]
-        ]
-    ),
-)
-def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE):
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 7:
-        pytest.skip("Only test tl.dot() on devices with sm >= 70")
-    if capability[0] < 8 and DTYPE == "bfloat16":
-        pytest.skip("Only test bfloat16 on devices with sm >= 80")
-    if DTYPE == "bfloat16" and SPLIT_K != 1:
-        pytest.skip("bfloat16 matmuls don't allow split_k for now")
-    torch.manual_seed(0)
-    # nuke kernel decorators -- will set meta-parameters manually
-    kwargs = {'BLOCK_M': BLOCK_M, 'BLOCK_N': BLOCK_N, 'BLOCK_K': BLOCK_K, 'SPLIT_K': SPLIT_K}
-    pre_hook = None if SPLIT_K == 1 else lambda nargs: nargs['C'].zero_()
-    configs = [triton.Config(kwargs=kwargs, num_warps=NWARP, num_stages=NSTAGE, pre_hook=pre_hook)]
-    kernel = triton.ops._matmul.kernel
-    kernel.configs = configs
-    # kernel.run = kernel.run.run.run
-
-    # get matrix shape
-    M = BLOCK_M if M is None else M
-    N = BLOCK_N if N is None else N
-    K = BLOCK_K * SPLIT_K if K is None else K
-    # allocate/transpose inputs
-    DTYPE = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[DTYPE]
-    a = .1 * torch.randn((K, M) if AT else (M, K), device="cuda", dtype=DTYPE)
-    b = .1 * torch.randn((N, K) if BT else (K, N), device="cuda", dtype=DTYPE)
-    a = a.t() if AT else a
-    b = b.t() if BT else b
-    # run test
-    th_c = torch.matmul(a, b)
-    try:
-        tt_c = triton.ops.matmul(a, b)
-        torch.testing.assert_allclose(th_c, tt_c, atol=1e-2, rtol=0)
-    except triton.OutOfResources as e:
-        pytest.skip(str(e))
diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py
deleted file mode 100644
index c425a36697f6..000000000000
--- a/python/test/unit/runtime/test_autotuner.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-
-
-def test_kwargs():
-    N = 1024
-    src = torch.empty(N, device='cuda')
-    dst = torch.empty(N, device='cuda')
-
-    configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})]
-
-    @triton.autotune(configs=configs, key=['N'])
-    @triton.jit
-    def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
-        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        x = tl.load(src + offsets, mask=offsets < N)
-        tl.store(dst + offsets, x, mask=offsets < N)
-    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']),)
-    _kernel[grid](dst, src, N)
-    _kernel[grid](dst=dst, src=src, N=N)
diff --git a/python/test/unit/runtime/test_cache.py b/python/test/unit/runtime/test_cache.py
deleted file mode 100644
index e13921079992..000000000000
--- a/python/test/unit/runtime/test_cache.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import os
-import shutil
-
-import pytest
-import torch
-
-import triton
-import triton.language as tl
-from triton.runtime.jit import JITFunction
-
-tmpdir = ".tmp"
-
-
-@triton.jit
-def function_1(i):
-    i = i + 1
-    i = function_2(i)
-    return i
-
-
-@triton.jit
-def function_2(i):
-    i = i + 1
-    return i
-
-
-@triton.jit
-def kernel(X, i, BLOCK: tl.constexpr):
-    i = i + 1
-    i = function_1(i)
-    tl.store(X, i)
-
-
-@triton.jit(do_not_specialize=["i"])
-def kernel_nospec(X, i, BLOCK: tl.constexpr):
-    i = i + 1
-    i = function_1(i)
-    tl.store(X, i)
-
-
-def apply_src_change(target, old, new):
-    kernel.hash = None
-    function_1.hash = None
-    function_2.hash = None
-    function_1.src = function_1.src.replace(old, new)
-    target.src = target.src.replace(old, new)
-    ret = target.cache_key
-    target.src = target.src.replace(new, old)
-    return ret
-
-
-def test_nochange():
-    baseline = kernel.cache_key
-    updated = apply_src_change(kernel, 'i + 1', 'i + 1')
-    assert baseline == updated
-
-
-def test_toplevel_change():
-    baseline = kernel.cache_key
-    updated = apply_src_change(kernel, 'i + 1', 'i + 2')
-    assert baseline != updated
-
-
-def test_nested1_change():
-    baseline = kernel.cache_key
-    updated = apply_src_change(function_1, 'i + 1', 'i + 2')
-    assert baseline != updated
-
-
-def reset_tmp_dir():
-    os.environ["TRITON_CACHE_DIR"] = tmpdir
-    if os.path.exists(tmpdir):
-        shutil.rmtree(tmpdir)
-
-
-def test_reuse():
-    counter = 0
-
-    def inc_counter(*args, **kwargs):
-        nonlocal counter
-        counter += 1
-    JITFunction.cache_hook = inc_counter
-    reset_tmp_dir()
-    x = torch.empty(1, dtype=torch.int32, device='cuda')
-    for i in range(10):
-        kernel[(1,)](x, 1, BLOCK=1024)
-    assert counter == 1
-
-
-@pytest.mark.parametrize('mode', ['enable', 'disable'])
-def test_specialize(mode):
-    counter = 0
-
-    def inc_counter(*args, **kwargs):
-        nonlocal counter
-        counter += 1
-    JITFunction.cache_hook = inc_counter
-    reset_tmp_dir()
-    x = torch.empty(1, dtype=torch.int32, device='cuda')
-    function = {'enable': kernel, 'disable': kernel_nospec}[mode]
-    target = {'enable': 3, 'disable': 1}[mode]
-    for i in [1, 2, 4, 8, 16, 32]:
-        function[(1,)](x, i, BLOCK=512)
-    assert counter == target
-
-
-def test_constexpr_not_callable() -> None:
-    @triton.jit
-    def kernel(X, c: tl.constexpr):
-        tl.store(X, 2)
-
-    x = torch.empty(1, dtype=torch.int32, device='cuda')
-    error = False
-    try:
-        kernel[(1, )](x, c="str")
-    except BaseException:
-        error = True
-    assert error is False
-    # try and catch
-    try:
-        kernel[(1, )](x, c=tl.abs)
-    except BaseException:
-        error = True
-    assert error is True
-
-
-def test_jit_warmup_cache() -> None:
-    @triton.jit
-    def kernel_add(a, b, o, N: tl.constexpr):
-        idx = tl.arange(0, N)
-        tl.store(o + idx,
-                 tl.load(a + idx) + tl.load(b + idx))
-
-    args = [
-        torch.randn(32, dtype=torch.float32, device="cuda"),
-        torch.randn(32, dtype=torch.float32, device="cuda"),
-        torch.randn(32, dtype=torch.float32, device="cuda"),
-        32,
-    ]
-    assert len(kernel_add.cache) == 0
-    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))
-    assert len(kernel_add.cache) == 1
-    kernel_add.warmup(*args, grid=(1,))
-    assert len(kernel_add.cache) == 1
-    kernel_add.warmup(*args, grid=(1,))
-    assert len(kernel_add.cache) == 1
-
-
-def test_jit_debug() -> None:
-    @triton.jit
-    def kernel_add(a, b, o, N: tl.constexpr):
-        idx = tl.arange(0, N)
-        tl.device_assert(idx < 32, "idx < 32")
-        tl.store(o + idx,
-                 tl.load(a + idx) + tl.load(b + idx))
-
-    device = torch.cuda.current_device()
-    assert len(kernel_add.cache[device]) == 0
-    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))
-    assert len(kernel_add.cache[device]) == 1
-    kernel_add.debug = False
-    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))
-    assert len(kernel_add.cache[device]) == 2
-    kernel_add.debug = True
-    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))
-    assert len(kernel_add.cache[device]) == 3
-    bins = list(kernel_add.cache[device].values())
-    assert bins[2].asm['ttir'] != bins[1].asm['ttir']
-
-
-@triton.jit
-def add_fn(a, b, o, N: tl.constexpr):
-    idx = tl.arange(0, N)
-    tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx))
-
-
-def test_jit_noinline() -> None:
-    @triton.jit
-    def kernel_add_device(a, b, o, N: tl.constexpr):
-        add_fn(a, b, o, N)
-
-    device = torch.cuda.current_device()
-    assert len(kernel_add_device.cache[device]) == 0
-    kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))
-    assert len(kernel_add_device.cache[device]) == 1
-    bins = list(kernel_add_device.cache[device].values())
-    inline_ttir = bins[0].asm['ttir']
-    add_fn.noinline = True
-    add_fn.hash = None
-    kernel_add_device.hash = None
-    kernel_add_device.cache[device].clear()
-    kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))
-    assert len(kernel_add_device.cache[device]) == 1
-    bins = list(kernel_add_device.cache[device].values())
-    noinline_ttir = bins[0].asm['ttir']
-    assert inline_ttir != noinline_ttir
-
-
-def test_memory_leak() -> None:
-    @triton.jit
-    def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
-        xnumel = 10
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x0 = xindex
-        tmp0 = tl.load(in_ptr0 + (x0), xmask)
-        tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
diff --git a/python/test/unit/runtime/test_driver.py b/python/test/unit/runtime/test_driver.py
deleted file mode 100644
index b63927d89bfa..000000000000
--- a/python/test/unit/runtime/test_driver.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import sys
-
-import triton
-
-
-def test_is_lazy():
-    from importlib import reload
-    reload(sys.modules["triton.runtime.driver"])
-    reload(sys.modules["triton.runtime"])
-    mod = sys.modules[triton.runtime.driver.__module__]
-    assert isinstance(triton.runtime.driver, getattr(mod, "LazyProxy"))
-    assert triton.runtime.driver._obj is None
-    utils = triton.runtime.driver.utils  # noqa: F841
-    assert issubclass(triton.runtime.driver._obj.__class__, getattr(mod, "DriverBase"))
diff --git a/python/test/unit/runtime/test_launch.py b/python/test/unit/runtime/test_launch.py
deleted file mode 100644
index 41c5431027cf..000000000000
--- a/python/test/unit/runtime/test_launch.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import gc
-# import importlib
-# import os
-# import sys
-# import tempfile
-# import textwrap
-# import time
-import tracemalloc
-
-import torch
-
-import triton
-import triton.language as tl
-
-# from typing import Tuple
-
-
-def test_memory_leak() -> None:
-
-    @triton.jit
-    def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
-        xnumel = 10
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x0 = xindex
-        tmp0 = tl.load(in_ptr0 + (x0), xmask)
-        tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
-
-    tracemalloc.start()
-    try:
-        inp = torch.randn(10, device='cuda')
-        out = torch.randn(10, device='cuda')
-        kernel[(10,)](inp, out, 10, XBLOCK=16)
-        gc.collect()
-        begin, _ = tracemalloc.get_traced_memory()
-        for _ in range(100):
-            kernel[(10,)](inp, out, 10, XBLOCK=16)
-        gc.collect()
-        end, _ = tracemalloc.get_traced_memory()
-        assert end - begin < 1000
-    finally:
-        tracemalloc.stop()
-
-
-# LATENCY_THRESHOLD_US = 46
-
-# def test_kernel_launch_latency() -> None:
-#     def define_kernel(kernel_name: str, num_tensor_args: int) -> str:
-#         arg_str = ",".join([f"arg{i}: torch.Tensor" for i in range(num_tensor_args)])
-#         arg_str += ", n_elements: int, BLOCK_SIZE: tl.constexpr"
-#         func_str = f"""
-#         import torch
-
-#         import triton
-#         import triton.language as tl
-
-#         @triton.jit
-#         def {kernel_name}({arg_str}):
-#             pass
-#         """
-#         with tempfile.NamedTemporaryFile(mode="w+t", suffix=".py", delete=False) as temp_file:
-#             temp_file.write(textwrap.dedent(func_str))
-#             temp_file_path = temp_file.name
-
-#         return temp_file_path
-
-#     def import_kernel(file_path, kernel_name):
-#         directory, filename = os.path.split(file_path)
-#         module_name, _ = os.path.splitext(filename)
-#         sys.path.insert(0, directory)
-
-#         module = importlib.import_module(module_name)
-#         kernel = getattr(module, kernel_name)
-#         return kernel
-
-#     def empty(*kernel_args: Tuple[torch.Tensor]):
-#         first_arg = kernel_args[0]
-#         n_elements = first_arg.numel()
-#         grid = (triton.cdiv(n_elements, 1024),)
-#         device = torch.cuda.current_device()
-#         # Warmup
-#         empty_kernel[grid](*kernel_args, n_elements, BLOCK_SIZE=1024, device=device)
-#         torch.cuda.synchronize()
-#         # Measure launch overhead at steady state
-#         num_runs = 1000
-#         start_time = time.time()
-#         for i in range(num_runs):
-#             empty_kernel[grid](*kernel_args, n_elements, BLOCK_SIZE=1024, device=device)
-#         end_time = time.time()
-#         latency_us = (end_time - start_time) / num_runs * 1e6
-
-#         assert latency_us < LATENCY_THRESHOLD_US, "Kernel launch time has increased!"
-
-#     num_tensor_args = 40
-#     kernel_name = 'empty_kernel'
-#     file_path = define_kernel(kernel_name, num_tensor_args)
-#     empty_kernel = import_kernel(file_path, kernel_name)
-
-#     # Initialize random tensors for the empty_kernel
-#     torch.manual_seed(0)
-#     size = 1024
-#     kernel_args = (torch.rand(size, device='cuda') for i in range(num_tensor_args))
-
-#     # Run empty, which would run empty_kernel internally
-#     empty(*kernel_args)
diff --git a/python/test/unit/runtime/test_subproc.py b/python/test/unit/runtime/test_subproc.py
deleted file mode 100644
index 0e0d33c6fd21..000000000000
--- a/python/test/unit/runtime/test_subproc.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import multiprocessing
-import os
-import shutil
-from collections import namedtuple
-
-import torch
-
-import triton
-import triton.language as tl
-
-tmpdir = ".tmp"
-
-
-def reset_tmp_dir():
-    os.environ["TRITON_CACHE_DIR"] = tmpdir
-    if os.path.exists(tmpdir):
-        shutil.rmtree(tmpdir)
-
-
-instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"])
-
-
-def compile_fn(config, cc):
-    @triton.jit
-    def kernel_sub(a, b, o, N: tl.constexpr):
-        idx = tl.arange(0, N)
-        tl.store(o + idx, tl.load(a + idx) - tl.load(b + idx) * 777)
-    triton.compile(
-        fn=kernel_sub,
-        signature={0: "*fp32", 1: "*fp32", 2: "*fp32"},
-        device=0,
-        constants={3: 32},
-        configs=[config],
-        warm_cache_only=True,
-        cc=cc,
-    )
-
-
-def test_compile_in_subproc() -> None:
-    major, minor = torch.cuda.get_device_capability(0)
-    cc = major * 10 + minor
-    config = instance_descriptor(tuple(range(4)), ())
-
-    multiprocessing.set_start_method('fork')
-    proc = multiprocessing.Process(
-        target=compile_fn,
-        args=(config, cc))
-    proc.start()
-    proc.join()
-    assert proc.exitcode == 0
-
-
-def compile_fn_dot(config, cc):
-    @triton.jit
-    def kernel_dot(Z):
-        offs = tl.arange(0, 16)[:, None] * 16 + tl.arange(0, 16)[None, :]
-        z = tl.load(Z + offs)
-        z = tl.dot(z, z)
-        tl.store(Z + offs, z)
-
-    triton.compile(
-        fn=kernel_dot,
-        signature={0: "*fp32"},
-        device=0,
-        configs=[config],
-        warm_cache_only=True,
-        cc=cc,
-    )
-
-
-def test_compile_in_forked_subproc() -> None:
-    reset_tmp_dir()
-    major, minor = torch.cuda.get_device_capability(0)
-    cc = major * 10 + minor
-    config = instance_descriptor(tuple(range(1)), ())
-
-    assert multiprocessing.get_start_method() == 'fork'
-    proc = multiprocessing.Process(
-        target=compile_fn_dot,
-        args=(config, cc))
-    proc.start()
-    proc.join()
-    assert proc.exitcode == 0
diff --git a/python/triton/_C/include b/python/triton/_C/include
deleted file mode 120000
index b85a409837d1..000000000000
--- a/python/triton/_C/include
+++ /dev/null
@@ -1 +0,0 @@
-../../../include/
\ No newline at end of file
diff --git a/python/triton/__init__.py b/python/triton/__init__.py
deleted file mode 100644
index 14c9d61bdcb7..000000000000
--- a/python/triton/__init__.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""isort:skip_file"""
-__version__ = '2.1.0'
-
-# ---------------------------------------
-# Note: import order is significant here.
-
-# submodules
-from .runtime import (
-    autotune,
-    Config,
-    heuristics,
-    JITFunction,
-    KernelInterface,
-    reinterpret,
-    TensorWrapper,
-    OutOfResources,
-    MockTensor,
-)
-from .runtime.jit import jit
-from .compiler import compile, CompilationError
-from .debugger.debugger import program_ids_from_grid
-
-from . import language
-from . import testing
-
-__all__ = [
-    "autotune",
-    "cdiv",
-    "CompilationError",
-    "compile",
-    "Config",
-    "heuristics",
-    "impl",
-    "jit",
-    "JITFunction",
-    "KernelInterface",
-    "language",
-    "MockTensor",
-    "next_power_of_2",
-    "ops",
-    "OutOfResources",
-    "reinterpret",
-    "runtime",
-    "TensorWrapper",
-    "testing",
-    "program_ids_from_grid",
-]
-
-
-# -------------------------------------
-# misc. utilities that  don't fit well
-# into any specific module
-# -------------------------------------
-
-def cdiv(x, y):
-    return (x + y - 1) // y
-
-
-def next_power_of_2(n):
-    """Return the smallest power of 2 greater than or equal to n"""
-    n -= 1
-    n |= n >> 1
-    n |= n >> 2
-    n |= n >> 4
-    n |= n >> 8
-    n |= n >> 16
-    n += 1
-    return n
diff --git a/python/triton/common/__init__.py b/python/triton/common/__init__.py
deleted file mode 100644
index cc4d1e10cb49..000000000000
--- a/python/triton/common/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .build import _build
-
-__all__ = ["_build"]
diff --git a/python/triton/common/build.py b/python/triton/common/build.py
deleted file mode 100644
index 1bf019ce6d53..000000000000
--- a/python/triton/common/build.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import contextlib
-import functools
-import io
-import os
-import shutil
-import subprocess
-import sys
-import sysconfig
-
-import setuptools
-
-
-# TODO: is_hip shouldn't be here
-def is_hip():
-    import torch
-    return torch.version.hip is not None
-
-
-@functools.lru_cache()
-def libcuda_dirs():
-    locs = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[1:]
-    return [os.path.dirname(loc) for loc in locs]
-
-
-@functools.lru_cache()
-def rocm_path_dir():
-    return os.getenv("ROCM_PATH", default="/opt/rocm")
-
-
-@contextlib.contextmanager
-def quiet():
-    old_stdout, old_stderr = sys.stdout, sys.stderr
-    sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
-    try:
-        yield
-    finally:
-        sys.stdout, sys.stderr = old_stdout, old_stderr
-
-
-def _build(name, src, srcdir):
-    if is_hip():
-        hip_lib_dir = os.path.join(rocm_path_dir(), "lib")
-        hip_include_dir = os.path.join(rocm_path_dir(), "include")
-    else:
-        cuda_lib_dirs = libcuda_dirs()
-        base_dir = os.path.join(os.path.dirname(__file__), os.path.pardir)
-        cuda_path = os.path.join(base_dir, "third_party", "cuda")
-
-        cu_include_dir = os.path.join(cuda_path, "include")
-        triton_include_dir = os.path.join(os.path.dirname(__file__), "include")
-        cuda_header = os.path.join(cu_include_dir, "cuda.h")
-        triton_cuda_header = os.path.join(triton_include_dir, "cuda.h")
-        if not os.path.exists(cuda_header) and os.path.exists(triton_cuda_header):
-            cu_include_dir = triton_include_dir
-    suffix = sysconfig.get_config_var('EXT_SUFFIX')
-    so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
-    # try to avoid setuptools if possible
-    cc = os.environ.get("CC")
-    if cc is None:
-        # TODO: support more things here.
-        clang = shutil.which("clang")
-        gcc = shutil.which("gcc")
-        cc = gcc if gcc is not None else clang
-        if cc is None:
-            raise RuntimeError("Failed to find C compiler. Please specify via CC environment variable.")
-    # This function was renamed and made public in Python 3.10
-    if hasattr(sysconfig, 'get_default_scheme'):
-        scheme = sysconfig.get_default_scheme()
-    else:
-        scheme = sysconfig._get_default_scheme()
-    # 'posix_local' is a custom scheme on Debian. However, starting Python 3.10, the default install
-    # path changes to include 'local'. This change is required to use triton with system-wide python.
-    if scheme == 'posix_local':
-        scheme = 'posix_prefix'
-    py_include_dir = sysconfig.get_paths(scheme=scheme)["include"]
-
-    if is_hip():
-        ret = subprocess.check_call([cc, src, f"-I{hip_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", f"-L{hip_lib_dir}", "-lamdhip64", "-o", so])
-    else:
-        cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so]
-        cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs]
-        ret = subprocess.check_call(cc_cmd)
-
-    if ret == 0:
-        return so
-    # fallback on setuptools
-    extra_compile_args = []
-    library_dirs = cuda_lib_dirs
-    include_dirs = [srcdir, cu_include_dir]
-    libraries = ['cuda']
-    # extra arguments
-    extra_link_args = []
-    # create extension module
-    ext = setuptools.Extension(
-        name=name,
-        language='c',
-        sources=[src],
-        include_dirs=include_dirs,
-        extra_compile_args=extra_compile_args + ['-O3'],
-        extra_link_args=extra_link_args,
-        library_dirs=library_dirs,
-        libraries=libraries,
-    )
-    # build extension module
-    args = ['build_ext']
-    args.append('--build-temp=' + srcdir)
-    args.append('--build-lib=' + srcdir)
-    args.append('-q')
-    args = dict(
-        name=name,
-        ext_modules=[ext],
-        script_args=args,
-    )
-    with quiet():
-        setuptools.setup(**args)
-    return so
diff --git a/python/triton/compiler/__init__.py b/python/triton/compiler/__init__.py
deleted file mode 100644
index 4d62eeec4ab1..000000000000
--- a/python/triton/compiler/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .compiler import CompiledKernel, compile
-from .errors import CompilationError
-
-__all__ = ["compile", "CompiledKernel", "CompilationError"]
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
deleted file mode 100644
index eb57de82ee96..000000000000
--- a/python/triton/compiler/code_generator.py
+++ /dev/null
@@ -1,1086 +0,0 @@
-import ast
-import inspect
-import re
-import sys
-import warnings
-from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
-
-from .. import language
-from ..language import constexpr, tensor
-# ideally we wouldn't need any runtime component
-from ..runtime import JITFunction
-from .errors import (CompilationError, CompileTimeAssertionFailure,
-                     UnsupportedLanguageConstruct)
-from triton._C.libtriton.triton import ir
-
-
-def mangle_ty(ty):
-    if ty.is_ptr():
-        return 'P' + mangle_ty(ty.element_ty)
-    if ty.is_int():
-        SIGNED = language.dtype.SIGNEDNESS.SIGNED
-        prefix = 'i' if ty.int_signedness == SIGNED else 'u'
-        return prefix + str(ty.int_bitwidth)
-    if ty.is_fp8():
-        return 'fp8'
-    if ty.is_fp16():
-        return 'fp16'
-    if ty.is_bf16():
-        return 'bf16'
-    if ty.is_fp32():
-        return 'fp32'
-    if ty.is_fp64():
-        return 'fp64'
-    if ty.is_block():
-        elt = mangle_ty(ty.scalar)
-        shape = '_'.join(map(str, ty.shape))
-        return f'{elt}S{shape}S'
-    if ty.is_void():
-        return 'V'
-    assert False, "Unsupported type"
-
-
-def mangle_fn(name, arg_tys, constants):
-    # doesn't mangle ret type, which must be a function of arg tys
-    mangled_arg_names = '_'.join([mangle_ty(ty) for ty in arg_tys])
-    mangled_constants = '_'.join([f'{i}c{repr(constants[i])}' for i in sorted(constants)])
-    mangled_constants = mangled_constants.replace('.', '_d_')
-    mangled_constants = mangled_constants.replace("'", '_sq_')
-    # [ and ] are not allowed in LLVM identifiers
-    mangled_constants = mangled_constants.replace('[', '_').replace(']', '_')
-    ret = f'{name}__{mangled_arg_names}__{mangled_constants}'
-    return ret
-
-
-def _is_triton_tensor(o: Any) -> bool:
-    return isinstance(o, tensor)
-
-
-def _is_constexpr(o: Any) -> bool:
-    return isinstance(o, constexpr)
-
-
-def _is_triton_scalar(o: Any) -> bool:
-    return _is_triton_tensor(o) and (not o.type.is_block() or o.type.numel == 1)
-
-
-def _unwrap_if_constexpr(o: Any):
-    return o.value if isinstance(o, constexpr) else o
-
-
-def _check_fn_args(node, fn, args):
-    if fn.noinline:
-        for idx, arg in enumerate(args):
-            if not _is_constexpr(arg) and not _is_triton_scalar(arg):
-                raise UnsupportedLanguageConstruct(fn.src, node, f'Function {fn.__name__} is marked noinline, but was called with non-scalar argument {fn.arg_names[idx]}:{arg}')
-
-
-_condition_types = {bool, int, type(None)}  # Python types accepted for conditionals inside kernels
-
-
-class enter_sub_region:
-    def __init__(self, generator):
-        self.generator = generator
-
-    def __enter__(self):
-        # record lscope & local_defs in the parent scope
-        self.liveins = self.generator.lscope.copy()
-        self.prev_defs = self.generator.local_defs.copy()
-        self.generator.local_defs = {}
-        self.insert_block = self.generator.builder.get_insertion_block()
-        self.insert_point = self.generator.builder.get_insertion_point()
-        return self.liveins, self.insert_block
-
-    def __exit__(self, *args, **kwargs):
-        self.generator.builder.restore_insertion_point(self.insert_point)
-        self.generator.lscope = self.liveins
-        self.generator.local_defs = self.prev_defs
-
-
-# Check if the given syntax node has an "early" return
-class ContainsReturnChecker(ast.NodeVisitor):
-    def __init__(self, gscope):
-        self.gscope = gscope
-
-    def _visit_stmts(self, body) -> bool:
-        for s in body:
-            if self.visit(s):
-                return True
-        return False
-
-    def _visit_function(self, fn) -> bool:
-        # Currently we only support JITFunctions defined in the global scope
-        if isinstance(fn, JITFunction) and not fn.noinline:
-            fn_node = fn.parse()
-            return ContainsReturnChecker(self.gscope).visit(fn_node)
-        return False
-
-    def generic_visit(self, node) -> bool:
-        ret = False
-        for _, value in ast.iter_fields(node):
-            if isinstance(value, list):
-                for item in value:
-                    if isinstance(item, ast.AST):
-                        ret = ret or self.visit(item)
-            elif isinstance(value, ast.AST):
-                ret = ret or self.visit(value)
-        return ret
-
-    def visit_Attribute(self, node: ast.Attribute) -> bool:
-        # If the left part is a name, it's possible that
-        # we call triton native function or a jit function from another module.
-        # If the left part is not a name, it must return a tensor or a constexpr
-        # whose methods do not contain return statements
-        # e.g., (tl.load(x)).to(y)
-        # So we only check if the expressions within value have return or not
-        if isinstance(node.value, ast.Name):
-            if node.value.id in self.gscope:
-                value = self.gscope[node.value.id]
-                fn = getattr(value, node.attr)
-                return self._visit_function(fn)
-            return False
-        return self.visit(node.value)
-
-    def visit_Name(self, node: ast.Name) -> bool:
-        if type(node.ctx) == ast.Store:
-            return False
-        if node.id in self.gscope:
-            fn = self.gscope[node.id]
-            return self._visit_function(fn)
-        return False
-
-    def visit_Return(self, node: ast.Return) -> bool:
-        return True
-
-    def visit_Assign(self, node: ast.Assign) -> bool:
-        # There couldn't be an early return
-        # x = ...
-        return False
-
-    def visit_AugAssign(self, node: ast.AugAssign) -> bool:
-        # There couldn't be an early return
-        # x += ...
-        return False
-
-    def visit_Module(self, node: ast.Module) -> bool:
-        return self._visit_stmts(node.body)
-
-    def visit_FunctionDef(self, node: ast.FunctionDef) -> bool:
-        return self._visit_stmts(node.body)
-
-    def visit_If(self, node: ast.If) -> bool:
-        # TODO: optimize the following case in which we actually don't have
-        # a return when static_cond is false:
-        # if dynamic_cond
-        #   if static_cond
-        #     func_with_return
-        #   else
-        #     func_without_return
-        ret = self._visit_stmts(node.body)
-        if node.orelse:
-            ret = ret or self._visit_stmts(node.orelse)
-        return ret
-
-    def visit_IfExp(self, node: ast.IfExp) -> bool:
-        return self.visit(node.body) or self.visit(node.orelse)
-
-    def visit_Call(self, node: ast.Call) -> bool:
-        return self.visit(node.func)
-
-
-class CodeGenerator(ast.NodeVisitor):
-    def __init__(self, context, prototype, gscope, attributes, constants, function_name,
-                 module=None, is_kernel=False, function_types: Optional[Dict] = None,
-                 debug=False, noinline=False):
-        self.builder = ir.builder(context)
-        self.module = self.builder.create_module() if module is None else module
-        self.function_ret_types = {} if function_types is None else function_types
-        self.prototype = prototype
-        self.gscope = gscope
-        self.lscope = dict()
-        self.attributes = attributes
-        self.constants = constants
-        self.function_name = function_name
-        self.is_kernel = is_kernel
-        self.last_node = None
-        self.debug = debug
-        self.noinline = noinline
-        self.scf_stack = []
-        self.last_ret_type = None
-        # SSA-construction
-        # name => language.tensor
-        self.local_defs: Dict[str, tensor] = {}
-        self.global_uses: Dict[str, tensor] = {}
-        self.dereference_name: Callable[[str], Any] = self._define_name_lookup()
-
-    builtin_namespace: Dict[str, Any] = {_.__name__: _ for _ in (range, float, int, isinstance, getattr)}
-    builtin_namespace.update((
-        ('print', language.core.device_print),
-        ('min', language.minimum),
-    ))
-
-    def _define_name_lookup(self):
-        def local_lookup(name: str, absent):
-            value = self.lscope.get(name, absent)  # this needs to be re-fetched from `self` every time, because it gets switched occasionally
-            if value is not absent and name not in self.local_defs:
-                self.global_uses[name] = value
-            return value
-
-        absent_marker = object()
-
-        def name_lookup(name: str) -> Any:
-            absent = absent_marker
-            for lookup_function in local_lookup, self.gscope.get, self.builtin_namespace.get:
-                value = lookup_function(name, absent)
-                if value is not absent:
-                    return value
-            raise NameError(f'{name} is not defined')
-
-        return name_lookup
-
-    def set_value(self, name: str,
-                  value: Union[tensor, constexpr]) -> None:
-        ''' This function:
-            called by visit_Assign() & visit_FunctionDef() to store left value (lvalue)
-        1. record local defined name (FIXME: should consider control flow)
-        2. store tensor in self.lvalue
-        '''
-        self.lscope[name] = value
-        self.local_defs[name] = value
-
-    #
-    # AST visitor
-    #
-    def visit_compound_statement(self, stmts):
-        for stmt in stmts:
-            ret_type = self.visit(stmt)
-            if ret_type is not None and isinstance(stmt, ast.Return):
-                self.last_ret_type = ret_type
-
-    def visit_Module(self, node):
-        ast.NodeVisitor.generic_visit(self, node)
-
-    def visit_List(self, node):
-        ctx = self.visit(node.ctx)
-        assert ctx is None
-        elts = [self.visit(elt) for elt in node.elts]
-        return elts
-
-    # By design, only non-kernel functions can return
-    def visit_Return(self, node):
-        ret_value = self.visit(node.value)
-        # ret_block = self.builder.create_block()
-        # post_ret_block = self.builder.create_block()
-        # self.builder.create_branch(ret_block)
-        # self.builder.set_insertion_point_to_end(ret_block)
-        if ret_value is None:
-            self.builder.ret([])
-            ret_ty = None
-        elif isinstance(ret_value, tuple):
-            ret_values = [language.core._to_tensor(v, self.builder) for v in ret_value]
-            ret_types = [v.type for v in ret_values]
-            self.builder.ret([v.handle for v in ret_values])
-            ret_ty = tuple(ret_types)
-        else:
-            ret = language.core._to_tensor(ret_value, self.builder)
-            self.builder.ret([ret.handle])
-            ret_ty = ret.type
-        # self.builder.create_branch(post_ret_block)
-        # self.builder.set_insertion_point_to_end(post_ret_block)
-        return ret_ty
-
-    def visit_FunctionDef(self, node):
-        arg_names, kwarg_names = self.visit(node.args)
-        # initialize defaults
-        for i, default_value in enumerate(node.args.defaults):
-            arg_node = node.args.args[-i - 1]
-            annotation = arg_node.annotation
-            name = arg_node.arg
-            st_target = ast.Name(id=name, ctx=ast.Store())
-            if annotation is None:
-                init_node = ast.Assign(targets=[st_target], value=default_value)
-            else:
-                init_node = ast.AnnAssign(target=st_target, value=default_value, annotation=annotation)
-            self.visit(init_node)
-        # initialize function
-        visibility = "public" if self.is_kernel else "private"
-        fn = self.builder.get_or_insert_function(self.module, self.function_name, self.prototype.to_ir(self.builder), visibility, self.noinline)
-        self.module.push_back(fn)
-        entry = fn.add_entry_block()
-        arg_values = []
-        idx = 0
-        for i, arg_name in enumerate(arg_names):
-            if i in self.constants:
-                cst = self.constants[i]
-                if not _is_constexpr(cst):
-                    cst = constexpr(self.constants[i])
-                arg_values.append(cst)
-                continue
-            else:
-                if i in self.attributes:
-                    fn.set_arg_attr(idx, "tt.divisibility", self.attributes[i][1])
-                arg_values.append(tensor(fn.args(idx), self.prototype.param_types[idx]))
-                idx += 1
-
-        insert_pt = self.builder.get_insertion_block()
-        for arg_name, arg_value in zip(arg_names, arg_values):
-            self.set_value(arg_name, arg_value)
-        self.builder.set_insertion_point_to_start(entry)
-        # visit function body
-        self.visit_compound_statement(node.body)
-        # finalize function
-        if self.last_ret_type is None:
-            self.builder.ret([])
-        else:
-            # update return type
-            if isinstance(self.last_ret_type, tuple):
-                self.prototype.ret_types = list(self.last_ret_type)
-                fn.reset_type(self.prototype.to_ir(self.builder))
-            else:
-                self.prototype.ret_types = [self.last_ret_type]
-                fn.reset_type(self.prototype.to_ir(self.builder))
-        if insert_pt:
-            self.builder.set_insertion_point_to_end(insert_pt)
-        # Remove dead code
-        fn.finalize()
-
-    def visit_arguments(self, node):
-        arg_names = []
-        for arg in node.args:
-            arg_names += [self.visit(arg)]
-        kwarg_names = self.visit(node.kwarg)
-        return arg_names, kwarg_names
-
-    def visit_arg(self, node):
-        ast.NodeVisitor.generic_visit(self, node)
-        return node.arg
-
-    def visit_AnnAssign(self, node):
-        # extract attributes
-        annotation = self.visit(node.annotation)
-        target = self.visit(node.target)
-        value = self.visit(node.value)
-        # constexpr
-        if annotation == constexpr:
-            if target in self.lscope:
-                raise ValueError(f'{target} is already defined.'
-                                 f' constexpr cannot be reassigned.')
-            if not _is_constexpr(value):
-                value = constexpr(value)
-            self.lscope[target] = value
-            return self.lscope[target]
-        # default: call visit_Assign
-        return self.visit_Assign(node)
-
-    def visit_Assign(self, node):
-        _names = []
-        for target in node.targets:
-            _names += [self.visit(target)]
-        if len(_names) > 1:
-            raise UnsupportedLanguageConstruct(None, node, "simultaneous multiple assignment is not supported.")
-        names = _names[0]
-        values = self.visit(node.value)
-        if not isinstance(names, tuple):
-            names = [names]
-        if not isinstance(values, tuple):
-            values = [values]
-        native_nontensor_types = (language.dtype, )
-        for name, value in zip(names, values):
-            # by default, constexpr are assigned into python variable
-            value = _unwrap_if_constexpr(value)
-            if not _is_triton_tensor(value) and \
-               not isinstance(value, native_nontensor_types):
-                value = language.core._to_tensor(value, self.builder)
-            self.set_value(name, value)
-
-    def visit_AugAssign(self, node):
-        name = node.target.id
-        lhs = ast.Name(id=name, ctx=ast.Load())
-        rhs = ast.BinOp(lhs, node.op, node.value)
-        assign = ast.Assign(targets=[node.target], value=rhs)
-        self.visit(assign)
-        return self.dereference_name(name)
-
-    def visit_Name(self, node):
-        if type(node.ctx) == ast.Store:
-            return node.id
-        return self.dereference_name(node.id)
-
-    def visit_Store(self, node):
-        ast.NodeVisitor.generic_visit(self, node)
-
-    def visit_Load(self, node):
-        ast.NodeVisitor.generic_visit(self, node)
-
-    def visit_Tuple(self, node):
-        args = [self.visit(x) for x in node.elts]
-        return tuple(args)
-
-    def _apply_binary_method(self, method_name, lhs, rhs):
-        # TODO: raise something meaningful if getattr fails below, esp for reverse method
-        if _is_triton_tensor(lhs):
-            return getattr(lhs, method_name)(rhs, _builder=self.builder)
-        if _is_triton_tensor(rhs):
-            reverse_method_name = re.sub(r"__(.*)__", r"__r\1__", method_name)
-            return getattr(rhs, reverse_method_name)(lhs, _builder=self.builder)
-        return getattr(lhs, method_name)(rhs)
-
-    def visit_BinOp(self, node):
-        lhs = self.visit(node.left)
-        rhs = self.visit(node.right)
-        method_name = self._method_name_for_bin_op.get(type(node.op))
-        if method_name is None:
-            raise UnsupportedLanguageConstruct(None, node, "AST binary operator '{}' is not (currently) implemented.".format(node.op.__name__))
-        return self._apply_binary_method(method_name, lhs, rhs)
-    _method_name_for_bin_op: Dict[Type[ast.operator], str] = {
-        ast.Add: '__add__', ast.Sub: '__sub__', ast.Mult: '__mul__', ast.Div: '__truediv__',
-        ast.FloorDiv: '__floordiv__', ast.Mod: '__mod__', ast.Pow: '__pow__',
-        ast.LShift: '__lshift__', ast.RShift: '__rshift__', ast.BitAnd: '__and__', ast.BitOr: '__or__', ast.BitXor: '__xor__',
-    }
-
-    def visit_then_else_blocks(self, node, liveins, then_block, else_block):
-        # then block
-        self.builder.set_insertion_point_to_start(then_block)
-        self.visit_compound_statement(node.body)
-        then_block = self.builder.get_insertion_block()
-        then_defs = self.local_defs.copy()
-        # else block
-        else_defs = {}
-        if node.orelse:
-            self.builder.set_insertion_point_to_start(else_block)
-            self.lscope = liveins.copy()
-            self.local_defs = {}
-            self.visit_compound_statement(node.orelse)
-            else_defs = self.local_defs.copy()
-            else_block = self.builder.get_insertion_block()
-
-        # update block arguments
-        names = []
-        ret_types = []
-        ir_ret_types = []
-        # variables in livein whose value is updated in `if`
-        for name in liveins:
-            # check type
-            for defs, block_name in [(then_defs, 'then'), (else_defs, 'else')]:
-                if name in defs:
-                    assert defs[name].type == liveins[name].type,\
-                        f'initial value for `{name}` is of type {liveins[name].type}, '\
-                        f'but the {block_name} block redefines it as {defs[name].type}'
-            if name in then_defs or name in else_defs:
-                names.append(name)
-                ret_types.append(then_defs[name].type if name in then_defs else else_defs[name].type)
-                ir_ret_types.append(then_defs[name].handle.get_type() if name in then_defs else else_defs[name].handle.get_type())
-            # variable defined in then but not in else
-            if name in then_defs and name not in else_defs:
-                else_defs[name] = liveins[name]
-            # variable defined in else but not in then
-            if name in else_defs and name not in then_defs:
-                then_defs[name] = liveins[name]
-        # variables that are both in then and else but not in liveins
-        # TODO: could probably be cleaned up
-        for name in then_defs.keys() & else_defs.keys():
-            if name in names:
-                continue
-            then_ty = then_defs[name].type
-            else_ty = else_defs[name].type
-            assert then_ty == else_ty,\
-                f'mismatched type for {name} between then block ({then_ty}) '\
-                f'and else block ({else_ty})'
-            names.append(name)
-            ret_types.append(then_ty)
-            ir_ret_types.append(then_defs[name].handle.get_type())
-
-        return then_defs, else_defs, then_block, else_block, names, ret_types, ir_ret_types
-
-    def visit_if_top_level(self, cond, node):
-        has_endif_block = True
-        with enter_sub_region(self) as sr:
-            liveins, ip_block = sr
-            then_block = self.builder.create_block()
-            else_block = self.builder.create_block()
-            # create basic-block after conditional
-            endif_block = self.builder.create_block()
-            # create branch
-            self.builder.set_insertion_point_to_end(ip_block)
-            self.builder.create_cond_branch(cond.handle, then_block, else_block)
-            # visit then and else blocks
-            then_defs, else_defs, then_block, else_block, names, ret_types, ir_ret_types = \
-                self.visit_then_else_blocks(node, liveins, then_block, else_block)
-            # then terminator
-            self.builder.set_insertion_point_to_end(then_block)
-            if then_block.has_return() and else_block.has_return():
-                has_endif_block = False
-                endif_block.erase()
-            if not then_block.has_terminator() and has_endif_block:
-                self.builder.create_branch(endif_block, [then_defs[n].handle for n in names])
-            # else terminator
-            self.builder.set_insertion_point_to_end(else_block)
-            if not else_block.has_terminator() and has_endif_block:
-                self.builder.create_branch(endif_block, [else_defs[n].handle for n in names])
-            if has_endif_block:
-                for ty in ir_ret_types:
-                    endif_block.add_argument(ty)
-        if has_endif_block:
-            # change block
-            self.builder.set_insertion_point_to_start(endif_block)
-            # update value
-            for i, name in enumerate(names):
-                new_tensor = language.core.tensor(endif_block.arg(i), ret_types[i])
-                self.set_value(name, new_tensor)
-
-    # TODO: refactor
-    def visit_if_scf(self, cond, node):
-        with enter_sub_region(self) as sr:
-            liveins, _ = sr
-            ip = self.builder.get_insertion_point()
-            then_block = self.builder.create_block()
-            else_block = self.builder.create_block() if node.orelse else None
-            then_defs, else_defs, then_block, else_block, names, ret_types, _ = \
-                self.visit_then_else_blocks(node, liveins, then_block, else_block)
-            # create if op
-            self.builder.restore_insertion_point(ip)
-            if_op = self.builder.create_if_op([ty.to_ir(self.builder) for ty in ret_types], cond.handle, True)
-            then_block.merge_block_before(if_op.get_then_block())
-            self.builder.set_insertion_point_to_end(if_op.get_then_block())
-            if len(names) > 0:
-                self.builder.create_yield_op([then_defs[n].handle for n in names])
-            if not node.orelse:
-                else_block = if_op.get_else_block()
-            else:
-                else_block.merge_block_before(if_op.get_else_block())
-            self.builder.set_insertion_point_to_end(if_op.get_else_block())
-            if len(names) > 0:
-                self.builder.create_yield_op([else_defs[n].handle for n in names])
-        # update values
-        for i, name in enumerate(names):
-            new_tensor = language.core.tensor(if_op.get_result(i), ret_types[i])
-            self.set_value(name, new_tensor)
-
-    def visit_If(self, node):
-        cond = self.visit(node.test)
-        if _is_triton_tensor(cond):
-            cond = cond.to(language.int1, _builder=self.builder)
-            contains_return = ContainsReturnChecker(self.gscope).visit(node)
-            if self.scf_stack and contains_return:
-                raise UnsupportedLanguageConstruct(None, node,
-                                                   "Cannot have `return` statements inside `while` or `for` statements in triton")
-            elif self.scf_stack or not contains_return:
-                self.visit_if_scf(cond, node)
-            else:
-                self.visit_if_top_level(cond, node)
-        else:
-            cond = _unwrap_if_constexpr(cond)
-            if type(cond) not in _condition_types:  # not isinstance - we insist the real thing, no subclasses and no ducks
-                raise UnsupportedLanguageConstruct(
-                    None, node, "`if` conditionals can only accept values of type {{{}}}, not objects of type {}".format(
-                        ', '.join(_.__name__ for _ in _condition_types), type(cond).__name__))
-            if cond:
-                self.visit_compound_statement(node.body)
-            else:
-                self.visit_compound_statement(node.orelse)
-
-    def visit_IfExp(self, node):
-        cond = self.visit(node.test)
-        if _is_triton_tensor(cond):
-            cond = cond.to(language.int1, _builder=self.builder)
-        if _unwrap_if_constexpr(cond):
-            return self.visit(node.body)
-        else:
-            return self.visit(node.orelse)
-
-    def visit_Pass(self, node):
-        pass
-
-    def visit_Compare(self, node):
-        if not (len(node.comparators) == 1 and len(node.ops) == 1):
-            raise UnsupportedLanguageConstruct(None, node, "simultaneous multiple comparison is not supported")
-        lhs = _unwrap_if_constexpr(self.visit(node.left))
-        rhs = _unwrap_if_constexpr(self.visit(node.comparators[0]))
-        if type(node.ops[0]) == ast.Is:
-            return constexpr(lhs is rhs)
-        if type(node.ops[0]) == ast.IsNot:
-            return constexpr(lhs is not rhs)
-        method_name = self._method_name_for_comp_op.get(type(node.ops[0]))
-        if method_name is None:
-            raise UnsupportedLanguageConstruct(None, node, "AST comparison operator '{}' is not (currently) implemented.".format(node.ops[0].__name__))
-        return self._apply_binary_method(method_name, lhs, rhs)
-    _method_name_for_comp_op: Dict[Type[ast.cmpop], str] = {
-        ast.Eq: '__eq__', ast.NotEq: '__ne__', ast.Lt: '__lt__', ast.LtE: '__le__', ast.Gt: '__gt__', ast.GtE: '__ge__'
-    }
-
-    def visit_UnaryOp(self, node):
-        op = self.visit(node.operand)
-        fn = self._method_name_for_unary_op.get(type(node.op))
-        if fn is None:
-            raise UnsupportedLanguageConstruct(None, node, "AST unary operator '{}' is not (currently) implemented.".format(node.op.__name__))
-        if _is_triton_tensor(op):
-            return getattr(op, fn)(_builder=self.builder)
-        return getattr(op, fn)()
-    _method_name_for_unary_op: Dict[Type[ast.unaryop], str] = {ast.USub: '__neg__', ast.UAdd: '__pos__', ast.Not: '__not__', ast.Invert: '__invert__'}
-
-    def visit_While(self, node):
-        with enter_sub_region(self) as sr:
-            liveins, insert_block = sr
-
-            # loop body (the after region)
-            # loop_block = self.builder.create_block()
-            dummy = self.builder.create_block()
-            self.builder.set_insertion_point_to_start(dummy)
-            self.scf_stack.append(node)
-            self.visit_compound_statement(node.body)
-            self.scf_stack.pop()
-            loop_defs = self.local_defs
-
-            # collect loop-carried values
-            names = []
-            ret_types = []
-            init_args = []
-            for name in loop_defs:
-                if name in liveins:
-                    # We should not def new constexpr
-                    assert _is_triton_tensor(loop_defs[name])
-                    assert _is_triton_tensor(liveins[name])
-                    assert loop_defs[name].type == liveins[name].type
-                    # these are loop-carried values
-                    names.append(name)
-                    ret_types.append(loop_defs[name].type)
-                    init_args.append(liveins[name])
-
-            self.builder.set_insertion_point_to_end(insert_block)
-            while_op = self.builder.create_while_op([ty.to_ir(self.builder) for ty in ret_types],
-                                                    [arg.handle for arg in init_args])
-            # merge the condition region
-            before_block = self.builder.create_block_with_parent(while_op.get_before(),
-                                                                 [ty.to_ir(self.builder) for ty in ret_types])
-            self.builder.set_insertion_point_to_start(before_block)
-            for i, name in enumerate(names):
-                self.lscope[name] = language.core.tensor(before_block.arg(i), ret_types[i])
-                self.local_defs[name] = self.lscope[name]
-            cond = self.visit(node.test)
-            self.builder.set_insertion_point_to_end(before_block)
-            # create ConditionOp: e.g., scf.condition(%cond) %arg0, %arg1, ...
-            self.builder.create_condition_op(cond.handle, [before_block.arg(i) for i in range(len(init_args))])
-            # merge the loop body
-            after_block = self.builder.create_block_with_parent(while_op.get_after(),
-                                                                [ty.to_ir(self.builder) for ty in ret_types])
-
-            # generate loop body
-            self.builder.set_insertion_point_to_start(after_block)
-            for i, name in enumerate(names):
-                self.lscope[name] = language.core.tensor(after_block.arg(i), ret_types[i])
-                self.local_defs[name] = self.lscope[name]
-            self.scf_stack.append(node)
-            self.visit_compound_statement(node.body)
-            self.scf_stack.pop()
-            loop_defs = self.local_defs
-            yields = []
-            for name in loop_defs:
-                if name in liveins:
-                    yields.append(loop_defs[name])
-            self.builder.create_yield_op([y.handle for y in yields])
-
-        # update global uses in while_op
-        for i, name in enumerate(names):
-            after_block.replace_use_in_block_with(init_args[i].handle, after_block.arg(i))
-
-        # WhileOp defines new values, update the symbol table (lscope, local_defs)
-        for i, name in enumerate(names):
-            new_def = language.core.tensor(while_op.get_result(i), ret_types[i])
-            self.lscope[name] = new_def
-            self.local_defs[name] = new_def
-
-        for stmt in node.orelse:
-            assert False, "Not implemented"
-            ast.NodeVisitor.generic_visit(self, stmt)
-
-    def visit_Subscript(self, node):
-        assert node.ctx.__class__.__name__ == "Load"
-        lhs = self.visit(node.value)
-        slices = self.visit(node.slice)
-        if _is_triton_tensor(lhs):
-            return lhs.__getitem__(slices, _builder=self.builder)
-        return lhs[slices]
-
-    def visit_ExtSlice(self, node):
-        return [self.visit(dim) for dim in node.dims]
-
-    def visit_For(self, node):
-        IteratorClass = self.visit(node.iter.func)
-        iter_args = [self.visit(arg) for arg in node.iter.args]
-        if IteratorClass == language.static_range:
-            iterator = IteratorClass(*iter_args)
-            static_range = range(iterator.start.value,
-                                 iterator.end.value,
-                                 iterator.step.value)
-            for i in static_range:
-                self.lscope[node.target.id] = constexpr(i)
-                self.visit_compound_statement(node.body)
-                for stmt in node.orelse:
-                    ast.NodeVisitor.generic_visit(self, stmt)
-            return
-
-        if IteratorClass is not range:
-            raise RuntimeError('Only `range` and `static_range` iterators are currently supported')
-
-        # visit iterator arguments
-        # note: only `range` iterator is supported now
-        # collect lower bound (lb), upper bound (ub), and step
-        lb = iter_args[0] if len(iter_args) > 1 else self.visit(ast.Num(0))
-        ub = iter_args[1] if len(iter_args) > 1 else self.visit(node.iter.args[0])
-        step = iter_args[2] if len(iter_args) > 2 else self.visit(ast.Num(1))
-        # handle negative constant step (not supported by scf.for in MLIR)
-        negative_step = False
-        if _is_constexpr(step) and step.value < 0:
-            step = constexpr(-step.value)
-            negative_step = True
-            lb, ub = ub, lb
-        lb = language.core._to_tensor(lb, self.builder)
-        ub = language.core._to_tensor(ub, self.builder)
-        step = language.core._to_tensor(step, self.builder)
-        # induction variable type
-        if not lb.dtype.is_int() or not ub.dtype.is_int() or not step.dtype.is_int():
-            raise TypeError(f"For loop bounds and step must all be ints, are ({lb.dtype}, {ub.dtype}, {step.dtype})")
-        iv_type = language.semantic.integer_promote_impl(lb.dtype, ub.dtype)
-        iv_type = language.semantic.integer_promote_impl(iv_type, step.dtype)
-        iv_ir_type = iv_type.to_ir(self.builder)
-        iv_is_signed = iv_type.int_signedness == language.core.dtype.SIGNEDNESS.SIGNED
-        # lb/ub/step might be constexpr, we need to cast them to tensor
-        lb = lb.handle
-        ub = ub.handle
-        step = step.handle
-        # ForOp can only accept IndexType as lb/ub/step. Cast integer to Index
-        lb = self.builder.create_int_cast(lb, iv_ir_type, iv_is_signed)
-        ub = self.builder.create_int_cast(ub, iv_ir_type, iv_is_signed)
-        step = self.builder.create_int_cast(step, iv_ir_type, iv_is_signed)
-        # Create placeholder for the loop induction variable
-        iv = self.builder.create_undef(iv_ir_type)
-        self.set_value(node.target.id, language.core.tensor(iv, iv_type))
-
-        with enter_sub_region(self) as sr:
-            liveins, insert_block = sr
-            ip = self.builder.get_insertion_point()
-
-            # create loop body block
-            block = self.builder.create_block()
-            self.builder.set_insertion_point_to_start(block)
-            # dry visit loop body
-            self.scf_stack.append(node)
-            self.visit_compound_statement(node.body)
-            self.scf_stack.pop()
-            block.erase()
-
-            # If a variable (name) is defined in both its parent & itself, then it's
-            # a loop-carried variable. (They must be of the same type)
-            init_args = []
-            yields = []
-            names = []
-            for name in self.local_defs:
-                if name in liveins:
-                    assert _is_triton_tensor(self.local_defs[name]), f'{name} is not tensor'
-                    assert _is_triton_tensor(liveins[name])
-                    assert self.local_defs[name].type == liveins[name].type,\
-                        f'Loop-carried variable {name} has initial type {liveins[name].type} '\
-                        f'but is re-assigned to {self.local_defs[name].type} in loop! '\
-                        f'Please make sure that the type stays consistent.'
-
-                    names.append(name)
-                    init_args.append(language.core._to_tensor(liveins[name], self.builder))
-                    yields.append(language.core._to_tensor(self.local_defs[name], self.builder))
-
-            # create ForOp
-            self.builder.restore_insertion_point(ip)
-            for_op = self.builder.create_for_op(lb, ub, step, [arg.handle for arg in init_args])
-
-            self.scf_stack.append(node)
-            self.builder.set_insertion_point_to_start(for_op.get_body(0))
-            for i, name in enumerate(names):
-                self.set_value(name, language.core.tensor(for_op.get_body(0).arg(i + 1), yields[i].type))
-            self.visit_compound_statement(node.body)
-            self.scf_stack.pop()
-            yields = []
-            for name in self.local_defs:
-                if name in liveins:
-                    yields.append(language.core._to_tensor(self.local_defs[name], self.builder))
-
-            # create YieldOp
-            if len(yields) > 0:
-                self.builder.create_yield_op([y.handle for y in yields])
-            for_op_region = for_op.get_body(0).get_parent()
-            assert for_op_region.size() == 1, "We use SCF, so the loop body should only have one block"
-
-            # update induction variable with actual value, and replace all uses
-            self.builder.set_insertion_point_to_start(for_op.get_body(0))
-            iv = for_op.get_induction_var()
-            if negative_step:
-                iv = self.builder.create_sub(ub, iv)
-                iv = self.builder.create_add(iv, lb)
-            self.lscope[node.target.id].handle.replace_all_uses_with(iv)
-            self.set_value(node.target.id, language.core.tensor(iv, iv_type))
-
-        # update lscope & local_defs (ForOp defines new values)
-        for i, name in enumerate(names):
-            self.set_value(name, language.core.tensor(for_op.get_result(i), yields[i].type))
-
-        for stmt in node.orelse:
-            assert False, "Don't know what to do with else after for"
-            ast.NodeVisitor.generic_visit(self, stmt)
-
-    def visit_Slice(self, node):
-        lower = self.visit(node.lower)
-        upper = self.visit(node.upper)
-        step = self.visit(node.step)
-        return slice(lower, upper, step)
-
-    def visit_Index(self, node):
-        return self.visit(node.value)
-
-    def visit_keyword(self, node) -> Tuple[str, Any]:
-        return node.arg, self.visit(node.value)
-
-    def visit_Assert(self, node) -> Any:
-        if not self.debug:
-            return
-        test = self.visit(node.test)
-        msg = self.visit(node.msg)
-        # Convert assert to triton's device_assert which happens on the device
-        return language.core.device_assert(test, msg, _builder=self.builder)
-
-    def call_JitFunction(self, fn: JITFunction, args, kwargs):
-        args = inspect.getcallargs(fn.fn, *args, **kwargs)
-        args = [args[name] for name in fn.arg_names]
-        args = [arg if _is_triton_tensor(arg)
-                else constexpr(arg) for arg in args]
-        # generate function def
-        attributes = dict()
-        constexprs = [i for i, arg in enumerate(args) if _is_constexpr(arg)]
-        constants = {i: args[i] for i in constexprs}
-        # generate call
-        args = [None if i in constexprs else arg for i, arg in enumerate(args)]
-        arg_vals = [arg.handle for arg in args if arg is not None]
-        arg_types = [arg.type for arg in args if arg is not None]
-        fn_name = mangle_fn(fn.__name__, arg_types, constants)
-        # generate function def if necessary
-        if not self.module.has_function(fn_name):
-            prototype = language.function_type([], arg_types)
-            gscope = sys.modules[fn.fn.__module__].__dict__
-            # If the callee is not set, we use the same debug setting as the caller
-            debug = self.debug if fn.debug is None else fn.debug
-            generator = CodeGenerator(self.builder.context, prototype, gscope, attributes, constants, module=self.module, function_name=fn_name, function_types=self.function_ret_types, debug=debug, noinline=fn.noinline)
-            generator.visit(fn.parse())
-            callee_ret_type = generator.last_ret_type
-            self.function_ret_types[fn_name] = callee_ret_type
-        else:
-            callee_ret_type = self.function_ret_types[fn_name]
-        symbol = self.module.get_function(fn_name)
-        call_op = self.builder.call(symbol, arg_vals)
-        if call_op.get_num_results() == 0 or callee_ret_type is None:
-            return None
-        elif call_op.get_num_results() == 1:
-            return tensor(call_op.get_result(0), callee_ret_type)
-        else:
-            # should return a tuple of tl.tensor
-            results = []
-            for i in range(call_op.get_num_results()):
-                results.append(tensor(call_op.get_result(i), callee_ret_type[i]))
-            return tuple(results)
-
-    def visit_Call(self, node):
-        fn = _unwrap_if_constexpr(self.visit(node.func))
-
-        static_implementation = self.statically_implemented_functions.get(fn)
-        if static_implementation is not None:
-            return static_implementation(self, node)
-
-        kws = dict(self.visit(keyword) for keyword in node.keywords)
-        args = [self.visit(arg) for arg in node.args]
-        if fn is language.core.device_assert:   # TODO: this should not be so hardcoded
-            if not self.debug:
-                return
-        if isinstance(fn, JITFunction):
-            _check_fn_args(node, fn, args)
-            return self.call_JitFunction(fn, args, kws)
-        if (hasattr(fn, '__self__') and _is_triton_tensor(fn.__self__)) or language.core.is_builtin(fn):
-            extra_kwargs = dict(_builder=self.builder)
-            sig = inspect.signature(fn)
-            if '_generator' in sig.parameters:
-                extra_kwargs['_generator'] = self
-            return fn(*args, **extra_kwargs, **kws)
-        if fn in self.builtin_namespace.values():
-            args = map(_unwrap_if_constexpr, args)
-        return fn(*args, **kws)
-
-    def visit_Constant(self, node):
-        return constexpr(node.value)
-
-    def visit_BoolOp(self, node: ast.BoolOp):
-        if len(node.values) != 2:
-            raise UnsupportedLanguageConstruct(None, node, "chained boolean operators (A or B or C) are not supported; use parentheses to split the chain.")
-        lhs = self.visit(node.values[0])
-        rhs = self.visit(node.values[1])
-        method_name = self._method_name_for_bool_op.get(type(node.op))
-        if method_name is None:
-            raise UnsupportedLanguageConstruct(None, node, "AST boolean operator '{}' is not (currently) implemented.".format(node.op.__name__))
-        return self._apply_binary_method(method_name, lhs, rhs)
-    _method_name_for_bool_op: Dict[Type[ast.boolop], str] = {ast.And: 'logical_and', ast.Or: 'logical_or'}
-
-    if sys.version_info < (3, 8):
-        def visit_NameConstant(self, node):
-            return constexpr(node.value)
-
-        def visit_Num(self, node):
-            return constexpr(node.n)
-
-        def visit_Str(self, node):
-            return constexpr(ast.literal_eval(node))
-
-    def visit_Attribute(self, node):
-        lhs = self.visit(node.value)
-        if _is_triton_tensor(lhs):
-            if node.attr == "T":
-                return language.semantic.trans(lhs, builder=self.builder)
-        return getattr(lhs, node.attr)
-
-    def visit_Expr(self, node):
-        ast.NodeVisitor.generic_visit(self, node)
-
-    def visit_NoneType(self, node):
-        return None
-
-    def visit_JoinedStr(self, node):
-        values = list(node.values)
-        for i, value in enumerate(values):
-            if isinstance(value, ast.Constant):
-                values[i] = str(value.value)
-            elif isinstance(value, ast.FormattedValue):
-                conversion_code = value.conversion
-                evaluated = self.visit(value.value)
-                if not _is_constexpr(evaluated):
-                    raise UnsupportedLanguageConstruct(
-                        None, node, "Cannot evaluate f-string containing non-constexpr conversion values, found conversion of type " + str(type(evaluated)))
-                values[i] = ("{}" if conversion_code < 0 else "{!" + chr(conversion_code) + "}").format(evaluated.value)
-            else:
-                raise AssertionError("encountered unexpected node of type {} in a JoinedStr node".format(type(value)))
-        return ''.join(values)
-
-    def visit(self, node):
-        if node is not None:
-            self.last_node = node
-        with warnings.catch_warnings():
-            # The ast library added visit_Constant and deprecated some other
-            # methods but we can't move to that without breaking Python 3.6 and 3.7.
-            warnings.simplefilter("ignore", DeprecationWarning)  # python 3.9
-            warnings.simplefilter("ignore", PendingDeprecationWarning)  # python 3.8
-            return super().visit(node)
-
-    def generic_visit(self, node):
-        raise UnsupportedLanguageConstruct(None, node, "unsupported AST node type: {}".format(type(node).__name__))
-
-    def execute_static_print(self, node: ast.Call) -> None:
-        # TODO: too simplistic? Perhaps do something else with non-constexpr
-
-        kws = {name: _unwrap_if_constexpr(value) for name, value in (self.visit(keyword) for keyword in node.keywords)}
-        args = [_unwrap_if_constexpr(self.visit(arg)) for arg in node.args]
-        print(*args, **kws)
-
-    def execute_static_assert(self, node: ast.Call) -> None:
-        arg_count = len(node.args)
-        if not (0 < arg_count <= 2) or len(node.keywords):
-            raise TypeError("`static_assert` requires one or two positional arguments only")
-
-        passed = self.visit(node.args[0])
-        if not isinstance(passed, bool):
-            raise NotImplementedError("Assertion condition could not be determined at compile-time. Make sure that it depends only on `constexpr` values")
-        if not passed:
-            if arg_count == 1:
-                message = ""
-            else:
-                try:
-                    message = self.visit(node.args[1])
-                except Exception as e:
-                    message = "<failed to evaluate assertion message: " + repr(e) + ">"
-
-            raise CompileTimeAssertionFailure(None, node, _unwrap_if_constexpr(message))
-        return None
-
-    statically_implemented_functions: Dict[object, Callable[[ast.Call], Any]] = {
-        language.core.static_assert: execute_static_assert,
-        language.core.static_print: execute_static_print,
-    }
-
-
-def str_to_ty(name):
-    if name[0] == "*":
-        ty = str_to_ty(name[1:])
-        return language.pointer_type(ty)
-    tys = {
-        "fp8e5": language.float8e5,
-        "fp8e4": language.float8e4,
-        "fp16": language.float16,
-        "bf16": language.bfloat16,
-        "fp32": language.float32,
-        "fp64": language.float64,
-        "i1": language.int1,
-        "i8": language.int8,
-        "i16": language.int16,
-        "i32": language.int32,
-        "i64": language.int64,
-        "u8": language.uint8,
-        "u16": language.uint16,
-        "u32": language.uint32,
-        "u64": language.uint64,
-        "B": language.int1,
-    }
-    return tys[name]
-
-
-def kernel_suffix(signature, specialization):
-    # suffix format:
-    # <argid><'c' if equal to 1><'d' if divisible by 16>
-    suffix = ''
-    for i, _ in enumerate(signature):
-        suffix += str(i)
-        if i in specialization.equal_to_1:
-            suffix += 'c'
-        if i in specialization.divisible_by_16:
-            suffix += 'd'
-    return suffix
-
-
-def ast_to_ttir(fn, signature, specialization, constants, debug):
-    # canonicalize signature
-    if isinstance(signature, str):
-        signature = {k: v.strip() for k, v in enumerate(signature.split(","))}
-    context = ir.context()
-    context.load_triton()
-    # create kernel prototype
-    cst_key = lambda i: fn.arg_names.index(i) if isinstance(i, str) else i
-    constants = {cst_key(key): value for key, value in constants.items()}
-    # visit kernel AST
-    gscope = fn.__globals__.copy()
-    function_name = '_'.join([fn.__name__, kernel_suffix(signature.values(), specialization)])
-    tys = list(signature.values())
-    new_constants = {k: True if k in tys and tys[k] == "i1" else 1 for k in specialization.equal_to_1}
-    new_attrs = {k: ("multiple_of", 16) for k in specialization.divisible_by_16}
-    all_constants = constants.copy()
-    all_constants.update(new_constants)
-    arg_types = [str_to_ty(v) for k, v in signature.items() if k not in constants]
-
-    prototype = language.function_type([], arg_types)
-    generator = CodeGenerator(context, prototype, gscope=gscope, constants=all_constants,
-                              function_name=function_name, attributes=new_attrs,
-                              is_kernel=True, debug=debug)
-    try:
-        generator.visit(fn.parse())
-    except CompilationError as e:
-        if e.src is None:
-            e.set_source_code(fn.src)
-        raise
-    except Exception as e:
-        node = generator.last_node
-        if node is None:
-            raise
-        raise CompilationError(fn.src, node, repr(e)) from e
-    ret = generator.module
-    # module takes ownership of the context
-    ret.context = context
-    return ret
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
deleted file mode 100644
index 8c1dacbe12bd..000000000000
--- a/python/triton/compiler/compiler.py
+++ /dev/null
@@ -1,577 +0,0 @@
-from __future__ import annotations
-
-import functools
-import hashlib
-import json
-import os
-import re
-import subprocess
-import tempfile
-from collections import namedtuple
-from pathlib import Path
-from typing import Any, Tuple
-
-import triton
-import triton._C.libtriton.triton as _triton
-from ..runtime import driver
-# TODO: runtime.errors
-from ..runtime.autotuner import OutOfResources
-from ..runtime.cache import get_cache_manager
-from ..tools.disasm import extract
-from .code_generator import ast_to_ttir
-from .make_launcher import make_stub
-
-
-def inline_triton_ir(mod):
-    pm = _triton.ir.pass_manager(mod.context)
-    pm.enable_debug()
-    pm.add_inliner_pass()
-    pm.run(mod)
-    return mod
-
-
-def ttir_compute_capability_rewrite(mod, arch):
-    # For hardware without support, we must rewrite all load/store
-    # with block (tensor) pointers into tensors of pointers
-    pm = _triton.ir.pass_manager(mod.context)
-    pm.enable_debug()
-    if _is_cuda(arch):
-        pm.add_rewrite_tensor_pointer_pass(arch)
-    pm.run(mod)
-    return mod
-
-
-def optimize_ttir(mod, arch):
-    mod = inline_triton_ir(mod)
-    mod = ttir_compute_capability_rewrite(mod, arch)
-    pm = _triton.ir.pass_manager(mod.context)
-    pm.enable_debug()
-    pm.add_inliner_pass()
-    pm.add_triton_combine_pass()
-    pm.add_canonicalizer_pass()
-    pm.add_cse_pass()
-    pm.add_licm_pass()
-    pm.add_symbol_dce_pass()
-    pm.run(mod)
-    return mod
-
-
-def ttir_to_ttgir(mod, num_warps):
-    pm = _triton.ir.pass_manager(mod.context)
-    pm.add_convert_triton_to_tritongpu_pass(num_warps)
-    pm.run(mod)
-    return mod
-
-
-def optimize_ttgir(mod, num_stages, arch):
-    pm = _triton.ir.pass_manager(mod.context)
-    pm.enable_debug()
-    pm.add_tritongpu_coalesce_pass()
-    pm.add_tritongpu_remove_layout_conversions_pass()
-    if isinstance(arch, int):
-        pm.add_tritongpu_accelerate_matmul_pass(arch)
-    pm.add_tritongpu_remove_layout_conversions_pass()
-    pm.add_tritongpu_optimize_dot_operands_pass()
-    pm.add_tritongpu_pipeline_pass(num_stages)
-    pm.add_tritongpu_prefetch_pass()
-    pm.add_tritongpu_optimize_dot_operands_pass()
-    pm.add_tritongpu_remove_layout_conversions_pass()
-    pm.add_tritongpu_decompose_conversions_pass()
-    pm.add_tritongpu_reorder_instructions_pass()
-    pm.add_cse_pass()
-    pm.add_symbol_dce_pass()
-    pm.run(mod)
-    return mod
-
-
-def _add_external_libs(mod, libs):
-    for name, path in libs.items():
-        if len(name) == 0 or len(path) == 0:
-            return
-    _triton.add_external_libs(mod, list(libs.keys()), list(libs.values()))
-
-
-def ttgir_to_llir(mod, extern_libs, arch):
-    if extern_libs:
-        _add_external_libs(mod, extern_libs)
-    # TODO: separate tritongpu_to_llvmir for different backends
-    if _is_cuda(arch):
-        return _triton.translate_triton_gpu_to_llvmir(mod, arch, False)
-    else:
-        return _triton.translate_triton_gpu_to_llvmir(mod, 0, True)
-
-
-# PTX translation
-
-@functools.lru_cache()
-def ptx_get_version(cuda_version) -> int:
-    '''
-    Get the highest PTX version supported by the current CUDA driver.
-    '''
-    assert isinstance(cuda_version, str)
-    major, minor = map(int, cuda_version.split('.'))
-    if major == 12:
-        return 80 + minor
-    if major == 11:
-        return 70 + minor
-    if major == 10:
-        return 63 + minor
-    raise RuntimeError("Triton only support CUDA 10.0 or higher")
-
-
-@functools.lru_cache()
-def path_to_ptxas():
-    base_dir = os.path.join(os.path.dirname(__file__), os.pardir)
-    paths = [
-        os.environ.get("TRITON_PTXAS_PATH", ""),
-        os.path.join(base_dir, "third_party", "cuda", "bin", "ptxas")
-    ]
-
-    for ptxas in paths:
-        if os.path.exists(ptxas) and os.path.isfile(ptxas):
-            result = subprocess.check_output([ptxas, "--version"], stderr=subprocess.STDOUT)
-            if result is not None:
-                version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE)
-                if version is not None:
-                    return ptxas, version.group(1)
-    raise RuntimeError("Cannot find ptxas")
-
-
-def llir_to_ptx(mod: Any, arch: int, ptx_version: int = None) -> str:
-    '''
-    Translate TritonGPU module to PTX code.
-    :param mod: a TritonGPU dialect module
-    :return: PTX code
-    '''
-    if ptx_version is None:
-        _, cuda_version = path_to_ptxas()
-        ptx_version = ptx_get_version(cuda_version)
-    return _triton.translate_llvmir_to_ptx(mod, arch, ptx_version)
-
-
-def ptx_to_cubin(ptx: str, arch: int):
-    '''
-    Compile TritonGPU module to cubin.
-    :param ptx: ptx code
-    :param compute_capability: compute capability
-    :return: str
-    '''
-    ptxas, _ = path_to_ptxas()
-    return _triton.compile_ptx_to_cubin(ptx, ptxas, arch)
-
-
-# AMDGCN translation
-
-def get_amdgcn_bitcode_paths(arch):
-    gpu_arch_agnostic_bitcode_libraries = ["opencl.bc",
-                                           "ocml.bc",
-                                           "ockl.bc",
-                                           "oclc_finite_only_off.bc",
-                                           "oclc_daz_opt_off.bc",
-                                           "oclc_correctly_rounded_sqrt_on.bc",
-                                           "oclc_unsafe_math_off.bc",
-                                           "oclc_wavefrontsize64_on.bc"]
-
-    gfx_arch = arch[1]
-    gfx_arch_id = re.search('gfx(\\w+)', gfx_arch).group(1).strip()
-
-    gpu_arch_specific_bitcode_library = 'oclc_isa_version_' + gfx_arch_id + ".bc"
-    bitcode_path_dir = os.path.join(Path(__file__).parent.resolve(), "third_party/rocm/lib/bitcode/")
-
-    amdgcn_bitcode_paths = {}
-    i = 1
-    for bc_lib in gpu_arch_agnostic_bitcode_libraries:
-        bc_path = bitcode_path_dir + bc_lib
-        if os.path.exists(bc_path):
-            amdgcn_bitcode_paths['library_' + str(i)] = bc_path
-            i += 1
-    bc_gfx_path = bitcode_path_dir + gpu_arch_specific_bitcode_library
-    if os.path.exists(bc_gfx_path):
-        amdgcn_bitcode_paths['library_' + str(i)] = bc_gfx_path
-
-    return amdgcn_bitcode_paths
-
-
-def get_amdgpu_arch_fulldetails():
-    """
-    get the amdgpu fulll ISA details for compiling:
-    i.e., arch_triple: amdgcn-amd-amdhsa; arch_name: gfx906; arch_features: sramecc+:xnack-
-    """
-    try:
-        # TODO: package rocm.cc with Triton
-        rocm_path_dir = os.getenv("ROCM_PATH", default="/opt/rocm")
-        rocminfo = subprocess.check_output(rocm_path_dir + '/bin/rocminfo').decode()
-        gfx_arch_details = re.search('amd.*', rocminfo).group(0).strip().split('--')
-        arch_triple = gfx_arch_details[0]
-        arch_name_features = gfx_arch_details[1].split(':')
-        arch_name = arch_name_features[0]
-        arch_features = ""
-
-        if (len(arch_name_features) == 3):
-            arch_features = "+" + re.search('\\w+', arch_name_features[1]).group(0) + ","\
-                            "-" + re.search('\\w+', arch_name_features[2]).group(0)
-        return [arch_triple, arch_name, arch_features]
-    except BaseException:
-        return None
-
-
-def llir_to_amdgcn_and_hsaco(mod: Any, gfx_arch: str, gfx_triple: str, gfx_features: str) -> Tuple[str, str]:
-    '''
-    Translate TritonGPU module to HSACO code based on full details of gpu architecture.
-    :param mod: a TritonGPU dialect module
-    :return:
-        - AMDGCN code
-        - Path to HSACO object
-    '''
-    return _triton.translate_llvmir_to_hsaco(mod, gfx_arch, gfx_triple, gfx_features)
-
-
-# ------------------------------------------------------------------------------
-# compiler
-# ------------------------------------------------------------------------------
-def get_kernel_name(src: str, pattern: str) -> str:
-    '''
-    Get kernel name from PTX code.
-    This Kernel name is required when launching the kernel.
-    '''
-    # There is a name mangling in PTX codegen, so the original kernel names in Triton IR are not available in PTX/cubin.
-    assert src
-    for line in src.split('\n'):
-        line = line.strip()
-        if line.startswith(pattern):
-            return line.split()[-1]
-
-
-def convert_type_repr(x):
-    match = re.search(r'!tt\.ptr<(.*)>', x)
-    if match is not None:
-        return '*' + convert_type_repr(match.group(1))
-    return x
-
-
-def make_hash(fn, arch, **kwargs):
-    if isinstance(fn, triton.runtime.JITFunction):
-        configs = kwargs["configs"]
-        signature = kwargs["signature"]
-        constants = kwargs.get("constants", dict())
-        num_warps = kwargs.get("num_warps", 4)
-        num_stages = kwargs.get("num_stages", 3)
-        debug = kwargs.get("debug", False)
-        # Get unique key for the compiled code
-        get_conf_key = lambda conf: (sorted(conf.divisible_by_16), sorted(conf.equal_to_1))
-        configs_key = [get_conf_key(conf) for conf in configs]
-        key = f"{fn.cache_key}-{''.join(signature.values())}-{configs_key}-{constants}-{num_warps}-{num_stages}-{debug}-{arch}"
-        return hashlib.md5(key.encode("utf-8")).hexdigest()
-    assert isinstance(fn, str)
-    return hashlib.md5((Path(fn).read_text() + triton.runtime.jit.version_key()).encode("utf-8")).hexdigest()
-
-
-# - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
-#    and any following whitespace
-# - (public\s+)? : optionally match the keyword public and any following whitespace
-# - (@\w+) : match an @ symbol followed by one or more word characters
-#   (letters, digits, or underscores), and capture it as group 1 (the function name)
-# - (\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\)) : match a pair of parentheses enclosing
-#   zero or more arguments separated by commas, and capture it as group 2 (the argument list)
-mlir_prototype_pattern = r'^\s*tt\.func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$'
-ptx_prototype_pattern = r"\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\)"
-prototype_pattern = {
-    "ttir": mlir_prototype_pattern,
-    "ttgir": mlir_prototype_pattern,
-    "ptx": ptx_prototype_pattern,
-}
-
-mlir_arg_type_pattern = r'%\w+: ([^,^\)\s]+)(?: \{\S+ = \S+ : \S+\})?,?'
-ptx_arg_type_pattern = r"\.param\s+\.(\w+)"
-arg_type_pattern = {
-    "ttir": mlir_arg_type_pattern,
-    "ttgir": mlir_arg_type_pattern,
-    "ptx": ptx_arg_type_pattern,
-}
-
-ttgir_num_warps_pattern = r'"triton_gpu.num-warps"\s?=\s?(\d+)\s?:'
-
-
-def _get_jsonable_constants(constants):
-    def _is_jsonable(x):
-        try:
-            json.dumps(x)
-            return True
-        except (TypeError, OverflowError):
-            return False
-    serialized_constants = {}
-    for constant in constants:
-        if _is_jsonable(constants[constant]):
-            serialized_constants[constant] = constants[constant]
-    return serialized_constants
-
-
-def parse_mlir_module(path, context):
-    module = _triton.ir.parse_mlir_module(path, context)
-    # module takes ownership of the context
-    module.context = context
-    return module
-
-
-instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"], defaults=[set(), set()])
-
-
-# TODO: architecture descriptor class
-def _is_cuda(arch):
-    return isinstance(arch, int)
-
-
-def get_architecture_descriptor(capability):
-    try:
-        import torch
-    except ImportError:
-        raise ImportError("Triton requires PyTorch to be installed")
-    if capability is None:
-        if torch.version.hip is None:
-            device = triton.runtime.jit.get_current_device()
-            capability = triton.runtime.jit.get_device_capability(device)
-            capability = capability[0] * 10 + capability[1]
-        else:
-            capability = get_amdgpu_arch_fulldetails()
-    return capability
-
-
-def add_rocm_stages(arch, extern_libs, stages):
-    extern_libs.update(get_amdgcn_bitcode_paths(arch))
-
-    for key in list(extern_libs):
-        if extern_libs[key] == '' or extern_libs[key] is None:
-            extern_libs.pop(key)
-
-    gfx_arch_full_details = arch
-    gfx_arch = os.environ.get('MI_GPU_ARCH', gfx_arch_full_details[1])
-    if gfx_arch is None:
-        raise RuntimeError('gfx_arch is None (not specified)')
-    stages["amdgcn"] = (lambda path: Path(path).read_text(),
-                        lambda src: llir_to_amdgcn_and_hsaco(src, gfx_arch,
-                                                             gfx_arch_full_details[0],
-                                                             gfx_arch_full_details[2]))
-
-
-def add_cuda_stages(arch, extern_libs, stages):
-
-    stages["ptx"] = (lambda path: Path(path).read_text(),
-                     lambda src: llir_to_ptx(src, arch))
-    stages["cubin"] = (lambda path: Path(path).read_bytes(),
-                       lambda src: ptx_to_cubin(src, arch))
-
-
-def compile(fn, **kwargs):
-    arch = get_architecture_descriptor(kwargs.get("cc", None))
-    is_cuda = _is_cuda(arch)
-    context = _triton.ir.context()
-    asm = dict()
-    constants = kwargs.get("constants", dict())
-    num_warps = kwargs.get("num_warps", 4)
-    num_stages = kwargs.get("num_stages", 3 if is_cuda and arch >= 75 else 2)
-    extern_libs = kwargs.get("extern_libs", dict())
-    if extern_libs is None:
-        extern_libs = dict()
-    debug = kwargs.get("debug", False)
-    # build compilation stages
-    stages = dict()
-    stages["ast"] = (lambda path: fn, None)
-    stages["ttir"] = (lambda path: parse_mlir_module(path, context),
-                      lambda src: optimize_ttir(ast_to_ttir(src, signature, configs[0], constants, debug=debug), arch))
-    stages["ttgir"] = (lambda path: parse_mlir_module(path, context),
-                       lambda src: optimize_ttgir(ttir_to_ttgir(src, num_warps), num_stages, arch))
-    stages["llir"] = (lambda path: Path(path).read_text(),
-                      lambda src: ttgir_to_llir(src, extern_libs, arch))
-    if is_cuda:
-        add_cuda_stages(arch, extern_libs, stages)
-    else:
-        add_rocm_stages(arch, extern_libs, stages)
-
-    # find out the signature of the function
-    if isinstance(fn, triton.runtime.JITFunction):
-        configs = kwargs.get("configs", None)
-        signature = kwargs["signature"]
-        if configs is None:
-            configs = [instance_descriptor()]
-        assert len(configs) == 1
-        kwargs["configs"] = configs
-        name = fn.__name__
-        first_stage = 0
-        if isinstance(signature, str):
-            signature = {k: v.strip() for k, v in enumerate(signature.split(","))}
-        kwargs["signature"] = signature
-    else:
-        assert isinstance(fn, str)
-        _, ir = os.path.basename(fn).split(".")
-        src = Path(fn).read_text()
-        import re
-        match = re.search(prototype_pattern[ir], src, re.MULTILINE)
-        name, signature = match.group(1), match.group(2)
-        types = re.findall(arg_type_pattern[ir], signature)
-        if ir == 'ttgir':
-            num_warps_matches = re.findall(ttgir_num_warps_pattern, src)
-            assert len(num_warps_matches) == 1, "Expected exactly one match for num_warps"
-            assert "num_warps" not in kwargs or int(num_warps_matches[0]) == num_warps, "num_warps in ttgir does not match num_warps in compile"
-            num_warps = int(num_warps_matches[0])
-        param_tys = [convert_type_repr(ty) for ty in types]
-        signature = {k: v for k, v in enumerate(param_tys)}
-        first_stage = list(stages.keys()).index(ir)
-
-    # cache manager
-    so_path = make_stub(name, signature, constants)
-    # create cache manager
-    fn_cache_manager = get_cache_manager(make_hash(fn, arch, **kwargs))
-    # determine name and extension type of provided function
-    if isinstance(fn, triton.runtime.JITFunction):
-        name, ext = fn.__name__, "ast"
-    else:
-        name, ext = os.path.basename(fn).split(".")
-
-    # load metadata if any
-    metadata = None
-    metadata_filename = f"{name}.json"
-
-    # The group is addressed by the metadata
-    metadata_group = fn_cache_manager.get_group(
-        metadata_filename
-    ) or {}
-
-    metadata_path = metadata_group.get(metadata_filename)
-
-    if metadata_path is not None:
-        with open(metadata_path) as f:
-            metadata = json.load(f)
-    else:
-        metadata = {"num_warps": num_warps,
-                    "num_stages": num_stages,
-                    "constants": _get_jsonable_constants(constants),
-                    "debug": debug}
-        if ext == "ptx":
-            assert "shared" in kwargs, "ptx compilation must provide shared memory size"
-            metadata["shared"] = kwargs["shared"]
-
-    first_stage = list(stages.keys()).index(ext)
-    asm = dict()
-    module = fn
-    # run compilation pipeline  and populate metadata
-    for ir, (parse, compile_kernel) in list(stages.items())[first_stage:]:
-        ir_filename = f"{name}.{ir}"
-
-        if ir == ext:
-            next_module = parse(fn)
-        else:
-            path = metadata_group.get(ir_filename)
-            if path is None:
-                next_module = compile_kernel(module)
-                if ir == "amdgcn":
-                    extra_file_name = f"{name}.hsaco_path"
-                    metadata_group[ir_filename] = fn_cache_manager.put(next_module[0], ir_filename)
-                    metadata_group[extra_file_name] = fn_cache_manager.put(next_module[1], extra_file_name)
-                else:
-                    metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
-                    fn_cache_manager.put(next_module, ir_filename)
-            else:
-                if ir == "amdgcn":
-                    extra_file_name = f"{name}.hsaco_path"
-                    hasco_path = metadata_group.get(extra_file_name)
-                    assert hasco_path is not None, "Expected to have hsaco_path in metadata when we have the amdgcn"
-                    next_module = (parse(path), parse(hasco_path))
-                else:
-                    next_module = parse(path)
-
-        if ir == "cubin":
-            asm[ir] = next_module
-        elif ir == "amdgcn":
-            asm[ir] = str(next_module[0])
-        else:
-            asm[ir] = str(next_module)
-        if ir == "llir" and "shared" not in metadata:
-            metadata["shared"] = _triton.get_shared_memory_size(module)
-        if ir == "ptx":
-            metadata["name"] = get_kernel_name(next_module, pattern='// .globl')
-        if ir == "amdgcn":
-            metadata["name"] = get_kernel_name(next_module[0], pattern='.globl')
-            asm["hsaco_path"] = next_module[1]
-        module = next_module
-    # write-back metadata, if it didn't come from the cache
-    if metadata_path is None:
-        metadata_group[metadata_filename] = fn_cache_manager.put(json.dumps(metadata), metadata_filename, binary=False)
-        fn_cache_manager.put_group(metadata_filename, metadata_group)
-
-    # return handle to compiled kernel
-    return CompiledKernel(fn, so_path, metadata, asm)
-
-
-class CompiledKernel:
-
-    # Hooks for external tools to monitor the execution of triton kernels
-    launch_enter_hook = None
-    launch_exit_hook = None
-
-    def __init__(self, fn, so_path, metadata, asm):
-        # initialize launcher
-        import importlib.util
-        spec = importlib.util.spec_from_file_location("__triton_launcher", so_path)
-        mod = importlib.util.module_from_spec(spec)
-        self.fn = fn
-        spec.loader.exec_module(mod)
-        self.c_wrapper = getattr(mod, "launch")
-        # initialize metadata
-        self.shared = metadata["shared"]
-        self.num_warps = metadata["num_warps"]
-        self.num_stages = metadata["num_stages"]
-        self.constants = metadata["constants"]
-        # initialize asm dict
-        self.asm = asm
-        # binaries are lazily initialized
-        # because it involves doing runtime things
-        # (e.g., checking amount of shared memory on current device)
-        self.metadata = metadata
-        self.cu_module = None
-        self.cu_function = None
-
-    def _init_handles(self):
-        if self.cu_module is not None:
-            return
-        device = triton.runtime.jit.get_current_device()
-        bin_path = {
-            driver.HIP: "hsaco_path",
-            driver.CUDA: "cubin"
-        }[driver.backend]
-        max_shared = driver.utils.get_device_properties(device)["max_shared_mem"]
-        if self.shared > max_shared:
-            raise OutOfResources(self.shared, max_shared, "shared memory")
-        mod, func, n_regs, n_spills = driver.utils.load_binary(self.metadata["name"], self.asm[bin_path], self.shared, device)
-
-        self.n_spills = n_spills
-        self.n_regs = n_regs
-        self.cu_module = mod
-        self.cu_function = func
-
-    def __getattribute__(self, name):
-        if name == 'c_wrapper':
-            self._init_handles()
-        return super().__getattribute__(name)
-
-    def __getitem__(self, grid):
-        self._init_handles()
-
-        def runner(*args, stream=None):
-            if stream is None:
-                stream = triton.runtime.jit.get_cuda_stream()
-            self.c_wrapper(grid[0], grid[1], grid[2], self.num_warps, self.shared, stream, self.cu_function,
-                           CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, self, *args)
-        return runner
-
-    def get_sass(self, fun=None):
-        if 'sass' in self.asm:
-            return self.asm['sass']
-        fd, path = tempfile.mkstemp()
-        try:
-            with open(fd, 'wb') as cubin:
-                cubin.write(self.asm['cubin'])
-            self.sass = extract(path, fun)
-        finally:
-            os.remove(path)
-        self.asm['sass'] = self.sass
-        return self.sass
diff --git a/python/triton/compiler/errors.py b/python/triton/compiler/errors.py
deleted file mode 100644
index 2930117b5170..000000000000
--- a/python/triton/compiler/errors.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import ast
-from typing import Optional, Union
-
-
-class CompilationError(Exception):
-    source_line_count_max_in_message = 12
-
-    def _format_message(self) -> str:
-        node = self.node
-        if self.src is None:
-            source_excerpt = " <source unavailable>"
-        else:
-            source_excerpt = self.src.split('\n')[:node.lineno][-self.source_line_count_max_in_message:]
-            if source_excerpt:
-                source_excerpt.append(' ' * node.col_offset + '^')
-                source_excerpt = '\n'.join(source_excerpt)
-            else:
-                source_excerpt = " <source empty>"
-
-        message = "at {}:{}:{}".format(node.lineno, node.col_offset, source_excerpt)
-        if self.error_message:
-            message += '\n' + self.error_message
-        return message
-
-    def __init__(self, src: Optional[str], node: ast.AST, error_message: Union[str, None]):
-        self.src = src
-        self.node = node
-        self.error_message = error_message
-        self.message = self._format_message()
-
-    def set_source_code(self, src: Optional[str]):
-        self.src = src
-        self.message = self._format_message()
-
-    def __str__(self):
-        return self.message
-
-    def __repr__(self):
-        return "{}({!r})".format(type(self).__name__, self.message)
-
-    def __reduce__(self):
-        # this is necessary to make CompilationError picklable
-        return type(self), (self.src, self.node, self.error_message)
-
-
-class CompileTimeAssertionFailure(CompilationError):
-    """Specific exception for failed tests in `static_assert` invocations"""
-    pass
-
-
-class UnsupportedLanguageConstruct(CompilationError):
-    pass
diff --git a/python/triton/compiler/make_launcher.py b/python/triton/compiler/make_launcher.py
deleted file mode 100644
index 3da8ddccf5c5..000000000000
--- a/python/triton/compiler/make_launcher.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import hashlib
-import os
-import tempfile
-
-from ..common import _build
-from ..runtime.cache import get_cache_manager
-from ..runtime.jit import version_key
-
-
-def is_hip():
-    import torch
-    return torch.version.hip is not None
-
-
-# ----- stub --------
-
-
-def make_so_cache_key(version_hash, signature, constants):
-    # Get unique key for the compiled code
-    signature = {k: 'ptr' if v[0] == '*' else v for k, v in signature.items()}
-    key = f"{version_hash}-{''.join(signature.values())}{constants}"
-    key = hashlib.md5(key.encode("utf-8")).hexdigest()
-    return key
-
-
-def make_stub(name, signature, constants):
-    # name of files that are cached
-    so_cache_key = make_so_cache_key(version_key(), signature, constants)
-    so_cache_manager = get_cache_manager(so_cache_key)
-    so_name = f"{name}.so"
-    # retrieve stub from cache if it exists
-    cache_path = so_cache_manager.get_file(so_name)
-    if cache_path is None:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            src = generate_launcher(constants, signature)
-            src_path = os.path.join(tmpdir, "main.c")
-            with open(src_path, "w") as f:
-                f.write(src)
-            so = _build(name, src_path, tmpdir)
-            with open(so, "rb") as f:
-                return so_cache_manager.put(f.read(), so_name, binary=True)
-    else:
-        return cache_path
-
-# ----- source code generation --------
-
-
-def ty_to_cpp(ty):
-    if ty[0] == '*':
-        return "hipDeviceptr_t" if is_hip() else "CUdeviceptr"
-    return {
-        "i1": "int32_t",
-        "i8": "int8_t",
-        "i16": "int16_t",
-        "i32": "int32_t",
-        "i64": "int64_t",
-        "u32": "uint32_t",
-        "u64": "uint64_t",
-        "fp16": "float",
-        "bf16": "float",
-        "fp32": "float",
-        "f32": "float",
-        "fp64": "double",
-    }[ty]
-
-
-def generate_launcher(constants, signature):
-    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
-
-    def _extracted_type(ty):
-        if ty[0] == '*':
-            return "PyObject*"
-        return {
-            'i1': 'int32_t',
-            'i32': 'int32_t',
-            'i64': 'int64_t',
-            'u32': 'uint32_t',
-            'u64': 'uint64_t',
-            'fp16': 'float',
-            'bf16': 'float',
-            'fp32': 'float',
-            'f32': 'float',
-            'fp64': 'double',
-        }[ty]
-
-    def format_of(ty):
-        return {
-            "PyObject*": "O",
-            "float": "f",
-            "double": "d",
-            "long": "l",
-            "uint32_t": "I",
-            "int32_t": "i",
-            "uint64_t": "K",
-            "int64_t": "L",
-        }[ty]
-
-    format = "iiiiiKKOOO" + ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])
-
-    # generate glue code
-    if is_hip():
-        src = f"""
-    #define __HIP_PLATFORM_AMD__
-    #include <hip/hip_runtime.h>
-    #include <Python.h>
-    #include <stdio.h>
-
-    static inline void gpuAssert(hipError_t code, const char *file, int line)
-    {{
-      if (code != HIP_SUCCESS)
-      {{
-         const char* prefix = "Triton Error [HIP]: ";
-         const char* str = hipGetErrorString(code);
-         char err[1024] = {{0}};
-         snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
-         PyErr_SetString(PyExc_RuntimeError, err);
-      }}
-    }}
-
-    #define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
-
-    static void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, hipStream_t stream, hipFunction_t function, {arg_decls}) {{
-      void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }};
-      if (gridX*gridY*gridZ > 0) {{
-          HIP_CHECK(hipModuleLaunchKernel(function, gridX, gridY, gridZ, 64*num_warps, 1, 1, shared_memory, stream, params, 0));
-      }}
-    }}
-
-    typedef struct _DevicePtrInfo {{
-      hipDeviceptr_t dev_ptr;
-      bool valid;
-    }} DevicePtrInfo;
-
-    static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
-      DevicePtrInfo ptr_info;
-      ptr_info.dev_ptr = 0;
-      ptr_info.valid = true;
-
-      if (PyLong_Check(obj)) {{
-        ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
-        return ptr_info;
-      }}
-
-      if (obj == Py_None) {{
-        // valid nullptr
-        return ptr_info;
-      }}
-
-      PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
-
-      if (ptr) {{
-        PyObject *empty_tuple = PyTuple_New(0);
-        PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
-        Py_DECREF(empty_tuple);
-        Py_DECREF(ptr);
-
-        if (!PyLong_Check(ret)) {{
-          PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
-          ptr_info.valid = false;
-          return ptr_info;
-        }}
-
-        ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
-
-        if (!ptr_info.dev_ptr)
-          return ptr_info;
-
-        uint64_t dev_ptr;
-        hipError_t status = hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
-        if (status == hipErrorInvalidValue) {{
-            PyErr_Format(PyExc_ValueError,
-                         "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
-            ptr_info.valid = false;
-        }}
-
-        ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
-        return ptr_info;
-      }}
-
-      PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
-      return ptr_info;
-    }}
-
-    static PyObject* launch(PyObject* self, PyObject* args) {{
-
-      int gridX, gridY, gridZ;
-      uint64_t _stream;
-      uint64_t _function;
-      int num_warps;
-      int shared_memory;
-      PyObject *launch_enter_hook = NULL;
-      PyObject *launch_exit_hook = NULL;
-      PyObject *compiled_kernel = NULL;
-
-      {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
-      if (!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &num_warps, &shared_memory, &_stream, &_function, &launch_enter_hook, &launch_exit_hook, &compiled_kernel, {', '.join(f"&_arg{i}" for i, ty in signature.items())})) {{
-        return NULL;
-      }}
-
-      if (launch_enter_hook != Py_None) {{
-        PyObject_CallObject(launch_enter_hook, args);
-      }}
-
-      // raise exception asap
-      {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
-      _launch(gridX, gridY, gridZ, num_warps, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, {', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}" for i, ty in signature.items())});
-      if (launch_exit_hook != Py_None) {{
-        PyObject_CallObject(launch_exit_hook, args);
-      }}
-      if (PyErr_Occurred()) {{
-        return NULL;
-      }}
-
-      // return None
-      Py_INCREF(Py_None);
-      return Py_None;
-    }}
-
-    static PyMethodDef ModuleMethods[] = {{
-      {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
-      {{NULL, NULL, 0, NULL}} // sentinel
-    }};
-
-    static struct PyModuleDef ModuleDef = {{
-      PyModuleDef_HEAD_INIT,
-      \"__triton_launcher\",
-      NULL, //documentation
-      -1, //size
-      ModuleMethods
-    }};
-
-    PyMODINIT_FUNC PyInit___triton_launcher(void) {{
-      PyObject *m = PyModule_Create(&ModuleDef);
-      if(m == NULL) {{
-        return NULL;
-      }}
-      PyModule_AddFunctions(m, ModuleMethods);
-      return m;
-    }}
-    """
-    else:
-        src = f"""
-#include \"cuda.h\"
-#include <stdbool.h>
-#include <Python.h>
-
-static inline void gpuAssert(CUresult code, const char *file, int line)
-{{
-   if (code != CUDA_SUCCESS)
-   {{
-      const char* prefix = "Triton Error [CUDA]: ";
-      const char* str;
-      cuGetErrorString(code, &str);
-      char err[1024] = {{0}};
-      strcat(err, prefix);
-      strcat(err, str);
-      PyErr_SetString(PyExc_RuntimeError, err);
-   }}
-}}
-
-#define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
-
-static void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, CUstream stream, CUfunction function, {arg_decls}) {{
-  void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }};
-  if(gridX*gridY*gridZ > 0){{
-    CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
-  }}
-}}
-
-typedef struct _DevicePtrInfo {{
-    CUdeviceptr dev_ptr;
-    bool valid;
-}} DevicePtrInfo;
-
-static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
-  DevicePtrInfo ptr_info;
-  ptr_info.dev_ptr = 0;
-  ptr_info.valid = true;
-  if (PyLong_Check(obj)) {{
-    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
-    return ptr_info;
-  }}
-  if (obj == Py_None) {{
-    // valid nullptr
-    return ptr_info;
-  }}
-  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
-  if(ptr){{
-    PyObject *empty_tuple = PyTuple_New(0);
-    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
-    Py_DECREF(empty_tuple);
-    Py_DECREF(ptr);
-    if (!PyLong_Check(ret)) {{
-      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
-      ptr_info.valid = false;
-      return ptr_info;
-    }}
-    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
-    if(!ptr_info.dev_ptr)
-      return ptr_info;
-    uint64_t dev_ptr;
-    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
-    if (status == CUDA_ERROR_INVALID_VALUE) {{
-        PyErr_Format(PyExc_ValueError,
-                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
-        ptr_info.valid = false;
-    }}
-    ptr_info.dev_ptr = dev_ptr;
-    Py_DECREF(ret);  // Thanks ChatGPT!
-    return ptr_info;
-  }}
-  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
-  return ptr_info;
-}}
-
-static PyObject* launch(PyObject* self, PyObject* args) {{
-  int gridX, gridY, gridZ;
-  uint64_t _stream;
-  uint64_t _function;
-  int num_warps;
-  int shared_memory;
-  PyObject *launch_enter_hook = NULL;
-  PyObject *launch_exit_hook = NULL;
-  PyObject *compiled_kernel = NULL;
-  {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
-  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &num_warps, &shared_memory, &_stream, &_function, &launch_enter_hook, &launch_exit_hook, &compiled_kernel, {', '.join(f"&_arg{i}" for i, ty in signature.items())})) {{
-    return NULL;
-  }}
-
-  if (launch_enter_hook != Py_None) {{
-    PyObject_CallObject(launch_enter_hook, args);
-  }}
-
-
-  // raise exception asap
-  {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
-  _launch(gridX, gridY, gridZ, num_warps, shared_memory, (CUstream)_stream, (CUfunction)_function, {', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items())});
-
-  if (launch_exit_hook != Py_None) {{
-    PyObject_CallObject(launch_exit_hook, args);
-  }}
-
-  if(PyErr_Occurred()) {{
-    return NULL;
-  }}
-  // return None
-  Py_INCREF(Py_None);
-  return Py_None;
-}}
-
-static PyMethodDef ModuleMethods[] = {{
-  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
-  {{NULL, NULL, 0, NULL}} // sentinel
-}};
-
-static struct PyModuleDef ModuleDef = {{
-  PyModuleDef_HEAD_INIT,
-  \"__triton_launcher\",
-  NULL, //documentation
-  -1, //size
-  ModuleMethods
-}};
-
-PyMODINIT_FUNC PyInit___triton_launcher(void) {{
-  PyObject *m = PyModule_Create(&ModuleDef);
-  if(m == NULL) {{
-    return NULL;
-  }}
-  PyModule_AddFunctions(m, ModuleMethods);
-  return m;
-}}
-"""
-    return src
diff --git a/python/triton/debugger/__init__.py b/python/triton/debugger/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/python/triton/debugger/core.py b/python/triton/debugger/core.py
deleted file mode 100644
index 82f3f43a25a0..000000000000
--- a/python/triton/debugger/core.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from typing import Tuple
-
-import dataclasses
-
-
-@dataclasses.dataclass
-class ExecutionContext:
-    program_id: Tuple[int]
-    program_size: Tuple[int]
diff --git a/python/triton/debugger/debugger.py b/python/triton/debugger/debugger.py
deleted file mode 100644
index 5c5b97292fac..000000000000
--- a/python/triton/debugger/debugger.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import itertools
-import random
-from typing import Tuple
-
-import triton
-import triton.language as tl
-from .core import ExecutionContext
-from .memory_map import MemoryMap
-from .tl_lang import (TritonLangProxy, WrappedTensor, _primitive_to_tensor,
-                      debugger_constexpr)
-from triton.debugger import torch_wrapper
-
-torch = torch_wrapper.torch
-tl_method_backup = {}
-
-
-def get_proxy_method(proxy, name):
-    method = getattr(proxy, name)
-
-    def fun(*args, **kwarg):
-        return method(*args, **kwarg)
-
-    return fun
-
-
-def attach_triton(module, proxy):
-    method_list = [func for func in dir(TritonLangProxy) if func[0] != "_"]
-    for name in method_list:
-        if hasattr(module, name):
-            attr = getattr(module, name)
-            tl_method_backup[name] = attr
-            if callable(attr):
-                setattr(module, name, get_proxy_method(proxy, name))
-            else:
-                setattr(module, name, getattr(proxy, name))
-
-
-def detach_triton(module):
-    for name, method in tl_method_backup.items():
-        setattr(module, name, method)
-
-
-def program_ids_from_grid(grid: Tuple[int, ...]) -> Tuple[int, ...]:
-    # reverse the grid dimensions and generate the range for each dimension
-    reversed_grid = reversed(grid)
-    ranges_for_each_dimension = [range(dim) for dim in reversed_grid]
-
-    # gen all combinations
-    index_combinations = list(itertools.product(*ranges_for_each_dimension))
-    random.shuffle(index_combinations)
-
-    for index_combination in index_combinations:
-        yield index_combination
-
-
-class DebuggerFunction:
-    def __init__(self, func, grid=(1,)):
-        self.func = func
-        self.grid = grid
-
-    def _is_constexpr(self, name):
-        return name in self.func.__annotations__ and self.func.__annotations__[name] is triton.language.core.constexpr
-
-    def _get_constexpr(self):
-        result = []
-        for name, annotation in self.func.__annotations__.items():
-            if annotation is triton.language.core.constexpr:
-                result.append(name)
-        return result
-
-    def _assert_constexpr(self, **kwargs):
-        constexp = self._get_constexpr()
-        missing = [i for i in constexp if i not in kwargs.keys()]
-        assert len(missing) == 0, f"You must specify constexpr {missing}"
-
-    def _get_grid(self, **kwargs):
-        if callable(self.grid):
-            return self.grid(kwargs)
-        else:
-            return self.grid
-
-    def __call__(self, *args, **kwargs):
-        self._assert_constexpr(**kwargs)
-
-        memory = MemoryMap()
-
-        def convert_arg(v):
-            name, arg = v
-            if torch.is_tensor(arg):
-                ptr = memory.add_tensor(arg)
-                return WrappedTensor(torch.tensor([ptr], dtype=torch.int64, device="cuda"))
-            if self._is_constexpr(name):
-                return debugger_constexpr(arg)
-            return WrappedTensor(_primitive_to_tensor(arg))
-
-        new_args = tuple(map(convert_arg, zip(self.func.__code__.co_varnames, args)))
-        new_kwargs = {k: convert_arg((k, v)) for (k, v) in kwargs.items() if k not in ["num_warps", "num_stages"]}
-
-        grid = self._get_grid(**kwargs)
-        for program_id in program_ids_from_grid(grid):
-            proxy = TritonLangProxy(memory, ExecutionContext(program_id, grid))
-            attach_triton(tl, proxy)
-            self.func(*new_args, **new_kwargs)
-            detach_triton(tl)
-
-
-class GridSelector:
-    """
-    Entry point of the debugger
-    """
-
-    def __init__(self, func):
-        version = torch.__version__
-        assert version[0] == "2", f"Triton Debugger only supports torch >= 2.0, using {version}"
-        self.func = func
-
-    def __getitem__(self, grid):
-        return DebuggerFunction(self.func, grid)
-
-    def __call__(self, *args, **kwargs):
-        return DebuggerFunction(self.func)(*args, **kwargs)
-
-
-class AutotuneGridSelector:
-    def __init__(self, func, autotune_params):
-        self.func = func
-        self.autotune_params = autotune_params
-
-    def __getitem__(self, grid):
-        return AutotuneRunner(self.func, self.autotune_params, grid)
-
-    def __call__(self, *args, **kwargs):
-        return AutotuneRunner(self.func, self.autotune_params)(*args, **kwargs)
-
-
-class AutotuneRunner:
-    def __init__(self, func, autotune_params, grid=None):
-        self.func = func
-        self.autotune_params = autotune_params
-        self.grid = grid
-
-    def __call__(self, *args, **kwargs):
-        assert len(self.autotune_params["configs"]) >= 1
-
-        for config in self.autotune_params["configs"][1:]:
-
-            def convert_arg(v):
-                if torch.is_tensor(v):
-                    return torch.clone(v)
-                return v
-
-            new_args = tuple(map(convert_arg, args))
-            new_kwargs = {k: convert_arg(v) for k, v in kwargs.items()}
-            if self.grid:
-                self.func[self.grid](*new_args, **new_kwargs, **config.kwargs)
-            else:
-                self.func(*new_args, **new_kwargs, **config.kwargs)
-
-        main_config = self.autotune_params["configs"][0]
-        if self.grid:
-            self.func[self.grid](*args, **kwargs, **main_config.kwargs)
-        else:
-            self.func(*args, **kwargs, **main_config.kwargs)
-
-
-def triton_debug_autotune(**kwars):
-    def wrapper(func):
-        return AutotuneGridSelector(func, kwars)
-
-    return wrapper
diff --git a/python/triton/debugger/memory_map.py b/python/triton/debugger/memory_map.py
deleted file mode 100644
index edf4c3f77922..000000000000
--- a/python/triton/debugger/memory_map.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import dataclasses
-
-from triton.debugger import torch_wrapper
-
-torch = torch_wrapper.torch
-
-
-@dataclasses.dataclass
-class RegisteredStorage:
-    storage: torch.Storage
-    dtype: torch.dtype
-    size: int
-    ptr: int
-
-    @property
-    def end_ptr(self) -> int:
-        return self.ptr + self.size
-
-    @property
-    def access_tensor(self) -> torch.Tensor:
-        return torch.tensor(self.storage, dtype=self.dtype, device=self.storage.device)
-
-    def ensure_immutable(self):
-        assert self.storage.data_ptr() == self.ptr and self.storage.size() == self.size
-
-
-class MemoryMap:
-    storages: [RegisteredStorage]
-
-    def __init__(self):
-        self.storages = []
-
-    def _get_registered_storage(self, pointer: torch.Tensor):
-        max_pointer = torch.max(pointer).item()
-        min_pointer = torch.min(pointer).item()
-
-        registered_storage = next(
-            filter(
-                lambda registered: min_pointer >= registered.ptr and max_pointer < registered.end_ptr, self.storages
-            ),
-            None,
-        )
-        if registered_storage is None:
-            raise Exception("Storage not found or pointers spanning multiple tensors")
-        registered_storage.ensure_immutable()
-        return registered_storage
-
-    def add_tensor(self, t: torch.Tensor):
-        storage = t.untyped_storage()
-        self.storages.append(RegisteredStorage(storage, t.dtype, storage.size(), storage.data_ptr()))
-        return t.data_ptr()
-
-    def load(
-        self,
-        pointer: torch.Tensor,
-        mask: torch.Tensor = None,
-        other=0.0,
-    ):
-        assert pointer.is_cuda
-        assert 0 < pointer.dim() < 3
-        assert pointer.dtype == torch.int64
-
-        if mask is None:
-            mask = torch.ones_like(pointer).bool()
-        assert mask.is_cuda
-        assert 0 < mask.dim() < 3
-        assert mask.dtype == torch.bool
-        mask = mask.expand(pointer.size())
-
-        if torch.all(~mask):
-            # Todo: The type is wrong here, we can't determine the correct type
-            return torch.full_like(pointer, fill_value=other, dtype=torch.float16, device="cuda")
-
-        registered_storage = self._get_registered_storage(pointer[mask])
-        access_tensor = registered_storage.access_tensor
-
-        index_tensor = pointer - registered_storage.ptr
-
-        block = torch.full_like(pointer, fill_value=other, dtype=access_tensor.dtype, device="cuda")
-        block[mask] = access_tensor[index_tensor[mask]]
-        return block
-
-    def store(self, pointer: torch.Tensor, value: torch.Tensor, mask=None):
-        assert 0 < pointer.dim() < 3
-        assert pointer.dtype == torch.int64
-
-        if mask is None:
-            mask = torch.ones_like(pointer).bool()
-        assert 0 < mask.dim() < 3
-        assert mask.dtype == torch.bool
-        mask = mask.expand(pointer.size())
-
-        if torch.all(~mask):
-            return
-
-        registered_storage = self._get_registered_storage(pointer[mask])
-        access_tensor = registered_storage.access_tensor
-
-        index_tensor = pointer - registered_storage.ptr
-        access_tensor[index_tensor[mask]] = value[mask].to(access_tensor.dtype)
diff --git a/python/triton/debugger/tl_lang.py b/python/triton/debugger/tl_lang.py
deleted file mode 100644
index 6364b77a3803..000000000000
--- a/python/triton/debugger/tl_lang.py
+++ /dev/null
@@ -1,621 +0,0 @@
-import triton
-from .core import ExecutionContext
-from .memory_map import MemoryMap
-from triton.debugger import torch_wrapper
-
-torch = torch_wrapper.torch
-
-
-def _primitive_to_tensor(x):
-    """
-    Converts various Python primitive data types to PyTorch tensor.
-    """
-    tensor_args = {"device": "cuda"}
-    if isinstance(x, bool):
-        return torch.tensor([x], dtype=torch.bool, **tensor_args)
-    elif isinstance(x, int):
-        if -(2**31) <= x < 2**31:
-            return torch.tensor([x], dtype=torch.int32, **tensor_args)
-        elif -(2**63) <= x < 2**63:
-            return torch.tensor([x], dtype=torch.int64, **tensor_args)
-        else:
-            raise RuntimeError(f"Nonrepresentable integer {x}.")
-    elif isinstance(x, float):
-        return torch.tensor([x], dtype=torch.float32, **tensor_args)
-    elif torch.is_tensor(x):
-        return x
-    elif isinstance(x, WrappedTensor):
-        return x
-    elif isinstance(x, debugger_constexpr):
-        if x.value is None:
-            return None
-        return _primitive_to_tensor(x.value)
-    elif x is None:
-        return None
-    assert False, f"cannot convert {x} of type {type(x)} to tensor"
-
-
-def _infer_tensor(func):
-    """
-    A decorator function to harmonize function args:
-        - converts primitives to PyTorch tensors
-        - wraps PyTorch tensors with WrappedTensors
-    """
-    def wrapper(*args):
-        new_args = tuple(map(lambda v: _primitive_to_tensor(v), args))
-        new_args = tuple(map(lambda v: WrappedTensor(v) if torch.is_tensor(v) else v, new_args))
-
-        return func(*new_args)
-
-    return wrapper
-
-
-def _tensor_operation(func):
-    """
-    A decorator function to unwrap WrappedTensors and debugger_constexpr before calling the function.
-    Can be combined with _infer_tensor decorator to harmonize args (everything to torch tensor).
-    """
-    def wrapper(*args, **kwargs):
-        for arg in args:
-            assert not torch.is_tensor(arg), "unexpected tensor argument"
-
-        def unwrap_tensor(v):
-            if isinstance(v, WrappedTensor):
-                return v.tensor
-            if isinstance(v, debugger_constexpr):
-                return v.value
-            return v
-
-        new_args = tuple(map(unwrap_tensor, args))
-        new_kwargs = {k: unwrap_tensor(v) for k, v in kwargs.items()}
-
-        result = func(args[0], *new_args[1:], **new_kwargs)
-        return WrappedTensor(result) if torch.is_tensor(result) else result
-
-    return wrapper
-
-
-class debugger_constexpr:
-    def __init__(self, value):
-        if isinstance(value, debugger_constexpr):
-            self.value = value.value
-        else:
-            self.value = value
-
-    def __str__(self) -> str:
-        return "debugger_constexpr(" + str(self.value) + ")"
-
-    def __index__(self) -> int:
-        return self.value
-
-    def __bool__(self):
-        return bool(self.value)
-
-    def __ge__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value >= other
-
-    def __gt__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value > other
-
-    def __le__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value <= other
-
-    def __lt__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value < other
-
-    def __eq__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value == other
-
-    def __or__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value | other
-
-    def __ror__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value | other
-
-    def __and__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value & other
-
-    def __rand__(self, other):
-        other = other.value if isinstance(other, debugger_constexpr) else other
-        return self.value & other
-
-    def to(self, dtype, bitcast=False, _builder=None):
-        if dtype in [torch.int64]:
-            ret_ty = int
-        elif dtype == torch.bool:
-            ret_ty = bool
-        elif dtype in [torch.float64]:
-            ret_ty = float
-        else:
-            raise ValueError("dtype not supported in debugger")
-        return debugger_constexpr(ret_ty(self.value))
-
-
-class WrappedTensor:
-    def __init__(self, tensor):
-        self.tensor = tensor
-
-    def __index__(self) -> int:
-        return self.tensor.item()
-
-    def __str__(self) -> str:
-        return "wrapped_" + str(self.tensor)
-
-    def __bool__(self) -> bool:
-        return torch.all(self.tensor == True).item()  # noqa: E712
-
-    @property
-    def dtype(self):
-        return self.tensor.dtype
-
-    @_infer_tensor
-    @_tensor_operation
-    def __add__(self, other):
-        return torch.add(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __radd__(self, other):
-        return self.__add__(other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __sub__(self, other):
-        return torch.sub(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rsub__(self, other):
-        return torch.sub(other, self.tensor)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __mul__(self, other):
-        return torch.mul(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rmul__(self, other):
-        return self.__mul__(other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __truediv__(self, other):
-        return torch.div(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rtruediv__(self, other):
-        return torch.div(other, self.tensor)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __floordiv__(self, other):
-        return torch.floor_divide(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rfloordiv__(self, other):
-        return torch.floor_divide(other, self.tensor)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __mod__(self, other):
-        return torch.remainder(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rmod__(self, other):
-        return torch.remainder(other, self.tensor)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __neg__(self):
-        return -self.tensor
-
-    @_infer_tensor
-    @_tensor_operation
-    def __invert__(self):
-        return ~self.tensor
-
-    @_infer_tensor
-    @_tensor_operation
-    def __and__(self, other):
-        return torch.bitwise_and(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __or__(self, other):
-        return torch.bitwise_or(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __xor__(self, other):
-        return torch.bitwise_xor(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __lshift__(self, other):
-        return torch.bitwise_left_shift(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rshift__(self, other):
-        return torch.bitwise_right_shift(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __gt__(self, other):
-        return self.tensor > other
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rgt__(self, other):
-        return other > self.tensor
-
-    @_infer_tensor
-    @_tensor_operation
-    def __ge__(self, other):
-        return self.tensor >= other
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rge__(self, other):
-        return other >= self.tensor
-
-    @_infer_tensor
-    @_tensor_operation
-    def __lt__(self, other):
-        return self.tensor < other
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rlt__(self, other):
-        return other < self.tensor
-
-    @_infer_tensor
-    @_tensor_operation
-    def __le__(self, other):
-        return self.tensor <= other
-
-    @_infer_tensor
-    @_tensor_operation
-    def __rle__(self, other):
-        return other <= self.tensor
-
-    @_infer_tensor
-    @_tensor_operation
-    def __eq__(self, other):
-        return torch.equal(self.tensor, other)
-
-    @_infer_tensor
-    @_tensor_operation
-    def __ne__(self, other):
-        return not torch.equal(self.tensor, other)
-
-    @_tensor_operation
-    def __getitem__(self, slices):
-        return self.tensor.__getitem__(slices)
-        # if isinstance(slices, slice):
-        #     slices = [slices]
-        # src_shape = self.shape
-        # dst_shape = []
-        # curr = 0
-        # for sl in slices:
-        #     if isinstance(sl, constexpr) and sl.value is None:
-        #         dst_shape.append(1)
-        #     elif sl == slice(None, None, None):
-        #         dst_shape.append(src_shape[curr].value)
-        #         curr += 1
-        # ret = torch.reshape(self.tensor, dst_shape, )
-        # return ret
-
-    @_tensor_operation
-    def to(self, dtype, bitcast=False):
-        return self.tensor.to(dtype)
-        # if isinstance(bitcast, constexpr):
-        #     bitcast = bitcast.value
-        # if bitcast:
-        #     return semantic.bitcast(self, dtype, )
-        # return semantic.cast(self, dtype, )
-
-
-def _constexpr_to_value(v):
-    if isinstance(v, debugger_constexpr):
-        return v.value
-    return v
-
-
-class TritonLangProxy:
-    _memory_map: MemoryMap
-    _context: ExecutionContext
-
-    def __init__(self, memory_map: MemoryMap, context: ExecutionContext):
-        self._memory_map = memory_map
-        self._context = context
-
-    # Types
-    # Removed void, int1, float8, uint16, uint32, uint64, pi32_t
-
-    # constexpr = debugger_constexpr
-
-    # Program functions
-
-    @_tensor_operation
-    def load(
-        self,
-        pointer: torch.Tensor,
-        mask: torch.Tensor = None,
-        other=0.0,
-        cache_modifier="",
-        eviction_policy="",
-        volatile=False,
-    ):
-        return self._memory_map.load(pointer, mask, other)
-
-    @_tensor_operation
-    def store(self, pointer: torch.Tensor, value: torch.Tensor, mask=None):
-        return self._memory_map.store(pointer, value, mask)
-
-    @_tensor_operation
-    def program_id(self, axis):
-        assert axis < len(self._context.program_id)
-        return torch.tensor([self._context.program_id[axis]], dtype=torch.int32, device="cuda")
-
-    @_tensor_operation
-    def num_programs(self, axis):
-        assert axis < len(self._context.program_size)
-        return torch.tensor([self._context.program_size[axis]], dtype=torch.int32, device="cuda")
-
-    @_tensor_operation
-    def arange(self, start, end):
-        return torch.arange(start=start, end=end, dtype=torch.int32, device="cuda")
-
-    @_tensor_operation
-    def zeros(self, shape, dtype):
-        for i, d in enumerate(shape):
-            if not isinstance(d, debugger_constexpr):
-                raise TypeError(f"Shape element {i} must have type `constexpr`")
-            if not isinstance(d.value, int):
-                raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
-        shape = [x.value for x in shape]
-        if isinstance(dtype, triton.language.core.dtype):
-            if dtype.is_fp32():
-                dtype = torch.float32
-            elif dtype.is_fp16():
-                dtype = torch.float16
-            elif dtype.is_bf16():
-                dtype = torch.bfloat16
-            elif dtype.is_int32():
-                dtype = torch.int32
-            elif dtype.is_int16():
-                dtype = torch.int16
-            elif dtype.is_int8():
-                dtype = torch.int8
-            else:
-                raise TypeError(f"Unsupported dtype {dtype}")
-        return torch.zeros(size=shape, dtype=dtype, device="cuda")
-
-    @_tensor_operation
-    def dequantize(self, input, scale, shift, nbit, dst_ty=torch.float16):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def broadcast(self, input, other):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def broadcast_to(self, input, shape):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def cat(self, input, shape):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def reshape(self, input, shape):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def dot(self, input, other, trans_a=False, trans_b=False, allow_tf32=True):
-        assert input.dtype == other.dtype
-        if trans_a:
-            input = input.T
-        if trans_b:
-            other = other.T
-        return torch.matmul(input=input, other=other)
-
-    @_tensor_operation
-    def atomic_cas(self, pointer, cmp, val):
-        stored = self._memory_map.load(pointer, None, 0.0)
-        if not isinstance(cmp, torch.Tensor):
-            cmp = torch.tensor([cmp], dtype=stored.dtype, device="cuda")
-        if not isinstance(val, torch.Tensor):
-            val = torch.tensor([val], dtype=stored.dtype, device="cuda")
-        if stored == cmp:
-            self._memory_map.store(pointer, val, None)
-        return stored
-
-    @_tensor_operation
-    def atomic_xchg(self, pointer, val, mask=None):
-        if isinstance(val, int):
-            val = torch.tensor([val], dtype=torch.int32, device="cuda")
-        stored = self._memory_map.load(pointer, mask, 0.0)
-        self._memory_map.store(pointer, val, mask)
-        return stored
-
-    @_tensor_operation
-    def atomic_add(self, pointer, val, mask=None):
-        # arbitrary other value as it will masked during storing
-        stored = self._memory_map.load(pointer, mask, 0.0)
-        result = stored + val
-        self._memory_map.store(pointer, result, mask)
-        return stored
-
-    @_tensor_operation
-    def atomic_max(self, pointer, val, mask=None):
-        stored = self._memory_map.load(pointer, mask, 0.0)
-        result = torch.maximum(stored, val)
-        self._memory_map.store(pointer, result, mask)
-        return stored
-
-    @_tensor_operation
-    def atomic_min(self, pointer, val, mask=None):
-        stored = self._memory_map.load(pointer, mask, 0.0)
-        result = torch.minimum(stored, val)
-        self._memory_map.store(pointer, result, mask)
-        return stored
-
-    @_tensor_operation
-    def atomic_and(self, pointer, val, mask=None):
-        stored = self._memory_map.load(pointer, mask, 0)
-        result = torch.bitwise_and(stored, val)
-        self._memory_map.store(pointer, result, mask)
-        return stored
-
-    @_tensor_operation
-    def atomic_or(self, pointer, val, mask=None):
-        stored = self._memory_map.load(pointer, mask, 0)
-        result = torch.bitwise_or(stored, val)
-        self._memory_map.store(pointer, result, mask)
-        return stored
-
-    @_tensor_operation
-    def atomic_xor(self, pointer, val, mask=None):
-        stored = self._memory_map.load(pointer, mask, 0)
-        result = torch.bitwise_xor(stored, val)
-        self._memory_map.store(pointer, result, mask)
-        return stored
-
-    @_tensor_operation
-    def where(self, condition, x, y):
-        condition = _primitive_to_tensor(condition)
-        x = _primitive_to_tensor(x)
-        y = _primitive_to_tensor(y)
-        return torch.where(condition, x, y)
-
-    @_tensor_operation
-    def umulhi(self, x, y):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def fdiv(self, x, y, ieee_rounding=False):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def exp(self, x):
-        return torch.exp(x)
-
-    @_tensor_operation
-    def log(self, x):
-        return torch.log(x)
-
-    @_tensor_operation
-    def cos(self, x):
-        return torch.cos(x)
-
-    @_tensor_operation
-    def sin(self, x):
-        return torch.sin(x)
-
-    @_tensor_operation
-    def sqrt(self, x):
-        return torch.sqrt(x)
-
-    @_tensor_operation
-    def globaltimer(self):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def clock(self):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def debug_barrier(self):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def multiple_of(self, input, values):
-        return input
-
-    @_tensor_operation
-    def max_contiguous(self, input, values):
-        return input
-
-    @_tensor_operation
-    def abs(self, x):
-        return torch.abs(x)
-
-    @_tensor_operation
-    def cdiv(self, x, div):
-        return (x + div - 1) // div
-
-    @_tensor_operation
-    def minimum(self, x, y):
-        if isinstance(x, int):
-            x = torch.tensor(x, device="cuda")
-        if isinstance(y, int):
-            y = torch.tensor(y, device="cuda")
-        return torch.minimum(x, y)
-
-    @_tensor_operation
-    def maximum(self, x, y):
-        return torch.maximum(x, y)
-
-    @_tensor_operation
-    def sigmoid(self, x):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def softmax(self, x, ieee_rounding=False):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def ravel(self, x):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def swizzle2d(self, i, j, size_i, size_j, size_g):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def zeros_like(self, input):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def max(self, input, axis=None):
-        if axis is None:
-            return torch.max(input)
-        return torch.max(input, dim=axis).values
-
-    @_tensor_operation
-    def argmax(self, input, axis):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def min(self, input, axis=None):
-        if axis is None:
-            return torch.min(input)
-        return torch.min(input, dim=axis).values
-
-    @_tensor_operation
-    def argmin(self, input, axis):
-        raise NotImplementedError()
-
-    @_tensor_operation
-    def sum(self, input, axis=None):
-        if axis is None:
-            return torch.sum(input)
-        return torch.sum(input, dim=axis)
-
-    @_tensor_operation
-    def xor_sum(self, input, axis):
-        raise NotImplementedError()
diff --git a/python/triton/debugger/torch_wrapper.py b/python/triton/debugger/torch_wrapper.py
deleted file mode 100644
index 44aa17eb1355..000000000000
--- a/python/triton/debugger/torch_wrapper.py
+++ /dev/null
@@ -1,18 +0,0 @@
-try:
-    import torch as _torch
-except ImportError:
-    _torch = None
-
-
-class TorchWrapper:
-    """
-    Helps in making torch an optional dependency
-    """
-
-    def __getattr__(self, name):
-        if _torch is None:
-            raise ImportError("Triton requires PyTorch to be installed")
-        return getattr(_torch, name)
-
-
-torch = TorchWrapper()
diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py
deleted file mode 100644
index 7485f374b9e9..000000000000
--- a/python/triton/language/__init__.py
+++ /dev/null
@@ -1,201 +0,0 @@
-"""isort:skip_file"""
-# Import order is significant here.
-
-from . import math
-from . import extra
-from .standard import (
-    cdiv,
-    sigmoid,
-    softmax,
-    ravel,
-    swizzle2d,
-    zeros,
-    zeros_like,
-)
-from .core import (
-    abs,
-    advance,
-    arange,
-    argmin,
-    argmax,
-    atomic_add,
-    atomic_and,
-    atomic_cas,
-    atomic_max,
-    atomic_min,
-    atomic_or,
-    atomic_xchg,
-    atomic_xor,
-    bfloat16,
-    block_type,
-    broadcast,
-    broadcast_to,
-    cat,
-    constexpr,
-    cos,
-    debug_barrier,
-    device_assert,
-    device_print,
-    dot,
-    dtype,
-    exp,
-    expand_dims,
-    full,
-    fdiv,
-    float16,
-    float32,
-    float64,
-    float8e4,
-    float8e5,
-    function_type,
-    int1,
-    int16,
-    int32,
-    int64,
-    int8,
-    load,
-    log,
-    make_block_ptr,
-    max,
-    max_contiguous,
-    maximum,
-    min,
-    minimum,
-    multiple_of,
-    num_programs,
-    pi32_t,
-    pointer_type,
-    program_id,
-    reduce,
-    reshape,
-    sin,
-    sqrt,
-    static_assert,
-    static_print,
-    store,
-    sum,
-    static_range,
-    tensor,
-    trans,
-    triton,
-    uint16,
-    uint32,
-    uint64,
-    uint8,
-    umulhi,
-    view,
-    void,
-    where,
-    xor_sum,
-)
-from .random import (
-    pair_uniform_to_normal,
-    philox,
-    philox_impl,
-    rand,
-    rand4x,
-    randint,
-    randint4x,
-    randn,
-    randn4x,
-    uint32_to_uniform_float,
-)
-
-
-__all__ = [
-    "abs",
-    "advance",
-    "arange",
-    "argmin",
-    "argmax",
-    "atomic_add",
-    "atomic_and",
-    "atomic_cas",
-    "atomic_max",
-    "atomic_min",
-    "atomic_or",
-    "atomic_xchg",
-    "atomic_xor",
-    "bfloat16",
-    "block_type",
-    "broadcast",
-    "broadcast_to",
-    "builtin",
-    "cat",
-    "cdiv",
-    "constexpr",
-    "cos",
-    "debug_barrier",
-    "device_assert",
-    "device_print",
-    "dot",
-    "dtype",
-    "exp",
-    "expand_dims",
-    "extra",
-    "fdiv",
-    "float16",
-    "float32",
-    "float64",
-    "float8e4",
-    "float8e5",
-    "full",
-    "function_type",
-    "int1",
-    "int16",
-    "int32",
-    "int64",
-    "int8",
-    "ir",
-    "math",
-    "load",
-    "log",
-    "make_block_ptr",
-    "max",
-    "max_contiguous",
-    "maximum",
-    "min",
-    "minimum",
-    "multiple_of",
-    "num_programs",
-    "pair_uniform_to_normal",
-    "philox",
-    "philox_impl",
-    "pi32_t",
-    "pointer_type",
-    "program_id",
-    "rand",
-    "rand4x",
-    "randint",
-    "randint4x",
-    "randn",
-    "randn4x",
-    "ravel",
-    "reduce",
-    "reshape",
-    "sigmoid",
-    "sin",
-    "softmax",
-    "sqrt",
-    "static_range",
-    "static_assert",
-    "static_print",
-    "store",
-    "sum",
-    "swizzle2d",
-    "tensor",
-    "trans",
-    "triton",
-    "uint16",
-    "uint32",
-    "uint32_to_uniform_float",
-    "uint64",
-    "uint8",
-    "umulhi",
-    "view",
-    "void",
-    "where",
-    "xor_sum",
-    "zeros",
-    "zeros_like",
-]
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
deleted file mode 100644
index a3c4609961f0..000000000000
--- a/python/triton/language/core.py
+++ /dev/null
@@ -1,1702 +0,0 @@
-from __future__ import annotations
-
-from contextlib import contextmanager
-from enum import Enum
-from functools import wraps
-from typing import Callable, List, Sequence, TypeVar
-
-import triton
-from . import semantic
-from triton._C.libtriton.triton import ir
-
-T = TypeVar('T')
-
-TRITON_MAX_TENSOR_NUMEL = 131072
-
-TRITON_BUILTIN = "__triton_builtin__"
-
-
-def builtin(fn: T) -> T:
-    """Mark a function as a builtin."""
-    assert callable(fn)
-
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if "_builder" not in kwargs or kwargs["_builder"] is None:
-            raise ValueError(
-                "Did you forget to add @triton.jit ? "
-                "(`_builder` argument must be provided outside of JIT functions.)"
-            )
-        return fn(*args, **kwargs)
-
-    setattr(wrapper, TRITON_BUILTIN, True)
-
-    return wrapper
-
-
-def is_builtin(fn) -> bool:
-    """Is this a registered triton builtin function?"""
-    return getattr(fn, TRITON_BUILTIN, False)
-
-
-def _to_tensor(x, builder):
-    if isinstance(x, bool):
-        return tensor(builder.get_int1(x), int1)
-    # Note: compile-time const integers are represented by unsigned values
-    elif isinstance(x, int):
-        if -2**31 <= x < 2**31:
-            return tensor(builder.get_int32(x), int32)
-        elif 2**31 <= x < 2**32:
-            return tensor(builder.get_int32(x), uint32)
-        elif -2**63 <= x < 2**63:
-            return tensor(builder.get_int64(x), int64)
-        elif 2**63 <= x < 2**64:
-            return tensor(builder.get_int64(x), uint64)
-        else:
-            raise RuntimeError(f'Nonrepresentable integer {x}.')
-    elif isinstance(x, float):
-        min_float32 = 2 ** -126
-        max_float32 = (2 - 2**-23) * 2**127
-        abs_x = __builtins__['abs'](x)
-        if abs_x == float("inf") or\
-           abs_x == 0.0 or \
-           x != x or \
-           min_float32 <= abs_x <= max_float32:
-            return tensor(builder.get_fp32(x), float32)
-        else:
-            return tensor(builder.get_fp64(x), float64)
-
-    elif isinstance(x, constexpr):
-        return _to_tensor(x.value, builder)
-    elif isinstance(x, tensor):
-        return x
-    assert False, f"cannot convert {x} of type {type(x)} to tensor"
-
-
-class dtype:
-    SINT_TYPES = ['int8', 'int16', 'int32', 'int64']
-    UINT_TYPES = ['int1', 'uint8', 'uint16', 'uint32', 'uint64']
-    FP_TYPES = ['fp8e4', 'fp8e5', 'fp16', 'bf16', 'fp32', 'fp64']
-    STANDARD_FP_TYPES = ['fp16', 'bf16', 'fp32', 'fp64']
-    OTHER_TYPES = ['void']
-
-    class SIGNEDNESS(Enum):
-        SIGNED = 0
-        UNSIGNED = 1
-
-    def __init__(self, name):
-        self.name = name
-        assert name in dtype.SINT_TYPES + dtype.UINT_TYPES + dtype.FP_TYPES + dtype.OTHER_TYPES, name
-        if name in dtype.SINT_TYPES:
-            self.int_signedness = dtype.SIGNEDNESS.SIGNED
-            self.int_bitwidth = int(name.split('int')[-1])
-            self.primitive_bitwidth = self.int_bitwidth
-        elif name in dtype.UINT_TYPES:
-            self.int_signedness = dtype.SIGNEDNESS.UNSIGNED
-            self.int_bitwidth = int(name.split('int')[-1])
-            self.primitive_bitwidth = self.int_bitwidth
-        elif name in dtype.FP_TYPES:
-            if name == 'fp8e4':
-                self.fp_mantissa_width = 3
-                self.primitive_bitwidth = 8
-            elif name == 'fp8e5':
-                self.fp_mantissa_width = 2
-                self.primitive_bitwidth = 8
-            elif name == 'fp16':
-                self.fp_mantissa_width = 10
-                self.primitive_bitwidth = 16
-            elif name == 'bf16':
-                self.fp_mantissa_width = 7
-                self.primitive_bitwidth = 16
-            elif name == 'fp32':
-                self.fp_mantissa_width = 23
-                self.primitive_bitwidth = 32
-            elif name == 'fp64':
-                self.fp_mantissa_width = 53
-                self.primitive_bitwidth = 64
-            else:
-                raise RuntimeError(f'Unsupported floating-point type {name}')
-        elif name == 'void':
-            self.primitive_bitwidth = 0
-
-    def is_fp8(self):
-        return 'fp8' in self.name
-
-    def is_fp16(self):
-        return self.name == 'fp16'
-
-    def is_bf16(self):
-        return self.name == 'bf16'
-
-    def is_fp32(self):
-        return self.name == 'fp32'
-
-    def is_fp64(self):
-        return self.name == 'fp64'
-
-    def is_int1(self):
-        return self.name == 'int1'
-
-    def is_int8(self):
-        return self.name == 'int8'
-
-    def is_int16(self):
-        return self.name == 'int16'
-
-    def is_int32(self):
-        return self.name == 'int32'
-
-    def is_int64(self):
-        return self.name == 'int64'
-
-    def is_uint8(self):
-        return self.name == 'uint8'
-
-    def is_uint16(self):
-        return self.name == 'uint16'
-
-    def is_uint32(self):
-        return self.name == 'uint32'
-
-    def is_uint64(self):
-        return self.name == 'uint64'
-
-    def is_floating(self):
-        return self.name in dtype.FP_TYPES
-
-    def is_standard_floating(self):
-        return self.name in dtype.STANDARD_FP_TYPES
-
-    def is_int_signed(self):
-        return self.name in dtype.SINT_TYPES
-
-    def is_int_unsigned(self):
-        return self.name in dtype.UINT_TYPES
-
-    def is_int(self):
-        return self.name in dtype.SINT_TYPES + dtype.UINT_TYPES
-
-    def is_bool(self):
-        return self.is_int1()
-
-    @staticmethod
-    def is_void():
-        raise RuntimeError("Not implemented")
-
-    @staticmethod
-    def is_block():
-        return False
-
-    @staticmethod
-    def is_ptr():
-        return False
-
-    def __eq__(self, other: dtype):
-        if not isinstance(other, dtype):
-            return False
-        return self.name == other.name
-
-    def __ne__(self, other: dtype):
-        return not self.__eq__(other)
-
-    def __hash__(self):
-        return hash((self.name,))
-
-    @property
-    def scalar(self):
-        return self
-
-    def to_ir(self, builder: ir.builder) -> ir.type:
-        if self.name == 'void':
-            return builder.get_void_ty()
-        elif self.name == 'int1':
-            return builder.get_int1_ty()
-        elif self.name in ('int8', 'uint8'):
-            return builder.get_int8_ty()
-        elif self.name in ('int16', 'uint16'):
-            return builder.get_int16_ty()
-        elif self.name in ('int32', 'uint32'):
-            return builder.get_int32_ty()
-        elif self.name in ('int64', 'uint64'):
-            return builder.get_int64_ty()
-        elif self.name == 'fp8e5':
-            return builder.get_fp8e5_ty()
-        elif self.name == 'fp8e4':
-            return builder.get_fp8e4_ty()
-        elif self.name == 'fp16':
-            return builder.get_half_ty()
-        elif self.name == 'bf16':
-            return builder.get_bf16_ty()
-        elif self.name == 'fp32':
-            return builder.get_float_ty()
-        elif self.name == 'fp64':
-            return builder.get_double_ty()
-        raise ValueError(f'fail to convert {self} to ir type')
-
-    def __str__(self):
-        return self.name
-
-    @property
-    def cache_key_part(self) -> str:
-        """See cache_key_part() in triton.cc."""
-        return self.name
-
-    def __repr__(self):
-        return f'triton.language.{self.name}'
-
-
-class pointer_type(dtype):
-    def __init__(self, element_ty: dtype, address_space: int = 1):
-        if not isinstance(element_ty, dtype):
-            raise TypeError('element_ty is a {type(element_ty).__name__}.')
-        self.element_ty = element_ty
-        self.address_space = address_space
-
-        self.name = self.__str__()
-
-    def to_ir(self, builder: ir.builder) -> ir.pointer_type:
-        return builder.get_ptr_ty(self.element_ty.to_ir(builder), 1)
-
-    def __str__(self):
-        return f'pointer<{self.element_ty}>'
-
-    def __repr__(self):
-        return self.__str__()
-
-    def is_ptr(self):
-        return True
-
-    def __eq__(self, other: pointer_type) -> bool:
-        if not isinstance(other, pointer_type):
-            return False
-        return self.element_ty == other.element_ty and self.address_space == other.address_space
-
-    def __ne__(self, other: pointer_type) -> bool:
-        return not self.__eq__(other)
-
-    @property
-    def scalar(self):
-        return self
-
-
-class block_type(dtype):
-    def __init__(self, element_ty: dtype, shape: List):
-        self.element_ty = element_ty
-
-        # Note that block_type's shape is a list of int
-        # while tensor's shape is a list of constexpr.
-
-        # shape can be empty ([]) when an input is a 0D tensor.
-        if not shape:
-            raise TypeError('0d block_type is forbidden')
-        if isinstance(shape[0], constexpr):
-            shape = [s.value for s in shape]
-
-        self.shape = shape
-        self.numel = 1
-        for s in self.shape:
-            self.numel *= s
-        if self.numel > TRITON_MAX_TENSOR_NUMEL:
-            raise ValueError(f"numel ({self.numel}) exceeds triton maximum tensor numel ({TRITON_MAX_TENSOR_NUMEL})")
-
-        self.name = self.__str__()
-
-    def to_ir(self, builder: ir.builder) -> ir.block_type:
-        return builder.get_block_ty(self.element_ty.to_ir(builder), self.shape)
-
-    def __str__(self):
-        return f'<{self.shape}, {self.element_ty}>'
-
-    def __repr__(self):
-        return self.__str__()
-
-    def is_block(self):
-        return True
-
-    def get_block_shapes(self) -> List[int]:
-        return self.shape
-
-    def __eq__(self, other: block_type) -> bool:
-        if not isinstance(other, block_type):
-            return False
-        return self.element_ty == other.element_ty and self.shape == other.shape
-
-    def __ne__(self, other: block_type) -> bool:
-        return not self.__eq__(other)
-
-    @property
-    def scalar(self):
-        return self.element_ty
-
-
-class function_type(dtype):
-    def __init__(self, ret_types: List[dtype], param_types: List[dtype]) -> None:
-        self.ret_types = ret_types
-        self.param_types = param_types
-
-    def __str__(self):
-        return f'fn ({self.param_types}) -> {self.ret_types}'
-
-    def to_ir(self, builder: ir.builder):
-        ir_param_types = [ty.to_ir(builder) for ty in self.param_types]
-        ret_types = [ret_type.to_ir(builder) for ret_type in self.ret_types]
-        return builder.get_function_ty(ir_param_types, ret_types)
-
-
-# scalar types
-void = dtype('void')
-int1 = dtype('int1')
-int8 = dtype('int8')
-int16 = dtype('int16')
-int32 = dtype('int32')
-int64 = dtype('int64')
-uint8 = dtype('uint8')
-uint16 = dtype('uint16')
-uint32 = dtype('uint32')
-uint64 = dtype('uint64')
-float8e5 = dtype('fp8e5')
-float8e4 = dtype('fp8e4')
-float16 = dtype('fp16')
-bfloat16 = dtype('bf16')
-float32 = dtype('fp32')
-float64 = dtype('fp64')
-# pointer types
-pi32_t = pointer_type(int32)
-
-# -----------------------
-# constexpr
-# -----------------------
-
-
-class constexpr:
-    """
-    This class is used to store a value that is known at compile-time.
-    """
-
-    def __init__(self, value):
-        if isinstance(value, constexpr):
-            self.value = value.value
-        else:
-            self.value = value
-
-    def __repr__(self) -> str:
-        return f"constexpr[{self.value}]"
-
-    def __add__(self, other):
-        return constexpr(self.value + other.value)
-
-    def __radd__(self, other):
-        return constexpr(other.value + self.value)
-
-    def __sub__(self, other):
-        return constexpr(self.value - other.value)
-
-    def __rsub__(self, other):
-        return constexpr(other.value - self.value)
-
-    def __mul__(self, other):
-        return constexpr(self.value * other.value)
-
-    def __mod__(self, other):
-        return constexpr(self.value % other.value)
-
-    def __rmul__(self, other):
-        return constexpr(other.value * self.value)
-
-    def __truediv__(self, other):
-        return constexpr(self.value / other.value)
-
-    def __rtruediv__(self, other):
-        return constexpr(other.value / self.value)
-
-    def __floordiv__(self, other):
-        return constexpr(self.value // other.value)
-
-    def __rfloordiv__(self, other):
-        return constexpr(other.value // self.value)
-
-    def __gt__(self, other):
-        return constexpr(self.value > other.value)
-
-    def __rgt__(self, other):
-        return constexpr(other.value > self.value)
-
-    def __ge__(self, other):
-        return constexpr(self.value >= other.value)
-
-    def __rge__(self, other):
-        return constexpr(other.value >= self.value)
-
-    def __lt__(self, other):
-        return constexpr(self.value < other.value)
-
-    def __rlt__(self, other):
-        return constexpr(other.value < self.value)
-
-    def __le__(self, other):
-        return constexpr(self.value <= other.value)
-
-    def __rle__(self, other):
-        return constexpr(other.value <= self.value)
-
-    def __eq__(self, other):
-        return constexpr(self.value == other.value)
-
-    def __ne__(self, other):
-        return constexpr(self.value != other.value)
-
-    def __bool__(self):
-        return bool(self.value)
-
-    def __neg__(self):
-        return constexpr(-self.value)
-
-    def __and__(self, other):
-        return constexpr(self.value & other.value)
-
-    def logical_and(self, other):
-        return constexpr(self.value and other.value)
-
-    def __or__(self, other):
-        return constexpr(self.value | other.value)
-
-    def __xor__(self, other):
-        return constexpr(self.value ^ other.value)
-
-    def logical_or(self, other):
-        return constexpr(self.value or other.value)
-
-    def __pos__(self):
-        return constexpr(+self.value)
-
-    def __invert__(self):
-        return constexpr(~self.value)
-
-    def __pow__(self, other):
-        return constexpr(self.value ** other.value)
-
-    def __rshift__(self, other):
-        return constexpr(self.value >> other.value)
-
-    def __lshift__(self, other):
-        return constexpr(self.value << other.value)
-
-    def __not__(self):
-        return constexpr(not self.value)
-
-    def __call__(self, *args, **kwds):
-        return self.value(*args, **kwds)
-
-
-class tensor:
-    def __init__(self, handle, type: dtype):
-        # IR handle
-        self.handle = handle
-        # Block shape
-        self.shape = (1, )
-        if type.is_block():
-            self.shape = type.shape
-        self.numel = 1
-        for s in self.shape:
-            self.numel *= s
-        self.numel = constexpr(self.numel)
-        self.type = type  # Tensor type (can be block_type)
-        # Following the practice in pytorch, dtype is scalar type
-        self.dtype = type.scalar
-        self.shape = [constexpr(s) for s in self.shape]
-
-    def __str__(self) -> str:
-        # ex. "float32[3,4]"
-        return str(self.dtype) + '[' + ','.join(str(s) for s in self.shape) + ']'
-
-    @builtin
-    def __add__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.add(self, other, _builder)
-
-    def __radd__(self, other, _builder=None):
-        return self.__add__(other, _builder=_builder)
-
-    @builtin
-    def __sub__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.sub(self, other, _builder)
-
-    def __rsub__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.sub(other, self, _builder)
-
-    @builtin
-    def __mul__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.mul(self, other, _builder)
-
-    def __rmul__(self, other, _builder=None):
-        return self.__mul__(other, _builder=_builder)
-
-    @builtin
-    def __truediv__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.truediv(self, other, _builder)
-
-    def __rtruediv__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.truediv(other, self, _builder)
-
-    @builtin
-    def __floordiv__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.floordiv(self, other, _builder)
-
-    @builtin
-    def __rfloordiv__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.floordiv(other, self, _builder)
-
-    @builtin
-    def __mod__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.mod(self, other, _builder)
-
-    @builtin
-    def __rmod__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.mod(other, self, _builder)
-
-    # unary operators
-    @builtin
-    def __neg__(self, _builder=None):
-        return semantic.minus(self, _builder)
-
-    @builtin
-    def __invert__(self, _builder=None):
-        return semantic.invert(self, _builder)
-
-    # bitwise operators
-
-    @builtin
-    def __and__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.and_(self, other, _builder)
-
-    @builtin
-    def __rand__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.and_(other, self, _builder)
-
-    @builtin
-    def __or__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.or_(self, other, _builder)
-
-    @builtin
-    def __ror__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.or_(other, self, _builder)
-
-    @builtin
-    def __xor__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.xor_(self, other, _builder)
-
-    @builtin
-    def __rxor__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.xor_(other, self, _builder)
-
-    @builtin
-    def __lshift__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.shl(self, other, _builder)
-
-    @builtin
-    def __rlshift__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.shl(other, self, _builder)
-
-    @builtin
-    def __rshift__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        if self.dtype.is_int_signed():
-            return semantic.ashr(self, other, _builder)
-        else:
-            return semantic.lshr(self, other, _builder)
-
-    @builtin
-    def __rrshift__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        if self.dtype.is_int_signed():
-            return semantic.ashr(other, self, _builder)
-        else:
-            return semantic.lshr(other, self, _builder)
-
-    # comparison operators
-
-    # >
-    @builtin
-    def __gt__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.greater_than(self, other, _builder)
-
-    @builtin
-    def __rgt__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.greater_than(other, self, _builder)
-
-    # >=
-    @builtin
-    def __ge__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.greater_equal(self, other, _builder)
-
-    @builtin
-    def __rge__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.greater_equal(other, self, _builder)
-
-    # <
-    @builtin
-    def __lt__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.less_than(self, other, _builder)
-
-    @builtin
-    def __rlt__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.less_than(other, self, _builder)
-
-    # <=
-    @builtin
-    def __le__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.less_equal(self, other, _builder)
-
-    @builtin
-    def __rle__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.less_equal(other, self, _builder)
-
-    # ==
-    @builtin
-    def __eq__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.equal(self, other, _builder)
-
-    @builtin
-    def __ne__(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.not_equal(self, other, _builder)
-
-    @builtin
-    def logical_and(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.logical_and(self, other, _builder)
-
-    @builtin
-    def logical_or(self, other, _builder=None):
-        other = _to_tensor(other, _builder)
-        return semantic.logical_or(self, other, _builder)
-
-    # note: __not__ isn't actually a magic method in python
-    # but it's ok because our ASTVisitor handles it
-    @builtin
-    def __not__(self, _builder=None):
-        return semantic.not_(self, _builder)
-
-    @builtin
-    def __getitem__(self, slices, _builder=None):
-        if isinstance(slices, slice):
-            slices = [slices]
-        ret = self
-        for dim, sl in enumerate(slices):
-            if isinstance(sl, constexpr) and sl.value is None:
-                ret = semantic.expand_dims(ret, dim, _builder)
-            elif sl == slice(None, None, None):
-                pass
-            else:
-                assert False, f"unsupported tensor index: {sl}"
-        return ret
-
-    @property
-    def T(self):
-        assert False, "Transposition must be created by the AST Visitor"
-
-    @builtin
-    def to(self, dtype, bitcast=False, _builder=None):
-        if isinstance(bitcast, constexpr):
-            bitcast = bitcast.value
-        if bitcast:
-            return semantic.bitcast(self, dtype, _builder)
-        return semantic.cast(self, dtype, _builder)
-
-
-# -----------------------
-# SPMD Programming Model
-# -----------------------
-def _constexpr_to_value(v):
-    if isinstance(v, constexpr):
-        return v.value
-    return v
-
-
-@builtin
-def program_id(axis, _builder=None):
-    """
-    Returns the id of the current program instance along the given :code:`axis`.
-
-    :param axis: The axis of the 3D launch grid. Has to be either 0, 1 or 2.
-    :type axis: int
-    """
-    # if axis == -1:
-    #     pid0 = program_id(0, _builder)
-    #     pid1 = program_id(1, _builder)
-    #     pid2 = program_id(2, _builder)
-    #     npg0 = num_programs(0, _builder)
-    #     npg1 = num_programs(0, _builder)
-    #     return pid0 + pid1*npg0 + pid2*npg0*npg1
-    axis = _constexpr_to_value(axis)
-    return semantic.program_id(axis, _builder)
-
-
-@builtin
-def num_programs(axis, _builder=None):
-    """
-    Returns the number of program instances launched along the given :code:`axis`.
-
-    :param axis: The axis of the 3D launch grid. Has to be either 0, 1 or 2.
-    :type axis: int
-    """
-    axis = _constexpr_to_value(axis)
-    return semantic.num_programs(axis, _builder)
-
-
-# -----------------------
-# Block Initialization
-# -----------------------
-
-
-@builtin
-def arange(start, end, _builder=None):
-    """
-    Returns contiguous values within the left-closed and right-open interval [:code:`start`, :code:`end`). \
-    End - Start must be less than or equal to TRITON_MAX_TENSOR_NUMEL = 131072
-
-    :param start: Start of the interval. Must be a power of two.
-    :type start: int32
-    :param end: End of the interval. Must be a power of two > start.
-    :type end: int32
-    """
-    start = _constexpr_to_value(start)
-    end = _constexpr_to_value(end)
-    return semantic.arange(start, end, _builder)
-
-
-def _shape_check_impl(shape):
-    shape = _constexpr_to_value(shape)
-    for i, d in enumerate(shape):
-        if not isinstance(d, constexpr):
-            raise TypeError(f"Shape element {i} must have type `constexpr`")
-        if not isinstance(d.value, int):
-            raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
-    return [_constexpr_to_value(x) for x in shape]
-
-
-@builtin
-def full(shape, value, dtype, _builder=None):
-    """
-    Returns a tensor filled with the scalar value for the given :code:`shape` and :code:`dtype`.
-
-    :param shape: Shape of the new array, e.g., (8, 16) or (8, )
-    :value value: A scalar value to fill the array with
-    :type shape: tuple of ints
-    :param dtype: Data-type of the new array, e.g., :code:`tl.float16`
-    :type dtype: DType
-    """
-    shape = _shape_check_impl(shape)
-    value = _constexpr_to_value(value)
-    dtype = _constexpr_to_value(dtype)
-    return semantic.full(shape, value, dtype, _builder)
-
-
-# -----------------------
-# Shape Manipulation
-# -----------------------
-
-
-@builtin
-def broadcast(input, other, _builder=None):
-    """
-    Tries to broadcast the two given blocks to a common compatible shape.
-
-    :param input: The first input tensor.
-    :type input: Block
-    :param other: The second input tensor.
-    :type other: Block
-    """
-    return semantic.broadcast_impl_value(input, other, _builder)
-
-
-@builtin
-def broadcast_to(input, shape, _builder=None):
-    """
-    Tries to broadcast the given tensor to a new :code:`shape`.
-
-    :param input: The input tensor.
-    :type input: Block
-    :param shape: The desired shape.
-    :type shape: Tuple[int]
-    """
-    shape = _shape_check_impl(shape)
-    return semantic.broadcast_impl_shape(input, shape, _builder)
-
-
-@builtin
-def trans(input, _builder=None):
-    return semantic.trans(input, _builder)
-
-
-@builtin
-def cat(input, other, can_reorder=False, _builder=None):
-    """
-    Concatenate the given blocks
-
-    :param input: The first input tensor.
-    :type input:
-    :param other: The second input tensor.
-    :type other:
-    :param reorder: Compiler hint. If true, the compiler is
-    allowed to reorder elements while concatenating inputs.
-    Only use if the order does not matter (e.g., result is
-    only used in reduction ops)
-    """
-    return semantic.cat(input, other, can_reorder, _builder)
-
-
-@builtin
-def view(input, shape, _builder=None):
-    """
-    Returns a tensor with the same elements as `input` but a different shape.
-    The order of the elements may not be preserved.
-
-    :param input: The input tensor.
-    :type input:
-    :param shape: The desired shape.
-    :type shape: Tuple[int]
-
-    """
-    shape = _shape_check_impl(shape)
-    return semantic.view(input, shape, _builder)
-
-
-@builtin
-def reshape(input, shape, _builder=None):
-    shape = _shape_check_impl(shape)
-    return semantic.reshape(input, shape, _builder)
-
-
-def _wrap_axis(axis, ndim):
-    if not (-ndim <= axis < ndim):
-        raise ValueError(f"invalid axis {axis}. Expected {-ndim} <= axis < {ndim}")
-
-    return axis if axis >= 0 else axis + ndim
-
-
-@builtin
-def expand_dims(input, axis, _builder=None):
-    """
-    Expand the shape of a tensor, by inserting new length-1 dimensions.
-
-    Axis indices are with respect to the resulting tensor, so
-    ``result.shape[axis]`` will be 1 for each axis.
-
-    :param input: The input tensor.
-    :type input: tl.tensor
-    :param axis: The indices to add new axes
-    :type axis: int | Sequence[int]
-
-    """
-    axis = _constexpr_to_value(axis)
-    axes = list(axis) if isinstance(axis, Sequence) else [axis]
-    new_ndim = len(input.shape) + len(axes)
-    axes = [_wrap_axis(_constexpr_to_value(d), new_ndim) for d in axes]
-
-    if len(set(axes)) != len(axes):
-        raise ValueError(f"expand_dims recieved duplicate axes, normalized axes = {axes}")
-
-    ret = input
-    for a in sorted(axes):
-        ret = semantic.expand_dims(ret, a, _builder)
-    return ret
-
-# -----------------------
-# Linear Algebra
-# -----------------------
-
-
-@builtin
-def dot(input, other, allow_tf32=True, out_dtype=float32, _builder=None):
-    """
-    Returns the matrix product of two blocks.
-
-    The two blocks must be two-dimensional and have compatible inner dimensions.
-
-    :param input: The first tensor to be multiplied.
-    :type input: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`}
-    :param other: The second tensor to be multiplied.
-    :type other: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`}
-    """
-    allow_tf32 = _constexpr_to_value(allow_tf32)
-    out_dtype = _constexpr_to_value(out_dtype)
-    return semantic.dot(input, other, allow_tf32, out_dtype, _builder)
-
-
-# -----------------------
-# Non-Atomic Memory Operations
-# -----------------------
-
-
-@builtin
-def load(pointer, mask=None, other=None, boundary_check=tuple(), padding_option="", cache_modifier="",
-         eviction_policy="", volatile=False, _builder=None):
-    """
-    Return a tensor of data whose values are loaded from memory at location defined by `pointer`:
-        (1) `pointer` could be a single element pointer, then a scalar will be loaded
-            - `mask` and `other` must be scalar too
-            - `other` is implicitly typecast to `pointer.dtype.element_ty`
-            - `boundary_check` and `padding_option` must be empty
-        (2) `pointer` could be element-wise tensor of pointers, in which case:
-            - `mask` and `other` are implicitly broadcast to `pointer.shape`
-            - `other` is implicitly typecast to `pointer.dtype.element_ty`
-            - `boundary_check` and `padding_option` must be empty
-        (3) `pointer` could be a block pointer defined by `make_block_ptr`, in which case:
-            - `mask` and `other` must be None
-            - `boundary_check` and `padding_option` can be specified to control the behavior of out-of-bound access
-
-    :param pointer: Pointer to the data to be loaded
-    :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType`
-    :param mask: if `mask[idx]` is false, do not load the data at address `pointer[idx]`
-        (must be `None` with block pointers)
-    :type mask: Block of `triton.int1`, optional
-    :param other: if `mask[idx]` is false, return `other[idx]`
-    :type other: Block, optional
-    :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check
-    :type boundary_check: tuple of ints, optional
-    :param padding_option: should be one of {"", "zero", "nan"}, do padding while out of bound
-    :param cache_modifier: changes cache option in NVIDIA PTX
-    :type cache_modifier: str, optional
-    :param eviction_policy: changes eviction policy in NVIDIA PTX
-    :type eviction_policy: str, optional
-    :param volatile: changes volatile option in NVIDIA PTX
-    :type volatile: bool, optional
-    """
-    # `mask` and `other` can be constexpr
-    if _constexpr_to_value(mask) is not None:
-        mask = _to_tensor(mask, _builder)
-    if _constexpr_to_value(other) is not None:
-        other = _to_tensor(other, _builder)
-    padding_option = _constexpr_to_value(padding_option)
-    cache_modifier = _constexpr_to_value(cache_modifier)
-    eviction_policy = _constexpr_to_value(eviction_policy)
-    volatile = _constexpr_to_value(volatile)
-    return semantic.load(pointer, mask, other, boundary_check, padding_option, cache_modifier, eviction_policy,
-                         volatile, _builder)
-
-
-@builtin
-def store(pointer, value, mask=None, boundary_check=(), cache_modifier="", eviction_policy="", _builder=None):
-    """
-    Store a tensor of data into memory locations defined by `pointer`:
-        (1) `pointer` could be a single element pointer, then a scalar will be stored
-            - `mask` must be scalar too
-            - `boundary_check` and `padding_option` must be empty
-        (2) `pointer` could be element-wise tensor of pointers, in which case:
-            - `mask` is implicitly broadcast to `pointer.shape`
-            - `boundary_check` must be empty
-        (3) or `pointer` could be a block pointer defined by `make_block_ptr`, in which case:
-            - `mask` must be None
-            - `boundary_check` can be specified to control the behavior of out-of-bound access
-    `value` is implicitly broadcast to `pointer.shape` and typecast to `pointer.dtype.element_ty`.
-
-    :param pointer: The memory location where the elements of `value` are stored
-    :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType`
-    :param value: The tensor of elements to be stored
-    :type value: Block
-    :param mask: If `mask[idx]` is false, do not store `value[idx]` at `pointer[idx]`
-    :type mask: Block of triton.int1, optional
-    :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check
-    :type boundary_check: tuple of ints, optional
-    :param cache_modifier: changes cache option in NVIDIA PTX
-    :type cache_modifier: str, optional
-    :param eviction_policy: changes eviction policy in NVIDIA PTX
-    :type eviction_policy: str, optional
-    """
-    # `value` can be constexpr
-    value = _to_tensor(value, _builder)
-    if _constexpr_to_value(mask) is not None:
-        mask = _to_tensor(mask, _builder)
-    cache_modifier = _constexpr_to_value(cache_modifier)
-    eviction_policy = _constexpr_to_value(eviction_policy)
-    return semantic.store(pointer, value, mask, boundary_check, cache_modifier, eviction_policy, _builder)
-
-
-@builtin
-def make_block_ptr(base: tensor, shape, strides, offsets, block_shape, order, _builder=None):
-    """
-    Returns a pointer to a block in a parent tensor
-
-    :param base: The base pointer to the parent tensor
-    :param shape: The shape of the parent tensor
-    :param strides: The strides of the parent tensor
-    :param offsets: The offsets to the block
-    :param block_shape: The shape of the block
-    :param order: The order of the original data format
-    """
-    return semantic.make_block_ptr(base, shape, strides, offsets, block_shape, order, _builder)
-
-
-@builtin
-def advance(base: tensor, offsets, _builder=None):
-    """
-    Advance a block pointer
-
-    :param base: the block pointer to advance
-    :param offsets: the offsets to advance, a tuple by dimension
-    """
-    return semantic.advance(base, offsets, _builder)
-
-# -----------------------
-# Atomic Memory Operations
-# -----------------------
-
-
-def _add_atomic_docstr(name: str) -> Callable[[T], T]:
-
-    def _decorator(func: T) -> T:
-        docstr = """
-    Performs an atomic {name} at the memory location specified by :code:`pointer`.
-
-    Return the data stored at :code:`pointer` before the atomic operation.
-
-    :param pointer: The memory locations to compare-and-swap.
-    :type pointer: Block of dtype=triton.PointerDType
-    :param cmp: The values expected to be found in the atomic object
-    :type cmp: Block of dtype=`pointer.dtype.element_ty`
-    :param val: The values to copy in case the expected value matches the contained value.
-    :type val: Block of dtype=`pointer.dtype.element_ty`
-    """
-        func.__doc__ = docstr.format(name=name)
-        return func
-
-    return _decorator
-
-
-@builtin
-@_add_atomic_docstr("compare-and-swap")
-def atomic_cas(pointer, cmp, val, _builder=None):
-    cmp = _to_tensor(cmp, _builder)
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_cas(pointer, cmp, val, _builder)
-
-
-@builtin
-@_add_atomic_docstr("exchange")
-def atomic_xchg(pointer, val, mask=None, _builder=None):
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_xchg(pointer, val, mask, _builder)
-
-
-@builtin
-@_add_atomic_docstr("add")
-def atomic_add(pointer, val, mask=None, _builder=None):
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_add(pointer, val, mask, _builder)
-
-
-@builtin
-@_add_atomic_docstr("max")
-def atomic_max(pointer, val, mask=None, _builder=None):
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_max(pointer, val, mask, _builder)
-
-
-@builtin
-@_add_atomic_docstr("min")
-def atomic_min(pointer, val, mask=None, _builder=None):
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_min(pointer, val, mask, _builder)
-
-
-@builtin
-@_add_atomic_docstr("logical and")
-def atomic_and(pointer, val, mask=None, _builder=None):
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_and(pointer, val, mask, _builder)
-
-
-@builtin
-@_add_atomic_docstr("logical or")
-def atomic_or(pointer, val, mask=None, _builder=None):
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_or(pointer, val, mask, _builder)
-
-
-@builtin
-@_add_atomic_docstr("logical xor")
-def atomic_xor(pointer, val, mask=None, _builder=None):
-    val = _to_tensor(val, _builder)
-    return semantic.atomic_xor(pointer, val, mask, _builder)
-
-
-# -----------------------
-# Conditioning
-# -----------------------
-
-@builtin
-def where(condition, x, y, _builder=None):
-    """
-    Returns a tensor of elements from either :code:`x` or :code:`y`, depending on :code:`condition`.
-
-    Note that :code:`x` and :code:`y` are always evaluated regardless of the value of :code:`condition`.
-
-    If you want to avoid unintended memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead.
-
-    The shape of :code:`x` and :code:`y` are both broadcast to the shape of :code:`condition`.
-    :code:`x` and :code:`y` must have the same data type.
-
-    :param condition: When True (nonzero), yield x, otherwise yield y.
-    :type condition: Block of triton.bool
-    :param x: values selected at indices where condition is True.
-    :param y: values selected at indices where condition is False.
-    """
-    condition = _to_tensor(condition, _builder)
-    x = _to_tensor(x, _builder)
-    y = _to_tensor(y, _builder)
-    return semantic.where(condition, x, y, _builder)
-
-
-# -----------------------
-# Math
-# -----------------------
-
-@builtin
-def umulhi(x, y, _builder=None):
-    x = _to_tensor(x, _builder)
-    y = _to_tensor(y, _builder)
-    return semantic.umulhi(x, y, _builder)
-
-
-@builtin
-def fdiv(x, y, ieee_rounding=False, _builder=None):
-    ieee_rounding = _constexpr_to_value(ieee_rounding)
-    return semantic.fdiv(x, y, ieee_rounding, _builder)
-
-
-def _add_math_1arg_docstr(name: str) -> Callable[[T], T]:
-
-    def _decorator(func: T) -> T:
-        docstr = """
-    Computes the element-wise {name} of :code:`x`.
-
-    :param x: the input values
-    :type x: Block
-    """
-        func.__doc__ = docstr.format(name=name)
-        return func
-
-    return _decorator
-
-
-@builtin
-@_add_math_1arg_docstr("exponential")
-def exp(x, _builder=None):
-    return semantic.exp(x, _builder)
-
-
-@builtin
-@_add_math_1arg_docstr("natural logarithm")
-def log(x, _builder=None):
-    return semantic.log(x, _builder)
-
-
-@builtin
-@_add_math_1arg_docstr("cosine")
-def cos(x, _builder=None):
-    return semantic.cos(x, _builder)
-
-
-@builtin
-@_add_math_1arg_docstr("sine")
-def sin(x, _builder=None):
-    return semantic.sin(x, _builder)
-
-
-@builtin
-@_add_math_1arg_docstr("square root")
-def sqrt(x, _builder=None):
-    return semantic.sqrt(x, _builder)
-
-
-@builtin
-@_add_math_1arg_docstr("absolute value")
-def abs(x, _builder=None):
-    return semantic.abs(x, _builder)
-
-
-# -----------------------
-# Reductions
-# -----------------------
-
-def _add_reduction_docstr(name: str) -> Callable[[T], T]:
-
-    def _decorator(func: T) -> T:
-        docstr = """
-    Returns the {name} of all elements in the :code:`input` tensor along the provided :code:`axis`
-
-    :param input: the input values
-    :param axis: the dimension along which the reduction should be done
-    """
-        func.__doc__ = docstr.format(name=name)
-        return func
-
-    return _decorator
-
-
-@contextmanager
-def _insertion_guard(builder):
-    ip = builder.get_insertion_point()
-    yield
-    builder.restore_insertion_point(ip)
-
-
-@builtin
-def reduce(input, axis, combine_fn, _builder=None, _generator=None):
-    """Applies the combine_fn to all elements in :code:`input` tensors along the provided :code:`axis`
-
-    :param input: the input tensor, or tuple of tensors
-    :param axis: the dimension along which the reduction should be done
-    :param combine_fn: a function to combine two groups of scalar tensors (must be marked with @triton.jit)
-
-    """
-    if isinstance(input, tensor):
-        return reduce((input,), axis, combine_fn,
-                      _builder=_builder, _generator=_generator)[0]
-
-    def make_combine_region(reduce_op):
-        in_scalar_tys = [t.type.scalar for t in input]
-        prototype = function_type(in_scalar_tys, in_scalar_tys * 2)
-
-        region = reduce_op.get_region(0)
-        with _insertion_guard(_builder):
-            param_types = [ty.to_ir(_builder) for ty in prototype.param_types]
-            block = _builder.create_block_with_parent(region, param_types)
-            args = [tensor(block.arg(i), ty)
-                    for i, ty in enumerate(prototype.param_types)]
-            results = _generator.call_JitFunction(combine_fn, args, kwargs={})
-            if isinstance(results, tensor):
-                handles = [results.handle]
-            else:
-                handles = [r.handle for r in results]
-            _builder.create_reduce_ret(*handles)
-
-    axis = _constexpr_to_value(axis)
-    return semantic.reduction(input, axis, make_combine_region, _builder)
-
-
-@builtin
-def _promote_reduction_input(t, _builder=None):
-    scalar_ty = t.type.scalar
-    # input is extended to 32-bits if necessary
-    # this increases numerical accuracy and can be done pretty much for free
-    # on GPUs
-    if scalar_ty.is_int() and scalar_ty.int_bitwidth < 32:
-        return t.to(int32, _builder=_builder)
-
-    # hardware doesn't support FMAX, FMIN, CMP for bfloat16
-    if scalar_ty is bfloat16:
-        return t.to(float32, _builder=_builder)
-
-    return t
-
-
-@builtin
-def _argreduce(input, axis, combine_fn, _builder=None, _generator=None):
-    axis = _constexpr_to_value(axis)
-    n = input.shape[axis]
-    index = arange(0, n, _builder=_builder)
-
-    if len(input.shape) > 1:
-        # Broadcast index across the non-reduced axes
-        axes_to_expand = [constexpr(d) for d in range(len(input.shape))]
-        del axes_to_expand[axis]
-        index = expand_dims(index, axes_to_expand, _builder=_builder)
-        index = broadcast_to(index, input.shape, _builder=_builder)
-
-    rvalue, rindices = reduce((input, index), axis, combine_fn,
-                              _builder=_builder, _generator=_generator)
-    return rindices
-
-
-@triton.jit
-def minimum(x, y):
-    """
-    Computes the element-wise minimum of :code:`x` and :code:`y`.
-
-    :param input: the first input tensor
-    :type input: Block
-    :param other: the second input tensor
-    :type other: Block
-    """
-    return where(x < y, x, y)
-
-
-@triton.jit
-def maximum(x, y):
-    """
-    Computes the element-wise maximum of :code:`x` and :code:`y`.
-
-    :param input: the first input tensor
-    :type input: Block
-    :param other: the second input tensor
-    :type other: Block
-    """
-    return where(x > y, x, y)
-
-
-@triton.jit
-def _max_combine(a, b):
-    return maximum(a, b)
-
-
-@triton.jit
-@_add_reduction_docstr("maximum")
-def max(input, axis):
-    input = _promote_reduction_input(input)
-    return reduce(input, axis, _max_combine)
-
-
-@triton.jit
-def _argmax_combine(value1, index1, value2, index2):
-    gt = value1 > value2
-    lt = value1 < value2
-    index_min = minimum(index1, index2)
-    index_ret = where(gt, index1, where(lt, index2, index_min))
-    value_ret = maximum(value1, value2)
-    return value_ret, index_ret
-
-
-@triton.jit
-@_add_reduction_docstr("maximum index")
-def argmax(input, axis):
-    input = _promote_reduction_input(input)
-    return _argreduce(input, axis, _argmax_combine)
-
-
-@triton.jit
-def _min_combine(a, b):
-    # TODO: minimum/maximum doesn't get lowered to fmin/fmax...
-    return minimum(a, b)
-
-
-@triton.jit
-@_add_reduction_docstr("minimum")
-def min(input, axis):
-    input = _promote_reduction_input(input)
-    return reduce(input, axis, _min_combine)
-
-
-@triton.jit
-def _argmin_combine(value1, index1, value2, index2):
-    lt = value1 < value2
-    gt = value1 > value2
-    index_min = minimum(index1, index2)
-    index_ret = where(lt, index1, where(gt, index2, index_min))
-    value_ret = minimum(value1, value2)
-    return value_ret, index_ret
-
-
-@triton.jit
-@_add_reduction_docstr("minimum index")
-def argmin(input, axis):
-    input = _promote_reduction_input(input)
-    return _argreduce(input, axis, _argmin_combine)
-
-
-@triton.jit
-def _sum_combine(a, b):
-    return a + b
-
-
-@triton.jit
-@_add_reduction_docstr("sum")
-def sum(input, axis):
-    input = _promote_reduction_input(input)
-    return reduce(input, axis, _sum_combine)
-
-
-@triton.jit
-def _xor_combine(a, b):
-    return a ^ b
-
-
-@builtin
-@_add_reduction_docstr("xor sum")
-def xor_sum(input, axis, _builder=None, _generator=None):
-    scalar_ty = input.type.scalar
-    if not scalar_ty.is_int():
-        raise ValueError("xor_sum only supported for integers")
-
-    input = _promote_reduction_input(input, _builder=_builder)
-    return reduce(input, axis, _xor_combine,
-                  _builder=_builder, _generator=_generator)
-
-
-# -----------------------
-# Compiler Hint Ops
-# -----------------------
-
-
-@builtin
-def debug_barrier(_builder=None):
-    '''
-    Insert a barrier to synchronize all threads in a block.
-    '''
-    return semantic.debug_barrier(_builder)
-
-
-@builtin
-def multiple_of(input, values, _builder=None):
-    """
-    Let the compiler knows that the values in :code:`input` are all multiples of :code:`value`.
-    """
-    if isinstance(values, constexpr):
-        values = [values]
-    for i, d in enumerate(values):
-        if not isinstance(d, constexpr):
-            raise TypeError(f"values element {i} must have type `constexpr`")
-        if not isinstance(d.value, int):
-            raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
-    values = [x.value for x in values]
-    return semantic.multiple_of(input, values)
-
-
-@builtin
-def max_contiguous(input, values, _builder=None):
-    """
-    Let the compiler knows that the `value` first values in :code:`input` are contiguous.
-    """
-    if isinstance(values, constexpr):
-        values = [values]
-    for i, d in enumerate(values):
-        if not isinstance(d, constexpr):
-            raise TypeError(f"values element {i} must have type `constexpr`")
-        if not isinstance(d.value, int):
-            raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
-    values = [x.value for x in values]
-    return semantic.max_contiguous(input, values)
-
-# -----------------------
-# Debugging functions
-# -----------------------
-
-
-@builtin
-def static_print(*values, sep: str = " ", end: str = "\n", file=None, flush=False, _builder=None):
-    '''
-    Print the values at compile time. The parameters are the same as the builtin :code:`print`.
-    '''
-    pass
-
-
-@builtin
-def static_assert(cond, msg="", _builder=None):
-    '''
-    Assert the condition at compile time. The parameters are the same as the builtin :code:`assert`.
-    '''
-    pass
-
-
-@builtin
-def device_print(prefix, *args, _builder=None):
-    '''
-    Print the values at runtime from the device.
-
-    :param prefix: a prefix to print before the values. This is required to be a string literal.
-    :param args: the values to print. They can be any tensor or scalar.
-    '''
-    import string
-    prefix = _constexpr_to_value(prefix)
-    assert isinstance(prefix, str), f"{prefix} is not string"
-    b_ascii = True
-    for ch in prefix:
-        if ch not in string.printable:
-            b_ascii = False
-            break
-    assert b_ascii, f"{prefix} is not an ascii string"
-    new_args = []
-    for arg in args:
-        new_args.append(_to_tensor(arg, _builder))
-    return semantic.device_print(prefix, new_args, _builder)
-
-
-@builtin
-def device_assert(cond, msg="", _builder=None):
-    '''
-    Assert the condition at runtime from the device.
-
-    :param cond: the condition to assert. This is required to be a boolean tensor.
-    :param msg: the message to print if the assertion fails. This is required to be a string literal.
-    '''
-    msg = _constexpr_to_value(msg)
-    import inspect
-    frame = inspect.currentframe()
-    module = inspect.getmodule(frame)
-    # The triton function module doesn't have the name attribute.
-    # We use this trick to find the caller.
-    while hasattr(module, "__name__"):
-        frame = frame.f_back
-        module = inspect.getmodule(frame)
-    func_name = frame.f_code.co_name
-    file_name = frame.f_back.f_code.co_filename
-    # TODO: The line number currently indicates the line
-    # where the triton function is called but not where the
-    # device_assert is called. Need to enhance this.
-    lineno = frame.f_back.f_lineno
-    return semantic.device_assert(_to_tensor(cond, _builder), msg, file_name, func_name, lineno, _builder)
-
-
-# -----------------------
-# Iterators
-# -----------------------
-
-
-class static_range:
-
-    """
-    Iterator that counts upward forever.
-
-    .. highlight:: python
-    .. code-block:: python
-
-        @triton.jit
-        def kernel(...):
-            for i in tl.static_range(10):
-                ...
-    :note: This is a special iterator used to implement similar semantics to Python's :code:`range` in the context of
-        :code:`triton.jit` functions. In addition, it also guides the compiler to unroll the loop aggressively.
-    :param arg1: the start value.
-    :param arg2: the end value.
-    :param step: the step value.
-    """
-
-    def __init__(self, arg1, arg2=None, step=None):
-        assert isinstance(arg1, constexpr)
-        if step is None:
-            self.step = constexpr(1)
-        else:
-            assert isinstance(step, constexpr)
-            self.step = step
-        if arg2 is None:
-            self.start = constexpr(0)
-            self.end = arg1
-        else:
-            assert isinstance(arg2, constexpr)
-            self.start = arg1
-            self.end = arg2
-
-    def __iter__(self):
-        raise RuntimeError("static_range can only be used in @triton.jit'd functions")
-
-    def __next__(self):
-        raise RuntimeError("static_range can only be used in @triton.jit'd functions")
-
-
-# -----------------------
-# Extern functions
-# -----------------------
-
-def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, ret_shape: tuple, is_pure: bool, _builder=None):
-    '''
-        Dispatch a function to a library
-        :param func: the function to dispatch
-        :param lib_name: the name of the library
-        :param lib_path: the path of the library
-        :param args: the arguments of the function
-        :param arg_type_symbol_dict: the type of the arguments
-        :param ret_shape: the shape of the return value
-        :param _builder: the builder
-        :return: the return value of the function
-    '''
-    if len(arg_type_symbol_dict) == 0:
-        raise ValueError("arg_type_symbol_dict is empty")
-
-    num_args = len(list(arg_type_symbol_dict.keys())[0])
-    if len(args) != num_args:
-        raise ValueError(f"length of input args does not match."
-                         f"Expect {len(args)}, got {num_args}")
-
-    arg_types = []
-    arg_list = []
-    for arg in args:
-        if isinstance(arg, tensor):
-            arg_types.append(arg.dtype)
-            arg_list.append(arg.handle)
-        else:
-            arg_types.append(type(arg))
-            arg_list.append(arg)
-    arg_types = tuple(arg_types)
-
-    if arg_types not in arg_type_symbol_dict:
-        raise ValueError(f"input arg type does not match."
-                         f"Expect one of {arg_type_symbol_dict.keys()}, got {arg_types}")
-    else:
-        symbol = arg_type_symbol_dict[arg_types][0]
-        ret_type = arg_type_symbol_dict[arg_types][1]
-        if ret_shape:
-            ret_type = block_type(ret_type, ret_shape)
-        return tensor(func(lib_name, lib_path, symbol, arg_list, ret_type.to_ir(_builder), is_pure), ret_type)
-
-
-def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, is_pure: bool, _builder=None):
-    '''
-        Dispatch an elementwise function to a library
-        :param lib_name: the name of the library
-        :param lib_path: the path of the library
-        :param args: the arguments of the function
-        :param arg_type_symbol_dict: the type of the arguments
-        :param is_pure: whether the function is pure
-        :param _builder: the builder
-        :return: the return value of the function
-    '''
-    dispatch_args = args.copy()
-    all_scalar = True
-    ret_shape = None
-    arg_types = []
-    for i in range(len(dispatch_args)):
-        dispatch_args[i] = _to_tensor(dispatch_args[i], _builder)
-        arg_types.append(dispatch_args[i].dtype)
-        if dispatch_args[i].type.is_block():
-            all_scalar = False
-    if len(arg_types) > 0:
-        arg_types = tuple(arg_types)
-        arithmetic_check = True
-        # If there's a type tuple that is not supported by the library, we will do arithmetic check
-        if arg_types in arg_type_symbol_dict:
-            arithmetic_check = False
-        broadcast_arg = dispatch_args[0]
-        # Get the broadcast shape over all the arguments
-        for i, item in enumerate(dispatch_args):
-            _, broadcast_arg = semantic.binary_op_type_checking_impl(
-                item, broadcast_arg, _builder, arithmetic_check=arithmetic_check)
-        # Change the shape of each argument based on the broadcast shape
-        for i in range(len(dispatch_args)):
-            dispatch_args[i], _ = semantic.binary_op_type_checking_impl(
-                dispatch_args[i], broadcast_arg, _builder, arithmetic_check=arithmetic_check)
-        if not all_scalar:
-            ret_shape = broadcast_arg.shape
-    func = getattr(_builder, "create_extern_elementwise")
-    return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, is_pure, _builder)
-
-
-def extern(fn):
-    """A decorator for external functions."""
-    return builtin(fn)
diff --git a/python/triton/language/extra/__init__.py b/python/triton/language/extra/__init__.py
deleted file mode 100644
index 2fd0ff3eeee3..000000000000
--- a/python/triton/language/extra/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from . import cuda
-
-__all__ = ['cuda']
diff --git a/python/triton/language/extra/cuda.bc b/python/triton/language/extra/cuda.bc
deleted file mode 100644
index 4538ac35446a..000000000000
Binary files a/python/triton/language/extra/cuda.bc and /dev/null differ
diff --git a/python/triton/language/extra/cuda.py b/python/triton/language/extra/cuda.py
deleted file mode 100644
index 92df37a67c77..000000000000
--- a/python/triton/language/extra/cuda.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import os
-
-from .. import core
-
-__path__ = os.path.dirname(os.path.abspath(__file__))
-
-
-@core.extern
-def globaltimer(_builder=None):
-    return core.extern_elementwise("cuda", os.path.join(__path__, "cuda.bc"), [],
-                                   {tuple(): ("globaltimer", core.dtype("int64")),
-                                    }, is_pure=False, _builder=_builder)
-
-
-@core.extern
-def smid(_builder=None):
-    return core.extern_elementwise("cuda", os.path.join(__path__, "cuda.bc"), [],
-                                   {tuple(): ("smid", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
diff --git a/python/triton/language/math.py b/python/triton/language/math.py
deleted file mode 100644
index 56e1ac5a11a0..000000000000
--- a/python/triton/language/math.py
+++ /dev/null
@@ -1,1534 +0,0 @@
-import functools
-import os
-
-from . import core
-
-
-@functools.lru_cache()
-def libdevice_path():
-    import torch
-    third_party_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "third_party")
-    if torch.version.hip is None:
-        default = os.path.join(third_party_dir, "cuda", "lib", "libdevice.10.bc")
-    else:
-        default = ''
-    return os.getenv("TRITON_LIBDEVICE_PATH", default)
-
-
-@core.extern
-def clz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_clz", core.dtype("int32")),
-                                    (core.dtype("int64"),): ("__nv_clzll", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def popc(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_popc", core.dtype("int32")),
-                                    (core.dtype("int64"),): ("__nv_popcll", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def byte_perm(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("int32"), core.dtype("int32"), core.dtype("int32"),): ("__nv_byte_perm", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def min(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("int32"),): ("__nv_min", core.dtype("int32")),
-                                    (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umin", core.dtype("uint32")),
-                                    (core.dtype("int64"), core.dtype("int64"),): ("__nv_llmin", core.dtype("int64")),
-                                    (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_ullmin", core.dtype("uint64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fminf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmin", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def max(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("int32"),): ("__nv_max", core.dtype("int32")),
-                                    (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umax", core.dtype("uint32")),
-                                    (core.dtype("int64"), core.dtype("int64"),): ("__nv_llmax", core.dtype("int64")),
-                                    (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_ullmax", core.dtype("uint64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaxf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmax", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def mulhi(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("int32"),): ("__nv_mulhi", core.dtype("int32")),
-                                    (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umulhi", core.dtype("uint32")),
-                                    (core.dtype("int64"), core.dtype("int64"),): ("__nv_mul64hi", core.dtype("int64")),
-                                    (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_umul64hi", core.dtype("uint64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def mul24(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("int32"),): ("__nv_mul24", core.dtype("int32")),
-                                    (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umul24", core.dtype("uint32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def brev(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_brev", core.dtype("int32")),
-                                    (core.dtype("int64"),): ("__nv_brevll", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sad(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("int32"), core.dtype("int32"), core.dtype("uint32"),): ("__nv_sad", core.dtype("int32")),
-                                    (core.dtype("uint32"), core.dtype("uint32"), core.dtype("uint32"),): ("__nv_usad", core.dtype("uint32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def abs(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_abs", core.dtype("int32")),
-                                    (core.dtype("int64"),): ("__nv_llabs", core.dtype("int64")),
-                                    (core.dtype("fp32"),): ("__nv_fabsf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_fabs", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def floor(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_floorf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_floor", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rcp64h(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_rcp64h", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rsqrt(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_rsqrtf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_rsqrt", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ceil(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_ceil", core.dtype("fp64")),
-                                    (core.dtype("fp32"),): ("__nv_ceilf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def trunc(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_trunc", core.dtype("fp64")),
-                                    (core.dtype("fp32"),): ("__nv_truncf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def exp2(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_exp2f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_exp2", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def saturatef(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_saturatef", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fma_rn(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rn", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fma_rz(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rz", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rz", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fma_rd(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rd", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rd", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fma_ru(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_ru", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_ru", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_dividef(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fast_fdividef", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def div_rn(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rn", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def div_rz(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rz", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rz", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def div_rd(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rd", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rd", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def div_ru(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_ru", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_ru", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rcp_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_frcp_rn", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_drcp_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rcp_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_frcp_rz", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_drcp_rz", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rcp_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_frcp_rd", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_drcp_rd", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rcp_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_frcp_ru", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_drcp_ru", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sqrt_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fsqrt_rn", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_dsqrt_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sqrt_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fsqrt_rz", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_dsqrt_rz", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sqrt_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fsqrt_rd", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_dsqrt_rd", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sqrt_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fsqrt_ru", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_dsqrt_ru", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sqrt(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_sqrtf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_sqrt", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def add_rn(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rn", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def add_rz(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rz", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rz", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def add_rd(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rd", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rd", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def add_ru(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_ru", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_ru", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def mul_rn(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rn", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def mul_rz(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rz", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rz", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def mul_rd(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rd", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rd", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def mul_ru(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_ru", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_ru", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2float_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2float_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2float_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2float_rz", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2float_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2float_rd", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2float_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2float_ru", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2int_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2int_rn", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2int_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2int_rz", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2int_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2int_rd", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2int_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2int_ru", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2uint_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2uint_rn", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2uint_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2uint_rz", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2uint_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2uint_rd", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2uint_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2uint_ru", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def int2double_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_int2double_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def uint2double_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint32"),): ("__nv_uint2double_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2int_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2int_rn", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2int_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2int_rz", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2int_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2int_rd", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2int_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2int_ru", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2uint_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2uint_rn", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2uint_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2uint_rz", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2uint_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2uint_rd", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2uint_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2uint_ru", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def int2float_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_int2float_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def int2float_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_int2float_rz", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def int2float_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_int2float_rd", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def int2float_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_int2float_ru", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def uint2float_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint32"),): ("__nv_uint2float_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def uint2float_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint32"),): ("__nv_uint2float_rz", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def uint2float_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint32"),): ("__nv_uint2float_rd", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def uint2float_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint32"),): ("__nv_uint2float_ru", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def hiloint2double(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("int32"),): ("__nv_hiloint2double", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2loint(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2loint", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2hiint(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2hiint", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ll_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ll_rn", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ll_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ll_rz", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ll_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ll_rd", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ll_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ll_ru", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ull_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ull_rn", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ull_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ull_rz", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ull_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ull_rd", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float2ull_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float2ull_ru", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ll_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ll_rn", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ll_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ll_rz", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ll_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ll_rd", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ll_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ll_ru", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ull_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ull_rn", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ull_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ull_rz", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ull_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ull_rd", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double2ull_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double2ull_ru", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2float_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2float_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2float_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2float_rz", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2float_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2float_rd", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2float_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2float_ru", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2float_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2float_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2float_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2float_rz", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2float_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2float_rd", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2float_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2float_ru", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2double_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2double_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2double_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2double_rz", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2double_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2double_rd", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ll2double_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_ll2double_ru", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2double_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2double_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2double_rz(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2double_rz", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2double_rd(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2double_rd", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ull2double_ru(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint64"),): ("__nv_ull2double_ru", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def int_as_float(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_int_as_float", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float_as_int(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float_as_int", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def uint_as_float(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("uint32"),): ("__nv_uint_as_float", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def float_as_uint(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_float_as_uint", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def longlong_as_double(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int64"),): ("__nv_longlong_as_double", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def double_as_longlong(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_double_as_longlong", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_sinf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_sinf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_cosf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_cosf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_log2f(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_log2f", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_logf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_logf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_expf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_expf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_tanf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_tanf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_exp10f(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_exp10f", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_log10f(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_fast_log10f", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fast_powf(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fast_powf", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def hadd(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("int32"),): ("__nv_hadd", core.dtype("int32")),
-                                    (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_uhadd", core.dtype("uint32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rhadd(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("int32"),): ("__nv_rhadd", core.dtype("int32")),
-                                    (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_urhadd", core.dtype("uint32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sub_rn(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rn", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sub_rz(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rz", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rz", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sub_rd(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rd", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rd", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sub_ru(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_ru", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_ru", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rsqrt_rn(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_frsqrt_rn", core.dtype("fp32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ffs(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("int32"),): ("__nv_ffs", core.dtype("int32")),
-                                    (core.dtype("int64"),): ("__nv_ffsll", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rint(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_rintf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_rint", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def llrint(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_llrintf", core.dtype("int64")),
-                                    (core.dtype("fp64"),): ("__nv_llrint", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def nearbyint(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_nearbyintf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_nearbyint", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def isnan(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_isnanf", core.dtype("int32")),
-                                    (core.dtype("fp64"),): ("__nv_isnand", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def signbit(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_signbitf", core.dtype("int32")),
-                                    (core.dtype("fp64"),): ("__nv_signbitd", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def copysign(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_copysignf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_copysign", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def finitef(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_finitef", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def isinf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_isinff", core.dtype("int32")),
-                                    (core.dtype("fp64"),): ("__nv_isinfd", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def nextafter(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_nextafterf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_nextafter", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sin(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_sinf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_sin", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def cos(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_cosf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_cos", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sinpi(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_sinpif", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_sinpi", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def cospi(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_cospif", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_cospi", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def tan(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_tanf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_tan", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def log2(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_log2f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_log2", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def exp(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_expf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_exp", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def exp10(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_exp10f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_exp10", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def cosh(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_coshf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_cosh", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def sinh(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_sinhf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_sinh", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def tanh(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_tanhf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_tanh", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def atan2(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_atan2f", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_atan2", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def atan(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_atanf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_atan", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def asin(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_asinf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_asin", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def acos(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_acosf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_acos", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def log(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_logf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_log", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def log10(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_log10f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_log10", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def log1p(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_log1pf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_log1p", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def acosh(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_acoshf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_acosh", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def asinh(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_asinhf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_asinh", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def atanh(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_atanhf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_atanh", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def expm1(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_expm1f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_expm1", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def hypot(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_hypotf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_hypot", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rhypot(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rhypotf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rhypot", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def norm3d(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_norm3df", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_norm3d", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rnorm3d(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rnorm3df", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rnorm3d", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def norm4d(arg0, arg1, arg2, arg3, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, arg3, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_norm4df", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_norm4d", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rnorm4d(arg0, arg1, arg2, arg3, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, arg3, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rnorm4df", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rnorm4d", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def cbrt(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_cbrtf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_cbrt", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def rcbrt(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_rcbrtf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_rcbrt", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def j0(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_j0f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_j0", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def j1(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_j1f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_j1", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def y0(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_y0f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_y0", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def y1(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_y1f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_y1", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def yn(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("fp32"),): ("__nv_ynf", core.dtype("fp32")),
-                                    (core.dtype("int32"), core.dtype("fp64"),): ("__nv_yn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def jn(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("int32"), core.dtype("fp32"),): ("__nv_jnf", core.dtype("fp32")),
-                                    (core.dtype("int32"), core.dtype("fp64"),): ("__nv_jn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def cyl_bessel_i0(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_cyl_bessel_i0f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_cyl_bessel_i0", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def cyl_bessel_i1(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_cyl_bessel_i1f", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_cyl_bessel_i1", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def erf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_erff", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_erf", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def erfinv(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_erfinvf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_erfinv", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def erfc(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_erfcf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_erfc", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def erfcx(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_erfcxf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_erfcx", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def erfcinv(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_erfcinvf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_erfcinv", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def normcdfinv(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_normcdfinvf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_normcdfinv", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def normcdf(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_normcdff", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_normcdf", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def lgamma(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_lgammaf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_lgamma", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ldexp(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_ldexpf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("int32"),): ("__nv_ldexp", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def scalbn(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_scalbnf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("int32"),): ("__nv_scalbn", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fmod(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmodf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmod", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def remainder(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_remainderf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_remainder", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fma(arg0, arg1, arg2, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def pow(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_powif", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("int32"),): ("__nv_powi", core.dtype("fp64")),
-                                    (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_powf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_pow", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def tgamma(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_tgammaf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_tgamma", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def round(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_roundf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_round", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def llround(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_llroundf", core.dtype("int64")),
-                                    (core.dtype("fp64"),): ("__nv_llround", core.dtype("int64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def fdim(arg0, arg1, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ],
-                                   {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdimf", core.dtype("fp32")),
-                                    (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fdim", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def ilogb(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_ilogbf", core.dtype("int32")),
-                                    (core.dtype("fp64"),): ("__nv_ilogb", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def logb(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp32"),): ("__nv_logbf", core.dtype("fp32")),
-                                    (core.dtype("fp64"),): ("__nv_logb", core.dtype("fp64")),
-                                    }, is_pure=True, _builder=_builder)
-
-
-@core.extern
-def isfinited(arg0, _builder=None):
-    return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ],
-                                   {(core.dtype("fp64"),): ("__nv_isfinited", core.dtype("int32")),
-                                    }, is_pure=True, _builder=_builder)
diff --git a/python/triton/language/random.py b/python/triton/language/random.py
deleted file mode 100644
index a9ddbd829f12..000000000000
--- a/python/triton/language/random.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import triton
-from . import core as tl
-
-PHILOX_KEY_A: tl.constexpr = 0x9E3779B9
-PHILOX_KEY_B: tl.constexpr = 0xBB67AE85
-PHILOX_ROUND_A: tl.constexpr = 0xD2511F53
-PHILOX_ROUND_B: tl.constexpr = 0xCD9E8D57
-N_ROUNDS_DEFAULT = 10  # Default number of rounds for philox
-
-# -------------------
-# randint
-# -------------------
-
-
-@triton.jit
-def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    """
-    Run `n_rounds` rounds of Philox for state (c0, c1, c2, c3) and key (k0, k1).
-    """
-    for _ in tl.static_range(n_rounds):
-        # for _ in range(n_rounds):
-        # update random state
-        A = PHILOX_ROUND_A
-        B = PHILOX_ROUND_B
-        _c0, _c2 = c0, c2
-        c0 = tl.umulhi(B, _c2) ^ c1 ^ k0
-        c2 = tl.umulhi(A, _c0) ^ c3 ^ k1
-        c1 = B * _c2
-        c3 = A * _c0
-        # raise key
-        k0 = k0 + PHILOX_KEY_A
-        k1 = k1 + PHILOX_KEY_B
-    return c0, c1, c2, c3
-
-
-@triton.jit
-def philox(seed, c0, c1, c2, c3, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    seed = seed.to(tl.uint64)
-    seed_hi = ((seed >> 32) & 0xffffffff).to(tl.uint32)
-    seed_lo = (seed & 0xffffffff).to(tl.uint32)
-    c0 = c0.to(tl.uint32, bitcast=True)
-    c1 = c1.to(tl.uint32, bitcast=True)
-    c2 = c2.to(tl.uint32, bitcast=True)
-    c3 = c3.to(tl.uint32, bitcast=True)
-    return philox_impl(c0, c1, c2, c3, seed_lo, seed_hi, n_rounds)
-
-
-@triton.jit
-def randint(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    """
-    Given a :code:`seed` scalar and an :code:`offset` block, returns a single
-    block of random :code:`int32`.
-
-    If you need multiple streams of random numbers,
-    using `randint4x` is likely to be faster than calling `randint` 4 times.
-
-    :param seed: The seed for generating random numbers.
-    :param offsets: The offsets to generate random numbers for.
-    """
-    ret, _, _, _ = randint4x(seed, offset, n_rounds)
-    return ret
-
-
-@triton.jit
-def randint4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    """
-    Given a :code:`seed` scalar and an :code:`offset` block, returns four
-    blocks of random :code:`int32`.
-
-    This is the maximally efficient entry point
-    to Triton's Philox pseudo-random number generator.
-
-    :param seed: The seed for generating random numbers.
-    :param offsets: The offsets to generate random numbers for.
-    """
-    # _0 = tl.zeros(offset.shape, offset.dtype)
-    _0 = offset * 0
-    return philox(seed, offset, _0, _0, _0, n_rounds)
-
-
-# -------------------
-# rand
-# -------------------
-
-# @triton.jit
-# def uint32_to_uniform_float(x):
-#     """
-#     Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1).
-#     """
-#     two_to_the_minus_32: tl.constexpr = 2.328306e-10
-#     return x * two_to_the_minus_32
-
-@triton.jit
-def uint32_to_uniform_float(x):
-    """
-    Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1).
-    """
-    x = x.to(tl.int32, bitcast=True)
-    # maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
-    scale = 4.6566127342e-10
-    x = tl.where(x < 0, -x - 1, x)
-    return x * scale
-
-
-@triton.jit
-def rand(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    """
-    Given a :code:`seed` scalar and an :code:`offset` block,
-    returns a block of random :code:`float32` in :math:`U(0, 1)`.
-
-    :param seed: The seed for generating random numbers.
-    :param offsets: The offsets to generate random numbers for.
-    """
-    offset = offset.to(tl.uint32, bitcast=True)
-    source = randint(seed, offset, n_rounds)
-    return uint32_to_uniform_float(source)
-
-
-@triton.jit
-def rand4x(seed, offsets, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    """
-    Given a :code:`seed` scalar and an :code:`offsets` block,
-    returns a 4 blocks of random :code:`float32` in :math:`U(0, 1)`.
-
-    :param seed: The seed for generating random numbers.
-    :param offsets: The offsets to generate random numbers for.
-    """
-    offsets = offsets.to(tl.uint32, bitcast=True)
-    i1, i2, i3, i4 = randint4x(seed, offsets, n_rounds)
-    u1 = uint32_to_uniform_float(i1)
-    u2 = uint32_to_uniform_float(i2)
-    u3 = uint32_to_uniform_float(i3)
-    u4 = uint32_to_uniform_float(i4)
-    return u1, u2, u3, u4
-
-# -------------------
-# randn
-# -------------------
-
-
-@triton.jit
-def pair_uniform_to_normal(u1, u2):
-    """Box-Muller transform"""
-    u1 = tl.maximum(1.0e-7, u1)
-    th = 6.283185307179586 * u2
-    r = tl.sqrt(-2.0 * tl.log(u1))
-    return r * tl.cos(th), r * tl.sin(th)
-
-
-@triton.jit
-def randn(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    """
-    Given a :code:`seed` scalar and an :code:`offset` block,
-    returns a block of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`.
-
-    :param seed: The seed for generating random numbers.
-    :param offsets: The offsets to generate random numbers for.
-    """
-    i1, i2, _, _ = randint4x(seed, offset, n_rounds)
-    u1 = uint32_to_uniform_float(i1)
-    u2 = uint32_to_uniform_float(i2)
-    n1, _ = pair_uniform_to_normal(u1, u2)
-    return n1
-
-
-@triton.jit
-def randn4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
-    """
-    Given a :code:`seed` scalar and an :code:`offset` block,
-    returns a 4 blocks of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`.
-
-    :param seed: The seed for generating random numbers.
-    :param offsets: The offsets to generate random numbers for.
-    """
-    u1, u2, u3, u4 = rand4x(seed, offset, n_rounds)
-    n1, n2 = pair_uniform_to_normal(u1, u2)
-    n3, n4 = pair_uniform_to_normal(u3, u4)
-    return n1, n2, n3, n4
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
deleted file mode 100644
index 968e57f59385..000000000000
--- a/python/triton/language/semantic.py
+++ /dev/null
@@ -1,1440 +0,0 @@
-from __future__ import annotations  # remove after python 3.11
-
-from functools import wraps
-from typing import List, Optional, Sequence, Tuple, TypeVar
-
-from . import core as tl
-from triton._C.libtriton.triton import ir
-
-T = TypeVar('T')
-
-# Create custom exception that prints message "hello"
-
-
-class IncompatibleTypeErrorImpl(Exception):
-    def __init__(self, type_a, type_b):
-        self.type_a = type_a
-        self.type_b = type_b
-        self.message = "invalid operands of type " + self.type_a.__repr__() + " and " + self.type_b.__repr__()
-        super(IncompatibleTypeErrorImpl, self).__init__(self.message)
-
-
-# ===----------------------------------------------------------------------===##
-# Programming Model
-# ===----------------------------------------------------------------------===##
-
-def program_id(axis: int, builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_get_program_id(axis), tl.int32)
-
-
-def num_programs(axis: int, builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_get_num_programs(axis), tl.int32)
-
-# ===----------------------------------------------------------------------===//
-#                               Implicit Casting Utilities
-# ===----------------------------------------------------------------------===//
-
-
-def integer_promote_impl(a_ty: tl.dtype, b_ty: tl.dtype) -> tl.dtype:
-    a_rank = a_ty.int_bitwidth
-    b_rank = b_ty.int_bitwidth
-    a_sn = a_ty.int_signedness
-    b_sn = b_ty.int_signedness
-    # Rules for signedness taken from "Usual arithmetic conversions" on
-    # https://en.cppreference.com/w/c/language/conversion.
-    if a_sn == b_sn:
-        return a_ty if a_rank > b_rank else b_ty
-    elif a_sn == tl.dtype.SIGNEDNESS.UNSIGNED:
-        return a_ty if a_rank >= b_rank else b_ty
-    elif b_sn == tl.dtype.SIGNEDNESS.UNSIGNED:
-        return b_ty if b_rank >= a_rank else a_ty
-    assert False
-
-
-def computation_type_impl(a_ty: tl.dtype, b_ty: tl.dtype, div_or_mod: bool) -> tl.dtype:
-    # 1) if one operand is double, the other is implicitly
-    #    converted to double
-    if a_ty.is_fp64() or b_ty.is_fp64():
-        return tl.float64
-    # 2) if one operand is float, the other is implicitly
-    #    converted to float
-    if a_ty.is_fp32() or b_ty.is_fp32():
-        return tl.float32
-    # 3 ) if one operand is half, the other is implicitly converted to half
-    #     unless we're doing / or %, which do not exist natively in PTX for fp16.
-    #     Supported PTX op: add, sub, mul, fma, neg, abs, min, max, tanh, ex2, setp
-    if a_ty.is_fp16() or b_ty.is_fp16():
-        if div_or_mod:
-            return tl.float32
-        else:
-            return tl.float16
-    # 4) return bf16 only if both operands are of bf16
-    if a_ty.is_bf16() or b_ty.is_bf16():
-        if div_or_mod:
-            return tl.float32
-        if a_ty.is_bf16() and b_ty.is_bf16():
-            return tl.bfloat16
-        return tl.float32
-    if not a_ty.is_int() or not b_ty.is_int():
-        assert False
-    # 5 ) both operands are integer and undergo
-    #    integer promotion
-    if div_or_mod and a_ty.int_signedness != b_ty.int_signedness:
-        raise ValueError("Cannot use /, #, or % with " + a_ty.__repr__() + " and " + b_ty.__repr__() + " because they have different signedness;"
-                         "this is unlikely to result in a useful answer. Cast them to the same signedness.")
-    return integer_promote_impl(a_ty, b_ty)
-
-# ===----------------------------------------------------------------------===//
-#                               Binary Operators
-# ===----------------------------------------------------------------------===//
-
-
-def check_ptr_type_impl(type_a: tl.dtype, type_b: tl.dtype, allow_ptr_a: bool) -> None:
-    if type_a.is_ptr():
-        if not allow_ptr_a:
-            raise IncompatibleTypeErrorImpl(type_a, type_b)
-        # T* + U* with T != U
-        if type_b.is_ptr() and (type_a != type_b):
-            raise IncompatibleTypeErrorImpl(type_a, type_b)
-        # T* + float
-        if type_b.is_floating():
-            raise IncompatibleTypeErrorImpl(type_a, type_b)
-
-
-def binary_op_type_checking_impl(lhs: tl.tensor,
-                                 rhs: tl.tensor,
-                                 builder: ir.builder,
-                                 allow_lhs_ptr=False, allow_rhs_ptr=False,
-                                 arithmetic_check=True, div_or_mod=False
-                                 ) -> Tuple[tl.tensor, tl.tensor]:
-    # implicit broadcasting
-    lhs, rhs = broadcast_impl_value(lhs, rhs, builder)
-    # implicit typecasting
-    lhs_sca_ty = lhs.type.scalar
-    rhs_sca_ty = rhs.type.scalar
-    check_ptr_type_impl(lhs_sca_ty, rhs_sca_ty, allow_lhs_ptr)
-    check_ptr_type_impl(rhs_sca_ty, lhs_sca_ty, allow_rhs_ptr)
-    if arithmetic_check and not lhs_sca_ty.is_ptr() and not rhs_sca_ty.is_ptr():
-        ret_sca_ty = computation_type_impl(lhs_sca_ty, rhs_sca_ty, div_or_mod)
-        lhs = cast(lhs, ret_sca_ty, builder)
-        rhs = cast(rhs, ret_sca_ty, builder)
-    return lhs, rhs
-
-
-def add(input: tl.tensor,
-        other: tl.tensor,
-        builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder, True, True)
-    input_scalar_ty = input.type.scalar
-    other_scalar_ty = other.type.scalar
-
-    # offset + ptr
-    # ptr + offset
-    if other_scalar_ty.is_ptr() and not input_scalar_ty.is_ptr():
-        input, other = other, input
-    if input_scalar_ty.is_ptr():
-        return tl.tensor(builder.create_addptr(input.handle, other.handle), input.type)
-    # float + float
-    elif input_scalar_ty.is_floating():
-        return tl.tensor(builder.create_fadd(input.handle, other.handle), input.type)
-    # int + int
-    elif input_scalar_ty.is_int():
-        return tl.tensor(builder.create_add(input.handle, other.handle), input.type)
-    assert False
-
-
-def sub(input: tl.tensor,
-        other: tl.tensor,
-        builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder, True, False)
-    scalar_ty = input.type.scalar
-    # ptr - offset
-    if scalar_ty.is_ptr():
-        return tl.tensor(builder.create_addptr(input.handle, minus(other, builder).handle),
-                         input.type)
-    # float - float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fsub(input.handle, other.handle), input.type)
-    # int - int
-    elif scalar_ty.is_int():
-        return tl.tensor(builder.create_sub(input.handle, other.handle), input.type)
-    assert False
-
-
-def mul(input: tl.tensor,
-        other: tl.tensor,
-        builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder)
-    scalar_ty = input.type.scalar
-    # float * float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fmul(input.handle, other.handle), input.type)
-    # * int
-    elif scalar_ty.is_int():
-        return tl.tensor(builder.create_mul(input.handle, other.handle), input.type)
-    assert False
-
-
-def truediv(input: tl.tensor,
-            other: tl.tensor,
-            builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True)
-    input_scalar_ty = input.type.scalar
-    other_scalar_ty = other.type.scalar
-    # float / int
-    if input_scalar_ty.is_floating() and other_scalar_ty.is_int():
-        other = cast(other, input_scalar_ty, builder)
-    # int / float
-    elif input_scalar_ty.is_int() and other_scalar_ty.is_floating():
-        input = cast(input, other_scalar_ty, builder)
-    # int / int (cast to tl.float32)
-    elif input_scalar_ty.is_int() and other_scalar_ty.is_int():
-        input = cast(input, tl.float32, builder)
-        other = cast(other, tl.float32, builder)
-    # float / float (cast to highest exponent type)
-    elif input_scalar_ty.is_floating() and other_scalar_ty.is_floating():
-        if input_scalar_ty.fp_mantissa_width > other_scalar_ty.fp_mantissa_width:
-            other = cast(other, input_scalar_ty, builder)
-        else:
-            input = cast(input, other_scalar_ty, builder)
-    # unreachable
-    else:
-        assert False
-    return tl.tensor(builder.create_fdiv(input.handle, other.handle), input.type)
-
-
-def floordiv(input: tl.tensor,
-             other: tl.tensor,
-             builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True)
-    input_scalar_ty = input.type.scalar
-    other_scalar_ty = other.type.scalar
-    if input_scalar_ty.is_int() and other_scalar_ty.is_int():
-        ret_ty = integer_promote_impl(input_scalar_ty, other_scalar_ty)
-        input = cast(input, ret_ty, builder)
-        other = cast(other, ret_ty, builder)
-        if ret_ty.is_int_signed():
-            return tl.tensor(builder.create_sdiv(input.handle, other.handle), input.type)
-        else:
-            return tl.tensor(builder.create_udiv(input.handle, other.handle), input.type)
-    assert False
-
-
-def fdiv(input: tl.tensor,
-         other: tl.tensor,
-         ieee_rounding: bool,
-         builder: ir.builder) -> tl.tensor:
-    input_scalar_ty = input.type.scalar
-    other_scalar_ty = other.type.scalar
-    if not input_scalar_ty.is_floating() or not other_scalar_ty.is_floating():
-        raise ValueError("both operands of fdiv must have floating scalar type")
-    input, other = binary_op_type_checking_impl(input, other, builder, False, False, False, True)
-    ret = builder.create_fdiv(input.handle, other.handle)
-    return tl.tensor(ret, input.type)
-
-
-def mod(input: tl.tensor,
-        other: tl.tensor,
-        builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True)
-    scalar_ty = input.type.scalar
-    other_scalar_ty = other.type.scalar
-    # float % float
-    if scalar_ty.is_floating():
-        # input - input.div(other, rounding_mode="floor") * other
-        ret = sub(input, mul(floor(fdiv(input, other, False, builder), builder),
-                             other, builder),
-                  builder)
-        return ret
-    # % int
-    elif scalar_ty.is_int():
-        if scalar_ty.int_signedness != other_scalar_ty.int_signedness:
-            raise ValueError("Cannot mod " + scalar_ty.__repr__() + " by " + other_scalar_ty.__repr__() + " "
-                             "because they have different signedness;"
-                             "this is unlikely to result in a useful answer. Cast them to the same signedness.")
-        if scalar_ty.is_int_signed():
-            return tl.tensor(builder.create_srem(input.handle, other.handle), input.type)
-        else:
-            return tl.tensor(builder.create_urem(input.handle, other.handle), input.type)
-    assert False
-
-##############
-# bitwise ops
-##############
-
-
-def bitwise_op_type_checking_impl(input: tl.tensor,
-                                  other: tl.tensor,
-                                  builder: ir.builder) -> Tuple[tl.tensor, tl.tensor]:
-    input, other = binary_op_type_checking_impl(input, other, builder, False, False, False)
-    input_sca_ty = input.type.scalar
-    other_sca_ty = other.type.scalar
-    if not input_sca_ty.is_int() or not other_sca_ty.is_int():
-        raise IncompatibleTypeErrorImpl(input_sca_ty, other_sca_ty)
-    ret_sca_ty = integer_promote_impl(input_sca_ty, other_sca_ty)
-    if ret_sca_ty != input_sca_ty:
-        input = cast(input, ret_sca_ty, builder)
-    if ret_sca_ty != other_sca_ty:
-        other = cast(other, ret_sca_ty, builder)
-    return input, other
-
-
-def and_(input: tl.tensor,
-         other: tl.tensor,
-         builder: ir.builder) -> tl.tensor:
-    input, other = bitwise_op_type_checking_impl(input, other, builder)
-    return tl.tensor(builder.create_and(input.handle, other.handle), input.type)
-
-
-def or_(input: tl.tensor,
-        other: tl.tensor,
-        builder: ir.builder) -> tl.tensor:
-    input, other = bitwise_op_type_checking_impl(input, other, builder)
-    return tl.tensor(builder.create_or(input.handle, other.handle), input.type)
-
-
-def xor_(input: tl.tensor,
-         other: tl.tensor,
-         builder: ir.builder) -> tl.tensor:
-    input, other = bitwise_op_type_checking_impl(input, other, builder)
-    return tl.tensor(builder.create_xor(input.handle, other.handle), input.type)
-
-
-def logical_and(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor:
-    if not input.type.is_int1():
-        input = bitcast(input, tl.dtype("int1"), builder)
-    if not other.type.is_int1():
-        other = bitcast(other, tl.dtype("int1"), builder)
-    return and_(input, other, builder)
-
-
-def logical_or(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor:
-    if not input.type.is_int1():
-        input = bitcast(input, tl.dtype("int1"), builder)
-    if not other.type.is_int1():
-        other = bitcast(other, tl.dtype("int1"), builder)
-    return or_(input, other, builder)
-
-
-def not_(input: tl.tensor, builder: ir.builder):
-    if not input.type.is_int1():
-        input = bitcast(input, tl.dtype("int1"), builder)
-    return invert(input, builder)
-
-
-def lshr(input: tl.tensor,
-         other: tl.tensor,
-         builder: ir.builder) -> tl.tensor:
-    input, other = bitwise_op_type_checking_impl(input, other, builder)
-    return tl.tensor(builder.create_lshr(input.handle, other.handle), input.type)
-
-
-def ashr(input: tl.tensor,
-         other: tl.tensor,
-         builder: ir.builder) -> tl.tensor:
-    input, other = bitwise_op_type_checking_impl(input, other, builder)
-    return tl.tensor(builder.create_ashr(input.handle, other.handle), input.type)
-
-
-def shl(input: tl.tensor,
-        other: tl.tensor,
-        builder: ir.builder) -> tl.tensor:
-    input, other = bitwise_op_type_checking_impl(input, other, builder)
-    return tl.tensor(builder.create_shl(input.handle, other.handle), input.type)
-
-# ===----------------------------------------------------------------------===//
-#                               Unary Operators
-# ===----------------------------------------------------------------------===//
-
-
-def plus(input: tl.tensor) -> tl.tensor:
-    return input
-
-
-def minus(input: tl.tensor,
-          builder: ir.builder) -> tl.tensor:
-    input_sca_ty = input.type.scalar
-    if input_sca_ty.is_ptr():
-        raise ValueError("wrong type argument to unary minus (" + input_sca_ty.__repr__() + ")")
-    _0 = tl.tensor(builder.get_null_value(input_sca_ty.to_ir(builder)), input_sca_ty)
-    return sub(_0, input, builder)
-
-
-def invert(input: tl.tensor,
-           builder: tl.tensor) -> tl.tensor:
-    input_sca_ty = input.type.scalar
-    if input_sca_ty.is_ptr() or input_sca_ty.is_floating():
-        raise ValueError("wrong type argument to unary invert (" + input_sca_ty.__repr__() + ")")
-    _1 = tl.tensor(builder.get_all_ones_value(input_sca_ty.to_ir(builder)), input_sca_ty)
-    return xor_(input, _1, builder)
-
-
-# ===----------------------------------------------------------------------===//
-#                               Comparison Operators
-# ===----------------------------------------------------------------------===//
-def _bool_like(v: tl.tensor) -> tl.block_type:
-    if not v.type.is_block():
-        return tl.int1
-    shape = v.type.shape
-    return tl.block_type(tl.int1, shape)
-
-
-def greater_than(input: tl.tensor,
-                 other: tl.tensor,
-                 builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder)
-    scalar_ty = input.type.scalar
-    # float > float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fcmpOGT(input.handle, other.handle), _bool_like(input))
-    # > int
-    elif scalar_ty.is_int():
-        if scalar_ty.is_int_signed():
-            return tl.tensor(builder.create_icmpSGT(input.handle, other.handle), _bool_like(input))
-        else:
-            return tl.tensor(builder.create_icmpUGT(input.handle, other.handle), _bool_like(input))
-    assert False
-
-
-def greater_equal(input: tl.tensor,
-                  other: tl.tensor,
-                  builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder)
-    scalar_ty = input.type.scalar
-    # float >= float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fcmpOGE(input.handle, other.handle), _bool_like(input))
-    # >= int
-    elif scalar_ty.is_int():
-        if scalar_ty.is_int_signed():
-            return tl.tensor(builder.create_icmpSGE(input.handle, other.handle), _bool_like(input))
-        else:
-            return tl.tensor(builder.create_icmpUGE(input.handle, other.handle), _bool_like(input))
-    assert False
-
-
-def less_than(input: tl.tensor,
-              other: tl.tensor,
-              builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder)
-    scalar_ty = input.type.scalar
-    # float < float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fcmpOLT(input.handle, other.handle), _bool_like(input))
-    # < int
-    elif scalar_ty.is_int():
-        if scalar_ty.is_int_signed():
-            return tl.tensor(builder.create_icmpSLT(input.handle, other.handle), _bool_like(input))
-        else:
-            return tl.tensor(builder.create_icmpULT(input.handle, other.handle), _bool_like(input))
-    assert False
-
-
-def less_equal(input: tl.tensor,
-               other: tl.tensor,
-               builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder)
-    scalar_ty = input.type.scalar
-    # float < float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fcmpOLE(input.handle, other.handle), _bool_like(input))
-    # < int
-    elif scalar_ty.is_int():
-        if scalar_ty.is_int_signed():
-            return tl.tensor(builder.create_icmpSLE(input.handle, other.handle), _bool_like(input))
-        else:
-            return tl.tensor(builder.create_icmpULE(input.handle, other.handle), _bool_like(input))
-    assert False
-
-
-def equal(input: tl.tensor,
-          other: tl.tensor,
-          builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder)
-    scalar_ty = input.type.scalar
-    # float == float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fcmpOEQ(input.handle, other.handle), _bool_like(input))
-    # == int
-    elif scalar_ty.is_int():
-        return tl.tensor(builder.create_icmpEQ(input.handle, other.handle), _bool_like(input))
-    assert False
-
-
-def not_equal(input: tl.tensor,
-              other: tl.tensor,
-              builder: ir.builder) -> tl.tensor:
-    input, other = binary_op_type_checking_impl(input, other, builder)
-    scalar_ty = input.type.scalar
-    # float == float
-    if scalar_ty.is_floating():
-        return tl.tensor(builder.create_fcmpUNE(input.handle, other.handle), _bool_like(input))
-    # == int
-    elif scalar_ty.is_int():
-        return tl.tensor(builder.create_icmpNE(input.handle, other.handle), _bool_like(input))
-    assert False
-
-# ===----------------------------------------------------------------------===//
-#                               Block Creation
-# ===----------------------------------------------------------------------===//
-
-
-def arange(start: int, end: int, builder: ir.builder) -> tl.tensor:
-    if not isinstance(start, int) or not isinstance(end, int):
-        raise ValueError("arange's arguments must be of type tl.constexpr")
-    is_start_int64 = bool(start >> 32)
-    is_end_int64 = bool(end >> 32)
-    if is_start_int64 or is_end_int64:
-        raise ValueError("arange must fit in int32")
-    if end <= start:
-        raise ValueError("arange's end argument must be greater than the start argument")
-
-    shape = [end - start]
-    ret_ty = tl.block_type(tl.int32, shape)
-    return tl.tensor(builder.create_make_range(start, end), ret_ty)
-
-
-def full(shape: List[int], value, dtype: tl.dtype, builder: ir.builder) -> tl.tensor:
-    if isinstance(value, tl.tensor):
-        assert value.numel.value == 1, "only accepts size-1 tensor"
-        value = cast(value, dtype, builder)
-        ret_ty = tl.block_type(value.dtype, shape)
-        return tl.tensor(builder.create_splat(value.handle, shape), ret_ty)
-    else:
-        # scalar
-        if value == 0:
-            value = builder.get_null_value(dtype.to_ir(builder))
-        else:
-            get_value_fn = getattr(builder, f"get_{dtype.name}")
-            value = get_value_fn(value)
-        if dtype is None:
-            raise ValueError("dtype must be specified when value is not a tensor")
-        ret_ty = tl.block_type(dtype, shape)
-        return tl.tensor(builder.create_splat(value, shape), ret_ty)
-
-
-# ===----------------------------------------------------------------------===//
-#                               Shape Manipulation
-# ===----------------------------------------------------------------------===//
-
-
-def view(input: tl.tensor,
-         dst_shape: List[int],
-         builder: ir.builder) -> tl.tensor:
-    # TODO: disable when TritonToTritonGPU handles views properly
-
-    # assert len(input.shape) == len(dst_shape)
-    numel = 1
-    for s in dst_shape:
-        numel *= s
-    if input.type.numel != numel:
-        raise ValueError("cannot view block of different shape")
-    ret_ty = tl.block_type(input.type.scalar, dst_shape)
-    return tl.tensor(builder.create_view(input.handle, dst_shape), ret_ty)
-
-
-def reshape(input: tl.tensor,
-            dst_shape: List[int],
-            builder: ir.builder) -> tl.tensor:
-    raise ValueError("`reshape` is not supported yet. Please use `view` instead if applicable. "
-                     "Note that view may reorder elements in an implementation- and context- dependent way.")
-
-
-def expand_dims(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor:
-    dst_shape = list(input.type.shape)
-    dst_shape.insert(axis, 1)
-    ret_ty = tl.block_type(input.type.scalar, dst_shape)
-    return tl.tensor(builder.create_expand_dims(input.handle, axis), ret_ty)
-
-
-def cat(lhs: tl.tensor, rhs: tl.tensor, can_reorder: bool, builder: ir.builder) -> tl.tensor:
-    assert can_reorder, "current implementation of `cat` always may reorder elements"
-    assert len(lhs.shape) == 1
-    ret_type = tl.block_type(lhs.type.scalar, [lhs.shape[0] + rhs.shape[0]])
-    return tl.tensor(builder.create_cat(lhs.handle, rhs.handle), ret_type)
-
-
-def trans(input: tl.tensor, builder: ir.builder) -> tl.tensor:
-    if len(input.shape) != 2:
-        raise ValueError("Only 2D tensors can be transposed")
-    ret_type = tl.block_type(input.type.scalar, [input.shape[1], input.shape[0]])
-    return tl.tensor(builder.create_trans(input.handle), ret_type)
-
-
-def broadcast_impl_shape(input: tl.tensor,
-                         shape: List[int],
-                         builder: ir.builder) -> tl.tensor:
-    if not input.type.is_block():
-        ret_ty = tl.block_type(input.type, shape)
-        return tl.tensor(builder.create_splat(input.handle, shape), ret_ty)
-    src_shape = input.type.get_block_shapes()
-    if len(src_shape) != len(shape):
-        raise ValueError(f"Cannot broadcast, rank mismatch: {src_shape}, {shape}")
-    if shape == src_shape:
-        return input
-    for i, item in enumerate(src_shape):
-        if shape[i] != item and item != 1:
-            raise ValueError(f"Cannot broadcast, the expanded size of the tensor ({shape[i]})"
-                             f" must match the existing size ({item}) at non-singleton dimension"
-                             f" {i}: {src_shape}, {shape}")
-    ret_ty = tl.block_type(input.type.scalar, shape)
-    return tl.tensor(builder.create_broadcast(input.handle, shape), ret_ty)
-
-
-def broadcast_impl_value(lhs: tl.tensor,
-                         rhs: tl.tensor,
-                         builder: ir.builder) -> tl.tensor:
-    lhs_ty = lhs.type
-    rhs_ty = rhs.type
-
-    # make_shape_compatible(block, scalar)
-    if lhs_ty.is_block() and not rhs_ty.is_block():
-        rhs_ty = tl.block_type(rhs_ty.scalar, lhs_ty.shape)
-        rhs = tl.tensor(builder.create_splat(rhs.handle, lhs_ty.get_block_shapes()), rhs_ty)
-    # make_shape_compatible(scalar, block)
-    elif not lhs_ty.is_block() and rhs_ty.is_block():
-        lhs_ty = tl.block_type(lhs_ty.scalar, rhs_ty.shape)
-        lhs = tl.tensor(builder.create_splat(lhs.handle, rhs_ty.get_block_shapes()), lhs_ty)
-    # make_shape_compatible(block, block)
-    elif lhs_ty.is_block() and rhs_ty.is_block():
-        lhs_shape = lhs_ty.get_block_shapes()
-        rhs_shape = rhs_ty.get_block_shapes()
-
-        if len(lhs_shape) < len(rhs_shape):
-            # Add new axes to lhs
-            for dim in range(len(lhs_shape), len(rhs_shape)):
-                lhs = tl.tensor(builder.create_expand_dims(lhs.handle, 0), tl.block_type(lhs_ty.scalar, [1] + lhs_shape))
-                lhs_ty = lhs.type
-                lhs_shape = lhs_ty.get_block_shapes()
-        elif len(rhs_shape) < len(lhs_shape):
-            # Add new axes to rhs
-            for dim in range(len(rhs_shape), len(lhs_shape)):
-                rhs = tl.tensor(builder.create_expand_dims(rhs.handle, 0), tl.block_type(rhs_ty.scalar, [1] + rhs_shape))
-                rhs_ty = rhs.type
-                rhs_shape = rhs_ty.get_block_shapes()
-        assert len(rhs_shape) == len(lhs_shape)
-
-        ret_shape = []
-        for i, left in enumerate(lhs_shape):
-            right = rhs_shape[i]
-            if left == 1:
-                ret_shape.append(right)
-            elif right == 1:
-                ret_shape.append(left)
-            elif left == right:
-                ret_shape.append(left)
-            else:
-                raise ValueError("Cannot make_shape_compatible: incompatible dimensions "
-                                 "at index " + str(i) + ": " + str(left) + " and " + str(right))
-        if lhs_shape != ret_shape:
-            ret_ty = tl.block_type(lhs_ty.scalar, ret_shape)
-            lhs = tl.tensor(builder.create_broadcast(lhs.handle, ret_shape), ret_ty)
-        if rhs_shape != ret_shape:
-            ret_ty = tl.block_type(rhs_ty.scalar, ret_shape)
-            rhs = tl.tensor(builder.create_broadcast(rhs.handle, ret_shape), ret_ty)
-    # (scalar, scalar) => returns original blocks
-    return lhs, rhs
-
-#######
-# cast
-#######
-
-
-def bitcast(input: tl.tensor,
-            dst_ty: tl.dtype,
-            builder: ir.builder) -> tl.tensor:
-    src_ty = input.type
-    if src_ty.is_block():
-        dst_ty = tl.block_type(dst_ty.scalar, input.type.get_block_shapes())
-    if src_ty == dst_ty:
-        return input
-    src_sca_ty = src_ty.scalar
-    dst_sca_ty = dst_ty.scalar
-    if src_sca_ty.is_ptr() or dst_sca_ty.is_ptr():
-        return cast(input, dst_ty, builder)
-    # Bitcast
-    src_bits = src_sca_ty.primitive_bitwidth
-    dst_bits = dst_sca_ty.primitive_bitwidth
-    if src_bits != dst_bits:
-        raise ValueError("Cannot bitcast data-type of size " + str(src_bits) + " to "
-                         "data-type of size " + str(dst_bits))
-    return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)),
-                     dst_ty)
-
-
-def cast(input: tl.tensor,
-         dst_ty: tl.dtype,
-         builder: ir.builder) -> tl.tensor:
-    src_ty = input.type
-    if isinstance(dst_ty, tl.constexpr):
-        dst_ty = dst_ty.value
-    if src_ty.is_block():
-        dst_ty = tl.block_type(dst_ty.scalar, input.type.get_block_shapes())
-    if src_ty == dst_ty:
-        return input
-
-    src_sca_ty = src_ty.scalar
-    dst_sca_ty = dst_ty.scalar
-
-    # Casting with customized floating types involved: fp8 <=> bf16, fp16, fp32, fp64
-    if (src_sca_ty.is_fp8() and dst_sca_ty.is_floating()) or \
-       (src_sca_ty.is_floating() and dst_sca_ty.is_fp8()):
-        return tl.tensor(builder.create_fp_to_fp(input.handle, dst_ty.to_ir(builder)),
-                         dst_ty)
-
-    # bf16 <=> (not fp32)
-    if (src_sca_ty.is_fp16() and not dst_sca_ty.is_fp32()) or \
-       (src_sca_ty.is_bf16() and not dst_sca_ty.is_fp32()):
-        return cast(cast(input, tl.float32, builder), dst_sca_ty, builder)
-
-    # Standard floating types' casting: truncation
-    #   fp64 => fp32, fp16, bf16
-    #   fp32 => fp16, bf16
-    truncate_fp = src_sca_ty.is_floating() and \
-        dst_sca_ty.is_floating() and \
-        src_sca_ty.primitive_bitwidth > dst_sca_ty.primitive_bitwidth
-    if truncate_fp:
-        return tl.tensor(builder.create_fp_trunc(input.handle,
-                                                 dst_ty.to_ir(builder)),
-                         dst_ty)
-
-    # Standard floating types' casting: extension
-    #   fp32 => fp64
-    #   fp16 => fp32, fp64
-    #   bf16 => fp32, fp64
-    ext_fp = src_sca_ty.is_floating() and \
-        dst_sca_ty.is_floating() and \
-        src_sca_ty.primitive_bitwidth < dst_sca_ty.primitive_bitwidth
-    if ext_fp:
-        return tl.tensor(builder.create_fp_ext(input.handle,
-                                               dst_ty.to_ir(builder)),
-                         dst_ty)
-
-    # Casting between integer types
-    if src_sca_ty.is_int() and dst_sca_ty.is_int() and \
-       (src_sca_ty.int_bitwidth != dst_sca_ty.int_bitwidth or src_sca_ty.int_signedness != dst_sca_ty.int_signedness):
-        sign_extend = src_sca_ty.is_int_signed() and not src_sca_ty.is_bool()
-        if dst_sca_ty.is_bool():
-            ty = input.dtype.to_ir(builder)
-            _0 = tl.tensor(builder.get_null_value(ty), input.dtype)
-            return not_equal(input, _0, builder)
-        else:
-            return tl.tensor(builder.create_int_cast(input.handle,
-                                                     dst_ty.to_ir(builder), sign_extend),
-                             dst_ty)
-
-    # Casting standard floating types to integer types
-    if src_sca_ty.is_standard_floating() and dst_sca_ty.is_int():
-        if dst_sca_ty.is_bool():
-            ty = input.dtype.to_ir(builder)
-            _0 = tl.tensor(builder.get_null_value(ty), input.dtype)
-            return not_equal(input, _0, builder)
-        elif dst_sca_ty.is_int_signed():
-            return tl.tensor(builder.create_fp_to_si(input.handle,
-                                                     dst_ty.to_ir(builder)),
-                             dst_ty)
-        else:
-            return tl.tensor(builder.create_fp_to_ui(input.handle,
-                                                     dst_ty.to_ir(builder)),
-                             dst_ty)
-
-    # Casting integer types to standard floating types
-    if src_sca_ty.is_int() and dst_sca_ty.is_standard_floating():
-        if src_sca_ty.is_bool() or not src_sca_ty.is_int_signed():
-            return tl.tensor(builder.create_ui_to_fp(input.handle,
-                                                     dst_ty.to_ir(builder)),
-                             dst_ty)
-        else:
-            return tl.tensor(builder.create_si_to_fp(input.handle,
-                                                     dst_ty.to_ir(builder)),
-                             dst_ty)
-
-    # Casting pointer types to integer types
-    if src_sca_ty.is_ptr() and dst_sca_ty.is_int():
-        bitwidth = dst_sca_ty.int_bitwidth
-        if bitwidth == 64:
-            return tl.tensor(builder.create_ptr_to_int(input.handle, dst_ty.to_ir(builder)),
-                             dst_ty)
-        if bitwidth == 1:
-            return not_equal(cast(input, tl.int64, builder),
-                             tl.tensor(builder.get_int64(0), tl.int64),
-                             builder)
-
-    # Casting integer types to pointer types
-    if src_sca_ty.is_int() and dst_sca_ty.is_ptr():
-        return tl.tensor(builder.create_int_to_ptr(input.handle, dst_ty.to_ir(builder)), dst_ty)
-
-    # Casting pointer types to pointer types
-    if src_sca_ty.is_ptr() and dst_sca_ty.is_ptr():
-        return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)), dst_ty)
-
-    assert False, f'cannot cast {input} to {dst_ty}'
-
-# ===----------------------------------------------------------------------===//
-#                               Memory Operators
-# ===----------------------------------------------------------------------===//
-
-
-def _str_to_cache_modifier(cache_modifier):
-    cache = ir.CACHE_MODIFIER.NONE  # default
-    if cache_modifier:
-        if cache_modifier == ".ca":
-            cache = ir.CACHE_MODIFIER.CA
-        elif cache_modifier == ".cg":
-            cache = ir.CACHE_MODIFIER.CG
-        else:
-            raise ValueError(f"Cache modifier {cache_modifier} not supported")
-    return cache
-
-
-def _str_to_eviction_policy(eviction_policy):
-    eviction = ir.EVICTION_POLICY.NORMAL  # default
-    if eviction_policy:
-        if eviction_policy == "evict_last":
-            eviction = ir.EVICTION_POLICY.EVICT_LAST
-        elif eviction_policy == "evict_first":
-            eviction = ir.EVICTION_POLICY.EVICT_FIRST
-        else:
-            raise ValueError(f"Eviction policy {eviction_policy} not supported")
-    return eviction
-
-
-def _str_to_padding_option(padding_option):
-    padding = None  # default
-    if padding_option:
-        if padding_option == "zero":
-            padding = ir.PADDING_OPTION.PAD_ZERO
-        elif padding_option == "nan":
-            padding = ir.PADDING_OPTION.PAD_NAN
-        else:
-            raise ValueError(f"Padding option {padding_option} not supported")
-    return padding
-
-
-def _canonicalize_boundary_check(boundary_check, block_shape):
-    if boundary_check:
-        if not hasattr(boundary_check, "__iter__"):
-            boundary_check = [boundary_check]
-        boundary_check = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in boundary_check]
-        for dim in boundary_check:
-            assert isinstance(dim, int) and 0 <= dim < len(block_shape)
-        assert len(boundary_check) > 0
-        assert len(boundary_check) == len(set(boundary_check)), "Duplicate dimension in `boundary_check`"
-        return sorted(boundary_check)
-    return tuple()
-
-
-def _load_block_pointer(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder):
-    # Load by a block pointer: `pointer_type<block_type<>>`
-    # Block pointer can not have `mask` and `other` arguments
-    if mask or other:
-        raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers")
-
-    elt_ty = ptr.type.element_ty.element_ty
-    assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`"
-    if elt_ty.is_int() and padding == ir.PADDING_OPTION.PAD_NAN:
-        raise ValueError("Padding option `nan` is not supported for integer block pointers")
-
-    # `dst_ty` is de-referenced type of the pointer type
-    dst_ty = ptr.type.element_ty
-
-    # Check `boundary_check` argument
-    boundary_check = _canonicalize_boundary_check(boundary_check, dst_ty.get_block_shapes())
-
-    # Build IR
-    return tl.tensor(builder.create_tensor_pointer_load(ptr.handle, boundary_check, padding, cache, eviction,
-                                                        is_volatile), dst_ty)
-
-
-def _load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder):
-    # Load by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
-    if not ptr.type.scalar.is_ptr():
-        raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.load`")
-
-    # Check `mask`, `other`, `boundary_check`, and `padding` arguments
-    if not mask and other:
-        raise ValueError("`other` cannot be provided without `mask`")
-    if padding or boundary_check:
-        raise ValueError("`padding_option` or `boundary_check` argument is not supported for loading a tensor of"
-                         "pointers or loading a scalar. Because the compiler does not know the boundary; please "
-                         "use block pointers (defined by `make_block_ptr`) instead")
-
-    # For a pointer of scalar, check the type of `mask` and `other`
-    if not ptr.type.is_block():
-        if mask and mask.type.is_block():
-            raise ValueError("Mask argument cannot be block type if pointer argument is not a block")
-        if other and other.type.is_block():
-            raise ValueError("Other argument cannot be block type if pointer argument is not a block")
-
-    # Make `mask` and `other` into the same shape as `ptr`
-    if ptr.type.is_block():
-        if mask:
-            mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder)
-        if other:
-            other = broadcast_impl_shape(other, ptr.type.get_block_shapes(), builder)
-
-    # Get `pointer_type<elt_ty>` and `elt_ty`
-    ptr_ty = ptr.type.scalar
-    elt_ty = ptr_ty.element_ty
-
-    # Treat `pointer_type<tl.int1>` as `pointer_type<tl.int8>`
-    if elt_ty == tl.int1:
-        elt_ty = tl.int8
-        ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space)
-        ptr = cast(ptr, ptr_ty, builder)
-
-    # Cast `other` into `ele_ty` type
-    if other:
-        other = cast(other, elt_ty, builder)
-
-    # Create loaded result type `dst_ty`
-    if ptr.type.is_block():
-        shape = ptr.type.get_block_shapes()
-        dst_ty = tl.block_type(elt_ty, shape)
-    else:
-        # Load by de-referencing the pointer of scalar
-        dst_ty = elt_ty
-
-    # Build IR
-    if not mask:
-        return tl.tensor(builder.create_load(ptr.handle, cache, eviction, is_volatile), dst_ty)
-    else:
-        return tl.tensor(builder.create_masked_load(ptr.handle, mask.handle, other.handle if other else None, cache,
-                                                    eviction, is_volatile), dst_ty)
-
-
-def load(ptr: tl.tensor,
-         mask: Optional[tl.tensor],
-         other: Optional[tl.tensor],
-         boundary_check,
-         padding_option: str,
-         cache_modifier: str,
-         eviction_policy: str,
-         is_volatile: bool,
-         builder: ir.builder) -> tl.tensor:
-    # Cache, eviction and padding options
-    cache = _str_to_cache_modifier(cache_modifier)
-    eviction = _str_to_eviction_policy(eviction_policy)
-    padding = _str_to_padding_option(padding_option)
-
-    if ptr.type.is_ptr() and ptr.type.element_ty.is_block():
-        # Load by a block pointer: `pointer_type<block_type<>>`
-        return _load_block_pointer(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder)
-    else:
-        # Load by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
-        return _load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder)
-
-
-def _store_block_pointer(ptr, val, mask, boundary_check, cache, eviction, builder):
-    # Store by a block pointer: `pointer_type<block_type<>>`
-    # Block pointers can not have the `mask` argument
-    if mask:
-        raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers")
-
-    # Check same shape and element type
-    block_shape = ptr.type.element_ty.get_block_shapes()
-    if not val.type.is_block():
-        val = broadcast_impl_shape(val, block_shape, builder)
-    assert val.type.is_block(), "Value argument must be block type or a scalar"
-    assert block_shape == val.type.get_block_shapes(), "Block shape and value shape mismatch"
-    assert ptr.type.element_ty.element_ty == val.type.element_ty, "Block element type and value element type mismatch"
-
-    elt_ty = ptr.type.element_ty.element_ty
-    assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`"
-
-    # Check `boundary_check` argument
-    boundary_check = _canonicalize_boundary_check(boundary_check, block_shape)
-
-    # Build IR
-    return tl.tensor(builder.create_tensor_pointer_store(ptr.handle, val.handle, boundary_check, cache, eviction),
-                     tl.void)
-
-
-def _store_legacy(ptr, val, mask, boundary_check, cache, eviction, builder):
-    # Store by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
-    if not ptr.type.scalar.is_ptr():
-        raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.store`")
-
-    # Check `boundary_check` argument
-    if boundary_check:
-        raise ValueError("`boundary_check` argument is not supported for storing a tensor of pointers or storing a "
-                         "scalar. Because the compiler does not know the boundary; please use block pointers "
-                         "(defined by `make_block_ptr`) instead")
-
-    # For a pointer of scalar, check the type of `val` and `mask`
-    if not ptr.type.is_block():
-        if val.type.is_block():
-            raise ValueError("Value argument cannot be block type if pointer argument is not a block")
-        if mask and mask.type.is_block():
-            raise ValueError("Mask argument cannot be block type if pointer argument is not a block")
-
-    # Make `mask` and `val` into the same shape as `ptr`
-    if ptr.type.is_block():
-        val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder)
-        if mask:
-            mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder)
-
-    ptr_ty = ptr.type.scalar
-    elt_ty = ptr_ty.element_ty
-
-    # Treat `pointer_type<tl.int1>` as `pointer_type<tl.int8>`
-    if elt_ty == tl.int1:
-        elt_ty = tl.int8
-        ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space)
-        ptr = cast(ptr, ptr_ty, builder)
-
-    # Cast to target data type
-    val = cast(val, elt_ty, builder)
-
-    # Build IR
-    if not mask:
-        return tl.tensor(builder.create_store(ptr.handle, val.handle, cache, eviction), tl.void)
-    if not mask.type.scalar.is_bool():
-        raise ValueError("Mask must have boolean scalar type")
-    return tl.tensor(builder.create_masked_store(ptr.handle, val.handle, mask.handle, cache, eviction), tl.void)
-
-
-def store(ptr: tl.tensor,
-          val: tl.tensor,
-          mask: Optional[tl.tensor],
-          boundary_check,
-          cache_modifier: str,
-          eviction_policy: str,
-          builder: ir.builder) -> tl.tensor:
-    # Cache and eviction options
-    cache = _str_to_cache_modifier(cache_modifier)
-    eviction = _str_to_eviction_policy(eviction_policy)
-
-    if ptr.type.is_ptr() and ptr.type.element_ty.is_block():
-        # Store by a block pointer: `pointer_type<block_type<>>`
-        return _store_block_pointer(ptr, val, mask, boundary_check, cache, eviction, builder)
-    else:
-        # Store by a tensor of pointers or a pointer of scalar: `block_type<pointer_type<>>` or `pointer_type<>`
-        return _store_legacy(ptr, val, mask, boundary_check, cache, eviction, builder)
-
-
-#########
-# atomic
-#########
-
-
-def atomic_cas(ptr: tl.tensor,
-               cmp: tl.tensor,
-               val: tl.tensor,
-               builder: ir.builder) -> tl.tensor:
-    element_ty = ptr.type.scalar.element_ty
-    if element_ty.primitive_bitwidth not in [16, 32, 64]:
-        raise ValueError("atomic_cas only supports elements with width {16, 32, 64}")
-    return tl.tensor(builder.create_atomic_cas(ptr.handle, cmp.handle, val.handle), val.type)
-
-
-def atom_red_typechecking_impl(ptr: tl.tensor,
-                               val: tl.tensor,
-                               mask: tl.tensor,
-                               op: str,
-                               builder: ir.builder) -> Tuple[tl.tensor, tl.tensor, tl.tensor]:
-    if not ptr.type.scalar.is_ptr():
-        raise ValueError("Pointer argument of store instruction is " + ptr.type.__repr__())
-
-    element_ty = ptr.type.scalar.element_ty
-    if element_ty is tl.float16 and op != 'add':
-        raise ValueError("atomic_" + op + " does not support fp16")
-    if element_ty in [tl.int1, tl.int8, tl.int16, tl.bfloat16]:
-        raise ValueError("atomic_" + op + " does not support " + str(element_ty))
-    if ptr.type.is_block():
-        if mask:
-            mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder)
-        if val:
-            val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder)
-    val = cast(val, ptr.type.scalar.element_ty, builder)
-    if not mask:
-        mask_ir = builder.get_int1(True)
-        mask_ty = tl.int1
-        if ptr.type.is_block():
-            mask_ir = builder.create_splat(mask_ir, ptr.type.get_block_shapes())
-            mask_ty = tl.block_type(tl.int1, ptr.type.get_block_shapes())
-        mask = tl.tensor(mask_ir, mask_ty)
-    return ptr, val, mask
-
-
-def atomic_max(ptr: tl.tensor,
-               val: tl.tensor,
-               mask: tl.tensor,
-               builder: ir.builder) -> tl.tensor:
-    ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'max', builder)
-    sca_ty = val.type.scalar
-    # direct call to atomic_max for integers
-    if sca_ty.is_int():
-        if sca_ty.is_int_signed():
-            return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MAX,
-                                                       ptr.handle,
-                                                       val.handle,
-                                                       mask.handle),
-                             val.type)
-        else:
-            return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX,
-                                                       ptr.handle,
-                                                       val.handle,
-                                                       mask.handle),
-                             val.type)
-    # for float
-    # return atomic_smax(i_ptr, i_val) if val >= 0
-    # return atomic_umin(i_ptr, i_val) if val < 0
-    i_val = bitcast(val, tl.int32, builder)
-    i_ptr = bitcast(ptr, tl.pointer_type(tl.int32, 1), builder)
-    pos = greater_equal(val, tl.tensor(builder.get_fp32(0), sca_ty), builder)
-    neg = less_than(val, tl.tensor(builder.get_fp32(0), sca_ty), builder)
-    pos_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, i_ptr.handle, i_val.handle, and_(mask, pos, builder).handle), i_val.type)
-    neg_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, i_ptr.handle, i_val.handle, and_(mask, neg, builder).handle), i_val.type)
-    return where(pos, pos_ret, neg_ret, builder)
-
-
-def atomic_min(ptr: tl.tensor,
-               val: tl.tensor,
-               mask: tl.tensor,
-               builder: ir.builder) -> tl.tensor:
-    ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'min', builder)
-    sca_ty = val.type.scalar
-    # direct call to atomic_min for integers
-    if sca_ty.is_int():
-        if sca_ty.is_int_signed():
-            return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MIN,
-                                                       ptr.handle,
-                                                       val.handle,
-                                                       mask.handle),
-                             val.type)
-        else:
-            return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN,
-                                                       ptr.handle,
-                                                       val.handle,
-                                                       mask.handle),
-                             val.type)
-    # for float
-    # return atomic_smin(i_ptr, i_val) if val >= 0
-    # return atomic_umax(i_ptr, i_val) if val < 0
-    i_val = bitcast(val, tl.int32, builder)
-    i_ptr = bitcast(ptr, tl.pointer_type(tl.int32, 1), builder)
-    pos = greater_equal(val, tl.tensor(builder.get_fp32(0), sca_ty), builder)
-    neg = less_than(val, tl.tensor(builder.get_fp32(0), sca_ty), builder)
-    pos_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MIN,
-                                                  i_ptr.handle,
-                                                  i_val.handle,
-                                                  and_(mask, pos, builder).handle),
-                        i_val.type)
-    neg_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX,
-                                                  i_ptr.handle,
-                                                  i_val.handle,
-                                                  and_(mask, neg, builder).handle),
-                        i_val.type)
-    return where(pos, pos_ret, neg_ret, builder)
-
-
-def atomic_add(ptr: tl.tensor,
-               val: tl.tensor,
-               mask: tl.tensor,
-               builder: ir.builder) -> tl.tensor:
-    ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'add', builder)
-    sca_ty = val.type.scalar
-    op = ir.ATOMIC_OP.FADD if sca_ty.is_floating() else ir.ATOMIC_OP.ADD
-    return tl.tensor(builder.create_atomic_rmw(op, ptr.handle, val.handle, mask.handle), val.type)
-
-
-def atomic_and(ptr: tl.tensor,
-               val: tl.tensor,
-               mask: tl.tensor,
-               builder: ir.builder) -> tl.tensor:
-    ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'and', builder)
-    return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.AND, ptr.handle, val.handle, mask.handle), val.type)
-
-
-def atomic_or(ptr: tl.tensor,
-              val: tl.tensor,
-              mask: tl.tensor,
-              builder: ir.builder) -> tl.tensor:
-    ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'or', builder)
-    return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.OR, ptr.handle, val.handle, mask.handle), val.type)
-
-
-def atomic_xor(ptr: tl.tensor,
-               val: tl.tensor,
-               mask: tl.tensor,
-               builder: ir.builder) -> tl.tensor:
-    ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xor', builder)
-    return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XOR, ptr.handle, val.handle, mask.handle), val.type)
-
-
-def atomic_xchg(ptr: tl.tensor,
-                val: tl.tensor,
-                mask: tl.tensor,
-                builder: ir.builder) -> tl.tensor:
-    ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xchg', builder)
-    return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XCHG, ptr.handle, val.handle, mask.handle), val.type)
-
-# ===----------------------------------------------------------------------===//
-#                               Linear Algebra
-# ===----------------------------------------------------------------------===//
-
-
-def dot(lhs: tl.tensor,
-        rhs: tl.tensor,
-        allow_tf32: bool,
-        out_dtype: tl.dtype,
-        builder: ir.builder) -> tl.tensor:
-    assert lhs.type.is_block() and rhs.type.is_block()
-    assert lhs.dtype == rhs.dtype, "lhs and rhs must have the same dtype!"
-    assert len(lhs.shape) == 2 and len(rhs.shape) == 2
-    assert lhs.shape[1].value == rhs.shape[0].value
-    assert lhs.shape[0].value >= 16 and lhs.shape[1].value >= 16 \
-        and rhs.shape[1].value >= 16,\
-        "small blocks not supported!"
-    if lhs.type.scalar.is_int():
-        assert lhs.type.scalar == tl.int8, "only int8 supported!"
-        # TODO: This is CUDA specific, check if ROCm has the same limitation
-        assert lhs.shape[1].value >= 32, "small blocks not supported!"
-        _0 = builder.get_int32(0)
-        ret_scalar_ty = tl.int32
-    elif lhs.type.scalar.is_fp32() or lhs.type.scalar.is_bf16():
-        _0 = builder.get_fp32(0)
-        ret_scalar_ty = tl.float32
-    else:
-        _0 = builder.get_fp16(0) if out_dtype.is_fp16() else builder.get_fp32(0)
-        ret_scalar_ty = out_dtype
-
-    M = lhs.type.shape[0]
-    N = rhs.type.shape[1]
-    _0 = builder.create_splat(_0, [M, N])
-    ret_ty = tl.block_type(ret_scalar_ty, [M, N])
-    return tl.tensor(builder.create_dot(lhs.handle, rhs.handle, _0, allow_tf32),
-                     ret_ty)
-
-
-# ===----------------------------------------------------------------------===//
-#                               Indexing
-# ===----------------------------------------------------------------------===//
-
-def where(condition: tl.tensor,
-          x: tl.tensor,
-          y: tl.tensor,
-          builder: ir.builder) -> tl.tensor:
-    condition = cast(condition, tl.int1, builder)
-    if condition.type.is_block():
-        condition, x = broadcast_impl_value(condition, x, builder)
-        x, y = broadcast_impl_value(x, y, builder)
-        condition, x = broadcast_impl_value(condition, x, builder)
-
-    x, y = binary_op_type_checking_impl(x, y, builder, True, True)
-    if not condition.type.is_block():
-        condition, _ = broadcast_impl_value(condition, x, builder)
-    ret_ty = x.type
-    return tl.tensor(builder.create_select(condition.handle, x.handle, y.handle), ret_ty)
-
-# ===----------------------------------------------------------------------===//
-#                               Reduction
-# ===----------------------------------------------------------------------===
-
-
-def reduction(
-    inputs: Sequence[tl.tensor], axis: int, region_builder_fn, builder: ir.builder
-) -> Tuple[tl.tensor, ...]:
-    # get result shape
-    shape = inputs[0].type.shape
-    ret_shape = [s for i, s in enumerate(shape) if i != axis]
-    for t in inputs:
-        assert t.type.shape == shape
-
-    def wrap_tensor(x, scalar_ty):
-        if ret_shape:
-            res_ty = tl.block_type(scalar_ty, ret_shape)
-        else:
-            # 0d-tensor -> scalar
-            res_ty = scalar_ty
-        return tl.tensor(x, res_ty)
-
-    reduce_op = builder.create_reduce([t.handle for t in inputs], axis)
-    region_builder_fn(reduce_op)
-    reduce_op.verify()
-
-    return tuple(
-        wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar)
-        for i in range(len(inputs))
-    )
-
-
-# ===----------------------------------------------------------------------===
-#                               Math
-# ===----------------------------------------------------------------------===
-
-def _check_dtype(dtypes: List[str]) -> T:
-    """
-    We following libdevice's convention to check accepted data types for math functions.
-    It is not a good practice to support all data types as accelerators/GPUs don't support
-    many float16 and bfloat16 math operations.
-    We should let the users know that they are using and invoke explicit cast to convert
-    the data type to the supported one.
-    """
-    def wrapper(fn):
-        @wraps(fn)
-        def check(*args, **kwargs):
-            # concatenate args and kwargs
-            all_args = list(args) + list(kwargs.values())
-            for arg in [a for a in all_args if isinstance(a, tl.tensor)]:
-                if arg.type.scalar.name not in dtypes:
-                    raise ValueError(f"Expected dtype {dtypes} but got {arg.type.scalar.name}")
-            return fn(*args, **kwargs)
-        return check
-
-    return wrapper
-
-
-def umulhi(x: tl.tensor, y: tl.tensor, builder: ir.builder) -> tl.tensor:
-    x, y = binary_op_type_checking_impl(x, y, builder)
-    # FIXME(Keren): not portable, should be fixed
-    from . import math
-    return math.mulhi(x, y, _builder=builder)
-
-
-@_check_dtype(dtypes=["fp32", "fp64"])
-def floor(x: tl.tensor, builder: ir.builder) -> tl.tensor:
-    # FIXME(Keren): not portable, should be fixed
-    from . import math
-    return math.floor(x, _builder=builder)
-
-
-@_check_dtype(dtypes=["fp32", "fp64"])
-def exp(x: tl.tensor, builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_exp(x.handle), x.type)
-
-
-@_check_dtype(dtypes=["fp32", "fp64"])
-def log(x: tl.tensor, builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_log(x.handle), x.type)
-
-
-@_check_dtype(dtypes=["fp32", "fp64"])
-def cos(x: tl.tensor, builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_cos(x.handle), x.type)
-
-
-@_check_dtype(dtypes=["fp32", "fp64"])
-def sin(x: tl.tensor, builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_sin(x.handle), x.type)
-
-
-@_check_dtype(dtypes=["fp32", "fp64"])
-def sqrt(x: tl.tensor, builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_sqrt(x.handle), x.type)
-
-
-def abs(x: tl.tensor, builder: ir.builder) -> tl.tensor:
-    dtype = x.dtype
-    if dtype.is_floating():
-        return tl.tensor(builder.create_fabs(x.handle), x.type)
-    elif dtype.is_int_signed():
-        return tl.tensor(builder.create_iabs(x.handle), x.type)
-    elif dtype.is_int_unsigned():
-        return x  # no-op
-    else:
-        assert False, f"Unexpected dtype {dtype}"
-
-
-##
-
-
-def multiple_of(x: tl.tensor, values: List[int]) -> tl.tensor:
-    if len(x.shape) != len(values):
-        raise ValueError("Shape of input to multiple_of does not match the length of values")
-    x.handle.set_attr("tt.divisibility", ir.make_attr(values, x.handle.get_context()))
-    return x
-
-
-def max_contiguous(x: tl.tensor, values: List[int]) -> tl.tensor:
-    if len(x.shape) != len(values):
-        raise ValueError("Shape of input to max_contiguous does not match the length of values")
-    x.handle.set_attr("tt.contiguity", ir.make_attr(values, x.handle.get_context()))
-    return x
-
-
-def debug_barrier(builder: ir.builder) -> tl.tensor:
-    return tl.tensor(builder.create_barrier(), tl.void)
-
-
-def device_print(prefix: str, args: List[tl.tensor], builder: ir.builder) -> tl.tensor:
-    new_args = []
-    for arg in args:
-        new_args.append(arg.handle)
-    return tl.tensor(builder.create_print(prefix, new_args), tl.void)
-
-
-def device_assert(cond: tl.tensor, msg: str, file_name: str, func_name, lineno: int, builder: ir.builder) -> tl.tensor:
-    cond_ty = cond.type
-    if not cond_ty.is_block():
-        cond_ty = tl.block_type(cond_ty.scalar, (1,))
-        cond = tl.tensor(builder.create_splat(cond.handle, (1,)), cond_ty)
-    return tl.tensor(builder.create_assert(cond.handle, msg, file_name, func_name, lineno), tl.void)
-
-
-def _convert_elem_to_ir_value(builder, elem, require_i64):
-    if isinstance(elem, tl.constexpr):
-        return builder.get_int64(elem.value) if require_i64 else builder.get_int32(elem.value)
-    elif isinstance(elem, tl.tensor):
-        assert elem.numel.value == 1, "Expected a scalar in shape/strides/offsets"
-        assert elem.dtype.is_int(), "Expected an integer scalar type in shape/strides/offsets"
-        if elem.dtype != tl.int64 and require_i64:
-            return builder.create_int_cast(elem.handle, builder.get_int64_ty(), elem.dtype.is_int_signed())
-        elif elem.dtype != tl.int32:
-            return builder.create_int_cast(elem.handle, builder.get_int32_ty(), elem.dtype.is_int_signed())
-        return elem.handle
-    assert False, f"Unsupported element type in shape/strides/offsets: {type(elem)}"
-
-
-def _convert_to_ir_values(builder, list_like, require_i64=True):
-    if hasattr(list_like, "__iter__"):
-        return [_convert_elem_to_ir_value(builder, elem, require_i64) for elem in list_like]
-    return [_convert_elem_to_ir_value(builder, list_like, require_i64)]
-
-
-def make_block_ptr(base: tl.tensor, shape, strides, offsets, block_shape, order, builder: ir.builder) -> tl.tensor:
-    # Convert dynamic arguments to IR values
-    # NOTES(Chenggang): current `shape/strides` are `int64_t`, while `offsets/block_shape` are `int32_t`
-    shape = _convert_to_ir_values(builder, shape)
-    strides = _convert_to_ir_values(builder, strides)
-    offsets = _convert_to_ir_values(builder, offsets, require_i64=False)
-
-    # Check `base` type
-    if not base.type.is_ptr() or base.type.element_ty.is_block():
-        raise ValueError("Expected `base` to be a pointer type (but not a block pointer type or others)")
-
-    # Treat `pointer_type<tl.int1>` as `pointer_type<tl.int8>`
-    if base.type.element_ty == tl.int1:
-        base = cast(base, tl.pointer_type(tl.int8, base.type.address_space), builder)
-
-    # Check whether `block_shape` is static
-    if not hasattr(block_shape, "__iter__"):
-        block_shape = [block_shape]
-    block_shape = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in block_shape]
-    assert all([isinstance(elem, int) and -2**31 <= elem < 2**31 for elem in block_shape]), \
-        "Expected a list of constant integers (`int32_t` range) in `block_shape`"
-
-    # Check `order`
-    if not hasattr(order, "__iter__"):
-        order = [order]
-    order = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in order]
-    assert sorted(order) == list(range(len(order))), "Expected a permutation of (0, 1, ..., len(order)-1) in order"
-
-    # Must have same length
-    assert all([len(block_shape) == len(list_like) for list_like in [shape, strides, offsets, order]]), \
-        "Expected shape/strides/offsets/block_shape to have the same length"
-
-    # Build value, the type is:
-    #   `pointer_type<blocked<shape, element_type>>` in Python
-    #   `tt.ptr<tensor<shape, element_type>>` in MLIR
-    handle = builder.create_make_block_ptr(base.handle, shape, strides, offsets, block_shape, order)
-    return tl.tensor(handle, tl.pointer_type(tl.block_type(base.type.element_ty, block_shape)))
-
-
-def advance(base: tl.tensor, offsets, builder: ir.builder) -> tl.tensor:
-    # Convert dynamic offsets to IR values
-    offsets = _convert_to_ir_values(builder, offsets, require_i64=False)
-
-    # Advanced block pointer type is the same as before
-    return tl.tensor(builder.create_advance(base.handle, offsets), base.type)
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
deleted file mode 100644
index b997674c91b6..000000000000
--- a/python/triton/language/standard.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from __future__ import annotations
-
-from ..runtime.jit import jit
-from . import core
-
-# -----------------------
-# Standard library
-# -----------------------
-
-
-@jit
-def cdiv(x, div):
-    """
-    Computes the ceiling division of :code:`x` by :code:`div`
-
-    :param x: the input number
-    :type input: Block
-    :param div: the divisor
-    :param div: Block
-    """
-    return (x + div - 1) // div
-
-
-@jit
-@core._add_math_1arg_docstr("sigmoid")
-def sigmoid(x):
-    return 1 / (1 + core.exp(-x))
-
-
-@jit
-@core._add_math_1arg_docstr("softmax")
-def softmax(x, ieee_rounding=False):
-    z = x - core.max(x, 0)
-    num = core.exp(z)
-    den = core.sum(num, 0)
-    return core.fdiv(num, den, ieee_rounding)
-
-
-@jit
-def ravel(x):
-    """
-    Returns a contiguous flattened view of :code:`x`.
-
-    :param x: the input tensor
-    :type x: Block
-    """
-    return core.view(x, [x.numel])
-
-
-@jit
-def swizzle2d(i, j, size_i, size_j, size_g):
-    """
-    Transforms indices of a row-major size_i*size_j matrix into those
-    of one where indices are row major for each group of size_j rows.
-    For example, for size_i = size_j = 4 and size_g = 2, it will transform
-    [[0 , 1 , 2 , 3 ],
-     [4 , 5 , 6 , 7 ],
-     [8 , 9 , 10, 11],
-     [12, 13, 14, 15]]
-    into
-    [[0, 2,  4 , 6 ],
-     [1, 3,  5 , 7 ],
-     [8, 10, 12, 14],
-     [9, 11, 13, 15]]
-    """
-    # "unrolled index in array"
-    ij = i * size_j + j
-    # number of elements in `size_g` groups
-    # of `size_j` columns
-    size_gj = size_g * size_j
-    # index of the group in which (i,j) is
-    group_id = ij // size_gj
-    # row-index of the first element of this group
-    off_i = group_id * size_g
-    # last group may have fewer rows
-    size_g = core.minimum(size_i - off_i, size_g)
-    # new row and column indices
-    new_i = off_i + (ij % size_g)
-    new_j = (ij % size_gj) // size_g
-    return new_i, new_j
-
-
-@jit
-def zeros(shape, dtype):
-    """
-    Returns a tensor filled with the scalar value 0 for the given :code:`shape` and :code:`dtype`.
-
-    :param shape: Shape of the new array, e.g., (8, 16) or (8, )
-    :type shape: tuple of ints
-    :param dtype: Data-type of the new array, e.g., :code:`tl.float16`
-    :type dtype: DType
-    """
-    return core.full(shape, 0, dtype)
-
-
-@jit
-def zeros_like(input):
-    return zeros(input.shape, input.dtype)
diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py
deleted file mode 100644
index 6ceec8b56a00..000000000000
--- a/python/triton/ops/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# from .conv import _conv, conv
-from . import blocksparse
-from .cross_entropy import _cross_entropy, cross_entropy
-from .flash_attention import attention
-from .matmul import _matmul, matmul
-
-__all__ = [
-    "blocksparse",
-    "_cross_entropy",
-    "cross_entropy",
-    "_matmul",
-    "matmul",
-    "attention",
-]
diff --git a/python/triton/ops/blocksparse/__init__.py b/python/triton/ops/blocksparse/__init__.py
deleted file mode 100644
index 6b24b5377fab..000000000000
--- a/python/triton/ops/blocksparse/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .matmul import matmul
-from .softmax import softmax
-
-__all__ = [
-    "matmul",
-    "softmax",
-]
diff --git a/python/triton/ops/blocksparse/matmul.py b/python/triton/ops/blocksparse/matmul.py
deleted file mode 100644
index c599af26a055..000000000000
--- a/python/triton/ops/blocksparse/matmul.py
+++ /dev/null
@@ -1,437 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-
-# ********************************************************
-# --------------------------------------------------------
-# Sparse = Dense x Dense (SDD)
-# This operation uses super-blocking to make sure that
-# it's done efficiently when small blocks can be grouped
-# together
-# --------------------------------------------------------
-# ********************************************************
-
-
-@triton.heuristics({
-    'EVEN_K': lambda nargs: nargs['K'] % nargs['TILE_K'] == 0,
-})
-@triton.jit
-def _sdd_kernel(
-    A, B, C,
-    stride_za, stride_ha, stride_ma, stride_ak,
-    stride_zb, stride_hb, stride_bk, stride_nb,
-    stride_zc, stride_hc, stride_mc, stride_nc,
-    K, grid_offset, lut,
-    TILE_M: tl.constexpr, TILE_N: tl.constexpr, TILE_K: tl.constexpr,
-    BLOCK: tl.constexpr, EVEN_K: tl.constexpr
-):
-    # ------------ #
-    # - Prologue - #
-    # ------------ #
-    block_id = tl.program_id(0) + grid_offset
-    lut += block_id * 3
-    # offsets
-    off_z = tl.program_id(2)  # batch
-    off_h = tl.load(lut + 0)  # head
-
-    # initialize pointers to A
-    start_am = tl.load(lut + 1)
-    offs_am = start_am * BLOCK + (tl.arange(0, TILE_M) % BLOCK)
-    offs_ak = tl.arange(0, TILE_K)
-    a_ptrs = A \
-        + off_z * stride_za \
-        + off_h * stride_ha \
-        + offs_am[:, None] * stride_ma \
-        + offs_ak[None, :] * stride_ak
-    # initialize pointers to B
-    start_bn = tl.load(lut + 2)
-    offs_bn = start_bn * BLOCK + (tl.arange(0, TILE_N) % BLOCK)
-    offs_bk = tl.arange(0, TILE_K)
-    b_ptrs = B \
-        + off_z * stride_zb \
-        + off_h * stride_hb \
-        + offs_bn[None, :] * stride_nb \
-        + offs_bk[:, None] * stride_bk
-    # ---------------- #
-    #    Inner Loop    #
-    # ---------------- #
-    acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
-    for k in range(K, 0, -TILE_K):
-        if EVEN_K:
-            a = tl.load(a_ptrs)
-            b = tl.load(b_ptrs)
-        else:
-            a = tl.load(a_ptrs, mask=offs_ak[None, :] < k, other=0.)
-            b = tl.load(b_ptrs, mask=offs_bk[:, None] < k, other=0.)
-        acc += tl.dot(a, b, out_dtype=tl.float32)
-        a_ptrs += TILE_K * stride_ak
-        b_ptrs += TILE_K * stride_bk
-    c = acc.to(C.dtype.element_ty)
-    # ---------------- #
-    #    Epilogue      #
-    # ---------------- #
-    offs_cm = tl.arange(0, TILE_M) % BLOCK
-    offs_cn = tl.arange(0, TILE_N) % BLOCK
-    pc = C \
-        + off_z * stride_zc \
-        + block_id * stride_hc \
-        + offs_cm[:, None] * stride_mc \
-        + offs_cn[None, :] * stride_nc
-    tl.store(pc, c, mask=True)
-
-
-def sdd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, widths, out=None):
-    if a.stride(2) != 1 and a.stride(3) != 1:
-        a = a.contiguous()
-    if b.stride(2) != 1 and b.stride(3) != 1:
-        b = b.contiguous()
-    # (A * B)^T = B^T * A^T
-    if trans_c:
-        a, b = b, a
-        trans_a, trans_b = not trans_b, not trans_a
-    # shape constraints
-    a_dim = -2 if trans_a else -1
-    b_dim = -1 if trans_b else -2
-    Ka, Kb = a.shape[a_dim], b.shape[b_dim]
-    if Ka != Kb:
-        raise ValueError(f"Inner dimension mismatch (A: {Ka} vs B: {Kb})")
-    # allocate output
-    if out is None:
-        c = torch.empty((a.shape[0], lut.shape[0], block, block), dtype=a.dtype, device=a.device)
-    else:
-        assert out.shape == (a.shape[0], lut.shape[0], block, block)
-        c = out
-    grid = [c.shape[1], 1, c.shape[0]]
-    _sdd_kernel[grid](
-        a, b, c,
-        a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3),
-        b.stride(0), b.stride(1), b.stride(3 if trans_b else 2), b.stride(2 if trans_b else 3),
-        c.stride(0), c.stride(1), c.stride(2), c.stride(3),
-        Ka, 0, lut,
-        TILE_M=block, TILE_N=block, TILE_K=32, BLOCK=block, num_stages=4,
-        num_warps=4,
-    )
-    return c
-
-
-def sdd_lut(layout, block, device):
-    lut = layout.nonzero(as_tuple=False).to(device).int()
-    lut = lut.contiguous()
-    return lut, None
-
-# -----------------------------
-# Dense = Sparse x Dense (DSD)
-# This operation uses a look-up table that contains pre-computed pointer increments
-# in order to minimize computations in the inner loop of the matmul kernel.
-# -----------------------------
-
-
-@triton.jit
-def _dsd_kernel(
-    A, B, C,
-    stride_az, stride_ha, stride_am, stride_ak,
-    stride_zb, stride_hb, stride_bk, stride_bn,
-    stride_zc, stride_hc, stride_cm, stride_cn,
-    DS0, DS1, lut,
-    TILE_M: tl.constexpr, TILE_N: tl.constexpr, TILE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, BLOCK: tl.constexpr
-):
-    # ------------ #
-    # - Prologue - #
-    # ------------ #
-    pid_m = tl.program_id(0)
-    pid_n = tl.program_id(1)
-    num_pid_m = tl.num_programs(0)
-    num_pid_n = tl.num_programs(1)
-    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_SIZE_M)
-    pidz = tl.program_id(2)
-    header = lut + pid_n * 4
-    offset = tl.load(header + 0)
-    K = tl.load(header + 1)
-    column = tl.load(header + 2)
-    off_h = tl.load(header + 3)
-    pinc = lut + offset
-    # initialize pointers to A (sparse)
-    block_id = tl.load(pinc + 1)
-    block_id = tl.multiple_of(block_id, 8)  # compiler hint
-    offs_am = tl.arange(0, TILE_M)
-    offs_ak = tl.arange(0, TILE_K)
-    pa = A + pidz * stride_az \
-        + block_id * stride_ha \
-        + offs_am[:, None] * stride_am \
-        + offs_ak[None, :] * stride_ak
-    # initialize pointers to B (dense)
-    offs_bn = pid_m * TILE_N + tl.arange(0, TILE_N)
-    offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn % DS0, TILE_N), TILE_N)
-    start_bk = tl.load(pinc)
-    start_bk = tl.multiple_of(start_bk, 8)  # compiler hint
-    offs_bk = start_bk + tl.arange(0, TILE_K)
-    pb = B + pidz * stride_zb \
-        + off_h * stride_hb \
-        + offs_bn[None, :] * stride_bn \
-        + offs_bk[:, None] * stride_bk
-    # ---------------- #
-    #    Inner Loop    #
-    # ---------------- #
-    acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
-    pinc += 2
-    inc_a = tl.load(pinc + 1)
-    inc_a = tl.multiple_of(inc_a, 8)
-    inc_b = tl.load(pinc)
-    inc_b = tl.multiple_of(inc_b, 8)
-    for k in range(K, 0, -TILE_K):
-        a = tl.load(pa)
-        b = tl.load(pb)
-        acc += tl.dot(a, b, out_dtype=tl.float32)
-        pa += inc_a
-        pb += inc_b * stride_bk
-        pinc += 2
-        inc_a = tl.load(pinc + 1)
-        inc_a = tl.multiple_of(inc_a, 8)
-        inc_b = tl.load(pinc)
-        inc_b = tl.multiple_of(inc_b, 8)
-    c = acc.to(C.dtype.element_ty)
-    # initialize pointers to C
-    offs_cm = column * TILE_M + tl.arange(0, TILE_M)
-    offs_cn = pid_m * TILE_N + tl.arange(0, TILE_N)
-    pc = C \
-        + off_h * stride_hc \
-        + pidz * stride_zc \
-        + offs_cm[:, None] * stride_cm \
-        + offs_cn[None, :] * stride_cn
-    tl.store(pc, c, mask=offs_cn[None, :] < DS0)
-
-
-def dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=None):
-    if a.stride(2) != 1 and a.stride(3) != 1:
-        a = a.contiguous()
-    if b.stride(2) != 1 and b.stride(3) != 1:
-        b = b.contiguous()
-    # shapes / dtypes
-    AS1 = block * spdims[2 if trans_a else 1]
-    BS0 = b.size(0)
-    BS1 = b.size(1)
-    BS3 = b.size(2 if trans_b else 3)
-    dtype = a.dtype
-    # allocate output
-    CS0 = BS0
-    CS1 = BS1
-    CS2 = BS3 if trans_c else AS1
-    CS3 = AS1 if trans_c else BS3
-    if out is None:
-        c = torch.empty((CS0, CS1, CS2, CS3), dtype=dtype, device=a.device)
-    else:
-        assert out.shape == (CS0, CS1, CS2, CS3)
-        c = out
-    # meta-parameter heuristics
-    TILE_N = 128
-    # compute output
-    grid = lambda meta: [triton.cdiv(BS3, meta['TILE_N']), width, BS0]
-    _dsd_kernel[grid](
-        a, b, c,
-        a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3),
-        b.stride(0), b.stride(1), b.stride(3 if trans_b else 2), b.stride(2 if trans_b else 3),
-        c.stride(0), c.stride(1), c.stride(3 if trans_c else 2), c.stride(2 if trans_c else 3),
-        BS3, AS1, lut,
-        TILE_M=block, TILE_N=TILE_N, TILE_K=min(block, 32), BLOCK=block, num_stages=4,
-        num_warps=4, GROUP_SIZE_M=4,
-    )
-    # exit()
-    return c
-
-
-def dsd_lut(layout, block, step, trans, device):
-    """
-    Generates the look-up table for incrementing pointers in the DSD/DDS matmul.
-    Example (BLOCK=32, STEP=16)
-    [[1, 0, 0, 1, 0],
-     [0, 1, 1, 0, 1],
-     [1, 0, 1, 0, 0]]
-
-    Then the offsets for A are
-     [0 , 16, 32, 48] <- row 0
-      \\----/  \\----/
-      col=0   col=3
-     [64, 80, 96, 112, 128, 144] <- row 1
-      \\----/   \\----/  \\------/
-       col=1    col=2    col=3
-     [160, 176, 192, 208]
-    which leads to increments table
-    [0, 16, 16, 16, || 64, 16, 16, 16, 16, 16, || 160, 16, 16, 16]
-
-    Because B is dense, the offsets are
-    [0, 16, 96, 112] <- row 0
-    [32, 48, 64, 80]  <- row 1
-    [0, 16, 64, 80]   <- row 2
-    """
-    sizes = torch.sum(layout, 2 if trans else 1)
-    head_id, col_id = torch.ones_like(sizes).nonzero(as_tuple=True)
-    sizes = sizes.flatten()
-    segments = sizes * step
-    # pointer increments
-    if trans:
-        nnz = layout.nonzero(as_tuple=False)
-    else:
-        nnz = layout.transpose(1, 2).nonzero(as_tuple=False)
-    num_blocks = nnz.size(0)
-    offsets = torch.zeros_like(sizes)
-    offsets[1:] = torch.cumsum(sizes[:-1], dim=0)
-    offsets = torch.min(offsets, (num_blocks - 1) * torch.ones_like(offsets))
-    # -------------------------------
-    # dense input pointer increments
-    # -------------------------------
-    # Note that the inner loop matmul kernel may have a fixed step size (e.g., TILE_K)
-    # that is smaller than the block size, so we need to do a bit of extra work
-    # to handle this case
-    B_idx = nnz[:, 2] * block
-    B_incs = B_idx.clone()
-    B_incs[1:] -= B_idx[:-1]
-    div = block // step
-    B_incs = B_incs.view(-1, 1).repeat(1, div)
-    B_incs[:, 1:] = step
-    B_incs[:, 0] -= (div - 1) * step
-    # first increment for each reduction is actually the offset
-    B_incs[offsets[segments > 0], 0] = B_idx[offsets[segments > 0]]
-    B_incs = B_incs.view(-1)
-    # -------------------------------
-    # sparse input pointer increments
-    # -------------------------------
-    # same as above, except that the increments are in the sparse memory layout
-    if trans:
-        A_idx = torch.arange(num_blocks, device=layout.device)
-    else:
-        A_idx = torch.tensor([], dtype=torch.int64, device=layout.device)
-        current_offset = 0
-        for z in range(layout.size(0)):
-            layoutw = layout[z, :, :].clone().long()
-            msum = layoutw.sum()
-            layoutw[layoutw > 0] = 1 + torch.arange(msum, device=layout.device)
-            A_idx = torch.cat((A_idx, current_offset + layoutw.T[layoutw.T > 0] - 1))
-            current_offset += msum
-    A_incs = A_idx * block * block
-    A_incs[1:] -= A_idx[:-1] * block * block
-    A_incs = A_incs.view(-1, 1).repeat(1, div)
-    if trans:
-        A_incs[:, 1:] = step
-        A_incs[:, 0] -= (div - 1) * step
-    else:
-        A_incs[:, 1:] = step * block
-        A_incs[:, 0] -= (div - 1) * step * block
-    A_incs[offsets[segments > 0], 0] = A_idx[offsets[segments > 0]]
-    A_incs = A_incs.view(-1)
-    # create header
-    width = col_id.size(0)
-    offsets = offsets * 2 * div + 4 * width
-    segments = segments * div
-    header = torch.stack((offsets, segments, col_id, head_id), dim=1).view(-1).contiguous()
-    # create increments
-    incs = torch.stack((B_incs, A_incs), dim=1).view(-1).contiguous()
-    # pad by a factor 2*MAX_NUM_STAGES
-    # to accommodate pre-fetching inside the kernel
-    pad = torch.zeros(20, device=incs.device, dtype=incs.dtype)
-    incs = torch.cat((incs, pad))
-    # create lut
-    lut = torch.cat((header, incs))
-    lut = lut.type(torch.int32).to(device)
-    # create locks
-    return lut, width
-
-# -----------------------------
-# Dense = Dense x Sparse (DDS)
-# -----------------------------
-# AB = (B^T A^T)^T
-
-
-def dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=None):
-    return dsd_matmul(b, a, not trans_b, not trans_a, not trans_c, spdims, block, lut, width, out=out)
-
-##############
-#  MAIN API  #
-##############
-
-
-class _matmul(torch.autograd.Function):
-
-    fn = {'sdd': sdd_matmul, 'dsd': dsd_matmul, 'dds': dds_matmul}
-
-    @staticmethod
-    def forward(
-        ctx, a, b, trans_a, trans_b, trans_c, mode, spdims, block,
-        c_lut, c_width, da_lut, da_width, db_lut, db_width, out
-    ):
-        c = _matmul.fn[mode](a, b, trans_a, trans_b, trans_c, spdims, block, c_lut, c_width, out=out)
-        # save for backward
-        ctx.save_for_backward(a, b)
-        ctx.da_lut = da_lut
-        ctx.da_width = da_width
-        ctx.db_lut = db_lut
-        ctx.db_width = db_width
-        ctx.mode = mode
-        ctx.spdims = spdims
-        ctx.block = block
-        ctx.trans_a = trans_a
-        ctx.trans_b = trans_b
-        ctx.trans_c = trans_c
-        ctx.has_out = out is not None
-        return c
-
-    @staticmethod
-    def backward(ctx, dc):
-        # saved for backward
-        a, b = ctx.saved_tensors
-        da, db = None, None
-        mode = ctx.mode
-        # gradients w.r.t. a
-        if ctx.needs_input_grad[0]:
-            mode_da = mode[1] + mode[0] + mode[2]
-            da = _matmul.fn[mode_da](
-                dc, b, ctx.trans_c, not ctx.trans_b, ctx.trans_a, ctx.spdims, ctx.block, ctx.da_lut, ctx.da_width,
-            )
-        # gradients w.r.t. b
-        if ctx.needs_input_grad[1]:
-            mode_db = mode[2] + mode[1] + mode[0]
-            db = _matmul.fn[mode_db](
-                a, dc, not ctx.trans_a, ctx.trans_c, ctx.trans_b, ctx.spdims, ctx.block, ctx.db_lut, ctx.db_width,
-            )
-        dout = dc if ctx.has_out else None
-        return da, db, None, None, None,\
-            None, None, None, None,\
-            None, None, None, None, None, dout
-
-
-class matmul:
-
-    def __init__(self, layout, block, mode, device, trans_a=False, trans_b=False, trans_c=False):
-        if mode not in ['sdd', 'dsd', 'dds']:
-            raise NotImplementedError('Supported modes are: sdd, dsd, dds')
-        self.block = block
-        self.mode = mode
-        self.trans_a = trans_a
-        self.trans_b = trans_b
-        self.trans_c = trans_c
-        self.layout = layout
-        self.spdims = layout.shape
-        step = min(block, 32)
-        if self.mode == 'sdd':
-            self.c_lut, self.c_width = sdd_lut(layout, block, device)
-            self.da_lut, self.da_width = dsd_lut(layout, block, step, True, device)
-            self.db_lut, self.db_width = dsd_lut(layout, block, step, False, device)
-        if self.mode == 'dsd':
-            self.c_lut, self.c_width = dsd_lut(layout, block, step, not self.trans_a, device)
-            self.da_lut, self.da_width = sdd_lut(layout, block, device)
-            self.db_lut, self.db_width = dsd_lut(layout, block, step, self.trans_a, device)
-        if self.mode == 'dds':
-            self.c_lut, self.c_width = dsd_lut(layout, block, step, self.trans_b, device)
-            self.da_lut, self.da_width = dsd_lut(layout, block, step, not self.trans_b, device)
-            self.db_lut, self.db_width = sdd_lut(layout, block, device)
-
-    def __call__(self, a, b, out=None):
-        c = _matmul.apply(
-            a, b, self.trans_a, self.trans_b, self.trans_c, self.mode, self.spdims, self.block,
-            self.c_lut, self.c_width,
-            self.da_lut, self.da_width,
-            self.db_lut, self.db_width,
-            out
-        )
-        return c
diff --git a/python/triton/ops/blocksparse/softmax.py b/python/triton/ops/blocksparse/softmax.py
deleted file mode 100644
index ac2b7c3eb985..000000000000
--- a/python/triton/ops/blocksparse/softmax.py
+++ /dev/null
@@ -1,239 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-
-
-def num_warps(n):
-    if n <= 128:
-        return 1
-    if n <= 256:
-        return 2
-    if n <= 512:
-        return 4
-    if n <= 4096:
-        return 8
-    return 16
-
-
-@triton.jit
-def _blocksparse_softmax_fwd(
-    Out, A, stride_xz, LUT,
-    R, extent, stride_zr, stride_hr,  # relative attention
-    scale, is_causal,
-    ROW_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-    IS_DENSE: tl.constexpr,
-):
-    h = tl.program_id(0)
-    m = tl.program_id(1)
-    z = tl.program_id(2)
-    # create index ranges
-    hm = h * tl.num_programs(1) + m
-    lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE
-    block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE
-    # extract information from LUT
-    header = LUT + (hm // BLOCK_SIZE) * 2
-    size = tl.load(header + 0)
-    offset = tl.load(header + 1)
-    # pointer offset
-    off_a = z * stride_xz
-    off_a += (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE  # block indx
-    off_a += (m % BLOCK_SIZE) * BLOCK_SIZE  # row indx
-    # do not need to read column indices in the dense case
-    if IS_DENSE:
-        ns = tl.arange(0, ROW_SIZE)
-    else:
-        off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE
-        start_n = tl.load(LUT + off_lut + block_n, mask=block_n < size, other=0)
-        ns = start_n * BLOCK_SIZE + lane_n
-    # load X
-    mask = block_n < size
-    a = tl.load(A + off_a + lane_n, mask=mask, other=-float("inf"))
-    a = a.to(tl.float32)
-    # compute
-    out = a
-    out *= scale
-    # apply relative attention
-    if R is not None:
-        R += z * stride_zr
-        R += h * stride_hr
-        off_lo = (extent - m - 1) + ns
-        mask_lo = (off_lo >= 0) & (off_lo < extent)
-        rel_logits = tl.load(R + m * extent + off_lo, mask=mask_lo, other=0.0)
-        out += rel_logits
-    out = out.to(tl.float32)
-    # apply causal mask
-    out = tl.where((ns > m) & is_causal, -float("inf"), out)
-    # computation
-    out = tl.softmax(out)
-    # write-back
-    tl.store(Out + off_a + lane_n, out, mask=mask)
-
-
-@triton.jit
-def _blocksparse_softmax_bwd(
-    DA, stride_zdx,
-    DOut, stride_zdout,
-    Out, stride_zout,
-    scale,
-    LUT,
-    DR, extent, stride_zr, stride_hr, stride_er,
-    is_causal,
-    ROW_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-    IS_DENSE: tl.constexpr,
-):
-    h = tl.program_id(0)
-    m = tl.program_id(1)
-    z = tl.program_id(2)
-    # create index ranges
-    hm = h * tl.num_programs(1) + m
-    lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE
-    block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE
-    # extract information from LUT
-    header = LUT + (hm // BLOCK_SIZE) * 2
-    size = tl.load(header + 0)
-    offset = tl.load(header + 1)
-    # row-col offset
-    off_mn = (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE
-    off_mn += (m % BLOCK_SIZE) * BLOCK_SIZE
-    mask = block_n < size
-    # pointers
-    As = Out + z * stride_zout + off_mn
-    DOuts = DOut + z * stride_zdout + off_mn
-    # do not need to read column indices in the dense case
-    if IS_DENSE:
-        ns = tl.arange(0, ROW_SIZE)
-    else:
-        off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE
-        start_n = tl.load(LUT + off_lut + block_n, mask=mask, other=0)
-        ns = start_n * BLOCK_SIZE + lane_n
-    # load data
-    a = tl.load(As + lane_n, mask=mask, other=0.0)
-    a = a.to(tl.float32)
-    dout = tl.load(DOuts + lane_n, mask=mask, other=0.0)
-    dout = dout.to(tl.float32)
-    # compute
-    a = tl.where((ns > m) & is_causal & (a == a), 0., a)
-    da = a * (dout - tl.sum(a * dout, 0))
-    # apply relative attention
-    if DR is not None:
-        DR += z * stride_zr
-        DR += h * stride_hr
-        off_lo = (extent - m - 1) + ns
-        mask_lo = (off_lo >= 0) & (off_lo < extent) & mask
-        tl.store(DR + m * extent + off_lo, da, mask=mask_lo)
-    da = da * scale
-    # convert da
-    # write-back
-    DAs = DA + z * stride_zdx + off_mn
-    tl.store(DAs + lane_n, da, mask=mask)
-
-
-class _softmax(torch.autograd.Function):
-    @staticmethod
-    def make_lut(layout, block, device):
-        _empty = torch.tensor([], dtype=torch.int64, device=layout.device)
-        sizes = _empty.clone()
-        # sizes along rows
-        for h in range(layout.shape[0]):
-            sizes = torch.cat((sizes, layout[h, :, :].sum(-1)))
-        total_sizes = sizes * block
-        # offsets in block format
-        offsets = torch.zeros_like(sizes)
-        offsets[1:] = torch.cumsum(sizes[:-1], dim=0)
-        # block indices
-        columns = layout.nonzero(as_tuple=False)[:, 2]
-        header = torch.stack((sizes, offsets), dim=1).view(-1)
-        lut = torch.cat((header, columns)).type(torch.int32).to(device)
-        return lut, int(total_sizes.max())
-
-    @staticmethod
-    def forward(
-        ctx, a, scale, rel_logits, is_causal,
-        spdims, block, lut, maxlut, is_dense
-    ):
-        if scale is not None and isinstance(scale, torch.Tensor):
-            assert scale.device.type == "cpu"
-            scale = scale.item()
-        M = a.shape[0]
-        grid = [spdims[0], spdims[1] * block, M]
-        rel_shape = (1, 1, 1, 1) if rel_logits is None else rel_logits.shape
-        rel_strides = (1, 1, 1, 1) if rel_logits is None else rel_logits.stride()
-        # enqueue kernel
-        out = torch.empty_like(a)
-        _blocksparse_softmax_fwd[grid](
-            out, a, a.stride(0), lut,
-            rel_logits, rel_shape[-1], rel_strides[0], rel_strides[1],  # relative attn
-            scale,
-            is_causal,
-            BLOCK_SIZE=block,
-            ROW_SIZE=triton.next_power_of_2(maxlut),
-            IS_DENSE=is_dense,
-            num_warps=num_warps(maxlut)
-        )
-        # save to context
-        # ctx.mark_dirty(x)
-        ctx.save_for_backward(out, lut)
-        ctx.spdims = spdims
-        ctx.block = block
-        ctx.maxlut = maxlut
-        ctx.scale = scale
-        ctx.rel_shape = rel_shape
-        ctx.rel_strides = rel_strides
-        ctx.rel_dtype = a.dtype
-        ctx.is_dense = is_dense
-        ctx.is_causal = is_causal
-        return out
-
-    @staticmethod
-    def backward(ctx, dout):
-        # retrieve from context
-        out, lut = ctx.saved_tensors
-        # relative logits gradients
-        dr = None
-        if ctx.needs_input_grad[3]:
-            dr = torch.zeros(ctx.rel_shape, dtype=ctx.rel_dtype, device=out.device)
-        # run kernel
-        M = out.shape[0]
-        grid = (ctx.spdims[0], ctx.spdims[1] * ctx.block, M)
-        da = torch.empty_like(dout)
-        _blocksparse_softmax_bwd[grid](
-            da, da.stride(0),
-            dout, dout.stride(0),
-            out, out.stride(0),
-            ctx.scale,
-            lut,
-            dr, ctx.rel_shape[-1], ctx.rel_strides[0], ctx.rel_strides[1], ctx.rel_strides[2],
-            ctx.is_causal,
-            BLOCK_SIZE=ctx.block,
-            ROW_SIZE=triton.next_power_of_2(ctx.maxlut),
-            IS_DENSE=ctx.is_dense,
-            num_warps=num_warps(ctx.maxlut)
-        )
-        return (da, None, None, dr, None,
-                None, None, None, None, None,
-                None,
-                None, None, None,
-                None,
-                None, None, None
-                )
-
-
-class softmax:
-    def __init__(self, layout, block, device, is_dense=False):
-        self.spdims = layout.shape
-        self.layout = layout
-        self.block = block
-        self.lut, self.maxlut = _softmax.make_lut(self.layout, self.block, device)
-        self.is_dense = is_dense
-
-    def __call__(self, a, *, scale=1.0, rel_logits=None, is_causal=False):
-        if rel_logits is not None and rel_logits.dtype != a.dtype:
-            raise ValueError(f"relative position embedding must be {a.dtype}")
-        a = _softmax.apply(
-            a, scale, rel_logits, is_causal,
-            self.spdims, self.block, self.lut, self.maxlut, self.is_dense,
-        )
-        return a
diff --git a/python/triton/ops/cross_entropy.py b/python/triton/ops/cross_entropy.py
deleted file mode 100644
index f66cddf37d21..000000000000
--- a/python/triton/ops/cross_entropy.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-
-
-def num_warps(N):
-    if N < 2048:
-        return 4
-    elif N < 8192:
-        return 8
-    return 16
-
-
-@triton.heuristics({'num_warps': lambda nargs: num_warps(nargs['N'])})
-@triton.heuristics({'BLOCK': lambda nargs: triton.next_power_of_2(nargs['N'])})
-@triton.jit
-def _forward(LOGITS, PROBS, IDX, LOSS, N, BLOCK: tl.constexpr):
-    row = tl.program_id(0)
-    cols = tl.arange(0, BLOCK)
-    idx = tl.load(IDX + row)
-    # pointers to logit and probs
-    LOGITS = LOGITS + row * N + cols
-    WRIT_PROBS = PROBS + row * N + cols
-    READ_PROBS = PROBS + row * N + idx
-    # write-back negative log-probs
-    logits = tl.load(LOGITS, mask=cols < N, other=-float('inf'))
-    logits = logits.to(tl.float32)
-    logits = logits - tl.max(logits, 0)
-    probs = tl.log(tl.sum(tl.exp(logits), 0)) - logits
-    tl.store(WRIT_PROBS, probs, mask=cols < N)
-    # There is a bug in the compiler, which fails to insert a barrier here.
-    # We add it explicitly for now. Will be fixed soon.
-    tl.debug_barrier()
-    # write-back loss
-    probs = tl.load(READ_PROBS)
-    tl.store(LOSS + row, probs)
-
-
-@triton.heuristics({'num_warps': lambda nargs: num_warps(nargs['N'])})
-@triton.heuristics({'BLOCK': lambda nargs: triton.next_power_of_2(nargs['N'])})
-@triton.jit
-def _backward(PROBS, IDX, DPROBS, N, BLOCK: tl.constexpr):
-    row = tl.program_id(0)
-    cols = tl.arange(0, BLOCK)
-    idx = tl.load(IDX + row)
-    # pointers to probs
-    PROBS = PROBS + row * N + cols
-    # We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k]
-    # and we have -log(p[k]) stored in PROBS, so this is easy
-    probs = -tl.load(PROBS, mask=cols < N, other=float('inf'))
-    probs = tl.exp(probs.to(tl.float32))
-    delta = cols == idx
-    # write result in-place in PROBS
-    dout = tl.load(DPROBS + row)
-    din = (probs - delta) * dout
-    tl.store(PROBS, din.to(PROBS.dtype.element_ty), mask=cols < N)
-
-
-class _cross_entropy(torch.autograd.Function):
-    @classmethod
-    def forward(cls, ctx, logits, indices):
-        # make sure we can use triton
-        assert (indices.dtype == torch.int64), "Indices are expected to be of type long."
-        # make kernel
-        device, dtype = logits.device, logits.dtype
-        n_cols = logits.shape[-1]
-        # run the kernel
-        result = torch.empty_like(indices, dtype=dtype, device=device)
-        neg_logprobs = torch.empty_like(logits, dtype=dtype, device=device)
-        grid = lambda opt: (logits.numel() // n_cols, )
-        _forward[grid](logits, neg_logprobs, indices, result, n_cols)
-        # save for backward
-        ctx.save_for_backward(neg_logprobs, indices)
-        return result
-
-    @classmethod
-    def backward(cls, ctx, dneg_logprobs):
-        """We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k]
-        so we initialize the gradient as neg_logprobs, so we can just exponentiate
-        to get p[k], which is most of what we need...  neg_logprobs will be
-        modified in place to become the gradient we want
-        """
-        # load saved tensors
-        neg_logprobs, indices = ctx.saved_tensors
-        # run the kernel
-        # neg_logprobs will be modified in place to become our gradient:
-        n_cols = neg_logprobs.shape[-1]
-        grid = lambda opt: (neg_logprobs.numel() // n_cols, )
-        _backward[grid](neg_logprobs, indices, dneg_logprobs, n_cols)
-        return neg_logprobs, None
-
-
-cross_entropy = _cross_entropy.apply
diff --git a/python/triton/ops/flash_attention.py b/python/triton/ops/flash_attention.py
deleted file mode 100644
index 33c0da791fb7..000000000000
--- a/python/triton/ops/flash_attention.py
+++ /dev/null
@@ -1,267 +0,0 @@
-"""
-Fused Attention
-===============
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
-"""
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _fwd_kernel(
-    Q, K, V, sm_scale,
-    L, M,
-    Out,
-    stride_qz, stride_qh, stride_qm, stride_qk,
-    stride_kz, stride_kh, stride_kn, stride_kk,
-    stride_vz, stride_vh, stride_vk, stride_vn,
-    stride_oz, stride_oh, stride_om, stride_on,
-    Z, H, N_CTX,
-    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hz = tl.program_id(1)
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    off_k = off_hz * stride_qh + offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kk
-    off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    # Initialize pointers to Q, K, V
-    q_ptrs = Q + off_q
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-    # initialize pointer to m and l
-    m_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_prev = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    q = tl.load(q_ptrs)
-    # loop over k, v and update accumulator
-    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
-        # -- compute qk ----
-        k = tl.load(k_ptrs)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk *= sm_scale
-        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
-        # compute new m
-        m_curr = tl.maximum(tl.max(qk, 1), m_prev)
-        # correct old l
-        l_prev *= tl.exp(m_prev - m_curr)
-        # attention weights
-        p = tl.exp(qk - m_curr[:, None])
-        l_curr = tl.sum(p, 1) + l_prev
-        # rescale operands of matmuls
-        l_rcp = 1. / l_curr
-        p *= l_rcp[:, None]
-        acc *= (l_prev * l_rcp)[:, None]
-        # update acc
-        p = p.to(Q.dtype.element_ty)
-        v = tl.load(v_ptrs)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_prev = l_curr
-        m_prev = m_curr
-        # update pointers
-        k_ptrs += BLOCK_N * stride_kn
-        v_ptrs += BLOCK_N * stride_vk
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    l_ptrs = L + off_hz * N_CTX + offs_m
-    m_ptrs = M + off_hz * N_CTX + offs_m
-    tl.store(l_ptrs, l_prev)
-    tl.store(m_ptrs, m_prev)
-    # initialize pointers to output
-    offs_n = tl.arange(0, BLOCK_DMODEL)
-    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc)
-
-
-@triton.jit
-def _bwd_preprocess(
-    Out, DO, L,
-    NewDO, Delta,
-    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
-):
-    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_n = tl.arange(0, D_HEAD)
-    # load
-    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    denom = tl.load(L + off_m).to(tl.float32)
-    # compute
-    do = do / denom[:, None]
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
-    tl.store(Delta + off_m, delta)
-
-
-@triton.jit
-def _bwd_kernel(
-    Q, K, V, sm_scale, Out, DO,
-    DQ, DK, DV,
-    L, M,
-    D,
-    stride_qz, stride_qh, stride_qm, stride_qk,
-    stride_kz, stride_kh, stride_kn, stride_kk,
-    stride_vz, stride_vh, stride_vk, stride_vn,
-    Z, H, N_CTX,
-    num_block,
-    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hz = tl.program_id(0)
-    off_z = off_hz // H
-    off_h = off_hz % H
-    # offset pointers for batch/head
-    Q += off_z * stride_qz + off_h * stride_qh
-    K += off_z * stride_qz + off_h * stride_qh
-    V += off_z * stride_qz + off_h * stride_qh
-    DO += off_z * stride_qz + off_h * stride_qh
-    DQ += off_z * stride_qz + off_h * stride_qh
-    DK += off_z * stride_qz + off_h * stride_qh
-    DV += off_z * stride_qz + off_h * stride_qh
-    for start_n in range(0, num_block):
-        lo = start_n * BLOCK_M
-        # initialize row/col offsets
-        offs_qm = lo + tl.arange(0, BLOCK_M)
-        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
-        offs_m = tl.arange(0, BLOCK_N)
-        offs_k = tl.arange(0, BLOCK_DMODEL)
-        # initialize pointers to value-like data
-        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        # pointer to row-wise quantities in value-like data
-        D_ptrs = D + off_hz * N_CTX
-        m_ptrs = M + off_hz * N_CTX
-        # initialize dv amd dk
-        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        # k and v stay in SRAM throughout
-        k = tl.load(k_ptrs)
-        v = tl.load(v_ptrs)
-        # loop over rows
-        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
-            offs_m_curr = start_m + offs_m
-            # load q, k, v, do on-chip
-            q = tl.load(q_ptrs)
-            # recompute p = softmax(qk, dim=-1).T
-            # NOTE: `do` is pre-divided by `l`; no normalization here
-            qk = tl.dot(q, tl.trans(k))
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-            m = tl.load(m_ptrs + offs_m_curr)
-            p = tl.exp(qk * sm_scale - m[:, None])
-            # compute dv
-            do = tl.load(do_ptrs)
-            dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)
-            # compute dp = dot(v, do)
-            Di = tl.load(D_ptrs + offs_m_curr)
-            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
-            dp += tl.dot(do, tl.trans(v))
-            # compute ds = p * (dp - delta[:, None])
-            ds = p * dp * sm_scale
-            # compute dk = dot(ds.T, q)
-            dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)
-            # compute dq
-            dq = tl.load(dq_ptrs)
-            dq += tl.dot(ds.to(Q.dtype.element_ty), k)
-            tl.store(dq_ptrs, dq)
-            # increment pointers
-            dq_ptrs += BLOCK_M * stride_qm
-            q_ptrs += BLOCK_M * stride_qm
-            do_ptrs += BLOCK_M * stride_qm
-        # write-back
-        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        tl.store(dv_ptrs, dv)
-        tl.store(dk_ptrs, dk)
-
-
-class _attention(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, q, k, v, sm_scale):
-        # only support for Ampere now
-        capability = torch.cuda.get_device_capability()
-        if capability[0] < 8:
-            raise RuntimeError("Flash attention currently only supported for compute capability >= 80")
-        BLOCK = 128
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        # assert Lk in {16, 32, 64, 128}
-        assert Lk in {64}  # TODO: fix other cases
-        o = torch.empty_like(q)
-        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1], 1)
-        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        num_warps = 4 if Lk <= 64 else 8
-
-        _fwd_kernel[grid](
-            q, k, v, sm_scale,
-            L, m,
-            o,
-            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
-            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
-            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
-            o.stride(0), o.stride(1), o.stride(2), o.stride(3),
-            q.shape[0], q.shape[1], q.shape[2],
-            BLOCK_M=BLOCK, BLOCK_N=BLOCK,
-            BLOCK_DMODEL=Lk, num_warps=num_warps,
-            num_stages=2,
-        )
-
-        ctx.save_for_backward(q, k, v, o, L, m)
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = Lk
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        BLOCK = 128
-        q, k, v, o, l, m = ctx.saved_tensors
-        do = do.contiguous()
-        dq = torch.zeros_like(q, dtype=torch.float32)
-        dk = torch.empty_like(k)
-        dv = torch.empty_like(v)
-        do_scaled = torch.empty_like(do)
-        delta = torch.empty_like(l)
-        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )](
-            o, do, l,
-            do_scaled, delta,
-            BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL,
-        )
-        _bwd_kernel[(ctx.grid[1],)](
-            q, k, v, ctx.sm_scale,
-            o, do_scaled,
-            dq, dk, dv,
-            l, m,
-            delta,
-            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
-            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
-            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
-            q.shape[0], q.shape[1], q.shape[2],
-            ctx.grid[0],
-            BLOCK_M=BLOCK, BLOCK_N=BLOCK,
-            BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8,
-            num_stages=1,
-        )
-        return dq, dk, dv, None
-
-
-attention = _attention.apply
diff --git a/python/triton/ops/matmul.py b/python/triton/ops/matmul.py
deleted file mode 100644
index 688186fef16f..000000000000
--- a/python/triton/ops/matmul.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import torch
-
-import triton
-import triton.language as tl
-from .matmul_perf_model import early_config_prune, estimate_matmul_time
-
-
-def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
-
-
-def get_configs_io_bound():
-    configs = []
-    for num_stages in [2, 3, 4, 5, 6]:
-        for block_m in [16, 32]:
-            for block_k in [32, 64]:
-                for block_n in [32, 64, 128, 256]:
-                    num_warps = 2 if block_n <= 64 else 4
-                    configs.append(
-                        triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1},
-                                      num_stages=num_stages, num_warps=num_warps))
-                    # split_k
-                    for split_k in [2, 4, 8, 16]:
-                        configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k},
-                                                     num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C')))
-    return configs
-
-
-@triton.autotune(
-    configs=[
-        # basic configs for compute-bound matmuls
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2),
-        # good for int8
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2),
-    ] + get_configs_io_bound(),
-    key=['M', 'N', 'K'],
-    prune_configs_by={
-        'early_config_prune': early_config_prune,
-        'perf_model': estimate_matmul_time,
-        'top_k': 10
-    },
-)
-@triton.heuristics({
-    'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0,
-})
-@triton.jit
-def _kernel(A, B, C, M, N, K,
-            stride_am, stride_ak,
-            stride_bk, stride_bn,
-            stride_cm, stride_cn,
-            dot_out_dtype: tl.constexpr,
-            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-            GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr,
-            ):
-    # matrix multiplication
-    pid = tl.program_id(0)
-    pid_z = tl.program_id(1)
-    grid_m = tl.cdiv(M, BLOCK_M)
-    grid_n = tl.cdiv(N, BLOCK_N)
-    # re-order program ID for better L2 performance
-    width = GROUP_M * grid_n
-    group_id = pid // width
-    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
-    pid_m = group_id * GROUP_M + (pid % group_size)
-    pid_n = (pid % width) // (group_size)
-    # do matrix multiplication
-    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
-    rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K)
-    # pointers
-    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
-    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
-    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=dot_out_dtype)
-    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
-        if EVEN_K:
-            a = tl.load(A)
-            b = tl.load(B)
-        else:
-            k_remaining = K - k * (BLOCK_K * SPLIT_K)
-            a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.)
-            b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.)
-        acc += tl.dot(a, b, out_dtype=dot_out_dtype)
-        A += BLOCK_K * SPLIT_K * stride_ak
-        B += BLOCK_K * SPLIT_K * stride_bk
-    acc = acc.to(C.dtype.element_ty)
-    # rematerialize rm and rn to save registers
-    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
-    mask = (rm < M)[:, None] & (rn < N)[None, :]
-    # handles write-back with reduction-splitting
-    if SPLIT_K == 1:
-        tl.store(C, acc, mask=mask)
-    else:
-        tl.atomic_add(C, acc, mask=mask)
-
-
-class _matmul(torch.autograd.Function):
-    kernel = _kernel
-
-    _locks = {}
-
-    @staticmethod
-    def _call(a, b, dot_out_dtype):
-        device = a.device
-        # handle non-contiguous inputs if necessary
-        if a.stride(0) > 1 and a.stride(1) > 1:
-            a = a.contiguous()
-        if b.stride(0) > 1 and b.stride(1) > 1:
-            b = b.contiguous()
-        # checks constraints
-        assert a.shape[1] == b.shape[0], "incompatible dimensions"
-        M, K = a.shape
-        _, N = b.shape
-        # allocates output
-        c = torch.empty((M, N), device=device, dtype=a.dtype)
-        if dot_out_dtype is None:
-            if a.dtype in [torch.float16, torch.float32, torch.bfloat16]:
-                dot_out_dtype = tl.float32
-            else:
-                dot_out_dtype = tl.int32
-        else:
-            assert isinstance(dot_out_dtype, torch.dtype), "dot_out_dtype must be a torch.dtype"
-            if dot_out_dtype == torch.float16:
-                dot_out_dtype = tl.float16
-            elif dot_out_dtype in [torch.float32, torch.bfloat16]:
-                dot_out_dtype = tl.float32
-            else:
-                dot_out_dtype = tl.int32
-        # launch kernel
-        grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K'])
-        _kernel[grid](a, b, c, M, N, K,
-                      a.stride(0), a.stride(1),
-                      b.stride(0), b.stride(1),
-                      c.stride(0), c.stride(1),
-                      dot_out_dtype=dot_out_dtype,
-                      GROUP_M=8)
-        return c
-
-    @staticmethod
-    def forward(ctx, a, b, dot_out_dtype=None):
-        return _matmul._call(a, b, dot_out_dtype=dot_out_dtype)
-
-
-matmul = _matmul.apply
diff --git a/python/triton/ops/matmul_perf_model.py b/python/triton/ops/matmul_perf_model.py
deleted file mode 100644
index 740426b13e34..000000000000
--- a/python/triton/ops/matmul_perf_model.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import heapq
-
-import torch
-
-import triton
-import triton._C.libtriton.triton as _triton
-from triton.runtime import driver
-from triton.testing import get_dram_gbps, get_max_simd_tflops, get_max_tensorcore_tflops
-
-
-def get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype):
-    ''' return compute throughput in TOPS '''
-    total_warps = num_ctas * min(num_warps, 4)
-    num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4  # on recent GPUs
-    tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops(dtype, backend, device)
-    return tflops
-
-
-def get_simd_tflops(backend, device, num_ctas, num_warps, dtype):
-    ''' return compute throughput in TOPS '''
-    total_warps = num_ctas * min(num_warps, 4)
-    num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4  # on recent GPUs
-    tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, backend, device)
-    return tflops
-
-
-def get_tflops(backend, device, num_ctas, num_warps, dtype):
-    capability = torch.cuda.get_device_capability(device)
-    if capability[0] < 8 and dtype == torch.float32:
-        return get_simd_tflops(backend, device, num_ctas, num_warps, dtype)
-    return get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype)
-
-
-def estimate_matmul_time(
-    # backend, device,
-    num_warps, num_stages,
-    A, B, C,
-    M, N, K,
-    BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K,
-    debug=False, **kwargs
-):
-    ''' return estimated running time in ms
-          = max(compute, loading) + store '''
-    backend = _triton.runtime.backend.CUDA
-    device = torch.cuda.current_device()
-    dtype = A.dtype
-    dtsize = A.element_size()
-
-    num_cta_m = triton.cdiv(M, BLOCK_M)
-    num_cta_n = triton.cdiv(N, BLOCK_N)
-    num_cta_k = SPLIT_K
-    num_ctas = num_cta_m * num_cta_n * num_cta_k
-
-    # If the input is smaller than the block size
-    M, N = max(M, BLOCK_M), max(N, BLOCK_N)
-
-    # time to compute
-    total_ops = 2 * M * N * K / (1024 * 1024 * 1024)  # GOPS
-    tput = get_tflops(backend, device, num_ctas, num_warps, dtype)
-    compute_ms = total_ops / tput
-
-    # time to load data
-    num_sm = driver.utils.get_device_properties(device)["multiprocessor_count"]
-    active_cta_ratio = min(1, num_ctas / num_sm)
-    active_cta_ratio_bw1 = min(1, num_ctas / 32)  # 32 active ctas are enough to saturate
-    active_cta_ratio_bw2 = max(min(1, (num_ctas - 32) / (108 - 32)), 0)  # 32-108, remaining 5%
-    dram_bw = get_dram_gbps(backend, device) * (active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05)  # in GB/s
-    l2_bw = dram_bw * 4  # rough estimation (should be 4.7 for A100?)
-    # assume 80% of (following) loads are in L2 cache
-    load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1))
-    load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1)
-    load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1))
-    load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1)
-    # total
-    total_dram = (load_a_dram + load_b_dram) / (1024 * 1024)  # MB
-    total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024)
-    # loading time in ms
-    load_ms = total_dram / dram_bw + total_l2 / l2_bw
-
-    # estimate storing time
-    store_bw = dram_bw * 0.6  # :o
-    store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024)  # MB
-    if SPLIT_K == 1:
-        store_ms = store_c_dram / store_bw
-    else:
-        reduce_bw = store_bw
-        store_ms = store_c_dram / reduce_bw
-        # c.zero_()
-        zero_ms = M * N * 2 / (1024 * 1024) / store_bw
-        store_ms += zero_ms
-
-    total_time_ms = max(compute_ms, load_ms) + store_ms
-    if debug:
-        print(f'Total time: {total_time_ms}ms, compute time: {compute_ms}ms, '
-              f'loading time: {load_ms}ms, store time: {store_ms}ms, '
-              f'Activate CTAs: {active_cta_ratio*100}%')
-    return total_time_ms
-
-
-def early_config_prune(configs, named_args):
-    device = torch.cuda.current_device()
-    capability = torch.cuda.get_device_capability()
-    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
-    dtsize = named_args['A'].element_size()
-    dtype = named_args['A'].dtype
-
-    # 1. make sure we have enough smem
-    pruned_configs = []
-    for config in configs:
-        kw = config.kwargs
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = \
-            kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], config.num_stages
-
-        max_shared_memory = driver.utils.get_device_properties(device)["max_shared_mem"]
-        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
-        if required_shared_memory <= max_shared_memory:
-            pruned_configs.append(config)
-    configs = pruned_configs
-
-    # Some dtypes do not allow atomic_add
-    if dtype not in [torch.float16, torch.float32]:
-        configs = [config for config in configs if config.kwargs['SPLIT_K'] == 1]
-
-    # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps)
-    configs_map = {}
-    for config in configs:
-        kw = config.kwargs
-        BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = \
-            kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], kw['SPLIT_K'], config.num_warps, config.num_stages
-
-        key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps)
-        if key in configs_map:
-            configs_map[key].append((config, num_stages))
-        else:
-            configs_map[key] = [(config, num_stages)]
-
-    pruned_configs = []
-    for k, v in configs_map.items():
-        BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k
-        if capability[0] >= 8:
-            # compute cycles (only works for ampere GPUs)
-            mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16)
-            mma_cycles = mmas / min(4, num_warps) * 8
-
-            ldgsts_latency = 300  # Does this matter?
-            optimal_num_stages = ldgsts_latency / mma_cycles
-
-            # nearest stages, prefer large #stages
-            nearest = heapq.nsmallest(2, v, key=lambda x: 10 + abs(x[1] - optimal_num_stages)
-                                      if (x[1] - optimal_num_stages) < 0 else x[1] - optimal_num_stages)
-
-            for n in nearest:
-                pruned_configs.append(n[0])
-        else:  # Volta & Turing only supports num_stages <= 2
-            random_config = v[0][0]
-            random_config.num_stages = 2
-            pruned_configs.append(random_config)
-    return pruned_configs
diff --git a/python/triton/runtime/__init__.py b/python/triton/runtime/__init__.py
deleted file mode 100644
index a4291ab31c8e..000000000000
--- a/python/triton/runtime/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from .autotuner import (Autotuner, Config, Heuristics, OutOfResources, autotune,
-                        heuristics)
-from .driver import driver
-from .jit import (JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret,
-                  version_key)
-
-__all__ = [
-    "driver",
-    "Config",
-    "Heuristics",
-    "autotune",
-    "heuristics",
-    "JITFunction",
-    "KernelInterface",
-    "version_key",
-    "reinterpret",
-    "TensorWrapper",
-    "OutOfResources",
-    "MockTensor",
-    "Autotuner",
-]
diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py
deleted file mode 100644
index 3cb9f9dbe862..000000000000
--- a/python/triton/runtime/autotuner.py
+++ /dev/null
@@ -1,244 +0,0 @@
-from __future__ import annotations
-
-import builtins
-import time
-from typing import Dict
-
-from ..testing import do_bench
-from .jit import KernelInterface
-
-
-class OutOfResources(Exception):
-    def __init__(self, required, limit, name):
-        self.message = f'out of resource: {name}, '\
-                       f'Required: {required}, '\
-                       f'Hardware limit: {limit}'
-        self.message += '. Reducing block sizes or `num_stages` may help.'
-        self.required = required
-        self.limit = limit
-        self.name = name
-        super().__init__(self.message)
-
-    def __reduce__(self):
-        # this is necessary to make CompilationError picklable
-        return (type(self), (self.required, self.limit, self.name))
-
-
-class Autotuner(KernelInterface):
-    def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None):
-        '''
-        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-            'perf_model': performance model used to predicate running time with different configs, returns running time
-            'top_k': number of configs to bench
-            'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
-        '''
-        if not configs:
-            self.configs = [Config({}, num_warps=4, num_stages=2)]
-        else:
-            self.configs = configs
-        self.key_idx = [arg_names.index(k) for k in key]
-        self.cache = {}
-        # hook to reset all required tensor to zeros before relaunching a kernel
-        self.hook = lambda args: 0
-        if reset_to_zero is not None:
-            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
-
-            def _hook(args):
-                for i in self.reset_idx:
-                    args[i].zero_()
-            self.hook = _hook
-        self.arg_names = arg_names
-        # prune configs
-        if prune_configs_by:
-            perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k']
-            if 'early_config_prune' in prune_configs_by:
-                early_config_prune = prune_configs_by['early_config_prune']
-        else:
-            perf_model, top_k, early_config_prune = None, None, None
-        self.perf_model, self.configs_top_k = perf_model, top_k
-        self.early_config_prune = early_config_prune
-        self.fn = fn
-
-    def _bench(self, *args, config, **meta):
-        # check for conflicts, i.e. meta-parameters both provided
-        # as kwargs and by the autotuner
-        conflicts = meta.keys() & config.kwargs.keys()
-        if conflicts:
-            raise ValueError(
-                f"Conflicting meta-parameters: {', '.join(conflicts)}."
-                " Make sure that you don't re-define auto-tuned symbols."
-            )
-        # augment meta-parameters with tunable ones
-        current = dict(meta, **config.kwargs)
-
-        def kernel_call():
-            if config.pre_hook:
-                config.pre_hook(self.nargs)
-            self.hook(args)
-            self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
-        try:
-            return do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
-        except OutOfResources:
-            return [float('inf'), float('inf'), float('inf')]
-
-    def run(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        if len(self.configs) > 1:
-            all_args = {**self.nargs, **kwargs}
-            _args = []
-            for name in self.arg_names:
-                if name in all_args:
-                    _args.append(all_args[name])
-            key = tuple(_args[i] for i in self.key_idx)
-            if key not in self.cache:
-                # prune configs
-                pruned_configs = self.prune_configs(kwargs)
-                bench_start = time.time()
-                timings = {config: self._bench(*args, config=config, **kwargs)
-                           for config in pruned_configs}
-                bench_end = time.time()
-                self.bench_time = bench_end - bench_start
-                self.cache[key] = builtins.min(timings, key=timings.get)
-                self.hook(args)
-                self.configs_timings = timings
-            config = self.cache[key]
-        else:
-            config = self.configs[0]
-        self.best_config = config
-        if config.pre_hook is not None:
-            config.pre_hook(self.nargs)
-        return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
-
-    def prune_configs(self, kwargs):
-        pruned_configs = self.configs
-        if self.early_config_prune:
-            pruned_configs = self.early_config_prune(self.configs, self.nargs)
-        if self.perf_model:
-            top_k = self.configs_top_k
-            if isinstance(top_k, float) and top_k <= 1.0:
-                top_k = int(len(self.configs) * top_k)
-            if len(pruned_configs) > top_k:
-                est_timing = {
-                    config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages,
-                                            num_warps=config.num_warps)
-                    for config in pruned_configs
-                }
-                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
-        return pruned_configs
-
-    def warmup(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        for config in self.prune_configs(kwargs):
-            self.fn.warmup(
-                *args,
-                num_warps=config.num_warps,
-                num_stages=config.num_stages,
-                **kwargs,
-                **config.kwargs,
-            )
-        self.nargs = None
-
-
-class Config:
-    """
-    An object that represents a possible kernel configuration for the auto-tuner to try.
-
-    :ivar meta: a dictionary of meta-parameters to pass to the kernel as keyword arguments.
-    :type meta: dict[Str, Any]
-    :ivar num_warps: the number of warps to use for the kernel when compiled for GPUs. For example, if
-                      `num_warps=8`, then each kernel instance will be automatically parallelized to
-                      cooperatively execute using `8 * 32 = 256` threads.
-    :type num_warps: int
-    :ivar num_stages: the number of stages that the compiler should use when software-pipelining loops.
-                       Mostly useful for matrix multiplication workloads on SM80+ GPUs.
-    :type num_stages: int
-    :ivar pre_hook: a function that will be called before the kernel is called. Parameters of this
-                    function are args.
-    """
-
-    def __init__(self, kwargs, num_warps=4, num_stages=2, pre_hook=None):
-        self.kwargs = kwargs
-        self.num_warps = num_warps
-        self.num_stages = num_stages
-        self.pre_hook = pre_hook
-
-    def __str__(self):
-        res = []
-        for k, v in self.kwargs.items():
-            res.append(f'{k}: {v}')
-        res.append(f'num_warps: {self.num_warps}')
-        res.append(f'num_stages: {self.num_stages}')
-        return ', '.join(res)
-
-
-def autotune(configs, key, prune_configs_by=None, reset_to_zero=None):
-    """
-    Decorator for auto-tuning a :code:`triton.jit`'d function.
-
-    .. highlight:: python
-    .. code-block:: python
-
-        @triton.autotune(configs=[
-            triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
-            triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
-          ],
-          key=['x_size'] # the two above configs will be evaluated anytime
-                         # the value of x_size changes
-        )
-        @triton.jit
-        def kernel(x_ptr, x_size, **META):
-            BLOCK_SIZE = META['BLOCK_SIZE']
-    :note: When all the configurations are evaluated, the kernel will run multiple times.
-           This means that whatever value the kernel updates will be updated multiple times.
-           To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
-           resets the value of the provided tensor to `zero` before running any configuration.
-    :param configs: a list of :code:`triton.Config` objects
-    :type configs: list[triton.Config]
-    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
-    :type key: list[str]
-    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-        'perf_model': performance model used to predicate running time with different configs, returns running time
-        'top_k': number of configs to bench
-        'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs.
-    :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
-    :type reset_to_zero: list[str]
-    """
-    def decorator(fn):
-        return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by)
-
-    return decorator
-
-
-class Heuristics(KernelInterface):
-
-    def __init__(self, fn, arg_names, values) -> None:
-        self.fn = fn
-        self.values = values
-        self.arg_names = arg_names
-
-    def run(self, *args, **kwargs):
-        for v, heur in self.values.items():
-            kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs})
-        return self.fn.run(*args, **kwargs)
-
-
-def heuristics(values):
-    """
-    Decorator for specifying how the values of certain meta-parameters may be computed.
-    This is useful for cases where auto-tuning is prohibitevely expensive, or just not applicable.
-
-    .. highlight:: python
-    .. code-block:: python
-
-        @triton.heuristics(values={'BLOCK_SIZE': lambda args: 2 ** int(math.ceil(math.log2(args[1])))})
-        @triton.jit
-        def kernel(x_ptr, x_size, **META):
-            BLOCK_SIZE = META['BLOCK_SIZE'] # smallest power-of-two >= x_size
-    :param values: a dictionary of meta-parameter names and functions that compute the value of the meta-parameter.
-                   each such function takes a list of positional arguments as input.
-    :type values: dict[str, Callable[[list[Any]], Any]]
-    """
-    def decorator(fn):
-        return Heuristics(fn, fn.arg_names, values)
-
-    return decorator
diff --git a/python/triton/runtime/backends/cuda.c b/python/triton/runtime/backends/cuda.c
deleted file mode 100644
index a03297639f32..000000000000
--- a/python/triton/runtime/backends/cuda.c
+++ /dev/null
@@ -1,124 +0,0 @@
-#include "cuda.h"
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-
-static inline void gpuAssert(CUresult code, const char *file, int line) {
-  if (code != CUDA_SUCCESS) {
-    const char *prefix = "Triton Error [CUDA]: ";
-    const char *str;
-    cuGetErrorString(code, &str);
-    char err[1024] = {0};
-    strcat(err, prefix);
-    strcat(err, str);
-    PyErr_SetString(PyExc_RuntimeError, err);
-  }
-}
-
-#define CUDA_CHECK(ans)                                                        \
-  {                                                                            \
-    gpuAssert((ans), __FILE__, __LINE__);                                      \
-    if (PyErr_Occurred())                                                      \
-      return NULL;                                                             \
-  }
-
-static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
-  int device_id;
-  if (!PyArg_ParseTuple(args, "i", &device_id))
-    return NULL;
-  // Get device handle
-  CUdevice device;
-  cuDeviceGet(&device, device_id);
-
-  // create a struct to hold device properties
-  int max_shared_mem;
-  int multiprocessor_count;
-  int sm_clock_rate;
-  int mem_clock_rate;
-  int mem_bus_width;
-  CUDA_CHECK(cuDeviceGetAttribute(
-      &max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
-      device));
-  CUDA_CHECK(cuDeviceGetAttribute(
-      &multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
-  CUDA_CHECK(cuDeviceGetAttribute(&sm_clock_rate,
-                                  CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
-  CUDA_CHECK(cuDeviceGetAttribute(
-      &mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
-  CUDA_CHECK(cuDeviceGetAttribute(
-      &mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));
-
-  return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
-                       max_shared_mem, "multiprocessor_count",
-                       multiprocessor_count, "sm_clock_rate", sm_clock_rate,
-                       "mem_clock_rate", mem_clock_rate, "mem_bus_width",
-                       mem_bus_width);
-}
-
-static PyObject *loadBinary(PyObject *self, PyObject *args) {
-  const char *name;
-  const char *data;
-  Py_ssize_t data_size;
-  int shared;
-  int device;
-  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
-                        &device)) {
-    return NULL;
-  }
-  CUfunction fun;
-  CUmodule mod;
-  int32_t n_regs = 0;
-  int32_t n_spills = 0;
-  // create driver handles
-  CUDA_CHECK(cuModuleLoadData(&mod, data));
-  CUDA_CHECK(cuModuleGetFunction(&fun, mod, name));
-  // get allocated registers and spilled registers from the function
-  CUDA_CHECK(cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
-  CUDA_CHECK(
-      cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
-  n_spills /= 4;
-  // set dynamic shared memory if necessary
-  int shared_optin;
-  CUDA_CHECK(cuDeviceGetAttribute(
-      &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
-      device));
-  if (shared > 49152 && shared_optin > 49152) {
-    CUDA_CHECK(cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
-    int shared_total, shared_static;
-    CUDA_CHECK(cuDeviceGetAttribute(
-        &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
-        device));
-    CUDA_CHECK(cuFuncGetAttribute(&shared_static,
-                                  CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
-    CUDA_CHECK(
-        cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-                           shared_optin - shared_static));
-  }
-
-  if (PyErr_Occurred()) {
-    return NULL;
-  }
-  return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs,
-                       n_spills);
-}
-
-static PyMethodDef ModuleMethods[] = {
-    {"load_binary", loadBinary, METH_VARARGS,
-     "Load provided cubin into CUDA driver"},
-    {"get_device_properties", getDeviceProperties, METH_VARARGS,
-     "Get the properties for a given device"},
-    {NULL, NULL, 0, NULL} // sentinel
-};
-
-static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
-                                       NULL, // documentation
-                                       -1,   // size
-                                       ModuleMethods};
-
-PyMODINIT_FUNC PyInit_cuda_utils(void) {
-  PyObject *m = PyModule_Create(&ModuleDef);
-  if (m == NULL) {
-    return NULL;
-  }
-  PyModule_AddFunctions(m, ModuleMethods);
-  return m;
-}
diff --git a/python/triton/runtime/backends/hip.c b/python/triton/runtime/backends/hip.c
deleted file mode 100644
index 5ed5f19ce837..000000000000
--- a/python/triton/runtime/backends/hip.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#define __HIP_PLATFORM_AMD__
-#include <hip/hip_runtime.h>
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-static inline void gpuAssert(hipError_t code, const char *file, int line) {
-  {
-    if (code != HIP_SUCCESS) {
-      {
-        const char *prefix = "Triton Error [HIP]: ";
-        const char *str = hipGetErrorString(code);
-        char err[1024] = {0};
-        snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str);
-        PyErr_SetString(PyExc_RuntimeError, err);
-      }
-    }
-  }
-}
-
-#define HIP_CHECK(ans)                                                         \
-  {                                                                            \
-    gpuAssert((ans), __FILE__, __LINE__);                                      \
-    if (PyErr_Occurred())                                                      \
-      return NULL;                                                             \
-  }
-
-static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
-  int device_id;
-  if (!PyArg_ParseTuple(args, "i", &device_id))
-    return NULL;
-
-  hipDeviceProp_t props;
-  HIP_CHECK(hipGetDeviceProperties(&props, device_id));
-
-  // create a struct to hold device properties
-  return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
-                       props.sharedMemPerBlock, "multiprocessor_count",
-                       props.multiProcessorCount, "sm_clock_rate",
-                       props.clockRate, "mem_clock_rate", props.memoryClockRate,
-                       "mem_bus_width", props.memoryBusWidth);
-}
-
-static PyObject *loadBinary(PyObject *self, PyObject *args) {
-  const char *name;
-  const char *data;
-  Py_ssize_t data_size;
-  int shared;
-  int device;
-  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
-                        &device)) {
-    return NULL;
-  }
-
-  // Open HSACO file
-  FILE *hsaco_file;
-  if ((hsaco_file = fopen(data, "rb")) == NULL) {
-    return NULL;
-  }
-
-  // Read HSCAO file into Buffer
-  fseek(hsaco_file, 0L, SEEK_END);
-  size_t hsaco_file_size = ftell(hsaco_file);
-  unsigned char *hsaco =
-      (unsigned char *)malloc(hsaco_file_size * sizeof(unsigned char));
-  rewind(hsaco_file);
-  fread(hsaco, sizeof(unsigned char), hsaco_file_size, hsaco_file);
-  fclose(hsaco_file);
-
-  // set HIP options
-  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
-                        hipJitOptionErrorLogBuffer,
-                        hipJitOptionInfoLogBufferSizeBytes,
-                        hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
-  const unsigned int errbufsize = 8192;
-  const unsigned int logbufsize = 8192;
-  char _err[errbufsize];
-  char _log[logbufsize];
-  void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
-                    (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
-
-  // launch HIP Binary
-  hipModule_t mod;
-  hipFunction_t fun;
-  hipModuleLoadDataEx(&mod, hsaco, 5, opt, optval);
-  hipModuleGetFunction(&fun, mod, name);
-  free(hsaco);
-
-  // get allocated registers and spilled registers from the function
-  int n_regs = 0;
-  int n_spills = 0;
-  if (PyErr_Occurred()) {
-    return NULL;
-  }
-  return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs,
-                       n_spills);
-}
-
-static PyMethodDef ModuleMethods[] = {
-    {"load_binary", loadBinary, METH_VARARGS,
-     "Load provided hsaco into HIP driver"},
-    {"get_device_properties", getDeviceProperties, METH_VARARGS,
-     "Get the properties for a given device"},
-    {NULL, NULL, 0, NULL} // sentinel
-};
-
-static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils",
-                                       NULL, // documentation
-                                       -1,   // size
-                                       ModuleMethods};
-
-PyMODINIT_FUNC PyInit_hip_utils(void) {
-  PyObject *m = PyModule_Create(&ModuleDef);
-  if (m == NULL) {
-    return NULL;
-  }
-  PyModule_AddFunctions(m, ModuleMethods);
-  return m;
-}
diff --git a/python/triton/runtime/cache.py b/python/triton/runtime/cache.py
deleted file mode 100644
index 43e6660a59df..000000000000
--- a/python/triton/runtime/cache.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import json
-import os
-import random
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Dict, Optional
-
-
-def default_cache_dir():
-    return os.path.join(Path.home(), ".triton", "cache")
-
-
-class CacheManager(ABC):
-    def __init__(self, key):
-        pass
-
-    @abstractmethod
-    def get_file(self, filename) -> Optional[str]:
-        pass
-
-    @abstractmethod
-    def has_file(self, filename) -> bool:
-        pass
-
-    @abstractmethod
-    def put(self, data, filename, binary=True) -> str:
-        pass
-
-    @abstractmethod
-    def get_group(self, filename: str) -> Optional[Dict[str, str]]:
-        pass
-
-    @abstractmethod
-    def put_group(self, filename: str, group: Dict[str, str]):
-        pass
-
-
-class FileCacheManager(CacheManager):
-    def __init__(self, key):
-        self.key = key
-        self.lock_path = None
-        # create cache directory if it doesn't exist
-        self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir())
-        if self.cache_dir:
-            self.cache_dir = os.path.join(self.cache_dir, self.key)
-            self.lock_path = os.path.join(self.cache_dir, "lock")
-            os.makedirs(self.cache_dir, exist_ok=True)
-
-    def _make_path(self, filename) -> str:
-        return os.path.join(self.cache_dir, filename)
-
-    def has_file(self, filename):
-        if not self.cache_dir:
-            return False
-        return os.path.exists(self._make_path(filename))
-
-    def get_file(self, filename) -> Optional[str]:
-        if self.has_file(filename):
-            return self._make_path(filename)
-        else:
-            return None
-
-    def get_group(self, filename: str) -> Optional[Dict[str, str]]:
-        grp_filename = f"__grp__{filename}"
-        if not self.has_file(grp_filename):
-            return None
-        grp_filepath = self._make_path(grp_filename)
-        with open(grp_filepath) as f:
-            grp_data = json.load(f)
-        child_paths = grp_data.get("child_paths", None)
-        # Invalid group data.
-        if child_paths is None:
-            return None
-        result = {}
-        for c in child_paths:
-            p = self._make_path(c)
-            if not os.path.exists(p):
-                raise Exception(f"Group file {p} does not exist from group {grp_filename} ")
-            result[c] = p
-        return result
-
-    # Note a group of pushed files as being part of a group
-    def put_group(self, filename: str, group: Dict[str, str]):
-        if not self.cache_dir:
-            return
-        grp_contents = json.dumps({"child_paths": sorted(list(group.keys()))})
-        grp_filename = f"__grp__{filename}"
-        return self.put(grp_contents, grp_filename, binary=False)
-
-    def put(self, data, filename, binary=True) -> str:
-        if not self.cache_dir:
-            return
-        binary = isinstance(data, bytes)
-        if not binary:
-            data = str(data)
-        assert self.lock_path is not None
-        filepath = self._make_path(filename)
-        # Random ID to avoid any collisions
-        rnd_id = random.randint(0, 1000000)
-        # we use the PID incase a bunch of these around so we can see what PID made it
-        pid = os.getpid()
-        # use tempfile to be robust against program interruptions
-        temp_path = f"{filepath}.tmp.pid_{pid}_{rnd_id}"
-        mode = "wb" if binary else "w"
-        with open(temp_path, mode) as f:
-            f.write(data)
-        # Replace is guaranteed to be atomic on POSIX systems if it succeeds
-        # so filepath cannot see a partial write
-        os.replace(temp_path, filepath)
-        return filepath
-
-
-__cache_cls = FileCacheManager
-__cache_cls_nme = "DEFAULT"
-
-
-def get_cache_manager(key) -> CacheManager:
-    import os
-
-    user_cache_manager = os.environ.get("TRITON_CACHE_MANAGER", None)
-    global __cache_cls
-    global __cache_cls_nme
-
-    if user_cache_manager is not None and user_cache_manager != __cache_cls_nme:
-        import importlib
-        module_path, clz_nme = user_cache_manager.split(":")
-        module = importlib.import_module(module_path)
-        __cache_cls = getattr(module, clz_nme)
-        __cache_cls_nme = user_cache_manager
-
-    return __cache_cls(key)
diff --git a/python/triton/runtime/driver.py b/python/triton/runtime/driver.py
deleted file mode 100644
index 3850821536c5..000000000000
--- a/python/triton/runtime/driver.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import abc
-import hashlib
-import os
-import tempfile
-from pathlib import Path
-
-from ..common.build import _build
-from .cache import get_cache_manager
-
-
-class DriverBase(metaclass=abc.ABCMeta):
-
-    CUDA = 0
-    HIP = 1
-
-    @staticmethod
-    def third_party_dir():
-        return os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "third_party")
-
-    def __init__(self) -> None:
-        pass
-# -----------------------------
-# CUDA
-# -----------------------------
-
-
-class CudaUtils(object):
-
-    def __new__(cls):
-        if not hasattr(cls, 'instance'):
-            cls.instance = super(CudaUtils, cls).__new__(cls)
-        return cls.instance
-
-    def __init__(self):
-        dirname = os.path.dirname(os.path.realpath(__file__))
-        src = Path(os.path.join(dirname, "backends", "cuda.c")).read_text()
-        key = hashlib.md5(src.encode("utf-8")).hexdigest()
-        cache = get_cache_manager(key)
-        fname = "cuda_utils.so"
-        cache_path = cache.get_file(fname)
-        if cache_path is None:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                src_path = os.path.join(tmpdir, "main.c")
-                with open(src_path, "w") as f:
-                    f.write(src)
-                so = _build("cuda_utils", src_path, tmpdir)
-                with open(so, "rb") as f:
-                    cache_path = cache.put(f.read(), fname, binary=True)
-        import importlib.util
-        spec = importlib.util.spec_from_file_location("cuda_utils", cache_path)
-        mod = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(mod)
-        self.load_binary = mod.load_binary
-        self.get_device_properties = mod.get_device_properties
-
-
-class CudaDriver(DriverBase):
-
-    def __new__(cls):
-        if not hasattr(cls, 'instance'):
-            cls.instance = super(CudaDriver, cls).__new__(cls)
-        return cls.instance
-
-    def __init__(self):
-        self.utils = CudaUtils()
-        self.backend = self.CUDA
-
-# -----------------------------
-# HIP
-# -----------------------------
-
-
-class HIPUtils(object):
-    def __new__(cls):
-        if not hasattr(cls, 'instance'):
-            cls.instance = super(HIPUtils, cls).__new__(cls)
-        return cls.instance
-
-    def __init__(self):
-        dirname = os.path.dirname(os.path.realpath(__file__))
-        src = Path(os.path.join(dirname, "backends", "hip.c")).read_text()
-        key = hashlib.md5(src.encode("utf-8")).hexdigest()
-        cache = get_cache_manager(key)
-        fname = "hip_utils.so"
-        cache_path = cache.get_file(fname)
-        if cache_path is None:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                src_path = os.path.join(tmpdir, "main.c")
-                with open(src_path, "w") as f:
-                    f.write(src)
-                so = _build("hip_utils", src_path, tmpdir)
-                with open(so, "rb") as f:
-                    cache_path = cache.put(f.read(), fname, binary=True)
-        import importlib.util
-        spec = importlib.util.spec_from_file_location("hip_utils", cache_path)
-        mod = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(mod)
-        self.load_binary = mod.load_binary
-        self.get_device_properties = mod.get_device_properties
-
-
-class HIPDriver(DriverBase):
-
-    def __new__(cls):
-        if not hasattr(cls, 'instance'):
-            cls.instance = super(HIPDriver, cls).__new__(cls)
-        return cls.instance
-
-    def __init__(self):
-        self.utils = HIPUtils()
-        self.backend = self.HIP
-
-
-class UnsupportedDriver(DriverBase):
-
-    def __new__(cls):
-        if not hasattr(cls, 'instance'):
-            cls.instance = super(UnsupportedDriver, cls).__new__(cls)
-        return cls.instance
-
-    def __init__(self):
-        self.utils = None
-        self.backend = None
-
-# -----------------------------
-# Driver
-# -----------------------------
-
-
-class LazyProxy:
-    def __init__(self, init_fn):
-        self._init_fn = init_fn
-        self._obj = None
-
-    def _initialize_obj(self):
-        if self._obj is None:
-            self._obj = self._init_fn()
-
-    def __getattr__(self, name):
-        self._initialize_obj()
-        return getattr(self._obj, name)
-
-    def __setattr__(self, name, value):
-        if name in ['_init_fn', '_obj']:
-            super().__setattr__(name, value)
-        else:
-            self._initialize_obj()
-            setattr(self._obj, name, value)
-
-    def __delattr__(self, name):
-        self._initialize_obj()
-        delattr(self._obj, name)
-
-    def __repr__(self):
-        if self._obj is None:
-            return f"<{self.__class__.__name__} for {self._init_fn} not yet initialized>"
-        return repr(self._obj)
-
-    def __str__(self):
-        self._initialize_obj()
-        return str(self._obj)
-
-
-def initialize_driver():
-    import torch
-    if torch.version.hip is not None:
-        return HIPDriver()
-    elif torch.cuda.is_available():
-        return CudaDriver()
-    else:
-        return UnsupportedDriver()
-
-
-driver = LazyProxy(initialize_driver)
diff --git a/python/triton/runtime/errors.py b/python/triton/runtime/errors.py
deleted file mode 100644
index 4ff900574c4f..000000000000
--- a/python/triton/runtime/errors.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-class OutOfResources(Exception):
-    def __init__(self, required, limit, name):
-        self.message = f'out of resource: {name}, '\
-                       f'Required: {required}, '\
-                       f'Hardware limit: {limit}'
-        self.message += '. Reducing block sizes or `num_stages` may help.'
-        self.required = required
-        self.limit = limit
-        self.name = name
-        super().__init__(self.message)
-
-    def __reduce__(self):
-        # this is necessary to make CompilationError picklable
-        return (type(self), (self.required, self.limit, self.name))
diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
deleted file mode 100644
index 787cd4c0d3ed..000000000000
--- a/python/triton/runtime/jit.py
+++ /dev/null
@@ -1,532 +0,0 @@
-from __future__ import annotations, division
-
-import ast
-import functools
-import hashlib
-import inspect
-import os
-import subprocess
-import textwrap
-from collections import defaultdict, namedtuple
-from typing import Callable, Generic, Iterable, Optional, TypeVar, Union, cast, overload
-
-import triton
-
-
-def get_cuda_stream(idx=None):
-    if idx is None:
-        idx = get_current_device()
-    try:
-        from torch._C import _cuda_getCurrentRawStream
-        return _cuda_getCurrentRawStream(idx)
-    except ImportError:
-        import torch
-        return torch.cuda.current_stream(idx).cuda_stream
-
-
-def get_current_device():
-    import torch
-    return torch.cuda.current_device()
-
-
-def set_current_device(idx):
-    import torch
-    torch.cuda.set_device(idx)
-
-
-def get_device_capability(idx):
-    import torch
-    return torch.cuda.get_device_capability(idx)
-
-
-T = TypeVar('T')
-
-# -----------------------------------------------------------------------------
-# Dependencies Finder
-# -----------------------------------------------------------------------------
-
-
-class DependenciesFinder(ast.NodeVisitor):
-    """
-    This AST visitor is used to find dependencies of a JITFunction. This can
-    be used to invalidate a JITFunction's hash when its source code -- or
-    that of its dependencies -- changes.
-    """
-
-    def __init__(self, globals, src) -> None:
-        super().__init__()
-        self.ret = hashlib.md5(src.encode("utf-8")).hexdigest()
-        self.globals = globals
-
-    def visit_Name(self, node):
-        return self.globals.get(node.id, None)
-
-    def visit_Attribute(self, node):
-        lhs = self.visit(node.value)
-        while isinstance(lhs, ast.Attribute):
-            lhs = self.visit(lhs.value)
-        if lhs is None or lhs is triton:
-            return None
-        return getattr(lhs, node.attr)
-
-    def visit_Call(self, node):
-        func = self.visit(node.func)
-        if func is None:
-            return
-        if inspect.isbuiltin(func):
-            return
-        if func.__module__ and func.__module__.startswith('triton.'):
-            return
-        assert isinstance(func, JITFunction), f"Function \"{func.__name__}\" is being called from a Triton function but is not a Triton function itself. Decorate it with @triton.jit to fix this"
-        if func.hash is None:
-            tree = ast.parse(func.src)
-            finder = DependenciesFinder(func.__globals__, func.src)
-            finder.visit(tree)
-            func.hash = finder.ret
-        noinline = str(getattr(func, 'noinline', False))
-        self.ret = (self.ret + func.hash + noinline).encode("utf-8")
-        self.ret = hashlib.md5(self.ret).hexdigest()
-
-# -----------------------------------------------------------------------------
-# JITFunction
-# -----------------------------------------------------------------------------
-
-
-@functools.lru_cache()
-def version_key():
-    import pkgutil
-    contents = []
-    # frontend
-    with open(__file__, "rb") as f:
-        contents += [hashlib.md5(f.read()).hexdigest()]
-    # compiler
-    compiler_path = os.path.join(*triton.__path__, 'compiler')
-    for lib in pkgutil.iter_modules([compiler_path]):
-        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
-            contents += [hashlib.md5(f.read()).hexdigest()]
-    # backend
-    with open(triton._C.libtriton.__file__, "rb") as f:
-        contents += [hashlib.md5(f.read()).hexdigest()]
-    # language
-    language_path = os.path.join(*triton.__path__, 'language')
-    for lib in pkgutil.iter_modules([language_path]):
-        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
-            contents += [hashlib.md5(f.read()).hexdigest()]
-    # ptxas version
-    try:
-        ptxas_version = hashlib.md5(subprocess.check_output(["ptxas", "--version"])).hexdigest()
-    except Exception:
-        ptxas_version = ''
-    return '-'.join(triton.__version__) + '-' + ptxas_version + '-' + '-'.join(contents)
-
-
-class KernelInterface(Generic[T]):
-    run: T
-
-    def __getitem__(self, grid) -> T:
-        """
-        A JIT function is launched with: fn[grid](*args, **kwargs).
-        Hence JITFunction.__getitem__ returns a callable proxy that
-        memorizes the grid.
-        """
-        return cast(T, functools.partial(cast(Callable, self.run), grid=grid))
-
-
-class JITFunction(KernelInterface[T]):
-
-    # Hook for inspecting compiled functions and modules
-    cache_hook = None
-    divisibility = 16
-
-    @staticmethod
-    def _key_of(arg):
-        if hasattr(arg, "dtype"):
-            return arg.dtype
-        elif isinstance(arg, bool):
-            return "i1"
-        elif isinstance(arg, int):
-            if -2**31 <= arg and arg <= 2**31 - 1:
-                return "i32"
-            elif 2**63 <= arg and arg <= 2**64 - 1:
-                return "u64"
-            else:
-                return "i64"
-        elif isinstance(arg, float):
-            return 'fp32'
-        elif arg is None:
-            return None
-        else:
-            raise TypeError(f'Unsupported type {type(arg)} for {arg}')
-
-    @staticmethod
-    def _spec_of(arg):
-        if hasattr(arg, "data_ptr"):
-            return (arg.data_ptr() % JITFunction.divisibility == 0)
-        elif isinstance(arg, int):
-            return (arg % 16 == 0, arg == 1)
-        return (arg is None, )
-
-    def _get_config(self, *args):
-        def is_divisible_by_16(x):
-            if hasattr(x, "data_ptr"):
-                return x.data_ptr() % JITFunction.divisibility == 0
-            elif isinstance(x, int):
-                return x % JITFunction.divisibility == 0
-            if x is None:
-                return True
-            return False
-        divisible_by_16 = {i for i, arg in enumerate(args) if is_divisible_by_16(arg) and i not in self.do_not_specialize}
-        equal_to_1 = {i for i, arg in enumerate(args) if not isinstance(arg, bool) and isinstance(arg, int) and arg == 1 and i not in self.do_not_specialize}
-        return namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"])(tuple(divisible_by_16), tuple(equal_to_1))
-        # return _triton.code_gen.instance_descriptor(divisible_by_16, equal_to_1)
-
-    @staticmethod
-    def _type_of(key):
-        # None are nullptr -- implicitly converted to *i8
-        if key is None:
-            return '*i8'
-        dtype_str = str(key).split(".")[-1]
-        tys = {
-            "bool": "i1",
-            "float8e5": "fp8e5",
-            "float8e4": "fp8e4",
-            "float16": "fp16",
-            "bfloat16": "bf16",
-            "float32": "fp32",
-            "float64": "fp64",
-            "int8": "i8",
-            "int16": "i16",
-            "int32": "i32",
-            "int64": "i64",
-            "uint8": "u8",
-            "uint16": "u16",
-            "uint32": "u32",
-            "uint64": "u64",
-        }
-        # reinterpret can create triton type
-        for v in list(tys.values()):
-            tys[v] = v
-        return key if isinstance(key, str) else f"*{tys[dtype_str]}"
-
-    def _make_signature(self, sig_key):
-        signature = ",".join([self._type_of(k) for i, k in enumerate(sig_key)])
-        return signature
-
-    def _make_constants(self, constexpr_key):
-        constants = dict(zip(self.constexprs, constexpr_key))
-        return constants
-
-    def _call_hook(self, key, signature, device, constants, num_warps, num_stages, extern_libs, configs):
-        if JITFunction.cache_hook is None:
-            return False
-        name = self.fn.__name__
-        module = self.fn.__module__
-        arg_reprs = ', '.join([f'{name}: {ty}' for name, ty in zip(self.arg_names, key[1])])
-        repr = f"{name}[num_warps={num_warps}, num_stages={num_stages}]({arg_reprs})"
-        key = str(key)
-
-        class LegacyCompiler:
-            def __init__(self, module, name):
-                self.module = module
-                self.name = name
-                pass
-
-        kwargs = dict(signature=signature, device=device, constants=constants,
-                      num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs,
-                      configs=configs)
-
-        return JITFunction.cache_hook(key=key, repr=repr, fn=LegacyCompiler(module, name), compile={"key": key, **kwargs}, is_manual_warmup=False, already_compiled=False)
-
-    def _get_arg_specialization_key(self, arg) -> str:
-        arg_annotation = self.__annotations__.get(arg, '')
-        if arg_annotation == '':
-            return f'({arg}.data_ptr() % {JITFunction.divisibility} == 0) if hasattr({arg}, "data_ptr") \
-                        else ({arg} % {JITFunction.divisibility} == 0, {arg} == 1) if isinstance({arg}, int) \
-                        else (False,)'
-        elif 'Tensor' in arg_annotation:
-            return f'({arg}.data_ptr() % {JITFunction.divisibility} == 0)'
-        elif arg_annotation == 'int':
-            return f'({arg} % {JITFunction.divisibility} == 0, {arg} == 1)'
-        else:
-            return '(False,)'
-
-    def _get_arg_sig_key(self, arg) -> str:
-        arg_annotation = self.__annotations__.get(arg, '')
-        if 'Tensor' in arg_annotation:
-            return f'{arg}.dtype'
-        elif arg_annotation == 'bool':
-            return "i1"
-        elif arg_annotation == 'float':
-            return 'fp32'
-        else:
-            return f'_key_of({arg})'
-
-    def _make_launcher(self):
-        regular_args = [f'{arg}' for i, arg in enumerate(self.arg_names) if i not in self.constexprs]
-        constexpr_args = [f'{arg}' for i, arg in enumerate(self.arg_names) if i in self.constexprs]
-        args = ', '.join(regular_args)
-        # cache key for regular argument type
-        sig_keys = ', '.join([self._get_arg_sig_key(arg) for arg in regular_args])
-        # cache key for constexpr argument values
-        constexpr_keys = ', '.join(constexpr_args)
-        # cache key for argument specialization
-        specializations = []
-        for i, arg in enumerate(regular_args):
-            if i in self.do_not_specialize:
-                continue
-            specializations += [self._get_arg_specialization_key(arg)]
-
-        spec_keys = ', '.join(specializations)
-        grid_args = ','.join([f'"{arg}": {arg}' for arg in self.arg_names])
-
-        src = f"""
-def {self.fn.__name__}({', '.join(self.arg_names)}, grid, num_warps=4, num_stages=3, extern_libs=None, stream=None, warmup=False, device=None):
-    sig_key =  {sig_keys},
-    constexpr_key = {f'{constexpr_keys},' if len(constexpr_keys) > 0 else ()}
-    spec_key = {f'{spec_keys},' if len(spec_keys) > 0 else ()}
-    key = (version_key, sig_key, constexpr_key, spec_key, num_warps, num_stages, self.debug)
-    if not extern_libs is None:
-      key = (key, tuple(extern_libs.items()))
-    assert num_warps > 0 and (num_warps & (num_warps - 1)) == 0, "num_warps must be a power of 2"
-    if callable(grid):
-        grid = grid({{{grid_args}}})
-    grid_size = len(grid)
-    grid_0 = grid[0]
-    grid_1 = grid[1] if grid_size > 1 else 1
-    grid_2 = grid[2] if grid_size > 2 else 1
-    if device is None:
-        device = get_current_device()
-        set_current_device(device)
-    if stream is None and not warmup:
-      stream = get_cuda_stream(device)
-    bin = cache[device].get(key, None)
-    if bin is not None:
-      if not warmup:
-          bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, triton.compiler.CompiledKernel.launch_enter_hook, triton.compiler.CompiledKernel.launch_exit_hook, bin, {args})
-      return bin
-    # kernel not cached -- compile
-    else:
-      # build dict of constant values
-      args = [{args}]
-      all_args = {', '.join([f'{arg}' for arg in self.arg_names])},
-      configs = self._get_config(*all_args),
-      constants = self._make_constants(constexpr_key)
-      constants.update({{i: None for i, arg in enumerate(all_args) if arg is None}})
-      constants.update({{i: 1 for i in configs[0].equal_to_1}})
-      # build kernel signature -- doesn't include specialized arguments
-      signature = {{ i: self._type_of(_key_of(arg)) for i, arg in enumerate(all_args) if i not in self.constexprs }}
-      # build stub signature -- includes arguments that are specialized
-      for i, arg in constants.items():
-        if callable(arg):
-          raise TypeError(f"Callable constexpr at index {{i}} is not supported")
-      if not self._call_hook(key, signature, device, constants, num_warps, num_stages, extern_libs, configs):
-        bin = triton.compile(self, signature=signature, device=device, constants=constants, num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs, configs=configs, debug=self.debug)
-        if not warmup:
-            bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, triton.compiler.CompiledKernel.launch_enter_hook, triton.compiler.CompiledKernel.launch_exit_hook, bin, *args)
-        self.cache[device][key] = bin
-        return bin
-      return None
-"""
-        scope = {"version_key": version_key(), "get_cuda_stream": get_cuda_stream,
-                 "self": self, "_spec_of": self._spec_of, "_key_of": self._key_of,
-                 "cache": self.cache, "triton": triton,
-                 "get_current_device": get_current_device,
-                 "set_current_device": set_current_device}
-        exec(src, scope)
-        return scope[self.fn.__name__]
-
-    def __init__(self, fn, version=None, do_not_specialize=None, debug=None, noinline=None):
-        self.fn = fn
-        self.module = fn.__module__
-        self.version = version
-        # function signature information
-        signature = inspect.signature(fn)
-        self.arg_names = [v.name for v in signature.parameters.values()]
-        self.has_defaults = any(v.default != inspect._empty for v in signature.parameters.values())
-        # specialization hints
-        self.do_not_specialize = [] if do_not_specialize is None else do_not_specialize
-        self.do_not_specialize = {self.arg_names.index(arg) if isinstance(arg, str) else arg for arg in self.do_not_specialize}
-        # function source code (without decorators)
-        self.src = textwrap.dedent(inspect.getsource(fn))
-        self.src = self.src[self.src.find("def"):]
-        # cache of just-in-time compiled kernels
-        self.cache = defaultdict(dict)
-        self.hash = None
-        # JITFunction can be instantiated as kernel
-        # when called with a grid using __getitem__
-        self.kernel_decorators = []
-        self.kernel = None
-        self.debug = True if os.environ.get("TRITON_DEBUG", "0") == "1" else debug
-        self.noinline = noinline
-        # annotations
-        normalize_ty = lambda ty: ty.__name__ if isinstance(ty, type) else ty
-        self.__annotations__ = {name: normalize_ty(ty) for name, ty in fn.__annotations__.items()}
-        # index of constexprs
-        self.constexprs = [self.arg_names.index(name) for name, ty in self.__annotations__.items() if 'constexpr' in ty]
-        # launcher
-        self.run = self._make_launcher()
-        # re-use docs of wrapped function
-        self.__doc__ = fn.__doc__
-        self.__name__ = fn.__name__
-        self.__globals__ = fn.__globals__
-        self.__module__ = fn.__module__
-
-    @property
-    def cache_key(self):
-        # TODO : hash should be attribute of `self`
-        if self.hash is None:
-            dependencies_finder = DependenciesFinder(globals=self.__globals__, src=self.src)
-            dependencies_finder.visit(self.parse())
-            self.hash = dependencies_finder.ret + version_key()
-        return self.hash
-
-    def warmup(self, *args, **kwargs):
-        return self.run(*map(MockTensor.wrap_dtype, args), **kwargs, warmup=True)
-
-    # we do not parse `src` in the constructor because
-    # the user might want to monkey-patch self.src dynamically.
-    # Our unit tests do this, for example.
-    def parse(self):
-        tree = ast.parse(self.src)
-        assert isinstance(tree, ast.Module)
-        assert len(tree.body) == 1
-        assert isinstance(tree.body[0], ast.FunctionDef)
-        return tree
-
-    def __call__(self, *args, **kwargs):
-        raise RuntimeError("Cannot call @triton.jit'd outside of the scope of a kernel")
-
-    def __setattr__(self, name, value):
-        # - when kernel decorators change, cached kernel
-        #   needs to be cleared
-        if name == 'kernel_decorators':
-            self.kernel = None
-        super(JITFunction, self).__setattr__(name, value)
-        # - when `.src` attribute is set, cache path needs
-        #   to be reinitialized
-        if name == 'src':
-            self.hash = None
-
-    def __repr__(self):
-        return f"JITFunction({self.module}:{self.fn.__name__})"
-
-
-# -----------------------------------------------------------------------------
-# `jit` decorator
-# -----------------------------------------------------------------------------
-
-
-@overload
-def jit(fn: T) -> JITFunction[T]:
-    ...
-
-
-@overload
-def jit(
-    *,
-    version=None,
-    do_not_specialize: Optional[Iterable[int]] = None,
-    debug: Optional[bool] = None,
-    noinline: Optional[bool] = None,
-) -> Callable[[T], JITFunction[T]]:
-    ...
-
-
-def jit(
-    fn: Optional[T] = None,
-    *,
-    version=None,
-    do_not_specialize: Optional[Iterable[int]] = None,
-    debug: Optional[bool] = None,
-    noinline: Optional[bool] = None,
-    interpret: Optional[bool] = None,
-) -> Union[JITFunction[T], Callable[[T], JITFunction[T]]]:
-    """
-    Decorator for JIT-compiling a function using the Triton compiler.
-
-    :note: When a jit'd function is called, arguments are
-        implicitly converted to pointers if they have a :code:`.data_ptr()` method
-        and a `.dtype` attribute.
-
-    :note: This function will be compiled and run on the GPU. It will only have access to:
-
-           * python primitives,
-           * builtins within the triton package,
-           * arguments to this function,
-           * other jit'd functions
-
-    :param fn: the function to be jit-compiled
-    :type fn: Callable
-    """
-
-    def decorator(fn: T) -> JITFunction[T]:
-        assert callable(fn)
-        if interpret:
-            from ..debugger.debugger import GridSelector
-            return GridSelector(fn)
-        else:
-            return JITFunction(
-                fn,
-                version=version,
-                do_not_specialize=do_not_specialize,
-                debug=debug,
-                noinline=noinline,
-            )
-    if fn is not None:
-        return decorator(fn)
-
-    else:
-        return decorator
-
-# -----------------------------------------------------------------------------
-# Utilities for mocking tensors
-# -----------------------------------------------------------------------------
-
-
-class MockTensor:
-    """
-    Can be used in place of real tensors when calling:
-        kernel.warmup(MockTensor(torch.float32), ...)
-    """
-    @staticmethod
-    def wrap_dtype(arg):
-        if arg.__class__.__name__ == "dtype" and\
-           arg.__module__ == "torch":
-            return MockTensor(arg)
-        return arg
-
-    def __init__(self, dtype):
-        self.dtype = dtype
-
-    @staticmethod
-    def data_ptr():
-        return 0  # optimistically assumes multiple of 16
-
-
-class TensorWrapper:
-    def __init__(self, base, dtype):
-        self.dtype = dtype
-        self.base = base
-        self.is_cuda = base.is_cuda
-        self.device = base.device
-
-    def data_ptr(self):
-        return self.base.data_ptr()
-
-    def __str__(self) -> str:
-        return f'TensorWrapper[{self.dtype}]({self.base})'
-
-
-def reinterpret(tensor, dtype):
-    if isinstance(tensor, TensorWrapper):
-        if dtype == tensor.base.dtype:
-            # Reinterpreting to the original interpretation; return the base.
-            return tensor.base
-        else:
-            # Reinterpreting a wrapped tensor to a different type.
-            return TensorWrapper(tensor.base, dtype)
-    elif hasattr(tensor, "data_ptr"):
-        # A new wrapper is needed around an unwrapped tensor.
-        return TensorWrapper(tensor, dtype)
-    else:
-        raise TypeError(f'Cannot reinterpret a {type(tensor)}.')
diff --git a/python/triton/testing.py b/python/triton/testing.py
deleted file mode 100644
index 321f03dbe8ba..000000000000
--- a/python/triton/testing.py
+++ /dev/null
@@ -1,423 +0,0 @@
-import functools
-import os
-import subprocess
-import sys
-from contextlib import contextmanager
-
-import triton._C.libtriton.triton as _triton
-
-
-def nvsmi(attrs):
-    attrs = ','.join(attrs)
-    cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits']
-    out = subprocess.check_output(cmd)
-    ret = out.decode(sys.stdout.encoding).split(',')
-    ret = [int(x) for x in ret]
-    return ret
-
-
-def do_bench(fn, warmup=25, rep=100, grad_to_none=None,
-             quantiles=None,
-             fast_flush=True,
-             return_mode="mean"):
-    assert return_mode in ["min", "max", "mean", "median"]
-    import torch
-    """
-    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
-    the 20-th and 80-th performance percentile.
-
-    :param fn: Function to benchmark
-    :type fn: Callable
-    :param warmup: Warmup time (in ms)
-    :type warmup: int
-    :param rep: Repetition time (in ms)
-    :type rep: int
-    :param grad_to_none: Reset the gradient of the provided tensor to None
-    :type grad_to_none: torch.tensor, optional
-    :param quantiles: Performance percentile to return in addition to the median.
-    :type quantiles: list[float]
-    :param fast_flush: Use faster kernel to flush L2 between measurements
-    :type fast_flush: bool
-    """
-
-    fn()
-    torch.cuda.synchronize()
-
-    # We maintain a buffer of 256 MB that we clear
-    # before each kernel call to make sure that the L2
-    # doesn't contain any input data before the run
-    if fast_flush:
-        cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
-    else:
-        cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda')
-
-    # Estimate the runtime of the function
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-    for _ in range(5):
-        cache.zero_()
-        fn()
-    end_event.record()
-    torch.cuda.synchronize()
-    estimate_ms = start_event.elapsed_time(end_event) / 5
-
-    # compute number of warmup and repeat
-    n_warmup = max(1, int(warmup / estimate_ms))
-    n_repeat = max(1, int(rep / estimate_ms))
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
-    # Warm-up
-    for _ in range(n_warmup):
-        fn()
-    # Benchmark
-    for i in range(n_repeat):
-        # we don't want `fn` to accumulate gradient values
-        # if it contains a backward pass. So we clear the
-        # provided gradients
-        if grad_to_none is not None:
-            for x in grad_to_none:
-                x.grad = None
-        # we clear the L2 cache before each run
-        cache.zero_()
-        # record time of `fn`
-        start_event[i].record()
-        fn()
-        end_event[i].record()
-    # Record clocks
-    torch.cuda.synchronize()
-    times = torch.tensor([s.elapsed_time(e) for s, e in zip(start_event, end_event)])
-    if quantiles is not None:
-        ret = torch.quantile(times, torch.tensor(quantiles)).tolist()
-        if len(ret) == 1:
-            ret = ret[0]
-        return ret
-    return getattr(torch, return_mode)(times).item()
-
-
-def assert_close(x, y, atol=None, rtol=None, err_msg=''):
-    import numpy as np
-    import torch
-
-    # canonicalize arguments to be tensors
-    if not isinstance(x, torch.Tensor):
-        x = torch.tensor(x)
-    if not isinstance(y, torch.Tensor):
-        y = torch.tensor(y)
-    # absolute tolerance
-    if atol is None:
-        atol = 1e-2
-    atol = atol(x.dtype) if callable(atol) else atol
-    # relative tolerance hook
-    if rtol is None:
-        rtol = 0.
-    rtol = rtol(x.dtype) if callable(rtol) else rtol
-    # we use numpy instead of pytorch
-    # as it seems more memory efficient
-    # pytorch tends to oom on large tensors
-    if isinstance(x, torch.Tensor):
-        if x.dtype == torch.bfloat16:
-            x = x.float()
-        x = x.cpu().detach().numpy()
-    if isinstance(y, torch.Tensor):
-        if y.dtype == torch.bfloat16:
-            y = y.float()
-        y = y.cpu().detach().numpy()
-    # we handle size==1 case separately as we can
-    # provide better error message there
-    if x.size > 1 or y.size > 1:
-        np.testing.assert_allclose(x, y, atol=atol, rtol=rtol, equal_nan=True)
-        return
-    if not np.allclose(x, y, atol=atol, rtol=rtol):
-        raise AssertionError(f'{err_msg} {x} is not close to {y} (atol={atol}, rtol={rtol})')
-
-
-class Benchmark:
-    """
-    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
-    """
-
-    def __init__(
-        self,
-        x_names,
-        x_vals,
-        line_arg,
-        line_vals,
-        line_names,
-        plot_name,
-        args,
-        xlabel='',
-        ylabel='',
-        x_log=False,
-        y_log=False,
-        color=None,
-        styles=None,
-    ):
-        """
-        Constructor
-
-        :param x_names: Name of the arguments that should appear on the x axis of the plot. If the list contains more than one element, all the arguments are assumed to have the same value.
-        :type x_names: List[str]
-        :param x_vals: List of values to use for the arguments in :code:`x_names`.
-        :type x_vals: List[Any]
-        :param line_arg: Argument name for which different values correspond to different lines in the plot.
-        :type line_arg: str
-        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
-        :type line_vals: List[str]
-        :param line_names: Label names for the different lines.
-        :type line_names: List[str]
-        :param plot_name: Name of the plot.
-        :type plot_name: str
-        :param args: List of arguments to remain fixed throughout the benchmark.
-        :type args: List[str]
-        :param xlabel: Label for the x axis of the plot.
-        :type xlabel: str, optional
-        :param ylabel: Label for the y axis of the plot.
-        :type ylabel: str, optional
-        :param x_log: Whether the x axis should be log scale.
-        :type x_log: bool, optional
-        :param y_log: Whether the y axis should be log scale.
-        :type y_log: bool, optional
-        """
-        self.x_names = x_names
-        self.x_vals = x_vals
-        self.x_log = x_log
-        self.line_arg = line_arg
-        self.line_vals = line_vals
-        self.line_names = line_names
-        self.y_log = y_log
-        self.styles = styles
-        # plot info
-        self.xlabel = xlabel
-        self.ylabel = ylabel
-        self.plot_name = plot_name
-        self.args = args
-
-
-class Mark:
-    def __init__(self, fn, benchmarks):
-        self.fn = fn
-        self.benchmarks = benchmarks
-
-    def _run(self, bench, save_path, show_plots, print_data):
-        import os
-
-        import matplotlib.pyplot as plt
-        import pandas as pd
-        y_mean = bench.line_names
-        y_min = [f'{x}-min' for x in bench.line_names]
-        y_max = [f'{x}-max' for x in bench.line_names]
-        df = pd.DataFrame(columns=[bench.x_names[0]] + y_mean + y_min + y_max)
-        for x in bench.x_vals:
-            x_args = {x_name: x for x_name in bench.x_names}
-            row_mean, row_min, row_max = [], [], []
-            for y in bench.line_vals:
-                ret = self.fn(**x_args, **{bench.line_arg: y}, **bench.args)
-                try:
-                    y_mean, y_min, y_max = ret
-                except TypeError:
-                    y_mean, y_min, y_max = ret, None, None
-                row_mean += [y_mean]
-                row_min += [y_min]
-                row_max += [y_max]
-            df.loc[len(df)] = [x] + row_mean + row_min + row_max
-        if bench.plot_name:
-            plt.figure()
-            ax = plt.subplot()
-            x = bench.x_names[0]
-            for i, y in enumerate(bench.line_names):
-                y_min, y_max = df[y + '-min'], df[y + '-max']
-                col = bench.styles[i][0] if bench.styles else None
-                sty = bench.styles[i][1] if bench.styles else None
-                ax.plot(df[x], df[y], label=y, color=col, ls=sty)
-                if y_min is not None and y_max is not None:
-                    ax.fill_between(df[x], y_min, y_max, alpha=0.15, color=col)
-            ax.legend()
-            xlabel = bench.xlabel if bench.xlabel else " = ".join(bench.x_names)
-            ax.set_xlabel(xlabel)
-            ax.set_ylabel(bench.ylabel)
-            # ax.set_title(bench.plot_name)
-            ax.set_xscale("log" if bench.x_log else "linear")
-            ax.set_yscale("log" if bench.y_log else "linear")
-            if show_plots:
-                plt.show()
-            if save_path:
-                plt.savefig(os.path.join(save_path, f"{bench.plot_name}.png"))
-        df = df[[bench.x_names[0]] + bench.line_names]
-        if print_data:
-            print(bench.plot_name + ':')
-            print(df)
-        if save_path:
-            df.to_csv(os.path.join(save_path, f"{bench.plot_name}.csv"), float_format='%.1f', index=False)
-
-    def run(self, show_plots=False, print_data=False, save_path=''):
-        has_single_bench = isinstance(self.benchmarks, Benchmark)
-        benchmarks = [self.benchmarks] if has_single_bench else self.benchmarks
-        if save_path:
-            html = open(os.path.join(save_path, "results.html"), "w")
-            html.write("<html><body>\n")
-        for bench in benchmarks:
-            self._run(bench, save_path, show_plots, print_data)
-            if save_path:
-                html.write(f"<image src=\"{bench.plot_name}.png\"/>\n")
-        if save_path:
-            html.write("</body></html>\n")
-
-
-def perf_report(benchmarks):
-    """
-    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.
-
-    :param benchmarks: Benchmarking configurations.
-    :type benchmarks: List of :class:`Benchmark`
-    """
-    wrapper = lambda fn: Mark(fn, benchmarks)
-    return wrapper
-
-
-def get_dram_gbps(backend=None, device=None):
-    ''' return DRAM bandwidth in GB/s '''
-    import torch
-
-    from .runtime import driver
-    if not backend:
-        backend = _triton.runtime.backend.CUDA
-    if not device:
-        device = torch.cuda.current_device()
-    mem_clock_khz = driver.utils.get_device_properties(device)["mem_clock_rate"]  # in kHz
-    bus_width = driver.utils.get_device_properties(device)["mem_bus_width"]
-    bw_gbps = mem_clock_khz * bus_width * 2 / 1e6 / 8  # In GB/s
-    return bw_gbps
-
-
-def get_max_tensorcore_tflops(dtype, backend=None, device=None, clock_rate=None):
-    import torch
-
-    from .runtime import driver
-    if not backend:
-        backend = _triton.runtime.backend.CUDA
-    if not device:
-        device = torch.cuda.current_device()
-
-    num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4
-    if not clock_rate:
-        clock_rate = driver.utils.get_device_properties(device)["sm_clock_rate"]  # in kHz
-    capability = torch.cuda.get_device_capability(device)
-    if capability[0] < 8:
-        assert dtype == torch.float16
-        ops_per_sub_core = 256  # 2 4x4x4 Tensor Cores
-    else:
-        if dtype == torch.float32:
-            ops_per_sub_core = 256
-        elif dtype in [torch.float16, torch.bfloat16]:
-            ops_per_sub_core = 512
-        elif dtype == torch.int8:
-            ops_per_sub_core = 1024
-        else:
-            raise RuntimeError("dtype not supported")
-    tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
-    return tflops
-
-# create decorator that wraps test function into
-# a cuda-memcheck system call
-
-
-def cuda_memcheck(**target_kwargs):
-    def decorator(test_fn):
-        @functools.wraps(test_fn)
-        def wrapper(*args, **kwargs):
-            import psutil
-            ppid_name = psutil.Process(os.getppid()).name()
-            run_cuda_memcheck = target_kwargs.items() <= kwargs.items()
-            if run_cuda_memcheck and ppid_name != "cuda-memcheck":
-                path = os.path.realpath(test_fn.__globals__["__file__"])
-                # get path of current file
-                env = {"PATH": os.environ["PATH"], "PYTORCH_NO_CUDA_MEMORY_CACHING": "1"}
-                assert 'request' in kwargs, "memcheck'ed test must have a (possibly unused) `request` fixture"
-                test_id = kwargs['request'].node.callspec.id
-                cmd = f"{path}::{test_fn.__name__}[{test_id}]"
-                out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env)
-                assert out.returncode == 0, "cuda-memcheck returned an error: bounds checking failed"
-                assert "ERROR SUMMARY: 0 errors" in str(out.stdout)
-            else:
-                test_fn(*args, **kwargs)
-        return wrapper
-    return decorator
-
-
-def nvsmi_attr(attrs):
-    attrs = ",".join(attrs)
-    cmd = [
-        "nvidia-smi",
-        "-i",
-        "0",
-        "--query-gpu=" + attrs,
-        "--format=csv,noheader,nounits",
-    ]
-    out = subprocess.check_output(cmd)
-    ret = out.decode(sys.stdout.encoding).split(",")
-    ret = [int(x) for x in ret]
-    return ret
-
-
-@contextmanager
-def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215):
-    try:
-        subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "1"])
-        subprocess.check_output(
-            [
-                "nvidia-smi",
-                "-i",
-                "0",
-                f"--lock-gpu-clocks={ref_sm_clock},{ref_sm_clock}",
-            ]
-        )
-        subprocess.check_output(
-            [
-                "nvidia-smi",
-                "-i",
-                "0",
-                f"--lock-memory-clocks={ref_mem_clock},{ref_mem_clock}",
-            ]
-        )
-        cur_sm_clock = nvsmi_attr(["clocks.current.sm"])[0]
-        cur_mem_clock = nvsmi_attr(["clocks.current.memory"])[0]
-        assert abs(cur_sm_clock - ref_sm_clock) < 10, f"GPU SMs must run at {ref_sm_clock} MHz"
-        assert abs(cur_mem_clock - ref_mem_clock) < 10, f"GPU SMs must run at {ref_mem_clock} MHz"
-        tflops = 1e-6 * 2 * 108 * 4 * 256 * ref_sm_clock
-        gbps = 640 * 2 * ref_mem_clock * 1e-3
-        yield tflops, gbps
-    finally:
-        subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "0"])
-        subprocess.check_output(["nvidia-smi", "-i", "0", "-rgc"])
-        subprocess.check_output(["nvidia-smi", "-i", "0", "-rmc"])
-
-
-def get_max_simd_tflops(dtype, backend=None, device=None):
-    import torch
-
-    from .runtime import driver
-    if not backend:
-        backend = _triton.runtime.backend.CUDA
-    if not device:
-        device = torch.cuda.current_device()
-
-    num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4
-    clock_rate = driver.utils.get_device_properties(device)["sm_clock_rate"]  # in kHz
-    capability = torch.cuda.get_device_capability()
-    if capability[0] < 8:
-        if dtype == torch.float32:
-            ops_per_sub_core = 32  # 2*16
-        elif dtype == torch.float16:
-            ops_per_sub_core = 64
-        else:
-            raise RuntimeError("dtype not supported")
-    else:
-        if dtype == torch.float32:
-            ops_per_sub_core = 32
-        elif dtype in [torch.float16, torch.bfloat16]:
-            ops_per_sub_core = 64
-        else:
-            raise RuntimeError("dtype not supported")
-    tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
-    return tflops
diff --git a/python/triton/third_party/cuda/bin/ptxas b/python/triton/third_party/cuda/bin/ptxas
deleted file mode 100755
index 8b47936ea212..000000000000
Binary files a/python/triton/third_party/cuda/bin/ptxas and /dev/null differ
diff --git a/python/triton/third_party/cuda/include/cuda.h b/python/triton/third_party/cuda/include/cuda.h
deleted file mode 100755
index c713bf316a16..000000000000
--- a/python/triton/third_party/cuda/include/cuda.h
+++ /dev/null
@@ -1,19348 +0,0 @@
-/*
- * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef __cuda_cuda_h__
-#define __cuda_cuda_h__
-
-
-
-#include <stdlib.h>
-#ifdef _MSC_VER
-typedef unsigned __int32 cuuint32_t;
-typedef unsigned __int64 cuuint64_t;
-#else
-#include <stdint.h>
-typedef uint32_t cuuint32_t;
-typedef uint64_t cuuint64_t;
-#endif
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#if defined(CUDA_FORCE_API_VERSION)
-#error "CUDA_FORCE_API_VERSION is no longer supported."
-#endif
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
-    #define __CUDA_API_PTDS(api) api ## _ptds
-    #define __CUDA_API_PTSZ(api) api ## _ptsz
-#else
-    #define __CUDA_API_PTDS(api) api
-    #define __CUDA_API_PTSZ(api) api
-#endif
-
-#define cuDeviceTotalMem                    cuDeviceTotalMem_v2
-#define cuCtxCreate                         cuCtxCreate_v2
-#define cuCtxCreate_v3                      cuCtxCreate_v3
-#define cuModuleGetGlobal                   cuModuleGetGlobal_v2
-#define cuMemGetInfo                        cuMemGetInfo_v2
-#define cuMemAlloc                          cuMemAlloc_v2
-#define cuMemAllocPitch                     cuMemAllocPitch_v2
-#define cuMemFree                           cuMemFree_v2
-#define cuMemGetAddressRange                cuMemGetAddressRange_v2
-#define cuMemAllocHost                      cuMemAllocHost_v2
-#define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
-#define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
-#define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
-#define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
-#define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
-#define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
-#define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
-#define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
-#define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
-#define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
-#define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
-#define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
-#define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
-#define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
-#define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
-#define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
-#define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
-#define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
-#define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
-#define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
-#define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
-#define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
-#define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
-#define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
-#define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
-#define cuArrayCreate                       cuArrayCreate_v2
-#define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
-#define cuArray3DCreate                     cuArray3DCreate_v2
-#define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
-#define cuTexRefSetAddress                  cuTexRefSetAddress_v2
-#define cuTexRefGetAddress                  cuTexRefGetAddress_v2
-#define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
-#define cuCtxDestroy                        cuCtxDestroy_v2
-#define cuCtxPopCurrent                     cuCtxPopCurrent_v2
-#define cuCtxPushCurrent                    cuCtxPushCurrent_v2
-#define cuStreamDestroy                     cuStreamDestroy_v2
-#define cuEventDestroy                      cuEventDestroy_v2
-#define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
-#define cuLinkCreate                        cuLinkCreate_v2
-#define cuLinkAddData                       cuLinkAddData_v2
-#define cuLinkAddFile                       cuLinkAddFile_v2
-#define cuMemHostRegister                   cuMemHostRegister_v2
-#define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
-#define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture_v2)
-#define cuDevicePrimaryCtxRelease           cuDevicePrimaryCtxRelease_v2
-#define cuDevicePrimaryCtxReset             cuDevicePrimaryCtxReset_v2
-#define cuDevicePrimaryCtxSetFlags          cuDevicePrimaryCtxSetFlags_v2
-#define cuDeviceGetUuid_v2                  cuDeviceGetUuid_v2
-#define cuIpcOpenMemHandle                  cuIpcOpenMemHandle_v2
-#define cuGraphInstantiate                  cuGraphInstantiate_v2
-
-#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
-    #define cuMemcpyAsync                       __CUDA_API_PTSZ(cuMemcpyAsync)
-    #define cuMemcpyPeer                        __CUDA_API_PTDS(cuMemcpyPeer)
-    #define cuMemcpyPeerAsync                   __CUDA_API_PTSZ(cuMemcpyPeerAsync)
-    #define cuMemcpy3DPeer                      __CUDA_API_PTDS(cuMemcpy3DPeer)
-    #define cuMemcpy3DPeerAsync                 __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
-    #define cuMemPrefetchAsync                  __CUDA_API_PTSZ(cuMemPrefetchAsync)
-
-    #define cuMemsetD8Async                     __CUDA_API_PTSZ(cuMemsetD8Async)
-    #define cuMemsetD16Async                    __CUDA_API_PTSZ(cuMemsetD16Async)
-    #define cuMemsetD32Async                    __CUDA_API_PTSZ(cuMemsetD32Async)
-    #define cuMemsetD2D8Async                   __CUDA_API_PTSZ(cuMemsetD2D8Async)
-    #define cuMemsetD2D16Async                  __CUDA_API_PTSZ(cuMemsetD2D16Async)
-    #define cuMemsetD2D32Async                  __CUDA_API_PTSZ(cuMemsetD2D32Async)
-
-    #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
-    #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
-    #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
-    #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
-    #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
-    #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
-    #define cuStreamGetCaptureInfo              __CUDA_API_PTSZ(cuStreamGetCaptureInfo)
-    #define cuStreamGetCaptureInfo_v2           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
-    #define cuStreamUpdateCaptureDependencies   __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies)
-    #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
-    #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
-    #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
-    #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
-    #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
-    #define cuEventRecordWithFlags              __CUDA_API_PTSZ(cuEventRecordWithFlags)
-    #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
-
-
-
-    #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
-    #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
-    #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
-
-    #define cuStreamWriteValue32                __CUDA_API_PTSZ(cuStreamWriteValue32)
-    #define cuStreamWaitValue32                 __CUDA_API_PTSZ(cuStreamWaitValue32)
-    #define cuStreamWriteValue64                __CUDA_API_PTSZ(cuStreamWriteValue64)
-    #define cuStreamWaitValue64                 __CUDA_API_PTSZ(cuStreamWaitValue64)
-    #define cuStreamBatchMemOp                  __CUDA_API_PTSZ(cuStreamBatchMemOp)
-
-    #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
-
-    #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
-    #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
-
-    #define cuGraphUpload                       __CUDA_API_PTSZ(cuGraphUpload)
-    #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
-    #define cuStreamCopyAttributes              __CUDA_API_PTSZ(cuStreamCopyAttributes)
-    #define cuStreamGetAttribute                __CUDA_API_PTSZ(cuStreamGetAttribute)
-    #define cuStreamSetAttribute                __CUDA_API_PTSZ(cuStreamSetAttribute)
-    #define cuMemMapArrayAsync                  __CUDA_API_PTSZ(cuMemMapArrayAsync)
-
-    #define cuMemFreeAsync                      __CUDA_API_PTSZ(cuMemFreeAsync)
-    #define cuMemAllocAsync                     __CUDA_API_PTSZ(cuMemAllocAsync)
-    #define cuMemAllocFromPoolAsync             __CUDA_API_PTSZ(cuMemAllocFromPoolAsync)
-#endif
-
-/**
- * \file cuda.h
- * \brief Header file for the CUDA Toolkit application programming interface.
- *
- * \file cudaGL.h
- * \brief Header file for the OpenGL interoperability functions of the
- * low-level CUDA driver application programming interface.
- *
- * \file cudaD3D9.h
- * \brief Header file for the Direct3D 9 interoperability functions of the
- * low-level CUDA driver application programming interface.
- */
-
-/**
- * \defgroup CUDA_TYPES Data types used by CUDA driver
- * @{
- */
-
-/**
- * CUDA API version number
- */
-#define CUDA_VERSION 11060
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * CUDA device pointer
- * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
- */
-#if defined(_WIN64) || defined(__LP64__)
-typedef unsigned long long CUdeviceptr_v2;
-#else
-typedef unsigned int CUdeviceptr_v2;
-#endif
-typedef CUdeviceptr_v2 CUdeviceptr;                          /**< CUDA device pointer */
-
-typedef int CUdevice_v1;                                     /**< CUDA device */
-typedef CUdevice_v1 CUdevice;                                /**< CUDA device */
-typedef struct CUctx_st *CUcontext;                          /**< CUDA context */
-typedef struct CUmod_st *CUmodule;                           /**< CUDA module */
-typedef struct CUfunc_st *CUfunction;                        /**< CUDA function */
-typedef struct CUarray_st *CUarray;                          /**< CUDA array */
-typedef struct CUmipmappedArray_st *CUmipmappedArray;        /**< CUDA mipmapped array */
-typedef struct CUtexref_st *CUtexref;                        /**< CUDA texture reference */
-typedef struct CUsurfref_st *CUsurfref;                      /**< CUDA surface reference */
-typedef struct CUevent_st *CUevent;                          /**< CUDA event */
-typedef struct CUstream_st *CUstream;                        /**< CUDA stream */
-typedef struct CUgraphicsResource_st *CUgraphicsResource;    /**< CUDA graphics interop resource */
-typedef unsigned long long CUtexObject_v1;                   /**< An opaque value that represents a CUDA texture object */
-typedef CUtexObject_v1 CUtexObject;                          /**< An opaque value that represents a CUDA texture object */
-typedef unsigned long long CUsurfObject_v1;                  /**< An opaque value that represents a CUDA surface object */
-typedef CUsurfObject_v1 CUsurfObject;                        /**< An opaque value that represents a CUDA surface object */ 
-typedef struct CUextMemory_st *CUexternalMemory;             /**< CUDA external memory */
-typedef struct CUextSemaphore_st *CUexternalSemaphore;       /**< CUDA external semaphore */
-typedef struct CUgraph_st *CUgraph;                          /**< CUDA graph */
-typedef struct CUgraphNode_st *CUgraphNode;                  /**< CUDA graph node */
-typedef struct CUgraphExec_st *CUgraphExec;                  /**< CUDA executable graph */
-typedef struct CUmemPoolHandle_st *CUmemoryPool;             /**< CUDA memory pool */
-typedef struct CUuserObject_st *CUuserObject;                /**< CUDA user object for graphs */
-
-#ifndef CU_UUID_HAS_BEEN_DEFINED
-#define CU_UUID_HAS_BEEN_DEFINED
-typedef struct CUuuid_st {                                /**< CUDA definition of UUID */
-    char bytes[16];
-} CUuuid;
-#endif
-
-/**
- * CUDA IPC handle size
- */
-#define CU_IPC_HANDLE_SIZE 64
-
-/**
- * CUDA IPC event handle
- */
-typedef struct CUipcEventHandle_st {
-    char reserved[CU_IPC_HANDLE_SIZE];
-} CUipcEventHandle_v1;
-typedef CUipcEventHandle_v1 CUipcEventHandle;
-
-/**
- * CUDA IPC mem handle
- */
-typedef struct CUipcMemHandle_st {
-    char reserved[CU_IPC_HANDLE_SIZE];
-} CUipcMemHandle_v1;
-typedef CUipcMemHandle_v1 CUipcMemHandle;
-
-/**
- * CUDA Ipc Mem Flags
- */
-typedef enum CUipcMem_flags_enum {
-    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
-} CUipcMem_flags;
-
-
-/**
- * CUDA Mem Attach Flags
- */
-typedef enum CUmemAttach_flags_enum {
-    CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
-    CU_MEM_ATTACH_HOST   = 0x2, /**< Memory cannot be accessed by any stream on any device */
-    CU_MEM_ATTACH_SINGLE = 0x4  /**< Memory can only be accessed by a single stream on the associated device */
-} CUmemAttach_flags;
-
-/**
- * Context creation flags
- */
-typedef enum CUctx_flags_enum {
-    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
-    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
-    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
-    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
-    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
-                                         *  \deprecated This flag was deprecated as of CUDA 4.0
-                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
-    CU_CTX_SCHED_MASK          = 0x07,
-    CU_CTX_MAP_HOST            = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 
-                                         *  and it no longer has any effect. All contexts 
-                                         *  as of CUDA 3.2 behave as though the flag is enabled. */
-    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
-    CU_CTX_FLAGS_MASK          = 0x1f
-} CUctx_flags;
-
-/**
- * Stream creation flags
- */
-typedef enum CUstream_flags_enum {
-    CU_STREAM_DEFAULT             = 0x0, /**< Default stream flag */
-    CU_STREAM_NON_BLOCKING        = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
-} CUstream_flags;
-
-/**
- * Legacy stream handle
- *
- * Stream handle that can be passed as a CUstream to use an implicit stream
- * with legacy synchronization behavior.
- *
- * See details of the \link_sync_behavior
- */
-#define CU_STREAM_LEGACY     ((CUstream)0x1)
-
-/**
- * Per-thread stream handle
- *
- * Stream handle that can be passed as a CUstream to use an implicit stream
- * with per-thread synchronization behavior.
- *
- * See details of the \link_sync_behavior
- */
-#define CU_STREAM_PER_THREAD ((CUstream)0x2)
-
-/**
- * Event creation flags
- */
-typedef enum CUevent_flags_enum {
-    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
-    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
-    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
-    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
-} CUevent_flags;
-
-/**
- * Event record flags
- */
-typedef enum CUevent_record_flags_enum {
-    CU_EVENT_RECORD_DEFAULT  = 0x0, /**< Default event record flag */
-    CU_EVENT_RECORD_EXTERNAL = 0x1  /**< When using stream capture, create an event record node
-                                      *  instead of the default behavior.  This flag is invalid
-                                      *  when used outside of capture. */
-} CUevent_record_flags;
-
-/**
- * Event wait flags
- */
-typedef enum CUevent_wait_flags_enum {
-    CU_EVENT_WAIT_DEFAULT  = 0x0, /**< Default event wait flag */
-    CU_EVENT_WAIT_EXTERNAL = 0x1  /**< When using stream capture, create an event wait node
-                                    *  instead of the default behavior.  This flag is invalid
-                                    *  when used outside of capture.*/
-} CUevent_wait_flags;
-
-/**
- * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
- */
-typedef enum CUstreamWaitValue_flags_enum {
-    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,   /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
-                                             values). Note this is a cyclic comparison which ignores wraparound.
-                                             (Default behavior.) */
-    CU_STREAM_WAIT_VALUE_EQ    = 0x1,   /**< Wait until *addr == value. */
-    CU_STREAM_WAIT_VALUE_AND   = 0x2,   /**< Wait until (*addr & value) != 0. */
-    CU_STREAM_WAIT_VALUE_NOR   = 0x3,   /**< Wait until ~(*addr | value) != 0. Support for this operation can be
-                                             queried with ::cuDeviceGetAttribute() and
-                                             ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
-    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30  /**< Follow the wait operation with a flush of outstanding remote writes. This
-                                             means that, if a remote write operation is guaranteed to have reached the
-                                             device before the wait can be satisfied, that write is guaranteed to be
-                                             visible to downstream device work. The device is permitted to reorder
-                                             remote writes internally. For example, this flag would be required if
-                                             two remote writes arrive in a defined order, the wait is satisfied by the
-                                             second write, and downstream work needs to observe the first write.
-                                             Support for this operation is restricted to selected platforms and can be
-                                             queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/
-} CUstreamWaitValue_flags;
-
-/**
- * Flags for ::cuStreamWriteValue32
- */
-typedef enum CUstreamWriteValue_flags_enum {
-    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0, /**< Default behavior */
-    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1  /**< Permits the write to be reordered with writes which were issued
-                                                        before it, as a performance optimization. Normally,
-                                                        ::cuStreamWriteValue32 will provide a memory fence before the
-                                                        write, which has similar semantics to
-                                                        __threadfence_system() but is scoped to the stream
-                                                        rather than a CUDA thread. */
-} CUstreamWriteValue_flags;
-
-/**
- * Operations for ::cuStreamBatchMemOp
- */
-typedef enum CUstreamBatchMemOpType_enum {
-    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,     /**< Represents a ::cuStreamWaitValue32 operation */
-    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,     /**< Represents a ::cuStreamWriteValue32 operation */
-    CU_STREAM_MEM_OP_WAIT_VALUE_64  = 4,     /**< Represents a ::cuStreamWaitValue64 operation */
-    CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,     /**< Represents a ::cuStreamWriteValue64 operation */
-    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
-                                                  standalone operation. */
-} CUstreamBatchMemOpType;
-
-/**
- * Per-operation parameters for ::cuStreamBatchMemOp
- */
-typedef union CUstreamBatchMemOpParams_union {
-    CUstreamBatchMemOpType operation;
-    struct CUstreamMemOpWaitValueParams_st {
-        CUstreamBatchMemOpType operation;
-        CUdeviceptr address;
-        union {
-            cuuint32_t value;
-            cuuint64_t value64;
-        };
-        unsigned int flags;
-        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
-    } waitValue;
-    struct CUstreamMemOpWriteValueParams_st {
-        CUstreamBatchMemOpType operation;
-        CUdeviceptr address;
-        union {
-            cuuint32_t value;
-            cuuint64_t value64;
-        };
-        unsigned int flags;
-        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
-    } writeValue;
-    struct CUstreamMemOpFlushRemoteWritesParams_st {
-        CUstreamBatchMemOpType operation;
-        unsigned int flags;
-    } flushRemoteWrites;
-    cuuint64_t pad[6];
-} CUstreamBatchMemOpParams_v1;
-typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams;
-
-/**
- * Occupancy calculator flag
- */
-typedef enum CUoccupancy_flags_enum {
-    CU_OCCUPANCY_DEFAULT                  = 0x0, /**< Default behavior */
-    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
-} CUoccupancy_flags;
-
-/**
- * Flags for ::cuStreamUpdateCaptureDependencies
- */
-typedef enum CUstreamUpdateCaptureDependencies_flags_enum {
-    CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */
-    CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1  /**< Replace the dependency set with the new nodes */
-} CUstreamUpdateCaptureDependencies_flags;
-
-/**
- * Array formats
- */
-typedef enum CUarray_format_enum {
-    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
-    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
-    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
-    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
-    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
-    CU_AD_FORMAT_FLOAT          = 0x20, /**< 32-bit floating point */
-    CU_AD_FORMAT_NV12           = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
-    CU_AD_FORMAT_UNORM_INT8X1   = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT8X2   = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT8X4   = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X1  = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X2  = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X4  = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X1   = 0xc6, /**< 1 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X2   = 0xc7, /**< 2 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X4   = 0xc8, /**< 4 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X1  = 0xc9, /**< 1 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X2  = 0xca, /**< 2 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X4  = 0xcb, /**< 4 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_BC1_UNORM      = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
-    CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC2_UNORM      = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
-    CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC3_UNORM      = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
-    CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC4_UNORM      = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
-    CU_AD_FORMAT_BC4_SNORM      = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
-    CU_AD_FORMAT_BC5_UNORM      = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
-    CU_AD_FORMAT_BC5_SNORM      = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
-    CU_AD_FORMAT_BC6H_UF16      = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
-    CU_AD_FORMAT_BC6H_SF16      = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
-    CU_AD_FORMAT_BC7_UNORM      = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
-    CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e  /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
-} CUarray_format;
-
-/**
- * Texture reference addressing modes
- */
-typedef enum CUaddress_mode_enum {
-    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
-    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
-    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
-    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
-} CUaddress_mode;
-
-/**
- * Texture reference filtering modes
- */
-typedef enum CUfilter_mode_enum {
-    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
-    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
-} CUfilter_mode;
-
-/**
- * Device properties
- */
-typedef enum CUdevice_attribute_enum {
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          /**< Maximum number of threads per block */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                /**< Maximum block dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                /**< Maximum block dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                /**< Maximum block dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 /**< Maximum grid dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 /**< Maximum grid dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 /**< Maximum grid dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    /**< Maximum shared memory available per block in bytes */
-    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,                        /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
-    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     /**< Warp size in threads */
-    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     /**< Maximum pitch in bytes allowed by memory copies */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       /**< Maximum number of 32-bit registers available per block */
-    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,                           /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    /**< Typical clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             /**< Alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                                   /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
-    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          /**< Number of multiprocessors on device */
-    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           /**< Specifies whether there is a run time limit on kernels */
-    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    /**< Device is integrated with host memory */
-    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           /**< Device can map host memory into CUDA address space */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  /**< Compute mode (See ::CUcomputemode for details) */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       /**< Maximum 1D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       /**< Maximum 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      /**< Maximum 2D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       /**< Maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      /**< Maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       /**< Maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               /**< Maximum 2D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              /**< Maximum 2D layered texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              /**< Maximum layers in a 2D layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,                 /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,                /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,             /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
-    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             /**< Alignment requirement for surfaces */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            /**< Device can possibly execute multiple kernels concurrently */
-    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   /**< Device has ECC support enabled */
-    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    /**< PCI bus ID of the device */
-    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 /**< PCI device ID of the device */
-    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    /**< Device is using TCC driver model */
-    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             /**< Peak memory clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       /**< Global memory bus width in bits */
-    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 /**< Size of L2 cache in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                /**< Maximum resident threads per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            /**< Number of asynchronous engines */
-    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            /**< Device shares a unified address space with the host */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               /**< Maximum 1D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              /**< Maximum layers in a 1D layered texture */
-    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                              /**< Deprecated, do not use. */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             /**< Alternate maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            /**< Alternate maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             /**< Alternate maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 /**< PCI domain ID of the device */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       /**< Pitch alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  /**< Maximum cubemap texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          /**< Maximum cubemap layered texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         /**< Maximum layers in a cubemap layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       /**< Maximum 1D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       /**< Maximum 2D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      /**< Maximum 2D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       /**< Maximum 3D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      /**< Maximum 3D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       /**< Maximum 3D surface depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               /**< Maximum 1D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              /**< Maximum layers in a 1D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               /**< Maximum 2D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              /**< Maximum 2D layered surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              /**< Maximum layers in a 2D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  /**< Maximum cubemap surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          /**< Maximum cubemap layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         /**< Maximum layers in a cubemap layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,                /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                /**< Maximum 2D linear texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               /**< Maximum 2D linear texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                /**< Maximum 2D linear texture pitch in bytes */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             /**< Maximum mipmapped 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            /**< Maximum mipmapped 2D texture height */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      /**< Major compute capability version number */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      /**< Minor compute capability version number */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             /**< Maximum mipmapped 1D texture width */
-    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   /**< Device supports stream priorities */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     /**< Device supports caching globals in L1 */
-    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      /**< Device supports caching locals in L1 */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          /**< Maximum shared memory available per multiprocessor in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              /**< Maximum number of 32-bit registers available per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                /**< Device can allocate managed memory on this system */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               /**< Device is on a multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      /**< Unique id for a group of devices on the same multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
-    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     /**< Device can coherently access managed memory concurrently with the CPU */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  /**< Device supports compute preemption. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       /**< Device can access host registered memory at the same virtual address as the CPU */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,                        /**< ::cuStreamBatchMemOp and related APIs are supported. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,                 /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,                 /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,               /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             /**< Maximum optin shared memory per block */
-    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
-    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       /**< Device supports host memory registration via ::cudaHostRegister. */
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
-    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          /**< The host can directly access managed memory on the device without migration. */
-    CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,         /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/
-    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,         /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
-    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                /**< Maximum number of blocks per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                /**< Device supports compression of memory */
-    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 /**< Maximum L2 persisting lines capacity setting in bytes. */
-    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                /**< Maximum value of CUaccessPolicyWindow::num_bytes. */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             /**< Shared memory reserved by CUDA driver per block in bytes */
-    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
-    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
-    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         /**< External timeline semaphore interop is supported on the device */
-    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
-    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               /**< Handle types supported with mempool based IPC */
-
-
-
-
-    CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121,        /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
-
-    CU_DEVICE_ATTRIBUTE_MAX
-} CUdevice_attribute;
-
-/**
- * Legacy device properties
- */
-typedef struct CUdevprop_st {
-    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
-    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
-    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
-    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
-    int totalConstantMemory;    /**< Constant memory available on device in bytes */
-    int SIMDWidth;              /**< Warp size in threads */
-    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
-    int regsPerBlock;           /**< 32-bit registers available per block */
-    int clockRate;              /**< Clock frequency in kilohertz */
-    int textureAlign;           /**< Alignment requirement for textures */
-} CUdevprop_v1;
-typedef CUdevprop_v1 CUdevprop;
-
-/**
- * Pointer information
- */
-typedef enum CUpointer_attribute_enum {
-    CU_POINTER_ATTRIBUTE_CONTEXT = 1,                     /**< The ::CUcontext on which a pointer was allocated or registered */
-    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,                 /**< The ::CUmemorytype describing the physical location of a pointer */
-    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,              /**< The address at which a pointer's memory may be accessed on the device */
-    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,                /**< The address at which a pointer's memory may be accessed on the host */
-    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,                  /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
-    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,                 /**< Synchronize every synchronous memory operation initiated on this region */
-    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,                   /**< A process-wide unique ID for an allocated memory region*/
-    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,                  /**< Indicates if the pointer points to managed memory */
-    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,              /**< A device ordinal of a device on which a pointer was allocated or registered */
-    CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/
-    CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11,           /**< Starting address for this requested pointer */
-    CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12,                 /**< Size of the address range for this requested pointer */
-    CU_POINTER_ATTRIBUTE_MAPPED = 13,                     /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/
-    CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14,       /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/
-    CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16,               /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */
-    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17              /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/
-} CUpointer_attribute;
-
-/**
- * Function properties
- */
-typedef enum CUfunction_attribute_enum {
-    /**
-     * The maximum number of threads per block, beyond which a launch of the
-     * function would fail. This number depends on both the function and the
-     * device on which the function is currently loaded.
-     */
-    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
-
-    /**
-     * The size in bytes of statically-allocated shared memory required by
-     * this function. This does not include dynamically-allocated shared
-     * memory requested by the user at runtime.
-     */
-    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
-
-    /**
-     * The size in bytes of user-allocated constant memory required by this
-     * function.
-     */
-    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
-
-    /**
-     * The size in bytes of local memory used by each thread of this function.
-     */
-    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
-
-    /**
-     * The number of registers used by each thread of this function.
-     */
-    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
-
-    /**
-     * The PTX virtual architecture version for which the function was
-     * compiled. This value is the major PTX version * 10 + the minor PTX
-     * version, so a PTX version 1.3 function would return the value 13.
-     * Note that this may return the undefined value of 0 for cubins
-     * compiled prior to CUDA 3.0.
-     */
-    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
-
-    /**
-     * The binary architecture version for which the function was compiled.
-     * This value is the major binary version * 10 + the minor binary version,
-     * so a binary version 1.3 function would return the value 13. Note that
-     * this will return a value of 10 for legacy cubins that do not have a
-     * properly-encoded binary architecture version.
-     */
-    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
-
-    /**
-     * The attribute to indicate whether the function has been compiled with
-     * user specified option "-Xptxas --dlcm=ca" set .
-     */
-    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
-
-    /**
-     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
-     * this function. If the user-specified dynamic shared memory size is larger than this
-     * value, the launch will fail.
-     * See ::cuFuncSetAttribute
-     */
-    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
-
-    /**
-     * On devices where the L1 cache and shared memory use the same hardware resources, 
-     * this sets the shared memory carveout preference, in percent of the total shared memory.
-     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
-     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
-     * See ::cuFuncSetAttribute
-     */
-    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    CU_FUNC_ATTRIBUTE_MAX
-} CUfunction_attribute;
-
-/**
- * Function cache configurations
- */
-typedef enum CUfunc_cache_enum {
-    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
-    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
-    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
-    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
-} CUfunc_cache;
-
-/**
- * Shared memory configurations
- */
-typedef enum CUsharedconfig_enum {
-    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
-    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
-    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
-} CUsharedconfig;
-
-/**
- * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute
- */
-typedef enum CUshared_carveout_enum {
-    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /**< No preference for shared memory or L1 (default) */
-    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
-    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
-} CUshared_carveout;
-
-/**
- * Memory types
- */
-typedef enum CUmemorytype_enum {
-    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
-    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
-    CU_MEMORYTYPE_ARRAY   = 0x03,    /**< Array memory */
-    CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
-} CUmemorytype;
-
-/**
- * Compute Modes
- */
-typedef enum CUcomputemode_enum {
-    CU_COMPUTEMODE_DEFAULT           = 0, /**< Default compute mode (Multiple contexts allowed per device) */
-    CU_COMPUTEMODE_PROHIBITED        = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
-    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
-} CUcomputemode;
-
-/**
- * Memory advise values
- */
-typedef enum CUmem_advise_enum {
-    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1, /**< Data will mostly be read and only occassionally be written to */
-    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
-    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3, /**< Set the preferred location for the data as the specified device */
-    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
-    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
-    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
-} CUmem_advise;
-
-typedef enum CUmem_range_attribute_enum {
-    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY            = 1, /**< Whether the range will mostly be read and only occassionally be written to */
-    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION     = 2, /**< The preferred location of the range */
-    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY            = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
-    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4  /**< The last location to which the range was prefetched */
-} CUmem_range_attribute;
-
-/**
- * Online compiler and linker options
- */
-typedef enum CUjit_option_enum
-{
-    /**
-     * Max number of registers that a thread may use.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_MAX_REGISTERS = 0,
-
-    /**
-     * IN: Specifies minimum number of threads per block to target compilation
-     * for\n
-     * OUT: Returns the number of threads the compiler actually targeted.
-     * This restricts the resource utilization fo the compiler (e.g. max
-     * registers) such that a block with the given number of threads should be
-     * able to launch based on register limitations. Note, this option does not
-     * currently take into account any other resource limitations, such as
-     * shared memory utilization.\n
-     * Cannot be combined with ::CU_JIT_TARGET.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_THREADS_PER_BLOCK,
-
-    /**
-     * Overwrites the option value with the total wall clock time, in
-     * milliseconds, spent in the compiler and linker\n
-     * Option type: float\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_WALL_TIME,
-
-    /**
-     * Pointer to a buffer in which to print any log messages
-     * that are informational in nature (the buffer size is specified via
-     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
-     * Option type: char *\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_INFO_LOG_BUFFER,
-
-    /**
-     * IN: Log buffer size in bytes.  Log messages will be capped at this size
-     * (including null terminator)\n
-     * OUT: Amount of log buffer filled with messages\n
-     * Option type: unsigned int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-
-    /**
-     * Pointer to a buffer in which to print any log messages that
-     * reflect errors (the buffer size is specified via option
-     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
-     * Option type: char *\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_ERROR_LOG_BUFFER,
-
-    /**
-     * IN: Log buffer size in bytes.  Log messages will be capped at this size
-     * (including null terminator)\n
-     * OUT: Amount of log buffer filled with messages\n
-     * Option type: unsigned int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-
-    /**
-     * Level of optimizations to apply to generated code (0 - 4), with 4
-     * being the default and highest level of optimizations.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_OPTIMIZATION_LEVEL,
-
-    /**
-     * No option value required. Determines the target based on the current
-     * attached context (default)\n
-     * Option type: No option value needed\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_TARGET_FROM_CUCONTEXT,
-
-    /**
-     * Target is chosen based on supplied ::CUjit_target.  Cannot be
-     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
-     * Option type: unsigned int for enumerated type ::CUjit_target\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_TARGET,
-
-    /**
-     * Specifies choice of fallback strategy if matching cubin is not found.
-     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
-     * used with cuLink* APIs as the linker requires exact matches.\n
-     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
-     * Applies to: compiler only
-     */
-    CU_JIT_FALLBACK_STRATEGY,
-
-    /**
-     * Specifies whether to create debug information in output (-g)
-     * (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_GENERATE_DEBUG_INFO,
-
-    /**
-     * Generate verbose log messages (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_LOG_VERBOSE,
-
-    /**
-     * Generate line number information (-lineinfo) (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_GENERATE_LINE_INFO,
-
-    /**
-     * Specifies whether to enable caching explicitly (-dlcm) \n
-     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
-     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
-     * Applies to: compiler only
-     */
-    CU_JIT_CACHE_MODE,
-
-    /**
-     * The below jit options are used for internal purposes only, in this version of CUDA
-     */
-    CU_JIT_NEW_SM3X_OPT,
-    CU_JIT_FAST_COMPILE,
-
-    /**
-     * Array of device symbol names that will be relocated to the corresponing
-     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
-     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
-     * When loding a device module, driver will relocate all encountered
-     * unresolved symbols to the host addresses.\n
-     * It is only allowed to register symbols that correspond to unresolved
-     * global variables.\n
-     * It is illegal to register the same device symbol at multiple addresses.\n
-     * Option type: const char **\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_NAMES,
-
-    /**
-     * Array of host addresses that will be used to relocate corresponding
-     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
-     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
-     * Option type: void **\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
-
-    /**
-     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
-     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
-     * Option type: unsigned int\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_COUNT,
-
-    /**
-     * Enable link-time optimization (-dlto) for device code (0: false, default).\n
-     * This option is not supported on 32-bit platforms.\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_LTO,
-
-    /**
-     * Control single-precision denormals (-ftz) support (0: false, default).
-     * 1 : flushes denormal values to zero
-     * 0 : preserves denormal values
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_FTZ,
-
-    /**
-     * Control single-precision floating-point division and reciprocals
-     * (-prec-div) support (1: true, default).
-     * 1 : Enables the IEEE round-to-nearest mode
-     * 0 : Enables the fast approximation mode
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_PREC_DIV,
-
-    /**
-     * Control single-precision floating-point square root
-     * (-prec-sqrt) support (1: true, default).
-     * 1 : Enables the IEEE round-to-nearest mode
-     * 0 : Enables the fast approximation mode
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_PREC_SQRT,
-
-    /**
-     * Enable/Disable the contraction of floating-point multiplies
-     * and adds/subtracts into floating-point multiply-add (-fma)
-     * operations (1: Enable, default; 0: Disable).
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_FMA,
-
-    CU_JIT_NUM_OPTIONS
-
-} CUjit_option;
-
-/**
- * Online compilation targets
- */
-typedef enum CUjit_target_enum
-{
-
-    CU_TARGET_COMPUTE_20 = 20,       /**< Compute device class 2.0 */
-    CU_TARGET_COMPUTE_21 = 21,       /**< Compute device class 2.1 */
-
-
-    CU_TARGET_COMPUTE_30 = 30,       /**< Compute device class 3.0 */
-    CU_TARGET_COMPUTE_32 = 32,       /**< Compute device class 3.2 */
-    CU_TARGET_COMPUTE_35 = 35,       /**< Compute device class 3.5 */
-    CU_TARGET_COMPUTE_37 = 37,       /**< Compute device class 3.7 */
-
-
-    CU_TARGET_COMPUTE_50 = 50,       /**< Compute device class 5.0 */
-    CU_TARGET_COMPUTE_52 = 52,       /**< Compute device class 5.2 */
-    CU_TARGET_COMPUTE_53 = 53,       /**< Compute device class 5.3 */
-
-
-    CU_TARGET_COMPUTE_60 = 60,       /**< Compute device class 6.0.*/
-    CU_TARGET_COMPUTE_61 = 61,       /**< Compute device class 6.1.*/
-    CU_TARGET_COMPUTE_62 = 62,       /**< Compute device class 6.2.*/
-
-
-    CU_TARGET_COMPUTE_70 = 70,       /**< Compute device class 7.0.*/
-    CU_TARGET_COMPUTE_72 = 72,       /**< Compute device class 7.2.*/
-
-    CU_TARGET_COMPUTE_75 = 75,       /**< Compute device class 7.5.*/
-
-    CU_TARGET_COMPUTE_80 = 80,       /**< Compute device class 8.0.*/
-    CU_TARGET_COMPUTE_86 = 86        /**< Compute device class 8.6.*/
-
-} CUjit_target;
-
-/**
- * Cubin matching fallback strategies
- */
-typedef enum CUjit_fallback_enum
-{
-    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx if exact binary match not found */
-
-    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code if exact match not found */
-
-} CUjit_fallback;
-
-/**
- * Caching modes for dlcm
- */
-typedef enum CUjit_cacheMode_enum
-{
-    CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
-    CU_JIT_CACHE_OPTION_CG,       /**< Compile with L1 cache disabled */
-    CU_JIT_CACHE_OPTION_CA        /**< Compile with L1 cache enabled */
-} CUjit_cacheMode;
-
-/**
- * Device code formats
- */
-typedef enum CUjitInputType_enum
-{
-    /**
-     * Compiled device-class-specific device code\n
-     * Applicable options: none
-     */
-    CU_JIT_INPUT_CUBIN = 0,
-
-    /**
-     * PTX source code\n
-     * Applicable options: PTX compiler options
-     */
-    CU_JIT_INPUT_PTX,
-
-    /**
-     * Bundle of multiple cubins and/or PTX of some device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_FATBINARY,
-
-    /**
-     * Host object with embedded device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_OBJECT,
-
-    /**
-     * Archive of host objects with embedded device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_LIBRARY,
-
-    /**
-     * High-level intermediate code for link-time optimization\n
-     * Applicable options: NVVM compiler options, PTX compiler options
-     */
-    CU_JIT_INPUT_NVVM,
-
-    CU_JIT_NUM_INPUT_TYPES
-} CUjitInputType;
-
-typedef struct CUlinkState_st *CUlinkState;
-
-/**
- * Flags to register a graphics resource
- */
-typedef enum CUgraphicsRegisterFlags_enum {
-    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
-    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
-    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
-    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
-    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
-} CUgraphicsRegisterFlags;
-
-/**
- * Flags for mapping and unmapping interop resources
- */
-typedef enum CUgraphicsMapResourceFlags_enum {
-    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
-    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
-    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
-} CUgraphicsMapResourceFlags;
-
-/**
- * Array indices for cube faces
- */
-typedef enum CUarray_cubemap_face_enum {
-    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
-    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
-    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
-    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
-    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
-    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
-} CUarray_cubemap_face;
-
-/**
- * Limits
- */
-typedef enum CUlimit_enum {
-    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack size */
-    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO size */
-    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap size */
-    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
-    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
-    CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
-    CU_LIMIT_PERSISTING_L2_CACHE_SIZE         = 0x06, /**< A size in bytes for L2 persisting lines cache size */
-    CU_LIMIT_MAX
-} CUlimit;
-
-/**
- * Resource types
- */
-typedef enum CUresourcetype_enum {
-    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
-    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
-    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
-    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
-} CUresourcetype;
-
-#ifdef _WIN32
-#define CUDA_CB __stdcall
-#else
-#define CUDA_CB
-#endif
-
-/**
- * CUDA host function
- * \param userData Argument value passed to the function
- */
-typedef void (CUDA_CB *CUhostFn)(void *userData);
-
-/**
- * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members.
- */
-typedef enum CUaccessProperty_enum {
-    CU_ACCESS_PROPERTY_NORMAL           = 0,    /**< Normal cache persistence. */
-    CU_ACCESS_PROPERTY_STREAMING        = 1,    /**< Streaming access is less likely to persit from cache. */
-    CU_ACCESS_PROPERTY_PERSISTING       = 2     /**< Persisting access is more likely to persist in cache.*/
-} CUaccessProperty;
-
-/**
- * Specifies an access policy for a window, a contiguous extent of memory
- * beginning at base_ptr and ending at base_ptr + num_bytes.
- * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE.
- * Partition into many segments and assign segments such that:
- * sum of "hit segments" / window == approx. ratio.
- * sum of "miss segments" / window == approx 1-ratio.
- * Segments and ratio specifications are fitted to the capabilities of
- * the architecture.
- * Accesses in a hit segment apply the hitProp access policy.
- * Accesses in a miss segment apply the missProp access policy.
- */
-typedef struct CUaccessPolicyWindow_st {
-    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
-    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
-    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
-    CUaccessProperty hitProp;           /**< ::CUaccessProperty set for hit. */
-    CUaccessProperty missProp;          /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */
-} CUaccessPolicyWindow_v1;
-typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow;
-
-/**
- * GPU kernel node parameters
- */
-typedef struct CUDA_KERNEL_NODE_PARAMS_st {
-    CUfunction func;             /**< Kernel to launch */
-    unsigned int gridDimX;       /**< Width of grid in blocks */
-    unsigned int gridDimY;       /**< Height of grid in blocks */
-    unsigned int gridDimZ;       /**< Depth of grid in blocks */
-    unsigned int blockDimX;      /**< X dimension of each thread block */
-    unsigned int blockDimY;      /**< Y dimension of each thread block */
-    unsigned int blockDimZ;      /**< Z dimension of each thread block */
-    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
-    void **kernelParams;         /**< Array of pointers to kernel parameters */
-    void **extra;                /**< Extra options */
-} CUDA_KERNEL_NODE_PARAMS_v1;
-typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS;
-
-/**
- * Memset node parameters
- */
-typedef struct CUDA_MEMSET_NODE_PARAMS_st {
-    CUdeviceptr dst;                        /**< Destination device pointer */
-    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
-    unsigned int value;                     /**< Value to be set */
-    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
-    size_t width;                           /**< Width of the row in elements */
-    size_t height;                          /**< Number of rows */
-} CUDA_MEMSET_NODE_PARAMS_v1;
-typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS;
-
-/**
- * Host node parameters
- */
-typedef struct CUDA_HOST_NODE_PARAMS_st {
-    CUhostFn fn;    /**< The function to call when the node executes */
-    void* userData; /**< Argument to pass to the function */
-} CUDA_HOST_NODE_PARAMS_v1;
-typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS;
-
-/**
- * Graph node types
- */
-typedef enum CUgraphNodeType_enum {
-    CU_GRAPH_NODE_TYPE_KERNEL           = 0, /**< GPU kernel node */
-    CU_GRAPH_NODE_TYPE_MEMCPY           = 1, /**< Memcpy node */
-    CU_GRAPH_NODE_TYPE_MEMSET           = 2, /**< Memset node */
-    CU_GRAPH_NODE_TYPE_HOST             = 3, /**< Host (executable) node */
-    CU_GRAPH_NODE_TYPE_GRAPH            = 4, /**< Node which executes an embedded graph */
-    CU_GRAPH_NODE_TYPE_EMPTY            = 5, /**< Empty (no-op) node */
-    CU_GRAPH_NODE_TYPE_WAIT_EVENT       = 6, /**< External event wait node */
-    CU_GRAPH_NODE_TYPE_EVENT_RECORD     = 7, /**< External event record node */
-    CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */
-    CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT   = 9, /**< External semaphore wait node */
-    CU_GRAPH_NODE_TYPE_MEM_ALLOC        = 10,/**< Memory Allocation Node */
-    CU_GRAPH_NODE_TYPE_MEM_FREE         = 11 /**< Memory Free Node */
-} CUgraphNodeType;
-
-typedef enum CUsynchronizationPolicy_enum {
-    CU_SYNC_POLICY_AUTO = 1,
-    CU_SYNC_POLICY_SPIN = 2,
-    CU_SYNC_POLICY_YIELD = 3,
-    CU_SYNC_POLICY_BLOCKING_SYNC = 4
-} CUsynchronizationPolicy;
-
-/**
- * Graph kernel node Attributes
- */
-typedef enum CUkernelNodeAttrID_enum {
-    CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW       = 1,    /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */
-    CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE                = 2     /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */
-} CUkernelNodeAttrID;
-
-/**
- * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute
- */
-typedef union CUkernelNodeAttrValue_union {
-    CUaccessPolicyWindow accessPolicyWindow;    /**< Attribute ::CUaccessPolicyWindow. */
-    int cooperative;                            /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */
-} CUkernelNodeAttrValue_v1;
-typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
-
-/**
- * Possible stream capture statuses returned by ::cuStreamIsCapturing
- */
-typedef enum CUstreamCaptureStatus_enum {
-    CU_STREAM_CAPTURE_STATUS_NONE        = 0, /**< Stream is not capturing */
-    CU_STREAM_CAPTURE_STATUS_ACTIVE      = 1, /**< Stream is actively capturing */
-    CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2  /**< Stream is part of a capture sequence that
-                                                   has been invalidated, but not terminated */
-} CUstreamCaptureStatus;
-
-/**
- * Possible modes for stream capture thread interactions. For more details see
- * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode
- */
-typedef enum CUstreamCaptureMode_enum {
-    CU_STREAM_CAPTURE_MODE_GLOBAL       = 0,
-    CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
-    CU_STREAM_CAPTURE_MODE_RELAXED      = 2
-} CUstreamCaptureMode;
-
-/**
- * Stream Attributes 
- */
-typedef enum CUstreamAttrID_enum {
-    CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW    = 1,   /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */
-    CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY  = 3    /**< ::CUsynchronizationPolicy for work queued up in this stream */
-} CUstreamAttrID;
-
-/**
- * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute
- */
-typedef union CUstreamAttrValue_union {
-    CUaccessPolicyWindow accessPolicyWindow;   /**< Attribute ::CUaccessPolicyWindow. */
-    CUsynchronizationPolicy syncPolicy;        /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */
-} CUstreamAttrValue_v1;
-typedef CUstreamAttrValue_v1 CUstreamAttrValue;
-
-/**
- * Flags to specify search options. For more details see ::cuGetProcAddress
- */
-typedef enum CUdriverProcAddress_flags_enum {
-    CU_GET_PROC_ADDRESS_DEFAULT = 0,                        /**< Default search mode for driver symbols. */
-    CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0,             /**< Search for legacy versions of driver symbols. */
-    CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1  /**< Search for per-thread versions of driver symbols. */ 
-} CUdriverProcAddress_flags;
-
-/**
- * Execution Affinity Types 
- */
-typedef enum CUexecAffinityType_enum {
-    CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0,  /**< Create a context with limited SMs. */
-    CU_EXEC_AFFINITY_TYPE_MAX
-} CUexecAffinityType;
-
-/**
- * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT
- */
-typedef struct CUexecAffinitySmCount_st {
-    unsigned int val;    /**< The number of SMs the context is limited to use. */
-} CUexecAffinitySmCount_v1;
-typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount;
-
-/**
- * Execution Affinity Parameters 
- */
-typedef struct CUexecAffinityParam_st {
-    CUexecAffinityType type;
-    union {
-        CUexecAffinitySmCount smCount;    /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */
-    } param;
-} CUexecAffinityParam_v1;
-typedef CUexecAffinityParam_v1 CUexecAffinityParam;
-
-/**
- * Error codes
- */
-typedef enum cudaError_enum {
-    /**
-     * The API call returned with no errors. In the case of query calls, this
-     * also means that the operation being queried is complete (see
-     * ::cuEventQuery() and ::cuStreamQuery()).
-     */
-    CUDA_SUCCESS                              = 0,
-
-    /**
-     * This indicates that one or more of the parameters passed to the API call
-     * is not within an acceptable range of values.
-     */
-    CUDA_ERROR_INVALID_VALUE                  = 1,
-
-    /**
-     * The API call failed because it was unable to allocate enough memory to
-     * perform the requested operation.
-     */
-    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
-
-    /**
-     * This indicates that the CUDA driver has not been initialized with
-     * ::cuInit() or that initialization has failed.
-     */
-    CUDA_ERROR_NOT_INITIALIZED                = 3,
-
-    /**
-     * This indicates that the CUDA driver is in the process of shutting down.
-     */
-    CUDA_ERROR_DEINITIALIZED                  = 4,
-
-    /**
-     * This indicates profiler is not initialized for this run. This can
-     * happen when the application is running with external profiling tools
-     * like visual profiler.
-     */
-    CUDA_ERROR_PROFILER_DISABLED              = 5,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to attempt to enable/disable the profiling via ::cuProfilerStart or
-     * ::cuProfilerStop without initialization.
-     */
-    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to call cuProfilerStart() when profiling is already enabled.
-     */
-    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to call cuProfilerStop() when profiling is already disabled.
-     */
-    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
-
-    /**
-     * This indicates that the CUDA driver that the application has loaded is a
-     * stub library. Applications that run with the stub rather than a real
-     * driver loaded will result in CUDA API returning this error.
-     */
-    CUDA_ERROR_STUB_LIBRARY                   = 34,
-
-    /**
-     * This indicates that no CUDA-capable devices were detected by the installed
-     * CUDA driver.
-     */
-    CUDA_ERROR_NO_DEVICE                      = 100,
-
-    /**
-     * This indicates that the device ordinal supplied by the user does not
-     * correspond to a valid CUDA device or that the action requested is
-     * invalid for the specified device.
-     */
-    CUDA_ERROR_INVALID_DEVICE                 = 101,
-
-    /**
-     * This error indicates that the Grid license is not applied.
-     */
-    CUDA_ERROR_DEVICE_NOT_LICENSED            = 102,
-
-    /**
-     * This indicates that the device kernel image is invalid. This can also
-     * indicate an invalid CUDA module.
-     */
-    CUDA_ERROR_INVALID_IMAGE                  = 200,
-
-    /**
-     * This most frequently indicates that there is no context bound to the
-     * current thread. This can also be returned if the context passed to an
-     * API call is not a valid handle (such as a context that has had
-     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
-     * mixes different API versions (i.e. 3010 context with 3020 API calls).
-     * See ::cuCtxGetApiVersion() for more details.
-     */
-    CUDA_ERROR_INVALID_CONTEXT                = 201,
-
-    /**
-     * This indicated that the context being supplied as a parameter to the
-     * API call was already the active context.
-     * \deprecated
-     * This error return is deprecated as of CUDA 3.2. It is no longer an
-     * error to attempt to push the active context via ::cuCtxPushCurrent().
-     */
-    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
-
-    /**
-     * This indicates that a map or register operation has failed.
-     */
-    CUDA_ERROR_MAP_FAILED                     = 205,
-
-    /**
-     * This indicates that an unmap or unregister operation has failed.
-     */
-    CUDA_ERROR_UNMAP_FAILED                   = 206,
-
-    /**
-     * This indicates that the specified array is currently mapped and thus
-     * cannot be destroyed.
-     */
-    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
-
-    /**
-     * This indicates that the resource is already mapped.
-     */
-    CUDA_ERROR_ALREADY_MAPPED                 = 208,
-
-    /**
-     * This indicates that there is no kernel image available that is suitable
-     * for the device. This can occur when a user specifies code generation
-     * options for a particular CUDA source file that do not include the
-     * corresponding device configuration.
-     */
-    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
-
-    /**
-     * This indicates that a resource has already been acquired.
-     */
-    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
-
-    /**
-     * This indicates that a resource is not mapped.
-     */
-    CUDA_ERROR_NOT_MAPPED                     = 211,
-
-    /**
-     * This indicates that a mapped resource is not available for access as an
-     * array.
-     */
-    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
-
-    /**
-     * This indicates that a mapped resource is not available for access as a
-     * pointer.
-     */
-    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
-
-    /**
-     * This indicates that an uncorrectable ECC error was detected during
-     * execution.
-     */
-    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
-
-    /**
-     * This indicates that the ::CUlimit passed to the API call is not
-     * supported by the active device.
-     */
-    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
-
-    /**
-     * This indicates that the ::CUcontext passed to the API call can
-     * only be bound to a single CPU thread at a time but is already
-     * bound to a CPU thread.
-     */
-    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
-
-    /**
-     * This indicates that peer access is not supported across the given
-     * devices.
-     */
-    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
-
-    /**
-     * This indicates that a PTX JIT compilation failed.
-     */
-    CUDA_ERROR_INVALID_PTX                    = 218,
-
-    /**
-     * This indicates an error with OpenGL or DirectX context.
-     */
-    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
-
-    /**
-    * This indicates that an uncorrectable NVLink error was detected during the
-    * execution.
-    */
-    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
-
-    /**
-    * This indicates that the PTX JIT compiler library was not found.
-    */
-    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
-
-    /**
-     * This indicates that the provided PTX was compiled with an unsupported toolchain.
-     */
-
-    CUDA_ERROR_UNSUPPORTED_PTX_VERSION        = 222,
-
-    /**
-     * This indicates that the PTX JIT compilation was disabled.
-     */
-    CUDA_ERROR_JIT_COMPILATION_DISABLED       = 223,
-
-    /**
-     * This indicates that the ::CUexecAffinityType passed to the API call is not
-     * supported by the active device.
-     */ 
-    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY      = 224,
-
-    /**
-     * This indicates that the device kernel source is invalid. This includes
-     * compilation/linker errors encountered in device code or user error.
-     */
-    CUDA_ERROR_INVALID_SOURCE                 = 300,
-
-    /**
-     * This indicates that the file specified was not found.
-     */
-    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
-
-    /**
-     * This indicates that a link to a shared object failed to resolve.
-     */
-    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
-
-    /**
-     * This indicates that initialization of a shared object failed.
-     */
-    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
-
-    /**
-     * This indicates that an OS call failed.
-     */
-    CUDA_ERROR_OPERATING_SYSTEM               = 304,
-
-    /**
-     * This indicates that a resource handle passed to the API call was not
-     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
-     */
-    CUDA_ERROR_INVALID_HANDLE                 = 400,
-
-    /**
-     * This indicates that a resource required by the API call is not in a
-     * valid state to perform the requested operation.
-     */
-    CUDA_ERROR_ILLEGAL_STATE                  = 401,
-
-    /**
-     * This indicates that a named symbol was not found. Examples of symbols
-     * are global/constant variable names, driver function names, texture names,
-     * and surface names.
-     */
-    CUDA_ERROR_NOT_FOUND                      = 500,
-
-    /**
-     * This indicates that asynchronous operations issued previously have not
-     * completed yet. This result is not actually an error, but must be indicated
-     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
-     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
-     */
-    CUDA_ERROR_NOT_READY                      = 600,
-
-    /**
-     * While executing a kernel, the device encountered a
-     * load or store instruction on an invalid memory address.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
-
-    /**
-     * This indicates that a launch did not occur because it did not have
-     * appropriate resources. This error usually indicates that the user has
-     * attempted to pass too many arguments to the device kernel, or the
-     * kernel launch specifies too many threads for the kernel's register
-     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
-     * when a 32-bit int is expected) is equivalent to passing too many
-     * arguments and can also result in this error.
-     */
-    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
-
-    /**
-     * This indicates that the device kernel took too long to execute. This can
-     * only occur if timeouts are enabled - see the device attribute
-     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
-
-    /**
-     * This error indicates a kernel launch that uses an incompatible texturing
-     * mode.
-     */
-    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
-
-    /**
-     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
-     * trying to re-enable peer access to a context which has already
-     * had peer access to it enabled.
-     */
-    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
-
-    /**
-     * This error indicates that ::cuCtxDisablePeerAccess() is
-     * trying to disable peer access which has not been enabled yet
-     * via ::cuCtxEnablePeerAccess().
-     */
-    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
-
-    /**
-     * This error indicates that the primary context for the specified device
-     * has already been initialized.
-     */
-    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
-
-    /**
-     * This error indicates that the context current to the calling thread
-     * has been destroyed using ::cuCtxDestroy, or is a primary context which
-     * has not yet been initialized.
-     */
-    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
-
-    /**
-     * A device-side assert triggered during kernel execution. The context
-     * cannot be used anymore, and must be destroyed. All existing device
-     * memory allocations from this context are invalid and must be
-     * reconstructed if the program is to continue using CUDA.
-     */
-    CUDA_ERROR_ASSERT                         = 710,
-
-    /**
-     * This error indicates that the hardware resources required to enable
-     * peer access have been exhausted for one or more of the devices
-     * passed to ::cuCtxEnablePeerAccess().
-     */
-    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
-
-    /**
-     * This error indicates that the memory range passed to ::cuMemHostRegister()
-     * has already been registered.
-     */
-    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
-
-    /**
-     * This error indicates that the pointer passed to ::cuMemHostUnregister()
-     * does not correspond to any currently registered memory region.
-     */
-    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
-
-    /**
-     * While executing a kernel, the device encountered a stack error.
-     * This can be due to stack corruption or exceeding the stack size limit.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
-
-    /**
-     * While executing a kernel, the device encountered an illegal instruction.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
-
-    /**
-     * While executing a kernel, the device encountered a load or store instruction
-     * on a memory address which is not aligned.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
-
-    /**
-     * While executing a kernel, the device encountered an instruction
-     * which can only operate on memory locations in certain address spaces
-     * (global, shared, or local), but was supplied a memory address not
-     * belonging to an allowed address space.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
-
-    /**
-     * While executing a kernel, the device program counter wrapped its address space.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_INVALID_PC                     = 718,
-
-    /**
-     * An exception occurred on the device while executing a kernel. Common
-     * causes include dereferencing an invalid device pointer and accessing
-     * out of bounds shared memory. Less common cases can be system specific - more
-     * information about these cases can be found in the system specific user guide.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_LAUNCH_FAILED                  = 719,
-
-    /**
-     * This error indicates that the number of blocks launched per grid for a kernel that was
-     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
-     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
-     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
-     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
-     */
-    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
-
-    /**
-     * This error indicates that the attempted operation is not permitted.
-     */
-    CUDA_ERROR_NOT_PERMITTED                  = 800,
-
-    /**
-     * This error indicates that the attempted operation is not supported
-     * on the current system or device.
-     */
-    CUDA_ERROR_NOT_SUPPORTED                  = 801,
-
-    /**
-     * This error indicates that the system is not yet ready to start any CUDA
-     * work.  To continue using CUDA, verify the system configuration is in a
-     * valid state and all required driver daemons are actively running.
-     * More information about this error can be found in the system specific
-     * user guide.
-     */
-    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
-
-    /**
-     * This error indicates that there is a mismatch between the versions of
-     * the display driver and the CUDA driver. Refer to the compatibility documentation
-     * for supported versions.
-     */
-    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
-
-    /**
-     * This error indicates that the system was upgraded to run with forward compatibility
-     * but the visible hardware detected by CUDA does not support this configuration.
-     * Refer to the compatibility documentation for the supported hardware matrix or ensure
-     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
-     * environment variable.
-     */
-    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
-
-    /**
-     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
-     */
-    CUDA_ERROR_MPS_CONNECTION_FAILED          = 805,
-
-    /**
-     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
-     */
-    CUDA_ERROR_MPS_RPC_FAILURE                = 806,
-
-    /**
-     * This error indicates that the MPS server is not ready to accept new MPS client requests.
-     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
-     */
-    CUDA_ERROR_MPS_SERVER_NOT_READY           = 807,
-
-    /**
-     * This error indicates that the hardware resources required to create MPS client have been exhausted.
-     */
-    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED        = 808,
-
-    /**
-     * This error indicates the the hardware resources required to support device connections have been exhausted.
-     */
-    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED    = 809,
-
-    /**
-     * This error indicates that the operation is not permitted when
-     * the stream is capturing.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
-
-    /**
-     * This error indicates that the current capture sequence on the stream
-     * has been invalidated due to a previous error.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
-
-    /**
-     * This error indicates that the operation would have resulted in a merge
-     * of two independent capture sequences.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
-
-    /**
-     * This error indicates that the capture was not initiated in this stream.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
-
-    /**
-     * This error indicates that the capture sequence contains a fork that was
-     * not joined to the primary stream.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
-
-    /**
-     * This error indicates that a dependency would have been created which
-     * crosses the capture sequence boundary. Only implicit in-stream ordering
-     * dependencies are allowed to cross the boundary.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
-
-    /**
-     * This error indicates a disallowed implicit dependency on a current capture
-     * sequence from cudaStreamLegacy.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
-
-    /**
-     * This error indicates that the operation is not permitted on an event which
-     * was last recorded in a capturing stream.
-     */
-    CUDA_ERROR_CAPTURED_EVENT                 = 907,
-
-    /**
-     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
-     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
-     * different thread.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
-
-    /**
-     * This error indicates that the timeout specified for the wait operation has lapsed.
-     */
-    CUDA_ERROR_TIMEOUT                        = 909,
-
-    /**
-     * This error indicates that the graph update was not performed because it included 
-     * changes which violated constraints specific to instantiated graph update.
-     */
-    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE      = 910,
-
-    /**
-     * This indicates that an async error has occurred in a device outside of CUDA.
-     * If CUDA was waiting for an external device's signal before consuming shared data,
-     * the external device signaled an error indicating that the data is not valid for
-     * consumption. This leaves the process in an inconsistent state and any further CUDA
-     * work will return the same error. To continue using CUDA, the process must be
-     * terminated and relaunched.
-     */
-    CUDA_ERROR_EXTERNAL_DEVICE               = 911,
-
-
-
-
-
-
-
-
-    /**
-     * This indicates that an unknown internal error has occurred.
-     */
-    CUDA_ERROR_UNKNOWN                        = 999
-} CUresult;
-
-/**
- * P2P Attributes
- */
-typedef enum CUdevice_P2PAttribute_enum {
-    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
-    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
-    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
-    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED              = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
-    CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
-} CUdevice_P2PAttribute;
-
-
-
-
-
-
-
-
-
-
-
-
-/**
- * CUDA stream callback
- * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
- * \param status ::CUDA_SUCCESS or any persistent error on the stream.
- * \param userData User parameter provided at registration.
- */
-typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
-
-/**
- * Block size to per-block dynamic shared memory mapping for a certain
- * kernel \param blockSize Block size of the kernel.
- *
- * \return The dynamic shared memory needed by a block.
- */
-typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
-
-/**
- * If set, host memory is portable between CUDA contexts.
- * Flag for ::cuMemHostAlloc()
- */
-#define CU_MEMHOSTALLOC_PORTABLE        0x01
-
-/**
- * If set, host memory is mapped into CUDA address space and
- * ::cuMemHostGetDevicePointer() may be called on the host pointer.
- * Flag for ::cuMemHostAlloc()
- */
-#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
-
-/**
- * If set, host memory is allocated as write-combined - fast to write,
- * faster to DMA, slow to read except via SSE4 streaming load instruction
- * (MOVNTDQA).
- * Flag for ::cuMemHostAlloc()
- */
-#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
-
-/**
- * If set, host memory is portable between CUDA contexts.
- * Flag for ::cuMemHostRegister()
- */
-#define CU_MEMHOSTREGISTER_PORTABLE     0x01
-
-/**
- * If set, host memory is mapped into CUDA address space and
- * ::cuMemHostGetDevicePointer() may be called on the host pointer.
- * Flag for ::cuMemHostRegister()
- */
-#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
-
-/**
- * If set, the passed memory pointer is treated as pointing to some
- * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
- * On Windows the flag is a no-op.
- * On Linux that memory is marked as non cache-coherent for the GPU and
- * is expected to be physically contiguous. It may return
- * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
- * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
- * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
- * is returned.
- * Flag for ::cuMemHostRegister()
- */
-#define CU_MEMHOSTREGISTER_IOMEMORY     0x04
-
-/**
-* If set, the passed memory pointer is treated as pointing to memory that is
-* considered read-only by the device.  On platforms without
-* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
-* required in order to register memory mapped to the CPU as read-only.  Support
-* for the use of this flag can be queried from the device attribute
-* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
-* a current context associated with a device that does not have this attribute
-* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED.
-*/
-#define CU_MEMHOSTREGISTER_READ_ONLY    0x08
-
-/**
- * 2D memory copy parameters
- */
-typedef struct CUDA_MEMCPY2D_st {
-    size_t srcXInBytes;         /**< Source X in bytes */
-    size_t srcY;                /**< Source Y */
-
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
-
-    size_t dstXInBytes;         /**< Destination X in bytes */
-    size_t dstY;                /**< Destination Y */
-
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
-
-    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
-    size_t Height;              /**< Height of 2D memory copy */
-} CUDA_MEMCPY2D_v2;
-typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D;
-
-/**
- * 3D memory copy parameters
- */
-typedef struct CUDA_MEMCPY3D_st {
-    size_t srcXInBytes;         /**< Source X in bytes */
-    size_t srcY;                /**< Source Y */
-    size_t srcZ;                /**< Source Z */
-    size_t srcLOD;              /**< Source LOD */
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    void *reserved0;            /**< Must be NULL */
-    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
-    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
-
-    size_t dstXInBytes;         /**< Destination X in bytes */
-    size_t dstY;                /**< Destination Y */
-    size_t dstZ;                /**< Destination Z */
-    size_t dstLOD;              /**< Destination LOD */
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    void *reserved1;            /**< Must be NULL */
-    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
-    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
-
-    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
-    size_t Height;              /**< Height of 3D memory copy */
-    size_t Depth;               /**< Depth of 3D memory copy */
-} CUDA_MEMCPY3D_v2;
-typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D;
-
-/**
- * 3D memory cross-context copy parameters
- */
-typedef struct CUDA_MEMCPY3D_PEER_st {
-    size_t srcXInBytes;         /**< Source X in bytes */
-    size_t srcY;                /**< Source Y */
-    size_t srcZ;                /**< Source Z */
-    size_t srcLOD;              /**< Source LOD */
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
-    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
-    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
-
-    size_t dstXInBytes;         /**< Destination X in bytes */
-    size_t dstY;                /**< Destination Y */
-    size_t dstZ;                /**< Destination Z */
-    size_t dstLOD;              /**< Destination LOD */
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
-    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
-    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
-
-    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
-    size_t Height;              /**< Height of 3D memory copy */
-    size_t Depth;               /**< Depth of 3D memory copy */
-} CUDA_MEMCPY3D_PEER_v1;
-typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER;
-
-/**
- * Array descriptor
- */
-typedef struct CUDA_ARRAY_DESCRIPTOR_st
-{
-    size_t Width;             /**< Width of array */
-    size_t Height;            /**< Height of array */
-
-    CUarray_format Format;    /**< Array format */
-    unsigned int NumChannels; /**< Channels per array element */
-} CUDA_ARRAY_DESCRIPTOR_v2;
-typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR;
-
-/**
- * 3D array descriptor
- */
-typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
-{
-    size_t Width;             /**< Width of 3D array */
-    size_t Height;            /**< Height of 3D array */
-    size_t Depth;             /**< Depth of 3D array */
-
-    CUarray_format Format;    /**< Array format */
-    unsigned int NumChannels; /**< Channels per array element */
-    unsigned int Flags;       /**< Flags */
-} CUDA_ARRAY3D_DESCRIPTOR_v2;
-typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR;
-
-/**
- * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
- */
-#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1
-
-/**
- * CUDA array sparse properties
- */
-typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
-    struct {
-        unsigned int width;     /**< Width of sparse tile in elements */
-        unsigned int height;    /**< Height of sparse tile in elements */
-        unsigned int depth;     /**< Depth of sparse tile in elements */
-    } tileExtent;
-
-    /**
-     * First mip level at which the mip tail begins.
-     */
-    unsigned int miptailFirstLevel;
-    /**
-     * Total size of the mip tail.
-     */
-    unsigned long long miptailSize;
-    /**
-     * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
-     */
-    unsigned int flags;
-    unsigned int reserved[4];
-} CUDA_ARRAY_SPARSE_PROPERTIES_v1;
-typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES;
-
-
-/**
- * CUDA array memory requirements
- */
-typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st {
-    size_t size;                /**< Total required memory size */
-    size_t alignment;           /**< alignment requirement */
-    unsigned int reserved[4];
-} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1;
-typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS;
-
-
-/**
- * CUDA Resource descriptor
- */
-typedef struct CUDA_RESOURCE_DESC_st
-{
-    CUresourcetype resType;                   /**< Resource type */
-
-    union {
-        struct {
-            CUarray hArray;                   /**< CUDA array */
-        } array;
-        struct {
-            CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
-        } mipmap;
-        struct {
-            CUdeviceptr devPtr;               /**< Device pointer */
-            CUarray_format format;            /**< Array format */
-            unsigned int numChannels;         /**< Channels per array element */
-            size_t sizeInBytes;               /**< Size in bytes */
-        } linear;
-        struct {
-            CUdeviceptr devPtr;               /**< Device pointer */
-            CUarray_format format;            /**< Array format */
-            unsigned int numChannels;         /**< Channels per array element */
-            size_t width;                     /**< Width of the array in elements */
-            size_t height;                    /**< Height of the array in elements */
-            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
-        } pitch2D;
-        struct {
-            int reserved[32];
-        } reserved;
-    } res;
-
-    unsigned int flags;                       /**< Flags (must be zero) */
-} CUDA_RESOURCE_DESC_v1;
-typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC;
-
-/**
- * Texture descriptor
- */
-typedef struct CUDA_TEXTURE_DESC_st {
-    CUaddress_mode addressMode[3];  /**< Address modes */
-    CUfilter_mode filterMode;       /**< Filter mode */
-    unsigned int flags;             /**< Flags */
-    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
-    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
-    float mipmapLevelBias;          /**< Mipmap level bias */
-    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
-    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
-    float borderColor[4];           /**< Border Color */
-    int reserved[12];
-} CUDA_TEXTURE_DESC_v1;
-typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC;
-
-/**
- * Resource view format
- */
-typedef enum CUresourceViewFormat_enum
-{
-    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
-    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
-    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
-    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
-    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
-} CUresourceViewFormat;
-
-/**
- * Resource view descriptor
- */
-typedef struct CUDA_RESOURCE_VIEW_DESC_st
-{
-    CUresourceViewFormat format;   /**< Resource view format */
-    size_t width;                  /**< Width of the resource view */
-    size_t height;                 /**< Height of the resource view */
-    size_t depth;                  /**< Depth of the resource view */
-    unsigned int firstMipmapLevel; /**< First defined mipmap level */
-    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
-    unsigned int firstLayer;       /**< First layer index */
-    unsigned int lastLayer;        /**< Last layer index */
-    unsigned int reserved[16];
-} CUDA_RESOURCE_VIEW_DESC_v1;
-typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC;
-
-/**
- * GPU Direct v3 tokens
- */
-typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
-    unsigned long long p2pToken;
-    unsigned int vaSpaceToken;
-} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1;
-typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
-
-/**
-* Access flags that specify the level of access the current context's device has
-* on the memory referenced.
-*/
-typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum {
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE      = 0x0,   /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ      = 0x1,   /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3    /**< Read-write access, the device has full read-write access to the memory */
-} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS;
-
-/**
- * Kernel launch parameters
- */
-typedef struct CUDA_LAUNCH_PARAMS_st {
-    CUfunction function;         /**< Kernel to launch */
-    unsigned int gridDimX;       /**< Width of grid in blocks */
-    unsigned int gridDimY;       /**< Height of grid in blocks */
-    unsigned int gridDimZ;       /**< Depth of grid in blocks */
-    unsigned int blockDimX;      /**< X dimension of each thread block */
-    unsigned int blockDimY;      /**< Y dimension of each thread block */
-    unsigned int blockDimZ;      /**< Z dimension of each thread block */
-    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
-    CUstream hStream;            /**< Stream identifier */
-    void **kernelParams;         /**< Array of pointers to kernel parameters */
-} CUDA_LAUNCH_PARAMS_v1;
-typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS;
-
-/**
- * External memory handle types
- */
-typedef enum CUexternalMemoryHandleType_enum {
-    /**
-     * Handle is an opaque file descriptor
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
-    /**
-     * Handle is an opaque shared NT handle
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
-    /**
-     * Handle is an opaque, globally shared handle
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
-    /**
-     * Handle is a D3D12 heap object
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
-    /**
-     * Handle is a D3D12 committed resource
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
-    /**
-     * Handle is a shared NT handle to a D3D11 resource
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
-    /**
-     * Handle is a globally shared handle to a D3D11 resource
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
-    /**
-     * Handle is an NvSciBuf object
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
-} CUexternalMemoryHandleType;
-
-/**
- * Indicates that the external memory object is a dedicated resource
- */
-#define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
-
-/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
- * contains this flag, it indicates that signaling an external semaphore object
- * should skip performing appropriate memory synchronization operations over all
- * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
- * which otherwise are performed by default to ensure data coherency with other
- * importers of the same NvSciBuf memory objects.
- */
-#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01
-
-/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
- * contains this flag, it indicates that waiting on an external semaphore object
- * should skip performing appropriate memory synchronization operations over all
- * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
- * which otherwise are performed by default to ensure data coherency with other
- * importers of the same NvSciBuf memory objects.
- */
-#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02
-
-/**
- * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
- * it indicates that application needs signaler specific NvSciSyncAttr
- * to be filled by ::cuDeviceGetNvSciSyncAttributes.
- */
-#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1
-
-/**
- * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
- * it indicates that application needs waiter specific NvSciSyncAttr
- * to be filled by ::cuDeviceGetNvSciSyncAttributes.
- */
-#define CUDA_NVSCISYNC_ATTR_WAIT 0x2
-/**
- * External memory handle descriptor
- */
-typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
-    /**
-     * Type of the handle
-     */
-    CUexternalMemoryHandleType type;
-    union {
-        /**
-         * File descriptor referencing the memory object. Valid
-         * when type is
-         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
-         */
-        int fd;
-        /**
-         * Win32 handle referencing the semaphore object. Valid when
-         * type is one of the following:
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
-         * Exactly one of 'handle' and 'name' must be non-NULL. If
-         * type is one of the following:
-         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
-         * then 'name' must be NULL.
-         */
-        struct {
-            /**
-             * Valid NT handle. Must be NULL if 'name' is non-NULL
-             */
-            void *handle;
-            /**
-             * Name of a valid memory object.
-             * Must be NULL if 'handle' is non-NULL.
-             */
-            const void *name;
-        } win32;
-        /**
-         * A handle representing an NvSciBuf Object. Valid when type
-         * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF
-         */
-        const void *nvSciBufObject;
-    } handle;
-    /**
-     * Size of the memory allocation
-     */
-    unsigned long long size;
-    /**
-     * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1;
-typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
-
-/**
- * External memory buffer descriptor
- */
-typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
-    /**
-     * Offset into the memory object where the buffer's base is
-     */
-    unsigned long long offset;
-    /**
-     * Size of the buffer
-     */
-    unsigned long long size;
-    /**
-     * Flags reserved for future use. Must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1;
-typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
-
-/**
- * External memory mipmap descriptor
- */
-typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
-    /**
-     * Offset into the memory object where the base level of the
-     * mipmap chain is.
-     */
-    unsigned long long offset;
-    /**
-     * Format, dimension and type of base level of the mipmap chain
-     */
-    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-    /**
-     * Total number of levels in the mipmap chain
-     */
-    unsigned int numLevels;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1;
-typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
-
-/**
- * External semaphore handle types
- */
-typedef enum CUexternalSemaphoreHandleType_enum {
-    /**
-     * Handle is an opaque file descriptor
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD             = 1,
-    /**
-     * Handle is an opaque shared NT handle
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32          = 2,
-    /**
-     * Handle is an opaque, globally shared handle
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT      = 3,
-    /**
-     * Handle is a shared NT handle referencing a D3D12 fence object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE           = 4,
-    /**
-     * Handle is a shared NT handle referencing a D3D11 fence object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE           = 5,
-    /**
-     * Opaque handle to NvSciSync Object
-	 */
-	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC             = 6,
-    /**
-     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX     = 7,
-    /**
-     * Handle is a globally shared handle referencing a D3D11 keyed mutex object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8,
-    /**
-     * Handle is an opaque file descriptor referencing a timeline semaphore
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9,
-    /**
-     * Handle is an opaque shared NT handle referencing a timeline semaphore
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
-} CUexternalSemaphoreHandleType;
-
-/**
- * External semaphore handle descriptor
- */
-typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
-    /**
-     * Type of the handle
-     */
-    CUexternalSemaphoreHandleType type;
-    union {
-        /**
-         * File descriptor referencing the semaphore object. Valid
-         * when type is one of the following:
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD
-         */
-        int fd;
-        /**
-         * Win32 handle referencing the semaphore object. Valid when
-         * type is one of the following:
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
-         * Exactly one of 'handle' and 'name' must be non-NULL. If
-         * type is one of the following:
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
-         * then 'name' must be NULL.
-         */
-        struct {
-            /**
-             * Valid NT handle. Must be NULL if 'name' is non-NULL
-             */
-            void *handle;
-            /**
-             * Name of a valid synchronization primitive.
-             * Must be NULL if 'handle' is non-NULL.
-             */
-            const void *name;
-        } win32;
-        /**
-         * Valid NvSciSyncObj. Must be non NULL
-         */
-        const void* nvSciSyncObj;
-    } handle;
-    /**
-     * Flags reserved for the future. Must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1;
-typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
-
-/**
- * External semaphore signal parameters
- */
-typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
-    struct {
-        /**
-         * Parameters for fence objects
-         */
-        struct {
-            /**
-             * Value of fence to be signaled
-             */
-            unsigned long long value;
-        } fence;
-        union {
-            /**
-             * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType
-             * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
-             */
-            void *fence;
-            unsigned long long reserved;
-        } nvSciSync;
-        /**
-         * Parameters for keyed mutex objects
-         */
-        struct {
-            /**
-             * Value of key to release the mutex with
-             */
-            unsigned long long key;
-        } keyedMutex;
-        unsigned int reserved[12];
-    } params;
-    /**
-     * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to
-     * signal a ::CUexternalSemaphore of type
-     * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-     * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates
-     * that while signaling the ::CUexternalSemaphore, no memory synchronization
-     * operations should be performed for any external memory object imported
-     * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-     * For all other types of ::CUexternalSemaphore, flags must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1;
-typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
-
-/**
- * External semaphore wait parameters
- */
-typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
-    struct {
-        /**
-         * Parameters for fence objects
-         */
-        struct {
-            /**
-             * Value of fence to be waited on
-             */
-            unsigned long long value;
-        } fence;
-        /**
-         * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType
-         * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
-         */
-        union {
-            void *fence;
-            unsigned long long reserved;
-        } nvSciSync;
-        /**
-         * Parameters for keyed mutex objects
-         */
-        struct {
-            /**
-             * Value of key to acquire the mutex with
-             */
-            unsigned long long key;
-            /**
-             * Timeout in milliseconds to wait to acquire the mutex
-             */
-            unsigned int timeoutMs;
-        } keyedMutex;
-        unsigned int reserved[10];
-    } params;
-    /**
-     * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-     * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
-     * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
-     * which indicates that while waiting for the ::CUexternalSemaphore, no memory
-     * synchronization operations should be performed for any external memory
-     * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-     * For all other types of ::CUexternalSemaphore, flags must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1;
-typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
-
-/**
- * Semaphore signal node parameters
- */
-typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st {
-    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
-    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
-} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1;
-typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS;
-
-/**
- * Semaphore wait node parameters
- */
-typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st {
-    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
-    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
-} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1;
-typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS;
-
-typedef unsigned long long CUmemGenericAllocationHandle_v1;
-typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
-
-/**
- * Flags for specifying particular handle types
- */
-typedef enum CUmemAllocationHandleType_enum {
-    CU_MEM_HANDLE_TYPE_NONE                  = 0x0,  /**< Does not allow any export mechanism. > */
-    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
-    CU_MEM_HANDLE_TYPE_WIN32                 = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
-    CU_MEM_HANDLE_TYPE_WIN32_KMT             = 0x4,  /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
-    CU_MEM_HANDLE_TYPE_MAX                   = 0x7FFFFFFF
-} CUmemAllocationHandleType;
-
-/**
- * Specifies the memory protection flags for mapping.
- */
-typedef enum CUmemAccess_flags_enum {
-    CU_MEM_ACCESS_FLAGS_PROT_NONE        = 0x0,  /**< Default, make the address range not accessible */
-    CU_MEM_ACCESS_FLAGS_PROT_READ        = 0x1,  /**< Make the address range read accessible */
-    CU_MEM_ACCESS_FLAGS_PROT_READWRITE   = 0x3,  /**< Make the address range read-write accessible */
-    CU_MEM_ACCESS_FLAGS_PROT_MAX         = 0x7FFFFFFF
-} CUmemAccess_flags;
-
-/**
- * Specifies the type of location
- */
-typedef enum CUmemLocationType_enum {
-    CU_MEM_LOCATION_TYPE_INVALID = 0x0,
-    CU_MEM_LOCATION_TYPE_DEVICE  = 0x1,  /**< Location is a device location, thus id is a device ordinal */
-    CU_MEM_LOCATION_TYPE_MAX     = 0x7FFFFFFF
-} CUmemLocationType;
-
-/**
-* Defines the allocation types available
-*/
-typedef enum CUmemAllocationType_enum {
-    CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
-
-    /** This allocation type is 'pinned', i.e. cannot migrate from its current
-      * location while the application is actively using it
-      */
-    CU_MEM_ALLOCATION_TYPE_PINNED  = 0x1,
-    CU_MEM_ALLOCATION_TYPE_MAX     = 0x7FFFFFFF
-} CUmemAllocationType;
-
-/**
-* Flag for requesting different optimal and required granularities for an allocation.
-*/
-typedef enum CUmemAllocationGranularity_flags_enum {
-    CU_MEM_ALLOC_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity for allocation */
-    CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for allocation for best performance */
-} CUmemAllocationGranularity_flags;
-
-/**
- * Sparse subresource types
- */
-typedef enum CUarraySparseSubresourceType_enum {
-    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
-    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
-} CUarraySparseSubresourceType;
-
-/**
- * Memory operation types
- */
-typedef enum CUmemOperationType_enum {
-    CU_MEM_OPERATION_TYPE_MAP = 1,
-    CU_MEM_OPERATION_TYPE_UNMAP = 2
-} CUmemOperationType;
-
-/**
- * Memory handle types
- */
-typedef enum CUmemHandleType_enum {
-    CU_MEM_HANDLE_TYPE_GENERIC = 0
-} CUmemHandleType;
-
-/**
- * Specifies the CUDA array or CUDA mipmapped array memory mapping information
- */
-typedef struct CUarrayMapInfo_st {    
-    CUresourcetype resourceType;                    /**< Resource type */
-
-    union {
-        CUmipmappedArray mipmap;
-        CUarray array;
-    } resource;
-
-    CUarraySparseSubresourceType subresourceType;   /**< Sparse subresource type */
-
-    union {
-        struct {
-            unsigned int level;                     /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */            
-            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
-            unsigned int offsetX;                   /**< Starting X offset in elements */
-            unsigned int offsetY;                   /**< Starting Y offset in elements */
-            unsigned int offsetZ;                   /**< Starting Z offset in elements */            
-            unsigned int extentWidth;               /**< Width in elements */
-            unsigned int extentHeight;              /**< Height in elements */
-            unsigned int extentDepth;               /**< Depth in elements */
-        } sparseLevel;
-        struct {
-            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
-            unsigned long long offset;              /**< Offset within mip tail */
-            unsigned long long size;                /**< Extent in bytes */
-        } miptail;
-    } subresource;
-    
-    CUmemOperationType memOperationType;            /**< Memory operation type */
-    CUmemHandleType memHandleType;                  /**< Memory handle type */
-
-    union {
-        CUmemGenericAllocationHandle memHandle;
-    } memHandle;
-    
-    unsigned long long offset;                      /**< Offset within the memory */
-    unsigned int deviceBitMask;                     /**< Device ordinal bit mask */
-    unsigned int flags;                             /**< flags for future use, must be zero now. */
-    unsigned int reserved[2];                       /**< Reserved for future use, must be zero now. */
-} CUarrayMapInfo_v1;
-typedef CUarrayMapInfo_v1 CUarrayMapInfo;
-
-/**
- * Specifies a memory location.
- */
-typedef struct CUmemLocation_st {
-    CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */
-    int id;                 /**< identifier for a given this location's ::CUmemLocationType. */
-} CUmemLocation_v1;
-typedef CUmemLocation_v1 CUmemLocation;
-
-/**
- * Specifies compression attribute for an allocation.
- */
-typedef enum CUmemAllocationCompType_enum {
-    CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */
-    CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating  compressible memory */
-} CUmemAllocationCompType;
-
-/**
- * This flag if set indicates that the memory will be used as a tile pool.
- */
-#define CU_MEM_CREATE_USAGE_TILE_POOL    0x1
-
-/**
-* Specifies the allocation properties for a allocation.
-*/
-typedef struct CUmemAllocationProp_st {
-    /** Allocation type */
-    CUmemAllocationType type;
-    /** requested ::CUmemAllocationHandleType */
-    CUmemAllocationHandleType requestedHandleTypes;
-    /** Location of allocation */
-    CUmemLocation location;
-    /**
-     * Windows-specific POBJECT_ATTRIBUTES required when
-     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object atributes structure
-     * includes security attributes that define
-     * the scope of which exported allocations may be tranferred to other
-     * processes.  In all other cases, this field is required to be zero.
-     */
-    void *win32HandleMetaData;
-    struct {
-         /**
-         * Allocation hint for requesting compressible memory.
-         * On devices that support Compute Data Compression, compressible
-         * memory can be used to accelerate accesses to data with unstructured
-         * sparsity and other compressible data patterns. Applications are 
-         * expected to query allocation property of the handle obtained with 
-         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to 
-         * validate if the obtained allocation is compressible or not. Note that 
-         * compressed memory may not be mappable on all devices.
-         */
-         unsigned char compressionType;
-         unsigned char gpuDirectRDMACapable;
-         /** Bitmask indicating intended usage for this allocation */
-         unsigned short usage;
-         unsigned char reserved[4];
-    } allocFlags;
-} CUmemAllocationProp_v1;
-typedef CUmemAllocationProp_v1 CUmemAllocationProp;
-
-/**
- * Memory access descriptor
- */
-typedef struct CUmemAccessDesc_st {
-    CUmemLocation location;        /**< Location on which the request is to change it's accessibility */
-    CUmemAccess_flags flags;       /**< ::CUmemProt accessibility flags to set on the request */
-} CUmemAccessDesc_v1;
-typedef CUmemAccessDesc_v1 CUmemAccessDesc;
-
-typedef enum CUgraphExecUpdateResult_enum {
-    CU_GRAPH_EXEC_UPDATE_SUCCESS                     = 0x0, /**< The update succeeded */
-    CU_GRAPH_EXEC_UPDATE_ERROR                       = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
-    CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED      = 0x2, /**< The update failed because the topology changed */
-    CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED     = 0x3, /**< The update failed because a node type changed */
-    CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED      = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
-    CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED    = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
-    CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED         = 0x6, /**< The update failed because something about the node is not supported */
-    CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */
-    CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED    = 0x8  /**< The update failed because the node attributes changed in a way that is not supported */
-} CUgraphExecUpdateResult;
-
-/**
- * CUDA memory pool attributes
- */
-typedef enum CUmemPool_attribute_enum {
-    /**
-     * (value type = int)
-     * Allow cuMemAllocAsync to use memory asynchronously freed
-     * in another streams as long as a stream ordering dependency
-     * of the allocating stream on the free action exists.
-     * Cuda events and null stream interactions can create the required
-     * stream ordered dependencies. (default enabled)
-     */
-    CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1,
-
-    /**
-     * (value type = int)
-     * Allow reuse of already completed frees when there is no dependency
-     * between the free and allocation. (default enabled)
-     */
-    CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
-
-    /**
-     * (value type = int)
-     * Allow cuMemAllocAsync to insert new stream dependencies
-     * in order to establish the stream ordering required to reuse
-     * a piece of memory released by cuFreeAsync (default enabled).
-     */
-    CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of reserved memory in bytes to hold onto before trying
-     * to release memory back to the OS. When more than the release
-     * threshold bytes of memory are held by the memory pool, the
-     * allocator will try to release memory back to the OS on the
-     * next call to stream, event or context synchronize. (default 0)
-     */
-    CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of backing memory currently allocated for the mempool.
-     */
-    CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of backing memory allocated for the mempool since the
-     * last time it was reset. High watermark can only be reset to zero.
-     */
-    CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of memory from the pool that is currently in use by the application.
-     */
-    CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of the amount of memory from the pool that was in use by the application since
-     * the last time it was reset. High watermark can only be reset to zero.
-     */
-    CU_MEMPOOL_ATTR_USED_MEM_HIGH
-} CUmemPool_attribute;
-
-/**
- * Specifies the properties of allocations made from the pool.
- */
-typedef struct CUmemPoolProps_st {
-    CUmemAllocationType allocType;         /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */
-    CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */
-    CUmemLocation location;                /**< Location where allocations should reside. */
-    /**
-     * Windows-specific LPSECURITYATTRIBUTES required when
-     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This security attribute defines
-     * the scope of which exported allocations may be tranferred to other
-     * processes.  In all other cases, this field is required to be zero.
-     */
-    void *win32SecurityAttributes;
-    unsigned char reserved[64]; /**< reserved for future use, must be 0 */
-} CUmemPoolProps_v1;
-typedef CUmemPoolProps_v1 CUmemPoolProps;
-
-/**
- * Opaque data for exporting a pool allocation
- */
-typedef struct CUmemPoolPtrExportData_st {
-    unsigned char reserved[64];
-} CUmemPoolPtrExportData_v1;
-typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData;
-
-/**
- * Memory allocation node parameters
- */
-typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st {
-    /**
-    * in: location where the allocation should reside (specified in ::location).
-    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
-    */
-    CUmemPoolProps poolProps;
-    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
-    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
-    size_t bytesize; /**< in: size in bytes of the requested allocation */
-    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
-} CUDA_MEM_ALLOC_NODE_PARAMS;
-
-typedef enum CUgraphMem_attribute_enum {
-    /**
-     * (value type = cuuint64_t)
-     * Amount of memory, in bytes, currently associated with graphs
-     */
-    CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of memory, in bytes, associated with graphs since the
-     * last time it was reset.  High watermark can only be reset to zero.
-     */
-    CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of memory, in bytes, currently allocated for use by
-     * the CUDA graphs asynchronous allocator.
-     */
-    CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of memory, in bytes, currently allocated for use by
-     * the CUDA graphs asynchronous allocator.
-     */
-    CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
-} CUgraphMem_attribute;
-
-/**
- * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
- * waits for prior work in the stream corresponding to that GPU to complete before the
- * kernel begins execution.
- */
-#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC   0x01
-
-/**
- * If set, any subsequent work pushed in a stream that participated in a call to
- * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
- * the GPU corresponding to that stream to complete before it begins execution.
- */
-#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC  0x02
-
-/**
- * If set, the CUDA array is a collection of layers, where each layer is either a 1D
- * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
- * of layers, not the depth of a 3D array.
- */
-#define CUDA_ARRAY3D_LAYERED        0x01
-
-/**
- * Deprecated, use CUDA_ARRAY3D_LAYERED
- */
-#define CUDA_ARRAY3D_2DARRAY        0x01
-
-/**
- * This flag must be set in order to bind a surface reference
- * to the CUDA array
- */
-#define CUDA_ARRAY3D_SURFACE_LDST   0x02
-
-/**
- * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
- * width of such a CUDA array must be equal to its height, and Depth must be six.
- * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
- * and Depth must be a multiple of six.
- */
-#define CUDA_ARRAY3D_CUBEMAP        0x04
-
-/**
- * This flag must be set in order to perform texture gather operations
- * on a CUDA array.
- */
-#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
-
-/**
- * This flag if set indicates that the CUDA
- * array is a DEPTH_TEXTURE.
- */
-#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10
-
-/**
- * This flag indicates that the CUDA array may be bound as a color target
- * in an external graphics API
- */
-#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
-
-/**
- * This flag if set indicates that the CUDA array or CUDA mipmapped array
- * is a sparse CUDA array or CUDA mipmapped array respectively
- */
-#define CUDA_ARRAY3D_SPARSE 0x40
-
-
-/**
- * This flag if set indicates that the CUDA array or CUDA mipmapped array
- * will allow deferred memory mapping
- */
-#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
-
-
-/**
- * Override the texref format with a format inferred from the array.
- * Flag for ::cuTexRefSetArray()
- */
-#define CU_TRSA_OVERRIDE_FORMAT 0x01
-
-/**
- * Read the texture as integers rather than promoting the values to floats
- * in the range [0,1].
- * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
- */
-#define CU_TRSF_READ_AS_INTEGER         0x01
-
-/**
- * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
- * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
- */
-#define CU_TRSF_NORMALIZED_COORDINATES  0x02
-
-/**
- * Perform sRGB->linear conversion during texture read.
- * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
- */
-#define CU_TRSF_SRGB  0x10
-
- /**
-  * Disable any trilinear filtering optimizations.
-  * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
-  */
-#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION  0x20
-
-/**
- * Enable seamless cube map filtering.
- * Flag for ::cuTexObjectCreate()
- */
-#define CU_TRSF_SEAMLESS_CUBEMAP  0x40
-
-/**
- * End of array terminator for the \p extra parameter to
- * ::cuLaunchKernel
- */
-#define CU_LAUNCH_PARAM_END            ((void*)0x00)
-
-/**
- * Indicator that the next value in the \p extra parameter to
- * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
- * parameters used for launching kernel \p f.  This buffer needs to
- * honor all alignment/padding requirements of the individual parameters.
- * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
- * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
- * effect.
- */
-#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
-
-/**
- * Indicator that the next value in the \p extra parameter to
- * ::cuLaunchKernel will be a pointer to a size_t which contains the
- * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
- * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
- * in the \p extra array if the value associated with
- * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
- */
-#define CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*)0x02)
-
-/**
- * For texture references loaded into the module, use default texunit from
- * texture reference.
- */
-#define CU_PARAM_TR_DEFAULT -1
-
-/**
- * Device that represents the CPU
- */
-#define CU_DEVICE_CPU               ((CUdevice)-1)
-
-/**
- * Device that represents an invalid device
- */
-#define CU_DEVICE_INVALID           ((CUdevice)-2)
-
-/**
- * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
- */
-typedef enum CUflushGPUDirectRDMAWritesOptions_enum {
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST   = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */
-} CUflushGPUDirectRDMAWritesOptions;
-
-/**
- * Platform native ordering for GPUDirect RDMA writes
- */
-typedef enum CUGPUDirectRDMAWritesOrdering_enum {
-    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE        = 0,   /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */
-    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER       = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */
-    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200  /**< Any CUDA device in the system can consistently consume remote writes to this device. */
-} CUGPUDirectRDMAWritesOrdering;
-
-/**
- * The scopes for ::cuFlushGPUDirectRDMAWrites
- */
-typedef enum CUflushGPUDirectRDMAWritesScope_enum {
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER       = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
-} CUflushGPUDirectRDMAWritesScope;
- 
-/**
- * The targets for ::cuFlushGPUDirectRDMAWrites
- */
-typedef enum CUflushGPUDirectRDMAWritesTarget_enum {
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
-} CUflushGPUDirectRDMAWritesTarget;
-
-/**
- * The additional write options for ::cuGraphDebugDotPrint
- */
-typedef enum CUgraphDebugDot_flags_enum {
-    CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE                        = 1<<0,  /** Output all debug data as if every debug flag is enabled */
-    CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES                  = 1<<1,  /** Use CUDA Runtime structures for output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS             = 1<<2,  /** Adds CUDA_KERNEL_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS             = 1<<3,  /** Adds CUDA_MEMCPY3D values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS             = 1<<4,  /** Adds CUDA_MEMSET_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS               = 1<<5,  /** Adds CUDA_HOST_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS              = 1<<6,  /** Adds CUevent handle from record and wait nodes to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS   = 1<<7,  /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS     = 1<<8,  /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES         = 1<<9,  /** Adds CUkernelNodeAttrValue values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES                        = 1<<10, /** Adds node handles and every kernel function handle to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS          = 1<<11, /** Adds memory alloc node parameters to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS           = 1<<12  /** Adds memory free node parameters to output */
-} CUgraphDebugDot_flags;
-
-/**
- * Flags for user objects for graphs
- */
-typedef enum CUuserObject_flags_enum {
-    CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
-} CUuserObject_flags;
-
-/**
- * Flags for retaining user object references for graphs
- */
-typedef enum CUuserObjectRetain_flags_enum {
-    CU_GRAPH_USER_OBJECT_MOVE = 1  /**< Transfer references from the caller rather than creating new references. */
-} CUuserObjectRetain_flags;
-
-/**
- * Flags for instantiating a graph
- */
-typedef enum CUgraphInstantiate_flags_enum {
-    CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH  = 1  /**< Automatically free memory allocated in a graph before relaunching. */
-} CUgraphInstantiate_flags;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-/** @} */ /* END CUDA_TYPES */
-
-#if defined(__GNUC__)
-  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
-    #pragma GCC visibility push(default)
-  #endif
-#endif
-
-#ifdef _WIN32
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
-
-/**
- * \defgroup CUDA_ERROR Error Handling
- *
- * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the error handling functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Gets the string description of an error code
- *
- * Sets \p *pStr to the address of a NULL-terminated string description
- * of the error code \p error.
- * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
- * will be returned and \p *pStr will be set to the NULL address.
- *
- * \param error - Error code to convert to string
- * \param pStr - Address of the string pointer.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::CUresult,
- * ::cudaGetErrorString
- */
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
-
-/**
- * \brief Gets the string representation of an error code enum name
- *
- * Sets \p *pStr to the address of a NULL-terminated string representation
- * of the name of the enum error code \p error.
- * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
- * will be returned and \p *pStr will be set to the NULL address.
- *
- * \param error - Error code to convert to string
- * \param pStr - Address of the string pointer.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::CUresult,
- * ::cudaGetErrorName
- */
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
-
-/** @} */ /* END CUDA_ERROR */
-
-/**
- * \defgroup CUDA_INITIALIZE Initialization
- *
- * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the initialization functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Initialize the CUDA driver API
- *
- * Initializes the driver API and must be called before any other function from
- * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
- * has not been called, any function from the driver API will return
- * ::CUDA_ERROR_NOT_INITIALIZED.
- *
- * \param Flags - Initialization flag for CUDA.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
- * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
- * \notefnerr
- */
-CUresult CUDAAPI cuInit(unsigned int Flags);
-
-/** @} */ /* END CUDA_INITIALIZE */
-
-/**
- * \defgroup CUDA_VERSION Version Management
- *
- * ___MANBRIEF___ version management functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the version management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns the latest CUDA version supported by driver
- *
- * Returns in \p *driverVersion the version of CUDA supported by
- * the driver.  The version is returned as
- * (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
- * would be represented by 9020.
- *
- * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
- * \p driverVersion is NULL.
- *
- * \param driverVersion - Returns the CUDA driver version
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cudaDriverGetVersion,
- * ::cudaRuntimeGetVersion
- */
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
-
-/** @} */ /* END CUDA_VERSION */
-
-/**
- * \defgroup CUDA_DEVICE Device Management
- *
- * ___MANBRIEF___ device management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the device management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns a handle to a compute device
- *
- * Returns in \p *device a device handle given an ordinal in the range <b>[0,
- * ::cuDeviceGetCount()-1]</b>.
- *
- * \param device  - Returned device handle
- * \param ordinal - Device number to get handle for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGetLuid,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport
- */
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
-
-/**
- * \brief Returns the number of compute-capable devices
- *
- * Returns in \p *count the number of devices with compute capability greater
- * than or equal to 2.0 that are available for execution. If there is no such
- * device, ::cuDeviceGetCount() returns 0.
- *
- * \param count - Returned number of compute-capable devices
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceCount
- */
-CUresult CUDAAPI cuDeviceGetCount(int *count);
-
-/**
- * \brief Returns an identifer string for the device
- *
- * Returns an ASCII string identifying the device \p dev in the NULL-terminated
- * string pointed to by \p name. \p len specifies the maximum length of the
- * string that may be returned.
- *
- * \param name - Returned identifier string for the device
- * \param len  - Maximum length of string to store in \p name
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGetCount,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
-
-/**
- * \brief Return an UUID for the device
- *
- * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
- * supplant this version in 12.0, which is retained for minor version compatibility.
- *
- * Returns 16-octets identifing the device \p dev in the structure
- * pointed by the \p uuid.
- *
- * \param uuid - Returned UUID
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetUuid_v2
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
-
-/**
- * \brief Return an UUID for the device (11.4+)
- *
- * Returns 16-octets identifing the device \p dev in the structure
- * pointed by the \p uuid. If the device is in MIG mode, returns its
- * MIG UUID which uniquely identifies the subscribed MIG compute instance.
- *
- * \param uuid - Returned UUID
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev);
-
-/**
- * \brief Return an LUID and device node mask for the device
- *
- * Return identifying information (\p luid and \p deviceNodeMask) to allow
- * matching device with graphics APIs.
- *
- * \param luid - Returned LUID
- * \param deviceNodeMask - Returned device node mask
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
-
-/**
- * \brief Returns the total amount of memory on the device
- *
- * Returns in \p *bytes the total amount of memory available on the device
- * \p dev in bytes.
- *
- * \param bytes - Returned memory available on device in bytes
- * \param dev   - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaMemGetInfo
- */
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
-
-/**
- * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
- *
- * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture
- * for given \p format and \p numChannels.
- *
- * \param maxWidthInElements    - Returned maximum number of texture elements allocatable for given \p format and \p numChannels.
- * \param format                - Texture format.
- * \param numChannels           - Number of channels per texture element.
- * \param dev                   - Device handle.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cudaMemGetInfo,
- * ::cuDeviceTotalMem
- */
-CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev);
-
-/**
- * \brief Returns information about the device
- *
- * Returns in \p *pi the integer value of the attribute \p attrib on device
- * \p dev. The supported attributes are:
- * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
- *   block;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
- *   shared memory available to a thread block in bytes
- * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
- *   __constant__ variables in a CUDA C kernel in bytes
- * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
- * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
- *   memory copy functions that involve memory regions allocated through
- *   ::cuMemAllocPitch()
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
- *  texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
- *  for a 1D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 1D texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
- *  texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
- *  texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
- *  for a 2D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
- *  for a 2D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
- *  in bytes for a 2D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 2D texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
- *  mipmapped 2D texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
- *  texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
- *  texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
- *  texture depth
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
- *  Alternate maximum 3D texture width, 0 if no alternate
- *  maximum 3D texture size is supported
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
- *  Alternate maximum 3D texture height, 0 if no alternate
- *  maximum 3D texture size is supported
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
- *  Alternate maximum 3D texture depth, 0 if no alternate
- *  maximum 3D texture size is supported
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
- *  Maximum cubemap texture width or height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
- *  Maximum 1D layered texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered texture
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
- *  Maximum 2D layered texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered texture
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered texture width or height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered texture
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
- *   Maximum 1D surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
- *   Maximum 2D surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
- *   Maximum 2D surface height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
- *   Maximum 3D surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
- *   Maximum 3D surface height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
- *   Maximum 3D surface depth
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
- *   Maximum 1D layered surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered surface
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
- *   Maximum 2D layered surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered surface height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered surface
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
- *   Maximum cubemap surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered surface
- * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
- *   registers available to a thread block
- * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
- * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
- *   base addresses aligned to ::textureAlign bytes do not need an offset
- *   applied to texture fetches
- * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
- *   for 2D texture references bound to pitched memory
- * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
- *   memory between host and device while executing a kernel, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
- *   the device
- * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
- *   for kernels executed on the device, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
- *   memory subsystem, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
- *   memory into the CUDA address space, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
- *   in. Available modes are as follows:
- *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
- *     can have multiple CUDA contexts present at a single time.
- *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
- *     prohibited from creating new CUDA contexts.
- *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
- *     can have only one context used by a single process at a time.
- * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
- *   executing multiple kernels within the same context simultaneously, or 0 if
- *   not. It is not guaranteed that multiple kernels will be resident
- *   on the device concurrently so this feature should not be relied upon for
- *   correctness.
- * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
- *    device, 0 if error correction is disabled or not supported by the device
- * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
- * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
- *   of the device
- * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
- * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
- *    is only available on Tesla hardware running Windows Vista or later
- * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
- * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
- * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
- * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
- *   the host, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
- * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
- *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device
- * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
- *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device
- * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
- *   shared memory available to a multiprocessor in bytes; this amount is shared
- *   by all thread blocks simultaneously resident on a multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
- *   registers available to a multiprocessor; this number is shared by all thread
- *   blocks simultaneously resident on a multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
- *   on this system, 0 if allocating managed memory is not supported by the device on this system.
- * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
- * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
- *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
- * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
- *   supports native atomic operations.
- * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
- *   (in floating-point operations per second) to double precision performance.
- * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing
- *   pageable memory without calling cudaHostRegister on it.
- * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
- *   concurrently with the CPU.
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
- * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
- *   memory at the same virtual address as the CPU.
- * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
- *    suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call.
- *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
- * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
- *   page tables.
- * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
- * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
- * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
- * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes 
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
- * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
- * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. 
- * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
- * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
- * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
-
- * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
-
- *
- * \param pi     - Returned device attribute value
- * \param attrib - Device attribute to query
- * \param dev    - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaDeviceGetAttribute,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
-
-/**
- * \brief Return NvSciSync attributes that this device can support.
- *
- * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
- * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
- * can be used to create an NvSciSync object that matches this device's capabilities.
- * 
- * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
- * already set this API will return ::CUDA_ERROR_INVALID_VALUE.
- * 
- * The applications should set \p nvSciSyncAttrList to a valid 
- * NvSciSyncAttrList failing which this API will return
- * ::CUDA_ERROR_INVALID_HANDLE.
- * 
- * The \p flags controls how applications intends to use
- * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
- * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to 
- * signal an NvSciSync on this CUDA device.
- * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to 
- * wait on an NvSciSync on this CUDA device.
- *
- * At least one of these flags must be set, failing which the API
- * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal
- * to one another: a developer may set both these flags that allows to
- * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
- *
- * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
- * \param dev                   - Valid Cuda Device to get NvSciSync attributes for.
- * \param flags                 - flags describing NvSciSync usage.
- *
- * \return
- *
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa
- * ::cuImportExternalSemaphore,
- * ::cuDestroyExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags);
-
-/**
- * \brief Sets the current memory pool of a device
- *
- * The memory pool must be local to the specified device.
- * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device.
- * By default, a device's current memory pool is its default memory pool.
- *
- * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different
- * than the one the stream runs on. 
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync
- */
-CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool);
-
-/**
- * \brief Gets the current mempool for a device
- *
- * Returns the last pool provided to ::cuDeviceSetMemPool for this device
- * or the device's default memory pool if ::cuDeviceSetMemPool has never been called.
- * By default the current mempool is the default mempool for a device.
- * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool
- */
-CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev);
-
-/**
- * \brief Returns the default mempool of a device
- *
- * The default mempool of a device contains device memory from that device.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev);
-
-/**
- * \brief Blocks until remote writes are visible to the specified scope
- *
- * Blocks until GPUDirect RDMA writes to the target context via mappings
- * created through APIs like nvidia_p2p_get_pages (see
- * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
- * visible to the specified scope.
- *
- * If the scope equals or lies within the scope indicated by
- * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call
- * will be a no-op and can be safely omitted for performance. This can be
- * determined by comparing the numerical values between the two enums, with
- * smaller scopes having smaller values.
- *
- * Users may query support for this API via
- * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
- *
- * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget
- * \param scope  - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- */
-CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
-
-/** @} */ /* END CUDA_DEVICE */
-
-/**
- * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the device management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns properties for a selected device
- *
- * \deprecated
- *
- * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
- *
- * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
- * structure is defined as:
- *
- * \code
-     typedef struct CUdevprop_st {
-     int maxThreadsPerBlock;
-     int maxThreadsDim[3];
-     int maxGridSize[3];
-     int sharedMemPerBlock;
-     int totalConstantMemory;
-     int SIMDWidth;
-     int memPitch;
-     int regsPerBlock;
-     int clockRate;
-     int textureAlign
-  } CUdevprop;
- * \endcode
- * where:
- *
- * - ::maxThreadsPerBlock is the maximum number of threads per block;
- * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
- * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
- * - ::sharedMemPerBlock is the total amount of shared memory available per
- *   block in bytes;
- * - ::totalConstantMemory is the total amount of constant memory available on
- *   the device in bytes;
- * - ::SIMDWidth is the warp size;
- * - ::memPitch is the maximum pitch allowed by the memory copy functions that
- *   involve memory regions allocated through ::cuMemAllocPitch();
- * - ::regsPerBlock is the total number of registers available per block;
- * - ::clockRate is the clock frequency in kilohertz;
- * - ::textureAlign is the alignment requirement; texture base addresses that
- *   are aligned to ::textureAlign bytes do not need an offset applied to
- *   texture fetches.
- *
- * \param prop - Returned properties of device
- * \param dev  - Device to get properties for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
-
-/**
- * \brief Returns the compute capability of the device
- *
- * \deprecated
- *
- * This function was deprecated as of CUDA 5.0 and its functionality superceded
- * by ::cuDeviceGetAttribute().
- *
- * Returns in \p *major and \p *minor the major and minor revision numbers that
- * define the compute capability of the device \p dev.
- *
- * \param major - Major revision number
- * \param minor - Minor revision number
- * \param dev   - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
-
-/** @} */ /* END CUDA_DEVICE_DEPRECATED */
-
-/**
- * \defgroup CUDA_PRIMARY_CTX Primary Context Management
- *
- * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the primary context management functions of the low-level
- * CUDA driver application programming interface.
- *
- * The primary context is unique per device and shared with the CUDA runtime API.
- * These functions allow integration with other libraries using CUDA.
- *
- * @{
- */
-
-/**
- * \brief Retain the primary context on the GPU
- *
- * Retains the primary context on the device.
- * Once the user successfully retains the primary context, the primary context
- * will be active and available to the user until the user releases it
- * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset().
- * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack.
- *
- * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN
- * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function
- * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to
- * determine the compute mode  of the device.
- * The <i>nvidia-smi</i> tool can be used to set the compute mode for
- * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
- * -h option to it.
- *
- * Please note that the primary context always supports pinned allocations. Other
- * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
- *
- * \param pctx  - Returned context handle of the new context
- * \param dev   - Device for which primary context is requested
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRelease,
- * ::cuDevicePrimaryCtxSetFlags,
- * ::cuCtxCreate,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
-
-/**
- * \brief Release the primary context on the GPU
- *
- * Releases the primary context interop on the device.
- * A retained context should always be released once the user is done using
- * it. The context is automatically reset once the last reference to it is
- * released. This behavior is different when the primary context was retained
- * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary
- * context remains always active.
- *
- * Releasing a primary context that has not been previously retained will
- * fail with ::CUDA_ERROR_INVALID_CONTEXT.
- *
- * Please note that unlike ::cuCtxDestroy() this method does not pop the context
- * from stack in any circumstances.
- *
- * \param dev - Device which primary context is released
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRetain,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
-
-/**
- * \brief Set flags for the primary context
- *
- * Sets the flags for the primary context on the device overwriting perviously
- * set ones.
- *
- * The three LSBs of the \p flags parameter can be used to control how the OS
- * thread, which owns the CUDA context at the time of an API call, interacts
- * with the OS scheduler when waiting for results from the GPU. Only one of
- * the scheduling flags can be set when creating a context.
- *
- * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- * results from the GPU. This can decrease latency when waiting for the GPU,
- * but may lower the performance of CPU threads if they are performing work in
- * parallel with the CUDA thread.
- *
- * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
- * results from the GPU. This can increase latency when waiting for the GPU,
- * but can increase the performance of CPU threads performing work in parallel
- * with the GPU.
- *
- * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work.
- *
- * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work. <br>
- * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
- * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
- *
- * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
- * uses a heuristic based on the number of active CUDA contexts in the
- * process \e C and the number of logical processors in the system \e P. If
- * \e C > \e P, then CUDA will yield to other OS threads when waiting for
- * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
- * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
- * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
- * for low-powered devices.
- *
- * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
- * after resizing local memory for a kernel. This can prevent thrashing by
- * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage. <br>
- * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
- * by this flag is now the default and cannot be disabled.
- *
- * \param dev   - Device for which the primary context flags are set
- * \param flags - New flags for the device
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRetain,
- * ::cuDevicePrimaryCtxGetState,
- * ::cuCtxCreate,
- * ::cuCtxGetFlags,
- * ::cudaSetDeviceFlags
- */
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
-
-/**
- * \brief Get the state of the primary context
- *
- * Returns in \p *flags the flags for the primary context of \p dev, and in
- * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
- * values.
- *
- * \param dev    - Device to get primary context flags for
- * \param flags  - Pointer to store flags
- * \param active - Pointer to store context state; 0 = inactive, 1 = active
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa
- * ::cuDevicePrimaryCtxSetFlags,
- * ::cuCtxGetFlags,
- * ::cudaGetDeviceFlags
- */
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
-
-/**
- * \brief Destroy all allocations and reset all state on the primary context
- *
- * Explicitly destroys and cleans up all resources associated with the current
- * device in the current process.
- *
- * Note that it is responsibility of the calling function to ensure that no
- * other module in the process is using the device any more. For that reason
- * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
- * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
- * even after resetting the device.
- * Resetting the primary context does not release it, an application that has
- * retained the primary context should explicitly release its usage.
- *
- * \param dev - Device for which primary context is destroyed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRetain,
- * ::cuDevicePrimaryCtxRelease,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaDeviceReset
- */
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
-
-/** @} */ /* END CUDA_PRIMARY_CTX */
-
-/**
- * \brief Returns information about the execution affinity support of the device.
- *
- * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev.
- * The supported types are:
- * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device,
- *   or 0 if not;
- *
- * \param pi   - 1 if the execution affinity type \p type is supported by the device, or 0 if not
- * \param type - Execution affinity type to query
- * \param dev  - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem
- */
-CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev);
-
-/**
- * \defgroup CUDA_CTX Context Management
- *
- * ___MANBRIEF___ context management functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the context management functions of the low-level
- * CUDA driver application programming interface.
- *
- * Please note that some functions are described in
- * \ref CUDA_PRIMARY_CTX "Primary Context Management" section.
- *
- * @{
- */
-
-/**
- * \brief Create a CUDA context
- *
- * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
- *
- * Creates a new CUDA context and associates it with the calling thread. The
- * \p flags parameter is described below. The context is created with a usage
- * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
- * when done using the context. If a context is already current to the thread,
- * it is supplanted by the newly created context and may be restored by a subsequent
- * call to ::cuCtxPopCurrent().
- *
- * The three LSBs of the \p flags parameter can be used to control how the OS
- * thread, which owns the CUDA context at the time of an API call, interacts
- * with the OS scheduler when waiting for results from the GPU. Only one of
- * the scheduling flags can be set when creating a context.
- *
- * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- * results from the GPU. This can decrease latency when waiting for the GPU,
- * but may lower the performance of CPU threads if they are performing work in
- * parallel with the CUDA thread.
- *
- * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
- * results from the GPU. This can increase latency when waiting for the GPU,
- * but can increase the performance of CPU threads performing work in parallel
- * with the GPU.
- *
- * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work.
- *
- * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work. <br>
- * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
- * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
- *
- * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
- * uses a heuristic based on the number of active CUDA contexts in the
- * process \e C and the number of logical processors in the system \e P. If
- * \e C > \e P, then CUDA will yield to other OS threads when waiting for
- * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
- * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
- * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
- * for low-powered devices.
- *
- * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
- * This flag must be set in order to allocate pinned host memory that is
- * accessible to the GPU.
- *
- * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
- * after resizing local memory for a kernel. This can prevent thrashing by
- * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage. <br>
- * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
- * by this flag is now the default and cannot be disabled.
- * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
- *
- * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
- * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
- * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
- * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
- * the compute mode for * devices.
- * Documentation for <i>nvidia-smi</i> can be obtained by passing a
- * -h option to it.
- *
- * \param pctx  - Returned context handle of the new context
- * \param flags - Context creation flags
- * \param dev   - Device to create context on
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-
-/**
- * \brief Create a CUDA context with execution affinity
- *
- * Creates a new CUDA context with execution affinity and associates it with
- * the calling thread. The \p paramsArray and \p flags parameter are described below.
- * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must
- * call ::cuCtxDestroy() or when done using the context. If a context is already
- * current to the thread, it is supplanted by the newly created context and may
- * be restored by a subsequent call to ::cuCtxPopCurrent().
- *
- * The type and the amount of execution resource the context can use is limited by \p paramsArray
- * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams
- * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type,
- * the latter execution affinity parameter overrides the former execution affinity parameter.
- * The supported execution affinity types are:
- * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
- *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
- *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
- *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
- *   is only supported under Volta+ MPS.
- *
- * The three LSBs of the \p flags parameter can be used to control how the OS
- * thread, which owns the CUDA context at the time of an API call, interacts
- * with the OS scheduler when waiting for results from the GPU. Only one of
- * the scheduling flags can be set when creating a context.
- *
- * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- * results from the GPU. This can decrease latency when waiting for the GPU,
- * but may lower the performance of CPU threads if they are performing work in
- * parallel with the CUDA thread.
- *
- * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
- * results from the GPU. This can increase latency when waiting for the GPU,
- * but can increase the performance of CPU threads performing work in parallel
- * with the GPU.
- *
- * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work.
- *
- * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work. <br>
- * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
- * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
- *
- * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
- * uses a heuristic based on the number of active CUDA contexts in the
- * process \e C and the number of logical processors in the system \e P. If
- * \e C > \e P, then CUDA will yield to other OS threads when waiting for
- * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
- * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
- * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
- * for low-powered devices.
- *
- * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
- * This flag must be set in order to allocate pinned host memory that is
- * accessible to the GPU.
- *
- * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
- * after resizing local memory for a kernel. This can prevent thrashing by
- * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage. <br>
- * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
- * by this flag is now the default and cannot be disabled.
- * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
- *
- * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
- * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
- * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
- * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
- * the compute mode for * devices.
- * Documentation for <i>nvidia-smi</i> can be obtained by passing a
- * -h option to it.
- *
- * \param pctx        - Returned context handle of the new context
- * \param paramsArray - Execution affinity parameters
- * \param numParams   - Number of execution affinity parameters
- * \param flags       - Context creation flags
- * \param dev         - Device to create context on
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::CUexecAffinityParam
- */
-CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
-
-/**
- * \brief Destroy a CUDA context
- *
- * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
- * destroyed regardless of how many threads it is current to.
- * It is the responsibility of the calling function to ensure that no API
- * call issues using \p ctx while ::cuCtxDestroy() is executing.
- *
- * Destroys and cleans up all resources associated with the context.
- * It is the caller's responsibility to ensure that the context or its resources
- * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
- * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
- * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
- * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
- *
- * If \p ctx is current to the calling thread then \p ctx will also be
- * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
- * were called).  If \p ctx is current to other threads, then \p ctx will
- * remain current to those threads, and attempting to access \p ctx from
- * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
- *
- * \param ctx - Context to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
-
-/**
- * \brief Pushes a context on the current CPU thread
- *
- * Pushes the given context \p ctx onto the CPU thread's stack of current
- * contexts. The specified context becomes the CPU thread's current context, so
- * all CUDA functions that operate on the current context are affected.
- *
- * The previous current context may be made current again by calling
- * ::cuCtxDestroy() or ::cuCtxPopCurrent().
- *
- * \param ctx - Context to push
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
-
-/**
- * \brief Pops the current CUDA context from the current CPU thread.
- *
- * Pops the current CUDA context from the CPU thread and passes back the
- * old context handle in \p *pctx. That context may then be made current
- * to a different CPU thread by calling ::cuCtxPushCurrent().
- *
- * If a context was current to the CPU thread before ::cuCtxCreate() or
- * ::cuCtxPushCurrent() was called, this function makes that context current to
- * the CPU thread again.
- *
- * \param pctx - Returned popped context handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
-
-/**
- * \brief Binds the specified CUDA context to the calling CPU thread
- *
- * Binds the specified CUDA context to the calling CPU thread.
- * If \p ctx is NULL then the CUDA context previously bound to the
- * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
- *
- * If there exists a CUDA context stack on the calling CPU thread, this
- * will replace the top of that stack with \p ctx.
- * If \p ctx is NULL then this will be equivalent to popping the top
- * of the calling CPU thread's CUDA context stack (or a no-op if the
- * calling CPU thread's CUDA context stack is empty).
- *
- * \param ctx - Context to bind to the calling CPU thread
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa
- * ::cuCtxGetCurrent,
- * ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cudaSetDevice
- */
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
-
-/**
- * \brief Returns the CUDA context bound to the calling CPU thread.
- *
- * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
- * If no context is bound to the calling CPU thread then \p *pctx is
- * set to NULL and ::CUDA_SUCCESS is returned.
- *
- * \param pctx - Returned context handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * \notefnerr
- *
- * \sa
- * ::cuCtxSetCurrent,
- * ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cudaGetDevice
- */
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
-
-/**
- * \brief Returns the device ID for the current context
- *
- * Returns in \p *device the ordinal of the current context's device.
- *
- * \param device - Returned device ID for the current context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaGetDevice
- */
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
-
-/**
- * \brief Returns the flags for the current context
- *
- * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
- * for flag values.
- *
- * \param flags - Pointer to store flags of current context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetCurrent,
- * ::cuCtxGetDevice,
- * ::cuCtxGetLimit,
- * ::cuCtxGetSharedMemConfig,
- * ::cuCtxGetStreamPriorityRange,
- * ::cudaGetDeviceFlags
- */
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
-
-/**
- * \brief Block for a context's tasks to complete
- *
- * Blocks until the device has completed all preceding requested tasks.
- * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
- * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
- * CPU thread will block until the GPU context has finished its work.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cudaDeviceSynchronize
- */
-CUresult CUDAAPI cuCtxSynchronize(void);
-
-/**
- * \brief Set resource limits
- *
- * Setting \p limit to \p value is a request by the application to update
- * the current limit maintained by the context. The driver is free to
- * modify the requested value to meet h/w requirements (this could be
- * clamping to minimum or maximum values, rounding up to nearest element
- * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
- * what the limit has been set to.
- *
- * Setting each ::CUlimit has its own specific restrictions, so each is
- * discussed here.
- *
- * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
- *   The driver automatically increases the per-thread stack size
- *   for each kernel launch as needed. This size isn't reset back to the
- *   original value after each launch. Setting this value will take effect 
- *   immediately, and if necessary, the device will block until all preceding 
- *   requested tasks are complete.
- *
- * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
- *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
- *   must be performed before launching any kernel that uses the ::printf()
- *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
- *   by the ::malloc() and ::free() device system calls. Setting
- *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
- *   that uses the ::malloc() or ::free() device system calls, otherwise
- *   ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
- *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
- *   this limit must be performed before any launch of a kernel that uses the
- *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
- *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
- *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
- *   violated. This limit can be set smaller than the default or up the maximum
- *   launch depth of 24. When setting this limit, keep in mind that additional
- *   levels of sync depth require the driver to reserve large amounts of device
- *   memory which can no longer be used for user allocations. If these
- *   reservations of device memory fail, ::cuCtxSetLimit() will return
- *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
- *   This limit is only applicable to devices of compute capability 3.5 and
- *   higher. Attempting to set this limit on devices of compute capability less
- *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
- *   returned.
- *
- * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
- *   outstanding device runtime launches that can be made from the current
- *   context. A grid is outstanding from the point of launch up until the grid
- *   is known to have been completed. Device runtime launches which violate
- *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
- *   ::cudaGetLastError() is called after launch. If more pending launches than
- *   the default (2048 launches) are needed for a module using the device
- *   runtime, this limit can be increased. Keep in mind that being able to
- *   sustain additional pending launches will require the driver to reserve
- *   larger amounts of device memory upfront which can no longer be used for
- *   allocations. If these reservations fail, ::cuCtxSetLimit() will return
- *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
- *   This limit is only applicable to devices of compute capability 3.5 and
- *   higher. Attempting to set this limit on devices of compute capability less
- *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
- *   returned.
- *
- * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
- *   Values can range from 0B to 128B. This is purely a performence hint and
- *   it can be ignored or clamped depending on the platform.
- *
- * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for
- *   persisting L2 cache. This is purely a performance hint and it can be
- *   ignored or clamped depending on the platform.
- *
- * \param limit - Limit to set
- * \param value - Size of limit
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSynchronize,
- * ::cudaDeviceSetLimit
- */
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
-
-/**
- * \brief Returns resource limits
- *
- * Returns in \p *pvalue the current size of \p limit.  The supported
- * ::CUlimit values are:
- * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
- * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
- *   ::printf() device system call.
- * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
- *   ::malloc() and ::free() device system calls.
- * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
- *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
- *   child grid launches to complete.
- * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
- *   device runtime launches that can be made from this context.
- * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
- * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes
- *
- * \param limit  - Limit to query
- * \param pvalue - Returned size of limit
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNSUPPORTED_LIMIT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaDeviceGetLimit
- */
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
-
-/**
- * \brief Returns the preferred cache configuration for the current context.
- *
- * On devices where the L1 cache and shared memory use the same hardware
- * resources, this function returns through \p pconfig the preferred cache configuration
- * for the current context. This is only a preference. The driver will use
- * the requested configuration if possible, but it is free to choose a different
- * configuration if required to execute functions.
- *
- * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
- * where the size of the L1 cache and shared memory are fixed.
- *
- * The supported cache configurations are:
- * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- *
- * \param pconfig - Returned cache configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceGetCacheConfig
- */
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
-
-/**
- * \brief Sets the preferred cache configuration for the current context.
- *
- * On devices where the L1 cache and shared memory use the same hardware
- * resources, this sets through \p config the preferred cache configuration for
- * the current context. This is only a preference. The driver will use
- * the requested configuration if possible, but it is free to choose a different
- * configuration if required to execute the function. Any function preference
- * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
- * setting. Setting the context-wide cache configuration to
- * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
- * to not change the cache configuration unless required to launch the kernel.
- *
- * This setting does nothing on devices where the size of the L1 cache and
- * shared memory are fixed.
- *
- * Launching a kernel with a different preference than the most recent
- * preference setting may insert a device-side synchronization point.
- *
- * The supported cache configurations are:
- * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- *
- * \param config - Requested cache configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceSetCacheConfig
- */
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
-
-/**
- * \brief Returns the current shared memory configuration for the current context.
- *
- * This function will return in \p pConfig the current size of shared memory banks
- * in the current context. On devices with configurable shared memory banks,
- * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
- * subsequent kernel launches will by default use the new bank size. When
- * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
- * memory, it will return the fixed bank size of the hardware.
- *
- * The returned bank configurations can be either:
- * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
- *   four bytes.
- * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
- *   eight bytes.
- *
- * \param pConfig - returned shared memory configuration
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuCtxGetSharedMemConfig,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceGetSharedMemConfig
- */
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
-
-/**
- * \brief Sets the shared memory configuration for the current context.
- *
- * On devices with configurable shared memory banks, this function will set
- * the context's shared memory bank size which is used for subsequent kernel
- * launches.
- *
- * Changed the shared memory configuration between launches may insert a device
- * side synchronization point between those launches.
- *
- * Changing the shared memory bank size will not increase shared memory usage
- * or affect occupancy of kernels, but may have major effects on performance.
- * Larger bank sizes will allow for greater potential bandwidth to shared memory,
- * but will change what kinds of accesses to shared memory will result in bank
- * conflicts.
- *
- * This function will do nothing on devices with fixed shared memory bank size.
- *
- * The supported bank configurations are:
- * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
- *   setting (currently, four bytes).
- * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively four bytes.
- * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively eight bytes.
- *
- * \param config - requested shared memory configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuCtxGetSharedMemConfig,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceSetSharedMemConfig
- */
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
-
-/**
- * \brief Gets the context's API version.
- *
- * Returns a version number in \p version corresponding to the capabilities of
- * the context (e.g. 3010 or 3020), which library developers can use to direct
- * callers to a specific API version. If \p ctx is NULL, returns the API version
- * used to create the currently bound context.
- *
- * Note that new API versions are only introduced when context capabilities are
- * changed that break binary compatibility, so the API version and driver version
- * may be different. For example, it is valid for the API version to be 3020 while
- * the driver version is 4020.
- *
- * \param ctx     - Context to check
- * \param version - Pointer to version
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
-
-/**
- * \brief Returns numerical values that correspond to the least and
- * greatest stream priorities.
- *
- * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
- * to the least and greatest stream priorities respectively. Stream priorities
- * follow a convention where lower numbers imply greater priorities. The range of
- * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
- * If the user attempts to create a stream with a priority value that is
- * outside the meaningful range as specified by this API, the priority is
- * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
- * respectively. See ::cuStreamCreateWithPriority for details on creating a
- * priority stream.
- * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
- * is not desired.
- *
- * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
- * the current context's device does not support stream priorities
- * (see ::cuDeviceGetAttribute).
- *
- * \param leastPriority    - Pointer to an int in which the numerical value for least
- *                           stream priority is returned
- * \param greatestPriority - Pointer to an int in which the numerical value for greatest
- *                           stream priority is returned
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuStreamCreateWithPriority,
- * ::cuStreamGetPriority,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaDeviceGetStreamPriorityRange
- */
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
-
-/**
- * \brief Resets all persisting lines in cache to normal status.
- *
- * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal
- * status. Takes effect on function return.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
-
-/**
- * \brief Returns the execution affinity setting for the current context.
- *
- * Returns in \p *pExecAffinity the current value of \p type. The supported
- * ::CUexecAffinityType values are:
- * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use.
- *
- * \param type          - Execution affinity type to query
- * \param pExecAffinity - Returned execution affinity
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY
- * \notefnerr
- *
- * \sa
- * ::CUexecAffinityParam
- */
-CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
-
-
-/** @} */ /* END CUDA_CTX */
-
-/**
- * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the deprecated context management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Increment a context's usage-count
- *
- * \deprecated
- *
- * Note that this function is deprecated and should not be used.
- *
- * Increments the usage count of the context and passes back a context handle
- * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
- * done with the context. ::cuCtxAttach() fails if there is no context current
- * to the thread.
- *
- * Currently, the \p flags parameter must be 0.
- *
- * \param pctx  - Returned context handle of the current context
- * \param flags - Context attach flags (must be 0)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxDetach,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
-
-/**
- * \brief Decrement a context's usage-count
- *
- * \deprecated
- *
- * Note that this function is deprecated and should not be used.
- *
- * Decrements the usage count of the context \p ctx, and destroys the context
- * if the usage count goes to 0. The context must be a handle that was passed
- * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
- * calling thread.
- *
- * \param ctx - Context to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
-
-/** @} */ /* END CUDA_CTX_DEPRECATED */
-
-
-/**
- * \defgroup CUDA_MODULE Module Management
- *
- * ___MANBRIEF___ module management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the module management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Loads a compute module
- *
- * Takes a filename \p fname and loads the corresponding module \p module into
- * the current context. The CUDA driver API does not attempt to lazily
- * allocate the resources needed by a module; if the memory for functions and
- * data (constant and global) needed by the module cannot be allocated,
- * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
- * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
- * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
- *
- * \param module - Returned module
- * \param fname  - Filename of module to load
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_FILE_NOT_FOUND,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
-
-/**
- * \brief Load a module's data
- *
- * Takes a pointer \p image and loads the corresponding module \p module into
- * the current context. The pointer may be obtained by mapping a \e cubin or
- * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
- * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
- * object into the executable resources and using operating system calls such
- * as Windows \c FindResource() to obtain the pointer.
- *
- * \param module - Returned module
- * \param image  - Module data to load
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
-
-/**
- * \brief Load a module's data with options
- *
- * Takes a pointer \p image and loads the corresponding module \p module into
- * the current context. The pointer may be obtained by mapping a \e cubin or
- * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
- * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
- * object into the executable resources and using operating system calls such
- * as Windows \c FindResource() to obtain the pointer. Options are passed as
- * an array via \p options and any corresponding parameters are passed in
- * \p optionValues. The number of total options is supplied via \p numOptions.
- * Any outputs will be returned via \p optionValues.
- *
- * \param module       - Returned module
- * \param image        - Module data to load
- * \param numOptions   - Number of options
- * \param options      - Options for JIT
- * \param optionValues - Option values for JIT
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-
-/**
- * \brief Load a module's data
- *
- * Takes a pointer \p fatCubin and loads the corresponding module \p module
- * into the current context. The pointer represents a <i>fat binary</i> object,
- * which is a collection of different \e cubin and/or \e PTX files, all
- * representing the same device code, but compiled and optimized for different
- * architectures.
- *
- * Prior to CUDA 4.0, there was no documented API for constructing and using
- * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
- * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
- * More information can be found in the \b nvcc document.
- *
- * \param module   - Returned module
- * \param fatCubin - Fat binary to load
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
-
-/**
- * \brief Unloads a module
- *
- * Unloads a module \p hmod from the current context.
- *
- * \param hmod - Module to unload
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_destroy_ub
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary
- */
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
-
-/**
- * \brief Returns a function handle
- *
- * Returns in \p *hfunc the handle of the function of name \p name located in
- * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
- * returns ::CUDA_ERROR_NOT_FOUND.
- *
- * \param hfunc - Returned function handle
- * \param hmod  - Module to retrieve function from
- * \param name  - Name of function to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
-
-/**
- * \brief Returns a global pointer from a module
- *
- * Returns in \p *dptr and \p *bytes the base pointer and size of the
- * global of name \p name located in module \p hmod. If no variable of that name
- * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
- * parameters \p dptr and \p bytes are optional. If one of them is
- * NULL, it is ignored.
- *
- * \param dptr  - Returned global device pointer
- * \param bytes - Returned global size in bytes
- * \param hmod  - Module to retrieve global from
- * \param name  - Name of global to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload,
- * ::cudaGetSymbolAddress,
- * ::cudaGetSymbolSize
- */
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
-
-/**
- * \brief Returns a handle to a texture reference
- *
- * Returns in \p *pTexRef the handle of the texture reference of name \p name
- * in the module \p hmod. If no texture reference of that name exists,
- * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
- * handle should not be destroyed, since it will be destroyed when the module
- * is unloaded.
- *
- * \param pTexRef  - Returned texture reference
- * \param hmod     - Module to retrieve texture reference from
- * \param name     - Name of texture reference to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetSurfRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload,
- * ::cudaGetTextureReference
- */
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
-
-/**
- * \brief Returns a handle to a surface reference
- *
- * Returns in \p *pSurfRef the handle of the surface reference of name \p name
- * in the module \p hmod. If no surface reference of that name exists,
- * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
- *
- * \param pSurfRef  - Returned surface reference
- * \param hmod     - Module to retrieve surface reference from
- * \param name     - Name of surface reference to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload,
- * ::cudaGetSurfaceReference
- */
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
-
-/**
- * \brief Creates a pending JIT linker invocation.
- *
- * If the call is successful, the caller owns the returned CUlinkState, which
- * should eventually be destroyed with ::cuLinkDestroy.  The
- * device code machine size (32 or 64 bit) will match the calling application.
- *
- * Both linker and compiler options may be specified.  Compiler options will
- * be applied to inputs to this linker action which must be compiled from PTX.
- * The options ::CU_JIT_WALL_TIME,
- * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
- * will accumulate data until the CUlinkState is destroyed.
- *
- * \p optionValues must remain valid for the life of the CUlinkState if output
- * options are used.  No other references to inputs are maintained after this
- * call returns.
- *
- * \param numOptions   Size of options arrays
- * \param options      Array of linker and compiler options
- * \param optionValues Array of option values, each cast to void *
- * \param stateOut     On success, this will contain a CUlinkState to specify
- *                     and complete this action
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuLinkAddData,
- * ::cuLinkAddFile,
- * ::cuLinkComplete,
- * ::cuLinkDestroy
- */
-CUresult CUDAAPI
-cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-
-/**
- * \brief Add an input to a pending linker invocation
- *
- * Ownership of \p data is retained by the caller.  No reference is retained to any
- * inputs after this call returns.
- *
- * This method accepts only compiler options, which are used if the data must
- * be compiled from PTX, and does not accept any of
- * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
- * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
- *
- * \param state        A pending linker action.
- * \param type         The type of the input data.
- * \param data         The input data.  PTX must be NULL-terminated.
- * \param size         The length of the input data.
- * \param name         An optional name for this input in log messages.
- * \param numOptions   Size of options.
- * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
- * \param optionValues Array of option values, each cast to void *.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU
- *
- * \sa ::cuLinkCreate,
- * ::cuLinkAddFile,
- * ::cuLinkComplete,
- * ::cuLinkDestroy
- */
-CUresult CUDAAPI
-cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
-    unsigned int numOptions, CUjit_option *options, void **optionValues);
-
-/**
- * \brief Add a file input to a pending linker invocation
- *
- * No reference is retained to any inputs after this call returns.
- *
- * This method accepts only compiler options, which are used if the input
- * must be compiled from PTX, and does not accept any of
- * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
- * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
- *
- * This method is equivalent to invoking ::cuLinkAddData on the contents
- * of the file.
- *
- * \param state        A pending linker action
- * \param type         The type of the input data
- * \param path         Path to the input file
- * \param numOptions   Size of options
- * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
- * \param optionValues Array of option values, each cast to void *
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_FILE_NOT_FOUND
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU
- *
- * \sa ::cuLinkCreate,
- * ::cuLinkAddData,
- * ::cuLinkComplete,
- * ::cuLinkDestroy
- */
-CUresult CUDAAPI
-cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
-    unsigned int numOptions, CUjit_option *options, void **optionValues);
-
-/**
- * \brief Complete a pending linker invocation
- *
- * Completes the pending linker action and returns the cubin image for the linked
- * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
- * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
- * This call does not destroy \p state.
- *
- * \param state    A pending linker invocation
- * \param cubinOut On success, this will point to the output image
- * \param sizeOut  Optional parameter to receive the size of the generated image
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuLinkCreate,
- * ::cuLinkAddData,
- * ::cuLinkAddFile,
- * ::cuLinkDestroy,
- * ::cuModuleLoadData
- */
-CUresult CUDAAPI
-cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
-
-/**
- * \brief Destroys state for a JIT linker invocation.
- *
- * \param state State object for the linker invocation
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE
- *
- * \sa ::cuLinkCreate
- */
-CUresult CUDAAPI
-cuLinkDestroy(CUlinkState state);
-
-/** @} */ /* END CUDA_MODULE */
-
-
-/**
- * \defgroup CUDA_MEM Memory Management
- *
- * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the memory management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Gets free and total memory
- *
- * Returns in \p *total the total amount of memory available to the the current context.
- * Returns in \p *free the amount of memory on the device that is free according to the OS.
- * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
- * In a multi-tenet situation, free estimate returned is prone to race condition where
- * a new allocation/free done by a different process or a different thread in the same
- * process between the time when free memory was estimated and reported, will result in
- * deviation in free value reported and actual free memory.
- *
- * The integrated GPU on Tegra shares memory with CPU and other component
- * of the SoC. The free and total values returned by the API excludes
- * the SWAP memory space maintained by the OS on some platforms.
- * The OS may move some of the memory pages into swap area as the GPU or
- * CPU allocate or access memory. See Tegra app note on how to calculate
- * total and free memory on Tegra.
- *
- * \param free  - Returned free memory in bytes
- * \param total - Returned total memory in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemGetInfo
- */
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
-
-/**
- * \brief Allocates device memory
- *
- * Allocates \p bytesize bytes of linear memory on the device and returns in
- * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
- * aligned for any kind of variable. The memory is not cleared. If \p bytesize
- * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
- *
- * \param dptr     - Returned device pointer
- * \param bytesize - Requested allocation size in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMalloc
- */
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
-
-/**
- * \brief Allocates pitched device memory
- *
- * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
- * the device and returns in \p *dptr a pointer to the allocated memory. The
- * function may pad the allocation to ensure that corresponding pointers in
- * any given row will continue to meet the alignment requirements for
- * coalescing as the address is updated from row to row. \p ElementSizeBytes
- * specifies the size of the largest reads and writes that will be performed
- * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
- * memory transactions are not possible on other data sizes). If
- * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
- * the kernel will run correctly, but possibly at reduced speed. The pitch
- * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
- * allocation. The intended usage of pitch is as a separate parameter of the
- * allocation, used to compute addresses within the 2D array. Given the row
- * and column of an array element of type \b T, the address is computed as:
- * \code
-   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
- * \endcode
- *
- * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
- * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
- * recommended that programmers consider performing pitch allocations using
- * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
- * especially true if the application will be performing 2D memory copies
- * between different regions of device memory (whether linear memory or CUDA
- * arrays).
- *
- * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
- * to match or exceed the alignment requirement for texture binding with
- * ::cuTexRefSetAddress2D().
- *
- * \param dptr             - Returned device pointer
- * \param pPitch           - Returned pitch of allocation in bytes
- * \param WidthInBytes     - Requested allocation width in bytes
- * \param Height           - Requested allocation height in rows
- * \param ElementSizeBytes - Size of largest reads/writes for range
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMallocPitch
- */
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
-
-/**
- * \brief Frees device memory
- *
- * Frees the memory space pointed to by \p dptr, which must have been returned
- * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
- *
- * \param dptr - Pointer to memory to free
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaFree
- */
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
-
-/**
- * \brief Get information on memory allocations
- *
- * Returns the base address in \p *pbase and size in \p *psize of the
- * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
- * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
- * of them is NULL, it is ignored.
- *
- * \param pbase - Returned base address
- * \param psize - Returned size of device memory allocation
- * \param dptr  - Device pointer to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- */
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
-
-/**
- * \brief Allocates page-locked host memory
- *
- * Allocates \p bytesize bytes of host memory that is page-locked and
- * accessible to the device. The driver tracks the virtual memory ranges
- * allocated with this function and automatically accelerates calls to
- * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
- * the device, it can be read or written with much higher bandwidth than
- * pageable memory obtained with functions such as ::malloc(). Allocating
- * excessive amounts of memory with ::cuMemAllocHost() may degrade system
- * performance, since it reduces the amount of memory available to the system
- * for paging. As a result, this function is best used sparingly to allocate
- * staging areas for data exchange between host and device.
- *
- * Note all host memory allocated using ::cuMemHostAlloc() will automatically
- * be immediately accessible to all contexts on all devices which support unified
- * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
- * The device pointer that may be used to access this host memory from those
- * contexts is always equal to the returned host pointer \p *pp.
- * See \ref CUDA_UNIFIED for additional details.
- *
- * \param pp       - Returned host pointer to page-locked memory
- * \param bytesize - Requested allocation size in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMallocHost
- */
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
-
-/**
- * \brief Frees page-locked host memory
- *
- * Frees the memory space pointed to by \p p, which must have been returned by
- * a previous call to ::cuMemAllocHost().
- *
- * \param p - Pointer to memory to free
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaFreeHost
- */
-CUresult CUDAAPI cuMemFreeHost(void *p);
-
-/**
- * \brief Allocates page-locked host memory
- *
- * Allocates \p bytesize bytes of host memory that is page-locked and accessible
- * to the device. The driver tracks the virtual memory ranges allocated with
- * this function and automatically accelerates calls to functions such as
- * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
- * it can be read or written with much higher bandwidth than pageable memory
- * obtained with functions such as ::malloc(). Allocating excessive amounts of
- * pinned memory may degrade system performance, since it reduces the amount
- * of memory available to the system for paging. As a result, this function is
- * best used sparingly to allocate staging areas for data exchange between
- * host and device.
- *
- * The \p Flags parameter enables different options to be specified that
- * affect the allocation, as follows.
- *
- * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
- *   considered as pinned memory by all CUDA contexts, not just the one that
- *   performed the allocation.
- *
- * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
- *   space. The device pointer to the memory may be obtained by calling
- *   ::cuMemHostGetDevicePointer().
- *
- * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
- *   (WC). WC memory can be transferred across the PCI Express bus more
- *   quickly on some system configurations, but cannot be read efficiently by
- *   most CPUs. WC memory is a good option for buffers that will be written by
- *   the CPU and read by the GPU via mapped pinned memory or host->device
- *   transfers.
- *
- * All of these flags are orthogonal to one another: a developer may allocate
- * memory that is portable, mapped and/or write-combined with no restrictions.
- *
- * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
- * devices that do not support mapped pinned memory. The failure is deferred
- * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
- * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
- *
- * The memory allocated by this function must be freed with ::cuMemFreeHost().
- *
- * Note all host memory allocated using ::cuMemHostAlloc() will automatically
- * be immediately accessible to all contexts on all devices which support unified
- * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
- * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
- * that may be used to access this host memory from those contexts is always equal
- * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
- * is specified, then the function ::cuMemHostGetDevicePointer() must be used
- * to query the device pointer, even if the context supports unified addressing.
- * See \ref CUDA_UNIFIED for additional details.
- *
- * \param pp       - Returned host pointer to page-locked memory
- * \param bytesize - Requested allocation size in bytes
- * \param Flags    - Flags for allocation request
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaHostAlloc
- */
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
-
-/**
- * \brief Passes back device pointer of mapped pinned memory
- *
- * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
- * host buffer \p p allocated by ::cuMemHostAlloc.
- *
- * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
- * flag was not specified at the time the memory was allocated, or if the
- * function is called on a GPU that does not support mapped pinned memory.
- *
- * For devices that have a non-zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
- * can also be accessed from the device using the host pointer \p p.
- * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
- * match the original host pointer \p p and depends on the devices visible to the
- * application. If all devices visible to the application have a non-zero value for the
- * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
- * will match the original pointer \p p. If any device visible to the application
- * has a zero value for the device attribute, the device pointer returned by
- * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
- * but it will be suitable for use on all devices provided Unified Virtual Addressing
- * is enabled. In such systems, it is valid to access the memory using either pointer
- * on devices that have a non-zero value for the device attribute. Note however that
- * such devices should access the memory using only one of the two pointers and not both.
- *
- * \p Flags provides for future releases. For now, it must be set to 0.
- *
- * \param pdptr - Returned device pointer
- * \param p     - Host pointer
- * \param Flags - Options (must be 0)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaHostGetDevicePointer
- */
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
-
-/**
- * \brief Passes back flags that were used for a pinned allocation
- *
- * Passes back the flags \p pFlags that were specified when allocating
- * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
- *
- * ::cuMemHostGetFlags() will fail if the pointer does not reside in
- * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
- *
- * \param pFlags - Returned flags word
- * \param p     - Host pointer
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuMemAllocHost,
- * ::cuMemHostAlloc,
- * ::cudaHostGetFlags
- */
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
-
-/**
- * \brief Allocates memory that will be automatically managed by the Unified Memory system
- *
- * Allocates \p bytesize bytes of managed memory on the device and returns in
- * \p *dptr a pointer to the allocated memory. If the device doesn't support
- * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
- * for managed memory can be queried using the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
- * aligned for any kind of variable. The memory is not cleared. If \p bytesize
- * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
- * is valid on the CPU and on all GPUs in the system that support managed memory.
- * All accesses to this pointer must obey the Unified Memory programming model.
- *
- * \p flags specifies the default stream association for this allocation.
- * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
- * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
- * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
- * allocation should not be accessed from devices that have a zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
- * ::cuStreamAttachMemAsync will be required to enable access on such devices.
- *
- * If the association is later changed via ::cuStreamAttachMemAsync to
- * a single stream, the default association as specifed during ::cuMemAllocManaged
- * is restored when that stream is destroyed. For __managed__ variables, the
- * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
- * stream is an asynchronous operation, and as a result, the change to default
- * association won't happen until all work in the stream has completed.
- *
- * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
- *
- * Device memory oversubscription is possible for GPUs that have a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
- * such GPUs may be evicted from device memory to host memory at any time by the Unified
- * Memory driver in order to make room for other allocations.
- *
- * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
- * API returns and instead may be populated on access. In such systems, managed memory can
- * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
- * maintain data locality and prevent excessive page faults to the extent possible. The application
- * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
- * can also explicitly migrate memory to a desired processor's memory via
- * ::cuMemPrefetchAsync.
- *
- * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
- * with each other, the physical storage for managed memory is created on the GPU which is active
- * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
- * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
- * memory among such GPUs.
- *
- * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
- * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
- * is zero for at least one of those GPUs, the location chosen for physical storage of managed
- * memory is system-dependent.
- * - On Linux, the location chosen will be device memory as long as the current set of active
- * contexts are on devices that either have peer-to-peer support with each other or have a
- * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- * If there is an active context on a GPU that does not have a non-zero value for that device
- * attribute and it does not have peer-to-peer support with the other devices that have active
- * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
- * Note that this means that managed memory that is located in device memory is migrated to
- * host memory if a new context is created on a GPU that doesn't have a non-zero value for
- * the device attribute and does not support peer-to-peer with at least one of the other devices
- * that has an active context. This in turn implies that context creation may fail if there is
- * insufficient host memory to migrate all managed allocations.
- * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
- * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
- * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
- * restrict CUDA to only use those GPUs that have peer-to-peer support.
- * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
- * non-zero value to force the driver to always use device memory for physical storage.
- * When this environment variable is set to a non-zero value, all contexts created in
- * that process on devices that support managed memory have to be peer-to-peer compatible
- * with each other. Context creation will fail if a context is created on a device that
- * supports managed memory and is not peer-to-peer compatible with any of the other
- * managed memory supporting devices on which contexts were previously created, even if
- * those contexts have been destroyed. These environment variables are described
- * in the CUDA programming guide under the "CUDA environment variables" section.
- * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
- *
- * \param dptr     - Returned device pointer
- * \param bytesize - Requested allocation size in bytes
- * \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
- * ::cudaMallocManaged
- */
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
-
-/**
- * \brief Returns a handle to a compute device
- *
- * Returns in \p *device a device handle given a PCI bus ID string.
- *
- * \param dev      - Returned device handle
- *
- * \param pciBusId - String in one of the following forms:
- * [domain]:[bus]:[device].[function]
- * [domain]:[bus]:[device]
- * [bus]:[device].[function]
- * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGet,
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetPCIBusId,
- * ::cudaDeviceGetByPCIBusId
- */
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
-
-/**
- * \brief Returns a PCI Bus Id string for the device
- *
- * Returns an ASCII string identifying the device \p dev in the NULL-terminated
- * string pointed to by \p pciBusId. \p len specifies the maximum length of the
- * string that may be returned.
- *
- * \param pciBusId - Returned identifier string for the device in the following format
- * [domain]:[bus]:[device].[function]
- * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
- * pciBusId should be large enough to store 13 characters including the NULL-terminator.
- *
- * \param len      - Maximum length of string to store in \p name
- *
- * \param dev      - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGet,
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetByPCIBusId,
- * ::cudaDeviceGetPCIBusId
- */
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
-
-/**
- * \brief Gets an interprocess handle for a previously allocated event
- *
- * Takes as input a previously allocated event. This event must have been
- * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
- * flags set. This opaque handle may be copied into other processes and
- * opened with ::cuIpcOpenEventHandle to allow efficient hardware
- * synchronization between GPU work in different processes.
- *
- * After the event has been opened in the importing process,
- * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
- * ::cuEventQuery may be used in either process. Performing operations
- * on the imported event after the exported event has been freed
- * with ::cuEventDestroy will result in undefined behavior.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param pHandle - Pointer to a user allocated CUipcEventHandle
- *                    in which to return the opaque event handle
- * \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
- *                    ::CU_EVENT_DISABLE_TIMING flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuEventCreate,
- * ::cuEventDestroy,
- * ::cuEventSynchronize,
- * ::cuEventQuery,
- * ::cuStreamWaitEvent,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcOpenMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cudaIpcGetEventHandle
- */
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
-
-/**
- * \brief Opens an interprocess event handle for use in the current process
- *
- * Opens an interprocess event handle exported from another process with
- * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
- * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
- * This event must be freed with ::cuEventDestroy.
- *
- * Performing operations on the imported event after the exported event has
- * been freed with ::cuEventDestroy will result in undefined behavior.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param phEvent - Returns the imported event
- * \param handle  - Interprocess handle to open
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuEventCreate,
- * ::cuEventDestroy,
- * ::cuEventSynchronize,
- * ::cuEventQuery,
- * ::cuStreamWaitEvent,
- * ::cuIpcGetEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcOpenMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cudaIpcOpenEventHandle
- */
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
-
-/**
- * \brief Gets an interprocess memory handle for an existing device memory
- * allocation
- *
- * Takes a pointer to the base of an existing device memory allocation created
- * with ::cuMemAlloc and exports it for use in another process. This is a
- * lightweight operation and may be called multiple times on an allocation
- * without adverse effects.
- *
- * If a region of memory is freed with ::cuMemFree and a subsequent call
- * to ::cuMemAlloc returns memory with the same device address,
- * ::cuIpcGetMemHandle will return a unique handle for the
- * new memory.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
- *                    the handle in.
- * \param dptr    - Base pointer to previously allocated device memory
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuIpcGetEventHandle,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcOpenMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cudaIpcGetMemHandle
- */
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
-
-/**
- * \brief Opens an interprocess memory handle exported from another process
- * and returns a device pointer usable in the local process.
- *
- * Maps memory exported from another process with ::cuIpcGetMemHandle into
- * the current device address space. For contexts on different devices
- * ::cuIpcOpenMemHandle can attempt to enable peer access between the
- * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
- * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
- * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
- *
- * Contexts that may open ::CUipcMemHandles are restricted in the following way.
- * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
- * by one ::CUcontext per ::CUdevice per other process.
- *
- * If the memory handle has already been opened by the current context, the
- * reference count on the handle is incremented by 1 and the existing device pointer
- * is returned.
- *
- * Memory returned from ::cuIpcOpenMemHandle must be freed with
- * ::cuIpcCloseMemHandle.
- *
- * Calling ::cuMemFree on an exported memory region before calling
- * ::cuIpcCloseMemHandle in the importing context will result in undefined
- * behavior.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param pdptr  - Returned device pointer
- * \param handle - ::CUipcMemHandle to open
- * \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_TOO_MANY_PEERS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \note No guarantees are made about the address returned in \p *pdptr.
- * In particular, multiple processes may not receive the same address for the same \p handle.
- *
- * \sa
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuIpcGetEventHandle,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cuCtxEnablePeerAccess,
- * ::cuDeviceCanAccessPeer,
- * ::cudaIpcOpenMemHandle
- */
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
-
-/**
- * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle
- *
- * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1.
- * When the reference count reaches 0, this API unmaps the memory. The original allocation
- * in the exporting process as well as imported mappings in other processes
- * will be unaffected.
- *
- * Any resources used to enable peer access will be freed if this is the
- * last mapping using them.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \sa
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuIpcGetEventHandle,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcOpenMemHandle,
- * ::cudaIpcCloseMemHandle
- */
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
-
-/**
- * \brief Registers an existing host memory range for use by CUDA
- *
- * Page-locks the memory range specified by \p p and \p bytesize and maps it
- * for the device(s) as specified by \p Flags. This memory range also is added
- * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
- * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
- * directly by the device, it can be read or written with much higher bandwidth
- * than pageable memory that has not been registered.  Page-locking excessive
- * amounts of memory may degrade system performance, since it reduces the amount
- * of memory available to the system for paging. As a result, this function is
- * best used sparingly to register staging areas for data exchange between
- * host and device.
- *
- * This function has limited support on Mac OS X. OS 10.7 or higher is required.
- *
- * The \p Flags parameter enables different options to be specified that
- * affect the allocation, as follows.
- *
- * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
- *   considered as pinned memory by all CUDA contexts, not just the one that
- *   performed the allocation.
- *
- * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
- *   space. The device pointer to the memory may be obtained by calling
- *   ::cuMemHostGetDevicePointer().
- *
- * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
- *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
- *
- * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
- *   that is considered read-only by the device.  On platforms without
- *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
- *   required in order to register memory mapped to the CPU as read-only.  Support
- *   for the use of this flag can be queried from the device attribute
- *   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
- *   a current context associated with a device that does not have this attribute
- *   set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
- *
- * All of these flags are orthogonal to one another: a developer may page-lock
- * memory that is portable or mapped with no restrictions.
- *
- * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
- * devices that do not support mapped pinned memory. The failure is deferred
- * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
- * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
- *
- * For devices that have a non-zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
- * can also be accessed from the device using the host pointer \p p.
- * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
- * match the original host pointer \p ptr and depends on the devices visible to the
- * application. If all devices visible to the application have a non-zero value for the
- * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
- * will match the original pointer \p ptr. If any device visible to the application
- * has a zero value for the device attribute, the device pointer returned by
- * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
- * but it will be suitable for use on all devices provided Unified Virtual Addressing
- * is enabled. In such systems, it is valid to access the memory using either pointer
- * on devices that have a non-zero value for the device attribute. Note however that
- * such devices should access the memory using only of the two pointers and not both.
- *
- * The memory page-locked by this function must be unregistered with
- * ::cuMemHostUnregister().
- *
- * \param p        - Host pointer to memory to page-lock
- * \param bytesize - Size in bytes of the address range to page-lock
- * \param Flags    - Flags for allocation request
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
- * ::CUDA_ERROR_NOT_PERMITTED,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa
- * ::cuMemHostUnregister,
- * ::cuMemHostGetFlags,
- * ::cuMemHostGetDevicePointer,
- * ::cudaHostRegister
- */
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
-
-/**
- * \brief Unregisters a memory range that was registered with cuMemHostRegister.
- *
- * Unmaps the memory range whose base address is specified by \p p, and makes
- * it pageable again.
- *
- * The base address must be the same one specified to ::cuMemHostRegister().
- *
- * \param p - Host pointer to memory to unregister
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
- * \notefnerr
- *
- * \sa
- * ::cuMemHostRegister,
- * ::cudaHostUnregister
- */
-CUresult CUDAAPI cuMemHostUnregister(void *p);
-
-/**
- * \brief Copies memory
- *
- * Copies data between two pointers.
- * \p dst and \p src are base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- * Note that this function infers the type of the transfer (host to host, host to
- *   device, device to device, or device to host) from the pointer values.  This
- *   function is only allowed in contexts which support unified addressing.
- *
- * \param dst - Destination unified virtual address space pointer
- * \param src - Source unified virtual address space pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyToSymbol,
- * ::cudaMemcpyFromSymbol
- */
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-
-/**
- * \brief Copies device memory between two contexts
- *
- * Copies from device memory in one context to device memory in another
- * context. \p dstDevice is the base device pointer of the destination memory
- * and \p dstContext is the destination context.  \p srcDevice is the base
- * device pointer of the source memory and \p srcContext is the source pointer.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice  - Destination device pointer
- * \param dstContext - Destination context
- * \param srcDevice  - Source device pointer
- * \param srcContext - Source context
- * \param ByteCount  - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpyPeer
- */
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
-
-/**
- * \brief Copies memory from Host to Device
- *
- * Copies from host memory to device memory. \p dstDevice and \p srcHost are
- * the base addresses of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyToSymbol
- */
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
-
-/**
- * \brief Copies memory from Device to Host
- *
- * Copies from device to host memory. \p dstHost and \p srcDevice specify the
- * base pointers of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination host pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyFromSymbol
- */
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-
-/**
- * \brief Copies memory from Device to Device
- *
- * Copies from device memory to device memory. \p dstDevice and \p srcDevice
- * are the base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyToSymbol,
- * ::cudaMemcpyFromSymbol
- */
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-
-/**
- * \brief Copies memory from Device to Array
- *
- * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
- * specify the CUDA array handle and starting index of the destination data.
- * \p srcDevice specifies the base pointer of the source. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyToArray
- */
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
-
-/**
- * \brief Copies memory from Array to Device
- *
- * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
- * base pointer of the destination and must be naturally aligned with the CUDA
- * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
- * and the offset in bytes into the array where the copy is to begin.
- * \p ByteCount specifies the number of bytes to copy and must be evenly
- * divisible by the array element size.
- *
- * \param dstDevice - Destination device pointer
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyFromArray
- */
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-
-/**
- * \brief Copies memory from Host to Array
- *
- * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
- * specify the CUDA array handle and starting offset in bytes of the destination
- * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
- * the number of bytes to copy.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyToArray
- */
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
-
-/**
- * \brief Copies memory from Array to Host
- *
- * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
- * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
- * array handle and starting offset in bytes of the source data.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination device pointer
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyFromArray
- */
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-
-/**
- * \brief Copies memory from Array to Array
- *
- * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
- * specify the handles of the destination and source CUDA arrays for the copy,
- * respectively. \p dstOffset and \p srcOffset specify the destination and
- * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
- * bytes to be copied. The size of the elements in the CUDA arrays need not be
- * the same format, but the elements must be the same size; and count must be
- * evenly divisible by that size.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyArrayToArray
- */
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-
-/**
- * \brief Copies memory for 2D arrays
- *
- * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- * The ::CUDA_MEMCPY2D structure is defined as:
- *
- * \code
-   typedef struct CUDA_MEMCPY2D_st {
-      unsigned int srcXInBytes, srcY;
-      CUmemorytype srcMemoryType;
-          const void *srcHost;
-          CUdeviceptr srcDevice;
-          CUarray srcArray;
-          unsigned int srcPitch;
-
-      unsigned int dstXInBytes, dstY;
-      CUmemorytype dstMemoryType;
-          void *dstHost;
-          CUdeviceptr dstDevice;
-          CUarray dstArray;
-          unsigned int dstPitch;
-
-      unsigned int WidthInBytes;
-      unsigned int Height;
-   } CUDA_MEMCPY2D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- * specify the (host) base address of the source data and the bytes per row to
- * apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- * specify the (device) base address of the source data and the bytes per row
- * to apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- * ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- * ignored.
- *
- * - ::srcXInBytes and ::srcY specify the base address of the source data for
- *   the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::dstXInBytes and ::dstY specify the base address of the destination data
- *   for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- *   the 2D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- *
- * \par
- * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- * (device to device, CUDA array to device, CUDA array to CUDA array),
- * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
- * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
- * significantly slower in the cases where ::cuMemcpy2D() would have returned
- * an error code.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy2D,
- * ::cudaMemcpy2DToArray,
- * ::cudaMemcpy2DFromArray
- */
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
-
-/**
- * \brief Copies memory for 2D arrays
- *
- * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- * The ::CUDA_MEMCPY2D structure is defined as:
- *
- * \code
-   typedef struct CUDA_MEMCPY2D_st {
-      unsigned int srcXInBytes, srcY;
-      CUmemorytype srcMemoryType;
-      const void *srcHost;
-      CUdeviceptr srcDevice;
-      CUarray srcArray;
-      unsigned int srcPitch;
-      unsigned int dstXInBytes, dstY;
-      CUmemorytype dstMemoryType;
-      void *dstHost;
-      CUdeviceptr dstDevice;
-      CUarray dstArray;
-      unsigned int dstPitch;
-      unsigned int WidthInBytes;
-      unsigned int Height;
-   } CUDA_MEMCPY2D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- * specify the (host) base address of the source data and the bytes per row to
- * apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- * specify the (device) base address of the source data and the bytes per row
- * to apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- * ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- * ignored.
- *
- * - ::srcXInBytes and ::srcY specify the base address of the source data for
- *   the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::dstXInBytes and ::dstY specify the base address of the destination data
- *   for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- *   the 2D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- *
- * \par
- * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- * (device to device, CUDA array to device, CUDA array to CUDA array),
- * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
- * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
- * significantly slower in the cases where ::cuMemcpy2D() would have returned
- * an error code.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy2D,
- * ::cudaMemcpy2DToArray,
- * ::cudaMemcpy2DFromArray
- */
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
-
-/**
- * \brief Copies memory for 3D arrays
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
- *
- * \code
-        typedef struct CUDA_MEMCPY3D_st {
-
-            unsigned int srcXInBytes, srcY, srcZ;
-            unsigned int srcLOD;
-            CUmemorytype srcMemoryType;
-                const void *srcHost;
-                CUdeviceptr srcDevice;
-                CUarray srcArray;
-                unsigned int srcPitch;  // ignored when src is array
-                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
-
-            unsigned int dstXInBytes, dstY, dstZ;
-            unsigned int dstLOD;
-            CUmemorytype dstMemoryType;
-                void *dstHost;
-                CUdeviceptr dstDevice;
-                CUarray dstArray;
-                unsigned int dstPitch;  // ignored when dst is array
-                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
-
-            unsigned int WidthInBytes;
-            unsigned int Height;
-            unsigned int Depth;
-        } CUDA_MEMCPY3D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
- * ::srcHeight specify the (host) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
- * ::srcHeight specify the (device) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
- * ::srcHeight are ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data, the bytes per row,
- * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data, the bytes per
- * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
- * ::dstHeight are ignored.
- *
- * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
- *   data for the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
- *   destination data for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
- *   and depth of the 3D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcHeight must be greater than or equal to ::Height +
- *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- *
- * \par
- * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- *
- * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
- * set to 0.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy3D
- */
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
-
-/**
- * \brief Copies memory between contexts
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
- * for documentation of its parameters.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpy3DPeer
- */
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
-
-/**
- * \brief Copies memory asynchronously
- *
- * Copies data between two pointers.
- * \p dst and \p src are base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- * Note that this function infers the type of the transfer (host to host, host to
- *   device, device to device, or device to host) from the pointer values.  This
- *   function is only allowed in contexts which support unified addressing.
- *
- * \param dst       - Destination unified virtual address space pointer
- * \param src       - Source unified virtual address space pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyToSymbolAsync,
- * ::cudaMemcpyFromSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies device memory between two contexts asynchronously.
- *
- * Copies from device memory in one context to device memory in another
- * context. \p dstDevice is the base device pointer of the destination memory
- * and \p dstContext is the destination context.  \p srcDevice is the base
- * device pointer of the source memory and \p srcContext is the source pointer.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice  - Destination device pointer
- * \param dstContext - Destination context
- * \param srcDevice  - Source device pointer
- * \param srcContext - Source context
- * \param ByteCount  - Size of memory copy in bytes
- * \param hStream    - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpyPeerAsync
- */
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Host to Device
- *
- * Copies from host memory to device memory. \p dstDevice and \p srcHost are
- * the base addresses of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyToSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Device to Host
- *
- * Copies from device to host memory. \p dstHost and \p srcDevice specify the
- * base pointers of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination host pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyFromSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Device to Device
- *
- * Copies from device memory to device memory. \p dstDevice and \p srcDevice
- * are the base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyToSymbolAsync,
- * ::cudaMemcpyFromSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Host to Array
- *
- * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
- * specify the CUDA array handle and starting offset in bytes of the
- * destination data. \p srcHost specifies the base address of the source.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyToArrayAsync
- */
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Array to Host
- *
- * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
- * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
- * array handle and starting offset in bytes of the source data.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination pointer
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyFromArrayAsync
- */
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory for 2D arrays
- *
- * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- * The ::CUDA_MEMCPY2D structure is defined as:
- *
- * \code
-   typedef struct CUDA_MEMCPY2D_st {
-      unsigned int srcXInBytes, srcY;
-      CUmemorytype srcMemoryType;
-      const void *srcHost;
-      CUdeviceptr srcDevice;
-      CUarray srcArray;
-      unsigned int srcPitch;
-      unsigned int dstXInBytes, dstY;
-      CUmemorytype dstMemoryType;
-      void *dstHost;
-      CUdeviceptr dstDevice;
-      CUarray dstArray;
-      unsigned int dstPitch;
-      unsigned int WidthInBytes;
-      unsigned int Height;
-   } CUDA_MEMCPY2D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- * specify the (host) base address of the source data and the bytes per row to
- * apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- * specify the (device) base address of the source data and the bytes per row
- * to apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- * ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- * ignored.
- *
- * - ::srcXInBytes and ::srcY specify the base address of the source data for
- *   the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::dstXInBytes and ::dstY specify the base address of the destination data
- *   for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- *   the 2D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcHeight must be greater than or equal to ::Height +
- *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- *
- * \par
- * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- * (device to device, CUDA array to device, CUDA array to CUDA array),
- * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
- *
- * \param pCopy   - Parameters for the memory copy
- * \param hStream - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpy2DAsync,
- * ::cudaMemcpy2DToArrayAsync,
- * ::cudaMemcpy2DFromArrayAsync
- */
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
-
-/**
- * \brief Copies memory for 3D arrays
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
- *
- * \code
-        typedef struct CUDA_MEMCPY3D_st {
-
-            unsigned int srcXInBytes, srcY, srcZ;
-            unsigned int srcLOD;
-            CUmemorytype srcMemoryType;
-                const void *srcHost;
-                CUdeviceptr srcDevice;
-                CUarray srcArray;
-                unsigned int srcPitch;  // ignored when src is array
-                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
-
-            unsigned int dstXInBytes, dstY, dstZ;
-            unsigned int dstLOD;
-            CUmemorytype dstMemoryType;
-                void *dstHost;
-                CUdeviceptr dstDevice;
-                CUarray dstArray;
-                unsigned int dstPitch;  // ignored when dst is array
-                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
-
-            unsigned int WidthInBytes;
-            unsigned int Height;
-            unsigned int Depth;
-        } CUDA_MEMCPY3D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
- * ::srcHeight specify the (host) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
- * ::srcHeight specify the (device) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
- * ::srcHeight are ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data, the bytes per row,
- * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data, the bytes per
- * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
- * ::dstHeight are ignored.
- *
- * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
- *   data for the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
- *   destination data for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
- *   and depth of the 3D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcHeight must be greater than or equal to ::Height +
- *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- *
- * \par
- * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- *
- * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
- * set to 0.
- *
- * \param pCopy - Parameters for the memory copy
- * \param hStream - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpy3DAsync
- */
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
-
-/**
- * \brief Copies memory between contexts asynchronously.
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
- * for documentation of its parameters.
- *
- * \param pCopy - Parameters for the memory copy
- * \param hStream - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpy3DPeerAsync
- */
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the memory range of \p N 8-bit values to the specified value
- * \p uc.
- *
- * \param dstDevice - Destination device pointer
- * \param uc        - Value to set
- * \param N         - Number of elements
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset
- */
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the memory range of \p N 16-bit values to the specified value
- * \p us. The \p dstDevice pointer must be two byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param us        - Value to set
- * \param N         - Number of elements
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset
- */
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the memory range of \p N 32-bit values to the specified value
- * \p ui. The \p dstDevice pointer must be four byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param ui        - Value to set
- * \param N         - Number of elements
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32Async,
- * ::cudaMemset
- */
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the 2D memory range of \p Width 8-bit values to the specified value
- * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param uc        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2D
- */
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the 2D memory range of \p Width 16-bit values to the specified value
- * \p us. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be two byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param us        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2D
- */
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the 2D memory range of \p Width 32-bit values to the specified value
- * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be four byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param ui        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2D
- */
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
-
-/**
- * \brief Sets device memory
- *
- * Sets the memory range of \p N 8-bit values to the specified value
- * \p uc.
- *
- * \param dstDevice - Destination device pointer
- * \param uc        - Value to set
- * \param N         - Number of elements
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemsetAsync
- */
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the memory range of \p N 16-bit values to the specified value
- * \p us. The \p dstDevice pointer must be two byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param us        - Value to set
- * \param N         - Number of elements
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemsetAsync
- */
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the memory range of \p N 32-bit values to the specified value
- * \p ui. The \p dstDevice pointer must be four byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param ui        - Value to set
- * \param N         - Number of elements
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
- * ::cudaMemsetAsync
- */
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the 2D memory range of \p Width 8-bit values to the specified value
- * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param uc        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2DAsync
- */
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the 2D memory range of \p Width 16-bit values to the specified value
- * \p us. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be two byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param us        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2DAsync
- */
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the 2D memory range of \p Width 32-bit values to the specified value
- * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be four byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param ui        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2DAsync
- */
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
-
-/**
- * \brief Creates a 1D or 2D CUDA array
- *
- * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
- * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
- * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
- *
- * \code
-    typedef struct {
-        unsigned int Width;
-        unsigned int Height;
-        CUarray_format Format;
-        unsigned int NumChannels;
-    } CUDA_ARRAY_DESCRIPTOR;
- * \endcode
- * where:
- *
- * - \p Width, and \p Height are the width, and height of the CUDA array (in
- * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
- * otherwise;
- * - ::Format specifies the format of the elements; ::CUarray_format is
- * defined as:
- * \code
-    typedef enum CUarray_format_enum {
-        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
-        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
-        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
-        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
-        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
-        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
-        CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
-    } CUarray_format;
- *  \endcode
- * - \p NumChannels specifies the number of packed components per CUDA array
- * element; it may be 1, 2, or 4;
- *
- * Here are examples of CUDA array descriptions:
- *
- * Description for a CUDA array of 2048 floats:
- * \code
-    CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 2048;
-    desc.Height = 1;
- * \endcode
- *
- * Description for a 64 x 64 CUDA array of floats:
- * \code
-    CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 64;
-    desc.Height = 64;
- * \endcode
- *
- * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
- * float16's:
- * \code
-    CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_HALF;
-    desc.NumChannels = 4;
-    desc.Width = width;
-    desc.Height = height;
- * \endcode
- *
- * Description for a \p width x \p height CUDA array of 16-bit elements, each
- * of which is two 8-bit unsigned chars:
- * \code
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
-    desc.NumChannels = 2;
-    desc.Width = width;
-    desc.Height = height;
- * \endcode
- *
- * \param pHandle        - Returned array
- * \param pAllocateArray - Array descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMallocArray
- */
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
-
-/**
- * \brief Get a 1D or 2D CUDA array descriptor
- *
- * Returns in \p *pArrayDescriptor a descriptor containing information on the
- * format and dimensions of the CUDA array \p hArray. It is useful for
- * subroutines that have been passed a CUDA array, but need to know the CUDA
- * array parameters for validation or other purposes.
- *
- * \param pArrayDescriptor - Returned array descriptor
- * \param hArray           - Array to get descriptor of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaArrayGetInfo
- */
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-
-/**
- * \brief Returns the layout properties of a sparse CUDA array
- *
- * Returns the layout properties of a sparse CUDA array in \p sparseProperties
- * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
- * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero.
- * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero.
- * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained
- * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties 
- * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
- * \param[in] array - CUDA array to get the sparse properties of
- * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array);
-
-/**
- * \brief Returns the layout properties of a sparse CUDA mipmapped array
- *
- * Returns the sparse array layout properties in \p sparseProperties
- * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the
- * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
- * is less than that of the tile.
- * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
- * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. 
- * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer.
- * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
- * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of
- * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap);
-
-
-/**
- * \brief Returns the memory requirements of a CUDA array
- *
- * Returns the memory requirements of a CUDA array in \p memoryRequirements
- * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
- * represents the total size of the CUDA array.
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
- * represents the alignment necessary for mapping the CUDA array.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
- * \param[in] array - CUDA array to get the memory requirements of
- * \param[in] device - Device to get the memory requirements for
- * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device);
- 
-/**
- * \brief Returns the memory requirements of a CUDA mipmapped array
- *
- * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
- * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
- * represents the total size of the CUDA mipmapped array.
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
- * represents the alignment necessary for mapping the CUDA mipmapped  
- * array.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
- * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
- * \param[in] device - Device to get the memory requirements for
- * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
-
-
-/**
- * \brief Gets a CUDA array plane from a CUDA array
- *
- * Returns in \p pPlaneArray a CUDA array that represents a single format plane
- * of the CUDA array \p hArray.
- *
- * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
- * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns
- * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
- * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
- * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
- *
- * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
- * \param hArray        - Multiplanar CUDA array
- * \param planeIdx      - Plane index
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::cuArrayCreate,
- * ::cudaGetArrayPlane
- */
-CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
-
-/**
- * \brief Destroys a CUDA array
- *
- * Destroys the CUDA array \p hArray.
- *
- * \param hArray - Array to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ARRAY_IS_MAPPED,
- * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaFreeArray
- */
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
-
-/**
- * \brief Creates a 3D CUDA array
- *
- * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
- * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
- * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
- *
- * \code
-    typedef struct {
-        unsigned int Width;
-        unsigned int Height;
-        unsigned int Depth;
-        CUarray_format Format;
-        unsigned int NumChannels;
-        unsigned int Flags;
-    } CUDA_ARRAY3D_DESCRIPTOR;
- * \endcode
- * where:
- *
- * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
- * CUDA array (in elements); the following types of CUDA arrays can be allocated:
- *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
- *     - A 2D array is allocated if only \p Depth extent is zero.
- *     - A 3D array is allocated if all three extents are non-zero.
- *     - A 1D layered CUDA array is allocated if only \p Height is zero and the
- *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
- *       of layers is determined by the depth extent.
- *     - A 2D layered CUDA array is allocated if all three extents are non-zero and
- *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
- *       of layers is determined by the depth extent.
- *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
- *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
- *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
- *       where the six layers represent the six faces of a cube. The order of the six
- *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
- *     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
- *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
- *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
- *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
- *       consists of a collection of cubemaps. The first six layers represent the first
- *       cubemap, the next six layers form the second cubemap, and so on.
- *
- * - ::Format specifies the format of the elements; ::CUarray_format is
- * defined as:
- * \code
-    typedef enum CUarray_format_enum {
-        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
-        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
-        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
-        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
-        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
-        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
-        CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
-    } CUarray_format;
- *  \endcode
- *
- * - \p NumChannels specifies the number of packed components per CUDA array
- * element; it may be 1, 2, or 4;
- *
- * - ::Flags may be set to
- *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
- *     \p Depth specifies the number of layers, not the depth of a 3D array.
- *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
- *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
- *     to a surface reference.
- *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
- *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
- *     then \p Depth must be a multiple of six.
- *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
- *     Texture gather can only be performed on 2D CUDA arrays.
- *
- * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
- * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
- * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
- *
- * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
- * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
- * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
- *
- * <table>
- * <tr><td><b>CUDA array type</b></td>
- * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
- * (depth range)}</b></td>
- * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
- * {(width range in elements), (height range), (depth range)}</b></td></tr>
- * <tr><td>1D</td>
- * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
- * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
- * <tr><td>2D</td>
- * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
- * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
- * <tr><td>3D</td>
- * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
- * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
- * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
- * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
- * (1,SURFACE3D_DEPTH) }</small></td></tr>
- * <tr><td>1D Layered</td>
- * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
- * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
- * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>2D Layered</td>
- * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
- * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
- * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>Cubemap</td>
- * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
- * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
- * <tr><td>Cubemap Layered</td>
- * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
- * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
- * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
- * </table>
- *
- * Here are examples of CUDA array descriptions:
- *
- * Description for a CUDA array of 2048 floats:
- * \code
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 2048;
-    desc.Height = 0;
-    desc.Depth = 0;
- * \endcode
- *
- * Description for a 64 x 64 CUDA array of floats:
- * \code
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 64;
-    desc.Height = 64;
-    desc.Depth = 0;
- * \endcode
- *
- * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
- * 4x16-bit float16's:
- * \code
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_HALF;
-    desc.NumChannels = 4;
-    desc.Width = width;
-    desc.Height = height;
-    desc.Depth = depth;
- * \endcode
- *
- * \param pHandle        - Returned array
- * \param pAllocateArray - 3D array descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMalloc3DArray
- */
-CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
-
-/**
- * \brief Get a 3D CUDA array descriptor
- *
- * Returns in \p *pArrayDescriptor a descriptor containing information on the
- * format and dimensions of the CUDA array \p hArray. It is useful for
- * subroutines that have been passed a CUDA array, but need to know the CUDA
- * array parameters for validation or other purposes.
- *
- * This function may be called on 1D and 2D arrays, in which case the \p Height
- * and/or \p Depth members of the descriptor struct will be set to 0.
- *
- * \param pArrayDescriptor - Returned 3D array descriptor
- * \param hArray           - 3D array to get descriptor of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaArrayGetInfo
- */
-CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-
-/**
- * \brief Creates a CUDA mipmapped array
- *
- * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
- * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
- * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
- * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
- *
- * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
- *
- * \code
-    typedef struct {
-        unsigned int Width;
-        unsigned int Height;
-        unsigned int Depth;
-        CUarray_format Format;
-        unsigned int NumChannels;
-        unsigned int Flags;
-    } CUDA_ARRAY3D_DESCRIPTOR;
- * \endcode
- * where:
- *
- * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
- * CUDA array (in elements); the following types of CUDA arrays can be allocated:
- *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
- *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
- *     - A 3D mipmapped array is allocated if all three extents are non-zero.
- *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
- *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
- *       of layers is determined by the depth extent.
- *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
- *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
- *       of layers is determined by the depth extent.
- *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
- *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
- *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
- *       where the six layers represent the six faces of a cube. The order of the six
- *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
- *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
- *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
- *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
- *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
- *       consists of a collection of cubemaps. The first six layers represent the first
- *       cubemap, the next six layers form the second cubemap, and so on.
- *
- * - ::Format specifies the format of the elements; ::CUarray_format is
- * defined as:
- * \code
-    typedef enum CUarray_format_enum {
-        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
-        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
-        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
-        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
-        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
-        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
-        CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
-    } CUarray_format;
- *  \endcode
- *
- * - \p NumChannels specifies the number of packed components per CUDA array
- * element; it may be 1, 2, or 4;
- *
- * - ::Flags may be set to
- *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
- *     \p Depth specifies the number of layers, not the depth of a 3D array.
- *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
- *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
- *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
-  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
- *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
- *     then \p Depth must be a multiple of six.
- *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
- *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
- *
- * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
- * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
- * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
- *
- * <table>
- * <tr><td><b>CUDA array type</b></td>
- * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
- * (depth range)}</b></td>
- * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
- * {(width range in elements), (height range), (depth range)}</b></td></tr>
- * <tr><td>1D</td>
- * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
- * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
- * <tr><td>2D</td>
- * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
- * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
- * <tr><td>3D</td>
- * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
- * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
- * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
- * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
- * (1,SURFACE3D_DEPTH) }</small></td></tr>
- * <tr><td>1D Layered</td>
- * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
- * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
- * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>2D Layered</td>
- * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
- * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
- * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>Cubemap</td>
- * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
- * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
- * <tr><td>Cubemap Layered</td>
- * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
- * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
- * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
- * </table>
- *
- *
- * \param pHandle             - Returned mipmapped array
- * \param pMipmappedArrayDesc - mipmapped array descriptor
- * \param numMipmapLevels     - Number of mipmap levels
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuMipmappedArrayDestroy,
- * ::cuMipmappedArrayGetLevel,
- * ::cuArrayCreate,
- * ::cudaMallocMipmappedArray
- */
-CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
-
-/**
- * \brief Gets a mipmap level of a CUDA mipmapped array
- *
- * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
- * of the CUDA mipmapped array \p hMipmappedArray.
- *
- * If \p level is greater than the maximum number of levels in this mipmapped array,
- * ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param pLevelArray     - Returned mipmap level CUDA array
- * \param hMipmappedArray - CUDA mipmapped array
- * \param level           - Mipmap level
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::cuMipmappedArrayCreate,
- * ::cuMipmappedArrayDestroy,
- * ::cuArrayCreate,
- * ::cudaGetMipmappedArrayLevel
- */
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
-
-/**
- * \brief Destroys a CUDA mipmapped array
- *
- * Destroys the CUDA mipmapped array \p hMipmappedArray.
- *
- * \param hMipmappedArray - Mipmapped array to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ARRAY_IS_MAPPED,
- * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
- * \notefnerr
- *
- * \sa
- * ::cuMipmappedArrayCreate,
- * ::cuMipmappedArrayGetLevel,
- * ::cuArrayCreate,
- * ::cudaFreeMipmappedArray
- */
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
-
-/** @} */ /* END CUDA_MEM */
-
-/**
- * \defgroup CUDA_VA Virtual Memory Management
- *
- * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the virtual memory management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
-* \brief Allocate an address range reservation. 
-* 
-* Reserves a virtual address range based on the given parameters, giving
-* the starting address of the range in \p ptr.  This API requires a system that
-* supports UVA.  The size and address parameters must be a multiple of the
-* host page size and the alignment must be a power of two or zero for default
-* alignment.
-*
-* \param[out] ptr       - Resulting pointer to start of virtual address range allocated
-* \param[in]  size      - Size of the reserved virtual address range requested
-* \param[in]  alignment - Alignment of the reserved virtual address range requested
-* \param[in]  addr      - Fixed starting address range requested
-* \param[in]  flags     - Currently unused, must be zero
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_OUT_OF_MEMORY,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemAddressFree
-*/
-CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
-
-/**
-* \brief Free an address range reservation.
-* 
-* Frees a virtual address range reserved by cuMemAddressReserve.  The size
-* must match what was given to memAddressReserve and the ptr given must
-* match what was returned from memAddressReserve.
-*
-* \param[in] ptr  - Starting address of the virtual address range to free
-* \param[in] size - Size of the virtual address region to free
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemAddressReserve
-*/
-CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
-
-/**
-* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
-*
-* This creates a memory allocation on the target device specified through the
-* \p prop strcuture. The created allocation will not have any device or host
-* mappings. The generic memory \p handle for the allocation can be
-* mapped to the address space of calling process via ::cuMemMap. This handle
-* cannot be transmitted directly to other processes (see
-* ::cuMemExportToShareableHandle).  On Windows, the caller must also pass
-* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
-* limits or allows access to this handle for a recepient process (see
-* ::CUmemAllocationProp::win32HandleMetaData for more).  The \p size of this
-* allocation must be a multiple of the the value given via
-* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
-* flag.
-* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
-* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
-* and sparse CUDA mipmapped arrays.
-* (see ::cuMemMapArrayAsync).
-*
-* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle.
-* \param[in]  size   - Size of the allocation requested
-* \param[in]  prop   - Properties of the allocation to create.
-* \param[in]  flags  - flags for future use, must be zero now.
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_OUT_OF_MEMORY,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-*
-* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
-
-/**
-* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
-* 
-* Frees the memory that was allocated on a device through cuMemCreate.
-*
-* The memory allocation will be freed when all outstanding mappings to the memory
-* are unmapped and when all outstanding references to the handle (including it's
-* shareable counterparts) are also released. The generic memory handle can be
-* freed when there are still outstanding mappings made with this handle. Each
-* time a recepient process imports a shareable handle, it needs to pair it with
-* ::cuMemRelease for the handle to be freed.  If \p handle is not a valid handle
-* the behavior is undefined. 
-*
-* \param[in] handle Value of handle which was returned previously by cuMemCreate.
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-*
-* \sa ::cuMemCreate
-*/
-CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle);
-
-/**
-* \brief Maps an allocation handle to a reserved virtual address range.
-*
-* Maps bytes of memory represented by \p handle starting from byte \p offset to
-* \p size to address range [\p addr, \p addr + \p size]. This range must be an
-* address reservation previously reserved with ::cuMemAddressReserve, and
-* \p offset + \p size must be less than the size of the memory allocation.
-* Both \p ptr, \p size, and \p offset must be a multiple of the value given via
-* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag.
-* 
-* Please note calling ::cuMemMap does not make the address accessible,
-* the caller needs to update accessibility of a contiguous mapped VA
-* range by calling ::cuMemSetAccess.
-* 
-* Once a recipient process obtains a shareable memory handle
-* from ::cuMemImportFromShareableHandle, the process must
-* use ::cuMemMap to map the memory into its address ranges before
-* setting accessibility with ::cuMemSetAccess.
-*  
-* ::cuMemMap can only create mappings on VA range reservations 
-* that are not currently mapped.
-* 
-* \param[in] ptr    - Address where memory will be mapped. 
-* \param[in] size   - Size of the memory mapping. 
-* \param[in] offset - Offset into the memory represented by 
-*                   - \p handle from which to start mapping
-*                   - Note: currently must be zero.
-* \param[in] handle - Handle to a shareable memory 
-* \param[in] flags  - flags for future use, must be zero now. 
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_OUT_OF_MEMORY,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-*
-* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
-
-/**
- * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays
- *
- * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays.
- * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count.
- * The structure ::CUarrayMapInfo is defined as follow:
- \code
-     typedef struct CUarrayMapInfo_st {
-        CUresourcetype resourceType;                   
-        union {
-            CUmipmappedArray mipmap;
-            CUarray array;
-        } resource;
-
-        CUarraySparseSubresourceType subresourceType;   
-        union {
-            struct {
-                unsigned int level;                     
-                unsigned int layer;                     
-                unsigned int offsetX;                   
-                unsigned int offsetY;                   
-                unsigned int offsetZ;                   
-                unsigned int extentWidth;               
-                unsigned int extentHeight;              
-                unsigned int extentDepth;               
-            } sparseLevel;
-            struct {
-                unsigned int layer;
-                unsigned long long offset;              
-                unsigned long long size;                
-            } miptail;
-        } subresource;
-
-        CUmemOperationType memOperationType;
-        
-        CUmemHandleType memHandleType;                  
-        union {
-            CUmemGenericAllocationHandle memHandle;
-        } memHandle;
-
-        unsigned long long offset;                      
-        unsigned int deviceBitMask;                     
-        unsigned int flags;                             
-        unsigned int reserved[2];                       
-    } CUarrayMapInfo;
- \endcode
- *
- * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on.
- * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then 
- * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle.
- * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using
- * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE
-
- * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
-
- * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.
- * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY 
- * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle.
- * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
- * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE
-
- * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
-
- *
- * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. 
- * ::CUarraySparseSubresourceType_enum is defined as:
- \code
-    typedef enum CUarraySparseSubresourceType_enum {
-        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
-        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
-    } CUarraySparseSubresourceType;
- \endcode
- *
- * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a
- * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which
- * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by 
- * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type.
- *
- * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
- * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents.
- * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY
- * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively.
- * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight
- * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively.
- * These offsets and extents must be aligned to the corresponding tile dimension.
- * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise,
- * must be zero.
- * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise,
- * must be zero.
- * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth
- * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays.
- * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties
- *
- * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
- * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in 
- * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size.
- * Both, mip tail offset and mip tail size must be aligned to the tile size. 
- * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags
- * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
- * Otherwise, must be zero.
- *
-
- * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING
- * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored.
- *
-
- * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as:
- \code
-    typedef enum CUmemOperationType_enum {
-        CU_MEM_OPERATION_TYPE_MAP = 1,
-        CU_MEM_OPERATION_TYPE_UNMAP = 2
-    } CUmemOperationType;
- \endcode
- * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource 
- * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. 
- * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, 
- * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC.
- * 
- * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation
- * is performed. ::CUarrayMapInfo::memHandle must be NULL.
- *
- * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. 
- * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. 
- * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match 
- * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle.
- *
- * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- *
- * \param[in] mapInfoList - List of ::CUarrayMapInfo
- * \param[in] count       - Count of ::CUarrayMapInfo  in \p mapInfoList
- * \param[in] hStream     - Stream identifier for the stream to use for map or unmap operations
- *
- * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties
- */
-CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo  *mapInfoList, unsigned int count, CUstream hStream);
-
-/**
-* \brief Unmap the backing memory of a given address range.
-*
-* The range must be the entire contiguous address range that was mapped to.  In
-* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped
-* by ::cuMemCreate / ::cuMemMap.  Any backing memory allocations will be freed
-* if there are no existing mappings and there are no unreleased memory handles.
-*
-* When ::cuMemUnmap returns successfully the address range is converted to an
-* address reservation and can be used for a future calls to ::cuMemMap.  Any new
-* mapping to this virtual address will need to have access granted through
-* ::cuMemSetAccess, as all mappings start with no accessibility setup.
-*
-* \param[in] ptr  - Starting address for the virtual address range to unmap
-* \param[in] size - Size of the virtual address range to unmap
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-* \note_sync
-*
-* \sa ::cuMemCreate, ::cuMemAddressReserve
-*/
-CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size);
-
-/**
-* \brief Set the access flags for each location specified in \p desc for the given virtual address range
-* 
-* Given the virtual address range via \p ptr and \p size, and the locations
-* in the array given by \p desc and \p count, set the access flags for the
-* target locations.  The range must be a fully mapped address range
-* containing all allocations created by ::cuMemMap / ::cuMemCreate.
-*
-* \param[in] ptr   - Starting address for the virtual address range
-* \param[in] size  - Length of the virtual address range
-* \param[in] desc  - Array of ::CUmemAccessDesc that describe how to change the
-*                  - mapping for each location specified
-* \param[in] count - Number of ::CUmemAccessDesc in \p desc
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-* \note_sync
-*
-* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap
-*/
-CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);
-
-/**
-* \brief Get the access \p flags set for the given \p location and \p ptr
-*
-* \param[out] flags   - Flags set for this location
-* \param[in] location - Location in which to check the flags for
-* \param[in] ptr      - Address in which to check the access flags for
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemSetAccess
-*/
-CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr);
-
-/**
-* \brief Exports an allocation to a requested shareable handle type
-*
-* Given a CUDA memory handle, create a shareable memory
-* allocation handle that can be used to share the memory with other
-* processes. The recipient process can convert the shareable handle back into a
-* CUDA memory handle using ::cuMemImportFromShareableHandle and map
-* it with ::cuMemMap. The implementation of what this handle is and how it
-* can be transferred is defined by the requested handle type in \p handleType
-*
-* Once all shareable handles are closed and the allocation is released, the allocated
-* memory referenced will be released back to the OS and uses of the CUDA handle afterward
-* will lead to undefined behavior.
-*
-* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL)
-* that support importing memory from the shareable type
-*
-* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type
-* \param[in] handle           - CUDA handle for the memory allocation
-* \param[in] handleType       - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter)
-* \param[in] flags            - Reserved, must be zero
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags);
-
-/**
-* \brief Imports an allocation from a requested shareable handle type.
-*
-* If the current process cannot support the memory described by this shareable
-* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED.
-*
-* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc)
-* created on devices under an SLI group may not be supported, and thus this API will
-* return CUDA_ERROR_NOT_SUPPORTED.
-* There is no guarantee that the contents of \p handle will be the same CUDA memory handle
-* for the same given OS shareable handle, or the same underlying allocation.
-*
-* \param[out] handle       - CUDA Memory handle for the memory allocation.
-* \param[in]  osHandle     - Shareable Handle representing the memory allocation that is to be imported. 
-* \param[in]  shHandleType - handle type of the exported handle ::CUmemAllocationHandleType.
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease
-*/
-CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
-
-/**
-* \brief Calculates either the minimal or recommended granularity 
-*
-* Calculates either the minimal or recommended granularity
-* for a given allocation specification and returns it in granularity.  This
-* granularity can be used as a multiple for alignment, size, or address mapping.
-*
-* \param[out] granularity Returned granularity.
-* \param[in]  prop Property for which to determine the granularity for
-* \param[in]  option Determines which granularity to return
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemCreate, ::cuMemMap
-*/
-CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
-
-/**
-* \brief Retrieve the contents of the property structure defining properties for this handle
-*
-* \param[out] prop  - Pointer to a properties structure which will hold the information about this handle
-* \param[in] handle - Handle which to perform the query on
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle);
-
-/**
-* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation.
-*
-* The handle is guaranteed to be the same handle value used to map the memory. If the address
-* requested is not mapped, the function will fail. The returned handle must be released with
-* corresponding number of calls to ::cuMemRelease.
-*
-* \note The address \p addr, can be any address in a range previously mapped
-* by ::cuMemMap, and not necessarily the start address.
-*
-* \param[out] handle CUDA Memory handle for the backing memory allocation.
-* \param[in] addr Memory address to query, that has been mapped previously.
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap
-*/
-CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr);
-
-/** @} */ /* END CUDA_VA */
-
-/**
- * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator
- *
- * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
- *                Functions for controlling the behavior of the underlying allocator.
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the stream ordered memory allocator exposed by the
- * low-level CUDA driver application programming interface.
- *
- * @{
- *
- * \section CUDA_MALLOC_ASYNC_overview overview
- *
- * The asynchronous allocator allows the user to allocate and free in stream order.
- * All asynchronous accesses of the allocation must happen between
- * the stream executions of the allocation and the free. If the memory is accessed
- * outside of the promised stream order, a use before allocation / use after free error
- * will cause undefined behavior.
- *
- * The allocator is free to reallocate the memory as long as it can guarantee
- * that compliant memory accesses will not overlap temporally.
- * The allocator may refer to internal stream ordering as well as inter-stream dependencies
- * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
- * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. 
- *
- * \section CUDA_MALLOC_ASYNC_support Supported Platforms
- *
- * Whether or not a device supports the integrated stream ordered memory allocator
- * may be queried by calling ::cuDeviceGetAttribute() with the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
- */
-
-/**
- * \brief Frees memory with stream ordered semantics
- *
- * Inserts a free operation into \p hStream.
- * The allocation must not be accessed after stream execution reaches the free.
- * After this API returns, accessing the memory from any subsequent work launched on the GPU
- * or querying its pointer attributes results in undefined behavior.
- *
- * \note During stream capture, this function results in the creation of a free node and
- *       must therefore be passed the address of a graph allocation.
- * 
- * \param dptr - memory to free
- * \param hStream - The stream establishing the stream ordering contract. 
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
- * ::CUDA_ERROR_NOT_SUPPORTED
- */
-CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
-
-/**
- * \brief Allocates memory with stream ordered semantics
- *
- * Inserts an allocation operation into \p hStream.
- * A pointer to the allocated memory is returned immediately in *dptr.
- * The allocation must not be accessed until the the allocation operation completes.
- * The allocation comes from the memory pool current to the stream's device.
- *
- * \note The default memory pool of a device contains device memory from that device.
- * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
- *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
- *       operation completes before work submitted in a separate stream runs. 
- * \note During stream capture, this function results in the creation of an allocation node.  In this case,
- *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
- *       are used to set the node's creation parameters.
- *
- * \param[out] dptr    - Returned device pointer
- * \param[in] bytesize - Number of bytes to allocate
- * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool,
- *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
- *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
- */
-CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
-
-/**
- * \brief Tries to release memory back to the OS
- *
- * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
- * reserved bytes, or there is no more memory that the allocator can safely release.
- * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
- * The OS allocations may happen at different granularity from the user allocations.
- *
- * \note: Allocations that have not been freed count as outstanding. 
- * \note: Allocations that have been asynchronously freed but whose completion has
- *        not been observed on the host (eg. by a synchronize) can count as outstanding.
- *
- * \param[in] pool           - The memory pool to trim
- * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
- * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
- * at least minBytesToKeep bytes reserved after the operation.
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep);
-
-/**
- * \brief Sets attributes of a memory pool
- *
- * Supported attributes are:
- * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
- *                    Amount of reserved memory in bytes to hold onto before trying
- *                    to release memory back to the OS. When more than the release
- *                    threshold bytes of memory are held by the memory pool, the
- *                    allocator will try to release memory back to the OS on the
- *                    next call to stream, event or context synchronize. (default 0)
- * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
- *                    in another stream as long as a stream ordering dependency
- *                    of the allocating stream on the free action exists.
- *                    Cuda events and null stream interactions can create the required
- *                    stream ordered dependencies. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
- *                    Allow reuse of already completed frees when there is no dependency
- *                    between the free and allocation. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to insert new stream dependencies
- *                    in order to establish the stream ordering required to reuse
- *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
- * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
- *                    Reset the high watermark that tracks the amount of backing memory that was
- *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
- * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
- *                    Reset the high watermark that tracks the amount of used memory that was
- *                    allocated for the memory pool.
- *
- * \param[in] pool  - The memory pool to modify
- * \param[in] attr  - The attribute to modify
- * \param[in] value - Pointer to the value to assign
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
-
-/**
- * \brief Gets attributes of a memory pool
- *
- * Supported attributes are:
- * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
- *                    Amount of reserved memory in bytes to hold onto before trying
- *                    to release memory back to the OS. When more than the release
- *                    threshold bytes of memory are held by the memory pool, the
- *                    allocator will try to release memory back to the OS on the
- *                    next call to stream, event or context synchronize. (default 0)
- * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
- *                    in another stream as long as a stream ordering dependency
- *                    of the allocating stream on the free action exists.
- *                    Cuda events and null stream interactions can create the required
- *                    stream ordered dependencies. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
- *                    Allow reuse of already completed frees when there is no dependency
- *                    between the free and allocation. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to insert new stream dependencies
- *                    in order to establish the stream ordering required to reuse
- *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
- * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t)
- *                    Amount of backing memory currently allocated for the mempool
- * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
- *                    High watermark of backing memory allocated for the mempool since the
- *                    last time it was reset.
- * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t)
- *                    Amount of memory from the pool that is currently in use by the application.
- * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
- *                    High watermark of the amount of memory from the pool that was in use by the application.
- *
- * \param[in] pool   - The memory pool to get attributes of
- * \param[in] attr   - The attribute to get 
- * \param[out] value - Retrieved value
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
-
-/**
- * \brief Controls visibility of pools between devices
- *
- * \param[in] pool  - The pool being modified
- * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu.
- * \param[in] count - Number of descriptors in the map array.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count);
-
-/**
- * \brief Returns the accessibility of a pool from a device
- *
- * Returns the accessibility of the pool's memory from the specified location. 
- *
- * \param[out] flags   - the accessibility of the pool from the specified location
- * \param[in] memPool  - the pool being queried
- * \param[in] location - the location accessing the pool
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location);
-
-/**
- * \brief Creates a memory pool
- *
- * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
- * the properties of the pool such as the backing device and IPC capabilities. 
- *
- * By default, the pool's memory will be accessible from the device it is allocated on.
- *
- * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NOT_SUPPORTED
- *
- * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool,
- *     ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle
- */
-CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps);
-
-/**
- * \brief Destroys the specified memory pool
- *
- * If any pointers obtained from this pool haven't been freed or
- * the pool has free operations that haven't completed
- * when ::cuMemPoolDestroy is invoked, the function will return immediately and the
- * resources associated with the pool will be released automatically
- * once there are no more outstanding allocations. 
- *
- * Destroying the current mempool of a device sets the default mempool of
- * that device as the current mempool for that device.
- *
- * \note A device's default memory pool cannot be destroyed.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool,
- *     ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool);
-
-/**
- * \brief Allocates memory from a specified pool with stream ordered semantics.
- *
- * Inserts an allocation operation into \p hStream.
- * A pointer to the allocated memory is returned immediately in *dptr.
- * The allocation must not be accessed until the the allocation operation completes.
- * The allocation comes from the specified memory pool.
- *
- * \note
- *    -  The specified memory pool may be from a device different than that of the specified \p hStream. 
- * 
- *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
- *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
- *       operation completes before work submitted in a separate stream runs. 
- *
- * \note During stream capture, this function results in the creation of an allocation node.  In this case,
- *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
- *       are used to set the node's creation parameters.
- *
- * \param[out] dptr    - Returned device pointer
- * \param[in] bytesize - Number of bytes to allocate
- * \param[in] pool     - The pool to allocate from 
- * \param[in] hStream  - The stream establishing the stream ordering semantic
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess,
- *     ::cuMemPoolSetAttribute
- */
-CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
-
-/**
- * \brief Exports a memory pool to the requested handle type.
- *
- * Given an IPC capable mempool, create an OS handle to share the pool with another process.
- * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle.
- * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs.
- * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
- * handle type.
- *
- * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
- *
- * \param[out] handle_out  - Returned OS handle 
- * \param[in] pool         - pool to export 
- * \param[in] handleType   - the type of handle to create 
- * \param[in] flags        - must be 0 
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer,
- *     ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync,
- *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
- *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
- */
-CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
-
-/**
- * \brief imports a memory pool from a shared handle.
- *
- * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer.
- *
- * \note Imported memory pools do not support creating new allocations.
- *       As such imported memory pools may not be used in cuDeviceSetMemPool
- *       or ::cuMemAllocFromPoolAsync calls.
- *
- * \param[out] pool_out    - Returned memory pool
- * \param[in] handle       - OS handle of the pool to open 
- * \param[in] handleType   - The type of handle being imported 
- * \param[in] flags        - must be 0 
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer
- */
-CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
-        CUmemoryPool *pool_out,
-        void *handle,
-        CUmemAllocationHandleType handleType,
-        unsigned long long flags);
-
-/**
- * \brief Export data to share a memory pool allocation between processes.
- *
- * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
- * The recipient process can import the allocation with the ::cuMemPoolImportPointer api.
- * The data is not a handle and may be shared through any IPC mechanism.
- *
- * \param[out] shareData_out - Returned export data  
- * \param[in] ptr            - pointer to memory being exported
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer
- */
-CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr);
-
-/**
- * \brief Import a memory pool allocation from another process.
- *
- * Returns in \p ptr_out a pointer to the imported memory.
- * The imported memory must not be accessed before the allocation operation completes
- * in the exporting process. The imported memory must be freed from all importing processes before
- * being freed in the exporting process. The pointer may be freed with cuMemFree
- * or cuMemFreeAsync.  If cuMemFreeAsync is used, the free must be completed
- * on the importing process before the free operation on the exporting process.
- *
- * \note The cuMemFreeAsync api may be used in the exporting process before
- *       the cuMemFreeAsync operation completes in its stream as long as the
- *       cuMemFreeAsync in the exporting process specifies a stream with
- *       a stream dependency on the importing process's cuMemFreeAsync.
- *
- * \param[out] ptr_out  - pointer to imported memory
- * \param[in] pool      - pool from which to import
- * \param[in] shareData - data specifying the memory to import
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer
- */
-CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData);
-
-/** @} */ /* END CUDA_MALLOC_ASYNC */
-
-/**
- * \defgroup CUDA_UNIFIED Unified Addressing
- *
- * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the unified addressing functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- *
- * \section CUDA_UNIFIED_overview Overview
- *
- * CUDA devices can share a unified address space with the host.
- * For these devices there is no distinction between a device
- * pointer and a host pointer -- the same pointer value may be
- * used to access memory from the host program and from a kernel
- * running on the device (with exceptions enumerated below).
- *
- * \section CUDA_UNIFIED_support Supported Platforms
- *
- * Whether or not a device supports unified addressing may be
- * queried by calling ::cuDeviceGetAttribute() with the device
- * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
- *
- * Unified addressing is automatically enabled in 64-bit processes
- *
- * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
- *
- * It is possible to look up information about the memory which backs a
- * pointer value.  For instance, one may want to know if a pointer points
- * to host or device memory.  As another example, in the case of device
- * memory, one may want to know on which CUDA device the memory
- * resides.  These properties may be queried using the function
- * ::cuPointerGetAttribute()
- *
- * Since pointers are unique, it is not necessary to specify information
- * about the pointers specified to the various copy functions in the
- * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
- * between two pointers, ignoring whether they point to host or device
- * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
- * unnecessary for devices supporting unified addressing).  For
- * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
- * used to specify that the CUDA driver should infer the location of the
- * pointer from its value.
- *
- * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
- *
- * All host memory allocated in all contexts using ::cuMemAllocHost() and
- * ::cuMemHostAlloc() is always directly accessible from all contexts on
- * all devices that support unified addressing.  This is the case regardless
- * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
- * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
- *
- * The pointer value through which allocated host memory may be accessed
- * in kernels on all devices that support unified addressing is the same
- * as the pointer value through which that memory is accessed on the host,
- * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
- * pointer for these allocations.
- *
- * Note that this is not the case for memory allocated using the flag
- * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
- *
- * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
- *
- * Upon enabling direct access from a context that supports unified addressing
- * to another peer context that supports unified addressing using
- * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
- * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
- * by the current context.  The device pointer value through
- * which any peer memory may be accessed in the current context
- * is the same pointer value through which that memory may be
- * accessed in the peer context.
- *
- * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
- *
- * Not all memory may be accessed on devices through the same pointer
- * value through which they are accessed on the host.  These exceptions
- * are host memory registered using ::cuMemHostRegister() and host memory
- * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these
- * exceptions, there exists a distinct host and device address for the
- * memory.  The device address is guaranteed to not overlap any valid host
- * pointer range and is guaranteed to have the same value across all
- * contexts that support unified addressing.
- *
- * This device address may be queried using ::cuMemHostGetDevicePointer()
- * when a context using unified addressing is current.  Either the host
- * or the unified device pointer value may be used to refer to this memory
- * through ::cuMemcpy() and similar functions using the
- * ::CU_MEMORYTYPE_UNIFIED memory type.
- *
- */
-
-/**
- * \brief Returns information about a pointer
- *
- * The supported attributes are:
- *
- * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
- *
- *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
- *      registered.
- *      The type of \p data must be ::CUcontext *.
- *
- *      If \p ptr was not allocated by, mapped by, or registered with
- *      a ::CUcontext which uses unified virtual addressing then
- *      ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
- *
- *      Returns in \p *data the physical memory type of the memory that
- *      \p ptr addresses as a ::CUmemorytype enumerated value.
- *      The type of \p data must be unsigned int.
- *
- *      If \p ptr addresses device memory then \p *data is set to
- *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
- *      memory resides is the ::CUdevice of the ::CUcontext returned by the
- *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
- *
- *      If \p ptr addresses host memory then \p *data is set to
- *      ::CU_MEMORYTYPE_HOST.
- *
- *      If \p ptr was not allocated by, mapped by, or registered with
- *      a ::CUcontext which uses unified virtual addressing then
- *      ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- *      If the current ::CUcontext does not support unified virtual
- *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
- *
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
- *
- *      Returns in \p *data the device pointer value through which
- *      \p ptr may be accessed by kernels running in the current
- *      ::CUcontext.
- *      The type of \p data must be CUdeviceptr *.
- *
- *      If there exists no device pointer value through which
- *      kernels running in the current ::CUcontext may access
- *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- *      If there is no current ::CUcontext then
- *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
- *
- *      Except in the exceptional disjoint addressing cases discussed
- *      below, the value returned in \p *data will equal the input
- *      value \p ptr.
- *
- * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
- *
- *      Returns in \p *data the host pointer value through which
- *      \p ptr may be accessed by by the host program.
- *      The type of \p data must be void **.
- *      If there exists no host pointer value through which
- *      the host program may directly access \p ptr then
- *      ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- *      Except in the exceptional disjoint addressing cases discussed
- *      below, the value returned in \p *data will equal the input
- *      value \p ptr.
- *
- * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
- *
- *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
- *      kernel interface. \p data must be a struct of type
- *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
- *
- *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
- *      Note that p2pToken and vaSpaceToken are only valid for the
- *      lifetime of the source allocation. A subsequent allocation at
- *      the same address may return completely different tokens.
- *      Querying this attribute has a side effect of setting the attribute
- *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
- *      \p ptr points to.
- *
- * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
- *
- *      A boolean attribute which when set, ensures that synchronous memory operations
- *      initiated on the region of memory that \p ptr points to will always synchronize.
- *      See further documentation in the section titled "API synchronization behavior"
- *      to learn more about cases when synchronous memory operations can
- *      exhibit asynchronous behavior.
- *
- * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
- *
- *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
- *      \p data must point to an unsigned long long.
- *
- *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
- *      Every memory allocation from any of the CUDA memory allocation APIs will
- *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
- *      from previous freed allocations. IDs are only unique within a single process.
- *
- *
- * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
- *
- *      Returns in \p *data a boolean that indicates whether the pointer points to
- *      managed memory or not.
- *
- *      If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
- *
- *      Returns in \p *data an integer representing a device ordinal of a device against
- *      which the memory was allocated or registered.
- *
- * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE:
- *
- *      Returns in \p *data a boolean that indicates if this pointer maps to
- *      an allocation that is suitable for ::cudaIpcGetMemHandle.
- *
- * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR:
- *
- *      Returns in \p *data the starting address for the allocation referenced
- *      by the device pointer \p ptr.  Note that this is not necessarily the
- *      address of the mapped region, but the address of the mappable address
- *      range \p ptr references (e.g. from ::cuMemAddressReserve).
- *
- * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE:
- *
- *      Returns in \p *data the size for the allocation referenced by the device
- *      pointer \p ptr.  Note that this is not necessarily the size of the mapped
- *      region, but the size of the mappable address range \p ptr references
- *      (e.g. from ::cuMemAddressReserve).  To retrieve the size of the mapped
- *      region, see ::cuMemGetAddressRange
- *
- * - ::CU_POINTER_ATTRIBUTE_MAPPED:
- *
- *      Returns in \p *data a boolean that indicates if this pointer is in a
- *      valid address range that is mapped to a backing allocation.
- *
- * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES:
- *
- *      Returns a bitmask of the allowed handle types for an allocation that may
- *      be passed to ::cuMemExportToShareableHandle.
- * 
- * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE:
- * 
- *      Returns in \p *data the handle to the mempool that the allocation was obtained from.
- *
- * \par
- *
- * Note that for most allocations in the unified virtual address space
- * the host and device pointer for accessing the allocation will be the
- * same.  The exceptions to this are
- *  - user memory registered using ::cuMemHostRegister
- *  - host memory allocated using ::cuMemHostAlloc with the
- *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
- * For these types of allocation there will exist separate, disjoint host
- * and device addresses for accessing the allocation.  In particular
- *  - The host address will correspond to an invalid unmapped device address
- *    (which will result in an exception if accessed from the device)
- *  - The device address will correspond to an invalid unmapped host address
- *    (which will result in an exception if accessed from the host).
- * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
- * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
- * and device addresses from either address.
- *
- * \param data      - Returned pointer attribute value
- * \param attribute - Pointer attribute to query
- * \param ptr       - Pointer
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuPointerSetAttribute,
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuMemAllocHost,
- * ::cuMemFreeHost,
- * ::cuMemHostAlloc,
- * ::cuMemHostRegister,
- * ::cuMemHostUnregister,
- * ::cudaPointerGetAttributes
- */
-CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
-
-/**
- * \brief Prefetches memory to the specified destination device
- *
- * Prefetches memory to the specified destination device.  \p devPtr is the
- * base device pointer of the memory to be prefetched and \p dstDevice is the
- * destination device. \p count specifies the number of bytes to copy. \p hStream
- * is the stream in which the operation is enqueued. The memory range must refer
- * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
- *
- * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
- * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
- * must be non-zero. Additionally, \p hStream must be associated with a device that has a
- * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- *
- * The start address and end address of the memory range will be rounded down and rounded up
- * respectively to be aligned to CPU page size before the prefetch operation is enqueued
- * in the stream.
- *
- * If no physical memory has been allocated for this region, then this memory region
- * will be populated and mapped on the destination device. If there's insufficient
- * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
- * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
- * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
- *
- * By default, any mappings to the previous location of the migrated pages are removed and
- * mappings for the new location are only setup on \p dstDevice. The exact behavior however
- * also depends on the settings applied to this memory range via ::cuMemAdvise as described
- * below:
- *
- * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
- * then that subset will create a read-only copy of the pages on \p dstDevice.
- *
- * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
- * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
- * preferred location of any pages in the memory range.
- *
- * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
- * then mappings to those pages from all the appropriate processors are updated to
- * refer to the new location if establishing such a mapping is possible. Otherwise,
- * those mappings are cleared.
- *
- * Note that this API is not required for functionality and only serves to improve performance
- * by allowing the application to migrate data to a suitable location before it is accessed.
- * Memory accesses to this range are always coherent and are allowed even when the data is
- * actively being migrated.
- *
- * Note that this function is asynchronous with respect to the host and all work
- * on other devices.
- *
- * \param devPtr    - Pointer to be prefetched
- * \param count     - Size in bytes
- * \param dstDevice - Destination device to prefetch to
- * \param hStream    - Stream to enqueue prefetch operation
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
- * ::cuMemcpy3DPeerAsync, ::cuMemAdvise,
- * ::cudaMemPrefetchAsync
- */
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
-
-/**
- * \brief Advise about the usage of a given memory range
- *
- * Advise the Unified Memory subsystem about the usage pattern for the memory range
- * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
- * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
- * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
- * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
- * memory provided it represents a valid, host-accessible region of memory and all additional constraints
- * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
- * memory range results in an error being returned.
- *
- * The \p advice parameter can take the following values:
- * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
- * from and only occasionally written to. Any read accesses from any processor to this region will create a
- * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
- * is called on this region, it will create a read-only copy of the data on the destination processor.
- * If any processor writes to this region, all copies of the corresponding page will be invalidated
- * except for the one where the write occurred. The \p device argument is ignored for this advice.
- * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
- * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- * Also, if a context is created on a device that does not have the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
- * all such contexts are destroyed.
- * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
- * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
- * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
- * will not create a read-only copy when that device accesses this memory region.
- *
- * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
- * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
- * copies of the data will be collapsed into a single copy. The location for the collapsed
- * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
- * copies was resident at that location. Otherwise, the location chosen is arbitrary.
- *
- * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
- * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
- * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
- * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
- * when a fault occurs on that memory region. If the data is already in its preferred location and the
- * faulting processor can establish a mapping without requiring the data to be migrated, then
- * data migration will be avoided. On the other hand, if the data is not in its preferred location
- * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
- * it. It is important to note that setting the preferred location does not prevent data prefetching
- * done using ::cuMemPrefetchAsync.
- * Having a preferred location can override the page thrash detection and resolution logic in the Unified
- * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
- * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
- * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
- * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
- * policies associated with that advice will override the policies of this advice, unless read accesses from
- * \p device will not result in a read-only copy being created on that device as outlined in description for
- * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
- * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
- * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
- * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
- * then this call has no effect. Note however that this behavior may change in the future.
- *
- * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
- * and changes the preferred location to none.
- *
- * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
- * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
- * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
- * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
- * it causes the data to always be mapped in the specified processor's page tables, as long as the
- * location of the data permits a mapping to be established. If the data gets migrated for any reason,
- * the mappings are updated accordingly.
- * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
- * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
- * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
- * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
- * migration may be too high. But preventing faults can still help improve performance, and so having
- * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
- * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
- * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
- * page in host memory.
- * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
- * policies associated with that advice will override the policies of this advice. Additionally, if the
- * preferred location of this memory region or any subset of it is also \p device, then the policies
- * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
- * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
- * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
- * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
- * then this call has no effect.
- *
- * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
- * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
- * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
- * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
- * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
- * then this call has no effect.
- *
- * \param devPtr - Pointer to memory to set the advice for
- * \param count  - Size in bytes of the memory range
- * \param advice - Advice to be applied for the specified memory range
- * \param device - Device to apply the advice for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
- * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync,
- * ::cudaMemAdvise
- */
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
-
-/**
- * \brief Query an attribute of a given memory range
- *
- * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
- * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
- * __managed__ variables.
- *
- * The \p attribute parameter can take the following values:
- * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
- * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
- * memory range have read-duplication enabled, or 0 otherwise.
- * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
- * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
- * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
- * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
- * if either all the pages don't have the same preferred location or some of the pages don't have a
- * preferred location at all. Note that the actual location of the pages in the memory range at the time of
- * the query may be different from the preferred location.
- * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
- * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
- * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
- * If any device does not have that advice set for the entire memory range, that device will not be included.
- * If \p data is larger than the number of devices that have that advice set for that memory range,
- * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
- * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
- * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
- * that advice set, then only as many devices will be returned as can fit in the array. There is no
- * guarantee on which specific devices will be returned, however.
- * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
- * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
- * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
- * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
- * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
- * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
- * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
- * whether the prefetch operation to that location has completed or even begun.
- *
- * \param data      - A pointers to a memory location where the result
- *                    of each attribute query will be written to.
- * \param dataSize  - Array containing the size of data
- * \param attribute - The attribute to query
- * \param devPtr    - Start of the range to query
- * \param count     - Size of the range to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
- * ::cuMemAdvise,
- * ::cudaMemRangeGetAttribute
- */
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
-
-/**
- * \brief Query attributes of a given memory range.
- *
- * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
- * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
- * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
- * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
- * The results of the query will be stored in \p data.
- *
- * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
- * attribute descriptions and restrictions.
- *
- * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
- * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
- * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
- * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
- *
- * \param data          - A two-dimensional array containing pointers to memory
- *                        locations where the result of each attribute query will be written to.
- * \param dataSizes     - Array containing the sizes of each result
- * \param attributes    - An array of attributes to query
- *                        (numAttributes and the number of attributes in this array should match)
- * \param numAttributes - Number of attributes to query
- * \param devPtr        - Start of the range to query
- * \param count         - Size of the range to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise,
- * ::cuMemPrefetchAsync,
- * ::cudaMemRangeGetAttributes
- */
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
-
-/**
- * \brief Set attributes on a previously allocated memory region
- *
- * The supported attributes are:
- *
- * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
- *
- *      A boolean attribute that can either be set (1) or unset (0). When set,
- *      the region of memory that \p ptr points to is guaranteed to always synchronize
- *      memory operations that are synchronous. If there are some previously initiated
- *      synchronous memory operations that are pending when this attribute is set, the
- *      function does not return until those memory operations are complete.
- *      See further documentation in the section titled "API synchronization behavior"
- *      to learn more about cases when synchronous memory operations can
- *      exhibit asynchronous behavior.
- *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
- *
- * \param value     - Pointer to memory containing the value to be set
- * \param attribute - Pointer attribute to set
- * \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa ::cuPointerGetAttribute,
- * ::cuPointerGetAttributes,
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuMemAllocHost,
- * ::cuMemFreeHost,
- * ::cuMemHostAlloc,
- * ::cuMemHostRegister,
- * ::cuMemHostUnregister
- */
-CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
-
-/**
- * \brief Returns information about a pointer.
- *
- * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
- *
- * - ::CU_POINTER_ATTRIBUTE_CONTEXT
- * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
- * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
- * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
- * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
- * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
- * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
- * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE
- * - ::CU_POINTER_ATTRIBUTE_MAPPED
- * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
- * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
- * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
- *
- * \param numAttributes - Number of attributes to query
- * \param attributes    - An array of attributes to query
- *                      (numAttributes and the number of attributes in this array should match)
- * \param data          - A two-dimensional array containing pointers to memory
- *                      locations where the result of each attribute query will be written to.
- * \param ptr           - Pointer to query
- *
- * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
- * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
- * and CUDA_SUCCESS is returned.
- *
- * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
- * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuPointerGetAttribute,
- * ::cuPointerSetAttribute,
- * ::cudaPointerGetAttributes
- */
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
-
-/** @} */ /* END CUDA_UNIFIED */
-
-/**
- * \defgroup CUDA_STREAM Stream Management
- *
- * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the stream management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Create a stream
- *
- * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
- * determines behaviors of the stream.
- *
- * Valid values for \p Flags are:
- * - ::CU_STREAM_DEFAULT: Default stream creation flag.
- * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
- *   stream may run concurrently with work in stream 0 (the NULL stream), and that
- *   the created stream should perform no implicit synchronization with stream 0.
- *
- * \param phStream - Returned newly created stream
- * \param Flags    - Parameters for stream creation
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreateWithPriority,
- * ::cuStreamGetPriority,
- * ::cuStreamGetFlags,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamCreate,
- * ::cudaStreamCreateWithFlags
- */
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
-
-/**
- * \brief Create a stream with the given priority
- *
- * Creates a stream with the specified priority and returns a handle in \p phStream.
- * This API alters the scheduler priority of work in the stream. Work in a higher
- * priority stream may preempt work already executing in a low priority stream.
- *
- * \p priority follows a convention where lower numbers represent higher priorities.
- * '0' represents default priority. The range of meaningful numerical priorities can
- * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
- * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
- * it will automatically be clamped to the lowest or the highest number in the range.
- *
- * \param phStream    - Returned newly created stream
- * \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
- *                      valid flags
- * \param priority    - Stream priority. Lower numbers represent higher priorities.
- *                      See ::cuCtxGetStreamPriorityRange for more information about
- *                      meaningful stream priorities that can be passed.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \note Stream priorities are supported only on GPUs
- * with compute capability 3.5 or higher.
- *
- * \note In the current implementation, only compute kernels launched in
- * priority streams are affected by the stream's priority. Stream priorities have
- * no effect on host-to-device and device-to-host memory operations.
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreate,
- * ::cuStreamGetPriority,
- * ::cuCtxGetStreamPriorityRange,
- * ::cuStreamGetFlags,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamCreateWithPriority
- */
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
-
-
-/**
- * \brief Query the priority of a given stream
- *
- * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
- * and return the priority in \p priority. Note that if the stream was created with a
- * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
- * this function returns the clamped priority.
- * See ::cuStreamCreateWithPriority for details about priority clamping.
- *
- * \param hStream    - Handle to the stream to be queried
- * \param priority   - Pointer to a signed integer in which the stream's priority is returned
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreate,
- * ::cuStreamCreateWithPriority,
- * ::cuCtxGetStreamPriorityRange,
- * ::cuStreamGetFlags,
- * ::cudaStreamGetPriority
- */
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
-
-/**
- * \brief Query the flags of a given stream
- *
- * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
- * and return the flags in \p flags.
- *
- * \param hStream    - Handle to the stream to be queried
- * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
- *                     The value returned in \p flags is a logical 'OR' of all flags that
- *                     were used while creating this stream. See ::cuStreamCreate for the list
- *                     of valid flags
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreate,
- * ::cuStreamGetPriority,
- * ::cudaStreamGetFlags
- */
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
-
-/**
- * \brief Query the context associated with a stream
- *
- * Returns the CUDA context that the stream is associated with.
- *
- * The stream handle \p hStream can refer to any of the following:
- * <ul>
- *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
- *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
- *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
- *   The returned context is the context that was active in the calling thread when the
- *   stream was created. Passing an invalid handle will result in undefined behavior.</li>
- *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
- *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
- *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
- *   Specifying any of the special handles will return the context current to the
- *   calling thread. If no context is current to the calling thread,
- *   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
- * </ul>
- *
- * \param hStream - Handle to the stream to be queried
- * \param pctx    - Returned context associated with the stream
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreateWithPriority,
- * ::cuStreamGetPriority,
- * ::cuStreamGetFlags,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamCreate,
- * ::cudaStreamCreateWithFlags
- */
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
-
-/**
- * \brief Make a compute stream wait on an event
- *
- * Makes all future work submitted to \p hStream wait for all work captured in
- * \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
- * The synchronization will be performed efficiently on the device when applicable.
- * \p hEvent may be from a different context or device than \p hStream.
- *
- * flags include:
- * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag.
- * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external
- *   event node when performing stream capture. This flag is invalid outside
- *   of stream capture.
- *
- * \param hStream - Stream to wait
- * \param hEvent  - Event to wait on (may not be NULL)
- * \param Flags   - See ::CUevent_capture_flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuEventRecord,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cuStreamDestroy,
- * ::cudaStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
-
-/**
- * \brief Add a callback to a compute stream
- *
- * \note This function is slated for eventual deprecation and removal. If
- * you do not require the callback to execute in case of a device error,
- * consider using ::cuLaunchHostFunc. Additionally, this function is not
- * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
- * ::cuLaunchHostFunc.
- *
- * Adds a callback to be called on the host after all currently enqueued
- * items in the stream have completed.  For each
- * cuStreamAddCallback call, the callback will be executed exactly once.
- * The callback will block later work in the stream until it is finished.
- *
- * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
- * of a device error, all subsequently executed callbacks will receive an
- * appropriate ::CUresult.
- *
- * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
- * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
- * synchronization that may depend on outstanding device work or other callbacks
- * that are not mandated to run earlier.  Callbacks without a mandated order
- * (in independent streams) execute in undefined order and may be serialized.
- *
- * For the purposes of Unified Memory, callback execution makes a number of
- * guarantees:
- * <ul>
- *   <li>The callback stream is considered idle for the duration of the
- *   callback.  Thus, for example, a callback may always use memory attached
- *   to the callback stream.</li>
- *   <li>The start of execution of a callback has the same effect as
- *   synchronizing an event recorded in the same stream immediately prior to
- *   the callback.  It thus synchronizes streams which have been "joined"
- *   prior to the callback.</li>
- *   <li>Adding device work to any stream does not have the effect of making
- *   the stream active until all preceding host functions and stream callbacks
- *   have executed.  Thus, for
- *   example, a callback might use global attached memory even if work has
- *   been added to another stream, if the work has been ordered behind the
- *   callback with an event.</li>
- *   <li>Completion of a callback does not cause a stream to become
- *   active except as described above.  The callback stream will remain idle
- *   if no device work follows the callback, and will remain idle across
- *   consecutive callbacks without device work in between.  Thus, for example,
- *   stream synchronization can be done by signaling from a callback at the
- *   end of the stream.</li>
- * </ul>
- *
- * \param hStream  - Stream to add callback to
- * \param callback - The function to call once preceding stream operations are complete
- * \param userData - User specified data to be passed to the callback function
- * \param flags    - Reserved for future use, must be 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuMemAllocManaged,
- * ::cuStreamAttachMemAsync,
- * ::cuStreamLaunchHostFunc,
- * ::cudaStreamAddCallback
- */
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
-
-/**
- * \brief Begins graph capture on a stream
- *
- * Begin graph capture on \p hStream. When a stream is in capture mode, all operations
- * pushed into the stream will not be executed, but will instead be captured into
- * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
- * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
- * it was initiated, and it may only be initiated if the stream is not already in capture
- * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
- * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
- *
- * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
- * called on this stream from the same thread.
- *
- * \param hStream - Stream in which to initiate capture
- * \param mode    - Controls the interaction of this capture sequence with other API
- *                  calls that are potentially unsafe. For more details see
- *                  ::cuThreadExchangeStreamCaptureMode.
- *
- * \note Kernels captured using this API must not use texture and surface references.
- *       Reading or writing through any texture or surface reference is undefined
- *       behavior. This restriction does not apply to texture and surface objects.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuStreamCreate,
- * ::cuStreamIsCapturing,
- * ::cuStreamEndCapture,
- * ::cuThreadExchangeStreamCaptureMode
- */
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
-
-/**
- * \brief Swaps the stream capture interaction mode for a thread
- *
- * Sets the calling thread's stream capture interaction mode to the value contained
- * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
- * facilitate deterministic behavior across function or module boundaries, callers
- * are encouraged to use this API in a push-pop fashion: \code
-     CUstreamCaptureMode mode = desiredMode;
-     cuThreadExchangeStreamCaptureMode(&mode);
-     ...
-     cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
- * \endcode
- *
- * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
- * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
- * not enqueued asynchronously to a stream, and is not observed by stream capture.
- * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
- * depended on the allocation being replayed whenever the graph is launched, the
- * captured graph would be invalid.
- *
- * Therefore, stream capture places restrictions on API calls that can be made within
- * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
- * behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
- *
- * A thread's mode is one of the following:
- * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
- *   an ongoing capture sequence that was not initiated with
- *   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
- *   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
- *   this thread is prohibited from potentially unsafe API calls.
- * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
- *   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
- *   from potentially unsafe API calls. Concurrent capture sequences in other threads
- *   are ignored.
- * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
- *   unsafe API calls. Note that the thread is still prohibited from API calls which
- *   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
- *   on an event that was last recorded inside a capture sequence.
- *
- * \param mode - Pointer to mode value to swap with the current mode
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuStreamBeginCapture
- */
-CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode);
-
-/**
- * \brief Ends capture on a stream, returning the captured graph
- *
- * End capture on \p hStream, returning the captured graph via \p phGraph.
- * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
- * If capture was invalidated, due to a violation of the rules of stream capture, then
- * a NULL graph will be returned.
- *
- * If the \p mode argument to ::cuStreamBeginCapture was not
- * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
- * ::cuStreamBeginCapture.
- *
- * \param hStream - Stream to query
- * \param phGraph - The captured graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
- * \notefnerr
- *
- * \sa
- * ::cuStreamCreate,
- * ::cuStreamBeginCapture,
- * ::cuStreamIsCapturing
- */
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
-
-/**
- * \brief Returns a stream's capture status
- *
- * Return the capture status of \p hStream via \p captureStatus. After a successful
- * call, \p *captureStatus will contain one of the following:
- * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
- * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
- * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
- *   has invalidated the capture sequence. The capture sequence must be terminated
- *   with ::cuStreamEndCapture on the stream where it was initiated in order to
- *   continue using \p hStream.
- *
- * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
- * a blocking stream in the same context is capturing, it will return
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
- * after the call. The blocking stream capture is not invalidated.
- *
- * When a blocking stream is capturing, the legacy stream is in an
- * unusable state until the blocking stream capture is terminated. The legacy
- * stream is not supported for stream capture, but attempted use would have an
- * implicit dependency on the capturing stream(s).
- *
- * \param hStream       - Stream to query
- * \param captureStatus - Returns the stream's capture status
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
- * \notefnerr
- *
- * \sa
- * ::cuStreamCreate,
- * ::cuStreamBeginCapture,
- * ::cuStreamEndCapture
- */
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
-
-/**
- * \brief Query capture status of a stream
- *
- * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will
- * supplant this version in 12.0, which is retained for minor version compatibility.
- *
- * Query the capture status of a stream and and get an id for 
- * the capture sequence, which is unique over the lifetime of the process.
- *
- * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
- * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
- *
- * A valid id is returned only if both of the following are true:
- * - the call returns CUDA_SUCCESS
- * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
- * \notefnerr
- *
- * \sa
- * ::cuStreamGetCaptureInfo_v2,
- * ::cuStreamBeginCapture,
- * ::cuStreamIsCapturing
- */
-CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
-
-/**
- * \brief Query a stream's capture state (11.3+)
- *
- * Query stream state related to stream capture.
- *
- * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
- * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
- *
- * Valid data (other than capture status) is returned only if both of the following are true:
- * - the call returns CUDA_SUCCESS
- * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
- *
- * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the
- * previous version in 12.0. Developers requiring compatibility across minor versions to
- * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback
- * path.
- *
- * \param hStream - The stream to query
- * \param captureStatus_out - Location to return the capture status of the stream; required
- * \param id_out - Optional location to return an id for the capture sequence, which is
- *           unique over the lifetime of the process
- * \param graph_out - Optional location to return the graph being captured into. All
- *           operations other than destroy and node removal are permitted on the graph
- *           while the capture sequence is in progress. This API does not transfer
- *           ownership of the graph, which is transferred or destroyed at
- *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
- *           end of capture for certain errors. Nodes that are or become
- *           unreachable from the original stream at ::cuStreamEndCapture due to direct
- *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
- * \param dependencies_out - Optional location to store a pointer to an array of nodes.
- *           The next node to be captured in the stream will depend on this set of nodes,
- *           absent operations such as event wait which modify this set. The array pointer
- *           is valid until the next API call which operates on the stream or until end of
- *           capture. The node handles may be copied out and are valid until they or the
- *           graph is destroyed. The driver-owned array may also be passed directly to
- *           APIs that operate on the graph (not the stream) without copying.
- * \param numDependencies_out - Optional location to store the size of the array
- *           returned in dependencies_out.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuStreamGetCaptureInfo,
- * ::cuStreamBeginCapture,
- * ::cuStreamIsCapturing,
- * ::cuStreamUpdateCaptureDependencies
- */
-CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
-        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
-
-/**
- * \brief Update the set of dependencies in a capturing stream (11.3+)
- *
- * Modifies the dependency set of a capturing stream. The dependency set is the set
- * of nodes that the next captured node in the stream will depend on.
- *
- * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
- * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
- * the API is added to the existing set or replaces it. A flags value of 0 defaults
- * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
- *
- * Nodes that are removed from the dependency set via this API do not result in
- * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
- * ::cuStreamEndCapture.
- *
- * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
- *
- * This API is new in CUDA 11.3. Developers requiring compatibility across minor
- * versions to CUDA 11.0 should not use this API or provide a fallback.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_ILLEGAL_STATE
- *
- * \sa
- * ::cuStreamBeginCapture,
- * ::cuStreamGetCaptureInfo,
- * ::cuStreamGetCaptureInfo_v2
- */
-CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
-
-/**
- * \brief Attach memory to a stream asynchronously
- *
- * Enqueues an operation in \p hStream to specify stream association of
- * \p length bytes of memory starting from \p dptr. This function is a
- * stream-ordered operation, meaning that it is dependent on, and will
- * only take effect when, previous work in stream has completed. Any
- * previous association is automatically replaced.
- *
- * \p dptr must point to one of the following types of memories:
- * - managed memory declared using the __managed__ keyword or allocated with
- *   ::cuMemAllocManaged.
- * - a valid host-accessible region of system-allocated pageable memory. This
- *   type of memory may only be specified if the device associated with the
- *   stream reports a non-zero value for the device attribute
- *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
- *
- * For managed allocations, \p length must be either zero or the entire
- * allocation's size. Both indicate that the entire allocation's stream
- * association is being changed. Currently, it is not possible to change stream
- * association for a portion of a managed allocation.
- *
- * For pageable host allocations, \p length must be non-zero.
- *
- * The stream association is specified using \p flags which must be
- * one of ::CUmemAttach_flags.
- * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
- * by any stream on any device.
- * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
- * that it won't access the memory on the device from any stream on a device that
- * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
- * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
- * the program makes a guarantee that it will only access the memory on the device
- * from \p hStream. It is illegal to attach singly to the NULL stream, because the
- * NULL stream is a virtual global stream and not a specific stream. An error will
- * be returned in this case.
- *
- * When memory is associated with a single stream, the Unified Memory system will
- * allow CPU access to this memory region so long as all operations in \p hStream
- * have completed, regardless of whether other streams are active. In effect,
- * this constrains exclusive ownership of the managed memory region by
- * an active GPU to per-stream activity instead of whole-GPU activity.
- *
- * Accessing memory on the device from streams that are not associated with
- * it will produce undefined results. No error checking is performed by the
- * Unified Memory system to ensure that kernels launched into other streams
- * do not access this region.
- *
- * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
- * via events, synchronization or other means to ensure legal access to memory
- * at all times. Data visibility and coherency will be changed appropriately
- * for all kernels which follow a stream-association change.
- *
- * If \p hStream is destroyed while data is associated with it, the association is
- * removed and the association reverts to the default visibility of the allocation
- * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
- * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
- * asynchronous operation, and as a result, the change to default association won't
- * happen until all work in the stream has completed.
- *
- * \param hStream - Stream in which to enqueue the attach operation
- * \param dptr    - Pointer to memory (must be a pointer to managed memory or
- *                  to a valid host-accessible region of system-allocated
- *                  pageable memory)
- * \param length  - Length of memory
- * \param flags   - Must be one of ::CUmemAttach_flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuMemAllocManaged,
- * ::cudaStreamAttachMemAsync
- */
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
-
-/**
- * \brief Determine status of a compute stream
- *
- * Returns ::CUDA_SUCCESS if all operations in the stream specified by
- * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
- *
- * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
- * is equivalent to having called ::cuStreamSynchronize().
- *
- * \param hStream - Stream to query status of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_READY
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamQuery
- */
-CUresult CUDAAPI cuStreamQuery(CUstream hStream);
-
-/**
- * \brief Wait until a stream's tasks are completed
- *
- * Waits until the device has completed all operations in the stream specified
- * by \p hStream. If the context was created with the
- * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
- * stream is finished with all of its tasks.
- *
- * \param hStream - Stream to wait for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
-
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamDestroy,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamAddCallback,
- * ::cudaStreamSynchronize
- */
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
-
-/**
- * \brief Destroys a stream
- *
- * Destroys the stream specified by \p hStream.
- *
- * In case the device is still doing work in the stream \p hStream
- * when ::cuStreamDestroy() is called, the function will return immediately
- * and the resources associated with \p hStream will be released automatically
- * once the device has completed all work in \p hStream.
- *
- * \param hStream - Stream to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamDestroy
- */
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
-
-/**
- * \brief Copies attributes from source stream to destination stream.
- *
- * Copies attributes from source stream \p src to destination stream \p dst.
- * Both streams must have the same context.
- *
- * \param[out] dst Destination stream
- * \param[in] src Source stream
- * For list of attributes see ::CUstreamAttrID
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src);
-
-/**
- * \brief Queries stream attribute.
- *
- * Queries attribute \p attr from \p hStream and stores it in corresponding
- * member of \p value_out.
- *
- * \param[in] hStream
- * \param[in] attr
- * \param[out] value_out
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      CUstreamAttrValue *value_out);
-
-/**
- * \brief Sets stream attribute.
- *
- * Sets attribute \p attr on \p hStream from corresponding attribute of
- * \p value. The updated attribute will be applied to subsequent work
- * submitted to the stream. It will not affect previously submitted work.
- *
- * \param[out] hStream
- * \param[in] attr
- * \param[in] value
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      const CUstreamAttrValue *value);
-
-/** @} */ /* END CUDA_STREAM */
-
-
-/**
- * \defgroup CUDA_EVENT Event Management
- *
- * ___MANBRIEF___ event management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the event management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Creates an event
- *
- * Creates an event *phEvent for the current context with the flags specified via
- * \p Flags. Valid flags include:
- * - ::CU_EVENT_DEFAULT: Default event creation flag.
- * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
- *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
- *   an event created with this flag will block until the event has actually
- *   been recorded.
- * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
- *   to record timing data.  Events created with this flag specified and
- *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
- *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
- * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
- *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
- *   be specified along with ::CU_EVENT_DISABLE_TIMING.
- *
- * \param phEvent - Returns newly created event
- * \param Flags   - Event creation flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventCreate,
- * ::cudaEventCreateWithFlags
- */
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
-
-/**
- * \brief Records an event
- *
- * Captures in \p hEvent the contents of \p hStream at the time of this call.
- * \p hEvent and \p hStream must be from the same context.
- * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
- * examine or wait for completion of the work that was captured. Uses of
- * \p hStream after this call do not modify \p hEvent. See note on default
- * stream behavior for what is captured in the default case.
- *
- * ::cuEventRecord() can be called multiple times on the same event and
- * will overwrite the previously captured state. Other APIs such as
- * ::cuStreamWaitEvent() use the most recently captured state at the time
- * of the API call, and are not affected by later calls to
- * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
- * event represents an empty set of work, so for example ::cuEventQuery()
- * would return ::CUDA_SUCCESS.
- *
- * \param hEvent  - Event to record
- * \param hStream - Stream to record event for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventRecord,
- * ::cuEventRecordWithFlags
- */
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
-
-/**
- * \brief Records an event
- *
- * Captures in \p hEvent the contents of \p hStream at the time of this call.
- * \p hEvent and \p hStream must be from the same context.
- * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
- * examine or wait for completion of the work that was captured. Uses of
- * \p hStream after this call do not modify \p hEvent. See note on default
- * stream behavior for what is captured in the default case.
- *
- * ::cuEventRecordWithFlags() can be called multiple times on the same event and
- * will overwrite the previously captured state. Other APIs such as
- * ::cuStreamWaitEvent() use the most recently captured state at the time
- * of the API call, and are not affected by later calls to
- * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an
- * event represents an empty set of work, so for example ::cuEventQuery()
- * would return ::CUDA_SUCCESS.
- *
- * flags include:
- * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag.
- * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external
- *   event node when performing stream capture. This flag is invalid outside
- *   of stream capture.
- *
- * \param hEvent  - Event to record
- * \param hStream - Stream to record event for
- * \param flags   - See ::CUevent_capture_flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cuEventRecord,
- * ::cudaEventRecord
- */
-CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
-
-/**
- * \brief Queries an event's status
- *
- * Queries the status of all work currently captured by \p hEvent. See
- * ::cuEventRecord() for details on what is captured by an event.
- *
- * Returns ::CUDA_SUCCESS if all captured work has been completed, or
- * ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
- *
- * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
- * is equivalent to having called ::cuEventSynchronize().
- *
- * \param hEvent - Event to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_READY
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventSynchronize,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventQuery
- */
-CUresult CUDAAPI cuEventQuery(CUevent hEvent);
-
-/**
- * \brief Waits for an event to complete
- *
- * Waits until the completion of all work currently captured in \p hEvent.
- * See ::cuEventRecord() for details on what is captured by an event.
- *
- * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
- * flag will cause the calling CPU thread to block until the event has
- * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
- * not been set, then the CPU thread will busy-wait until the event has
- * been completed by the device.
- *
- * \param hEvent - Event to wait for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventSynchronize
- */
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
-
-/**
- * \brief Destroys an event
- *
- * Destroys the event specified by \p hEvent.
- *
- * An event may be destroyed before it is complete (i.e., while
- * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
- * call does not block on completion of the event, and any associated
- * resources will automatically be released asynchronously at completion.
- *
- * \param hEvent - Event to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuEventElapsedTime,
- * ::cudaEventDestroy
- */
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
-
-/**
- * \brief Computes the elapsed time between two events
- *
- * Computes the elapsed time between two events (in milliseconds with a
- * resolution of around 0.5 microseconds).
- *
- * If either event was last recorded in a non-NULL stream, the resulting time
- * may be greater than expected (even if both used the same stream handle). This
- * happens because the ::cuEventRecord() operation takes place asynchronously
- * and there is no guarantee that the measured latency is actually just between
- * the two events. Any number of other different stream operations could execute
- * in between the two measured events, thus altering the timing in a significant
- * way.
- *
- * If ::cuEventRecord() has not been called on either event then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
- * on both events but one or both of them has not yet been completed (that is,
- * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
- * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
- * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
- * ::CUDA_ERROR_INVALID_HANDLE.
- *
- * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
- * \param hStart        - Starting event
- * \param hEnd          - Ending event
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_READY
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuEventDestroy,
- * ::cudaEventElapsedTime
- */
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
-
-/** @} */ /* END CUDA_EVENT */
-
-/**
- * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability
- *
- * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the external resource interoperability functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
- /**
- * \brief Imports an external memory object
- *
- * Imports an externally allocated memory object and returns
- * a handle to that in \p extMem_out.
- *
- * The properties of the handle being imported must be described in
- * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
- * is defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
-            CUexternalMemoryHandleType type;
-            union {
-                int fd;
-                struct {
-                    void *handle;
-                    const void *name;
-                } win32;
-                const void *nvSciBufObject;
-            } handle;
-            unsigned long long size;
-            unsigned int flags;
-        } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
- * of handle being imported. ::CUexternalMemoryHandleType is
- * defined as:
- *
- * \code
-        typedef enum CUexternalMemoryHandleType_enum {
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8
-        } CUexternalMemoryHandleType;
- * \endcode
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
- * file descriptor referencing a memory object. Ownership of
- * the file descriptor is transferred to the CUDA driver when the
- * handle is imported successfully. Performing any operations on the
- * file descriptor after it is imported results in undefined behavior.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
- * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
- * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * references a memory object. Ownership of this handle is
- * not transferred to CUDA after the import operation, so the
- * application must release the handle using the appropriate system
- * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a memory object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
- * be non-NULL and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * must be NULL. The handle specified must be a globally shared KMT
- * handle. This handle does not hold a reference to the underlying
- * object, and thus will be invalid when all references to the
- * memory object are destroyed.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
- * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
- * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3D12Device::CreateSharedHandle when referring to a
- * ID3D12Heap object. This handle holds a reference to the underlying
- * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a ID3D12Heap object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
- * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
- * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3D12Device::CreateSharedHandle when referring to a
- * ID3D12Resource object. This handle holds a reference to the
- * underlying object. If
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a ID3D12Resource object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
- * represent a valid shared NT handle that is returned by
- * IDXGIResource1::CreateSharedHandle when referring to a
- * ID3D11Resource object. If
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a ID3D11Resource object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
- * represent a valid shared KMT handle that is returned by
- * IDXGIResource::GetSharedHandle when referring to a
- * ID3D11Resource object and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * must be NULL.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL
- * and reference a valid NvSciBuf object.
- * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
- * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync
- * as appropriate barriers to maintain coherence between CUDA and the other drivers.
- * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
- * for memory synchronization.
- *
- *
- * The size of the memory object must be specified in
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
- *
- * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
- * resource is a dedicated resource. The definition of what a
- * dedicated resource is outside the scope of this extension.
- * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type
- * is one of the following:
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
- *
- * \param extMem_out    - Returned handle to an external memory object
- * \param memHandleDesc - Memory import handle descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
- * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
- * as well as appropriate Vulkan pipeline barriers to maintain coherence between
- * CPU and GPU. For more information on these APIs, please refer to "Synchronization
- * and Cache Control" chapter from Vulkan specification.
- *
- * \sa ::cuDestroyExternalMemory,
- * ::cuExternalMemoryGetMappedBuffer,
- * ::cuExternalMemoryGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc);
-
-/**
- * \brief Maps a buffer onto an imported memory object
- *
- * Maps a buffer onto an imported memory object and returns a device
- * pointer in \p devPtr.
- *
- * The properties of the buffer being mapped must be described in
- * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
- * defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
-            unsigned long long offset;
-            unsigned long long size;
-            unsigned int flags;
-        } CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
- * the memory object where the buffer's base address is.
- * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
- * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
- *
- * The offset and size have to be suitably aligned to match the
- * requirements of the external API. Mapping two buffers whose ranges
- * overlap may or may not result in the same virtual address being
- * returned for the overlapped portion. In such cases, the application
- * must ensure that all accesses to that region from the GPU are
- * volatile. Otherwise writes made via one address are not guaranteed
- * to be visible via the other address, even if they're issued by the
- * same thread. It is recommended that applications map the combined
- * range instead of mapping separate buffers and then apply the
- * appropriate offsets to the returned pointer to derive the
- * individual buffers.
- *
- * The returned pointer \p devPtr must be freed using ::cuMemFree.
- *
- * \param devPtr     - Returned device pointer to buffer
- * \param extMem     - Handle to external memory object
- * \param bufferDesc - Buffer descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalMemory,
- * ::cuDestroyExternalMemory,
- * ::cuExternalMemoryGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc);
-
-/**
- * \brief Maps a CUDA mipmapped array onto an external memory object
- *
- * Maps a CUDA mipmapped array onto an external object and returns a
- * handle to it in \p mipmap.
- *
- * The properties of the CUDA mipmapped array being mapped must be
- * described in \p mipmapDesc. The structure
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
-            unsigned long long offset;
-            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-            unsigned int numLevels;
-        } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
- * offset in the memory object where the base level of the mipmap
- * chain is.
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
- * the format, dimensions and type of the base level of the mipmap
- * chain. For further details on these parameters, please refer to the
- * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
- * array is bound as a color target in the graphics API, then the flag
- * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
- * the total number of levels in the mipmap chain.
- *
- * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
- *
- * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
- *
- * \param mipmap     - Returned CUDA mipmapped array
- * \param extMem     - Handle to external memory object
- * \param mipmapDesc - CUDA array descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalMemory,
- * ::cuDestroyExternalMemory,
- * ::cuExternalMemoryGetMappedBuffer
- */
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
-
-/**
- * \brief Destroys an external memory object.
- *
- * Destroys the specified external memory object. Any existing buffers
- * and CUDA mipmapped arrays mapped onto this object must no longer be
- * used and must be explicitly freed using ::cuMemFree and
- * ::cuMipmappedArrayDestroy respectively.
- *
- * \param extMem - External memory object to be destroyed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalMemory,
- * ::cuExternalMemoryGetMappedBuffer,
- * ::cuExternalMemoryGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
-
-/**
- * \brief Imports an external semaphore
- *
- * Imports an externally allocated synchronization object and returns
- * a handle to that in \p extSem_out.
- *
- * The properties of the handle being imported must be described in
- * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
- * defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
-            CUexternalSemaphoreHandleType type;
-            union {
-                int fd;
-                struct {
-                    void *handle;
-                    const void *name;
-                } win32;
-                const void* NvSciSyncObj;
-            } handle;
-            unsigned int flags;
-        } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
- * handle being imported. ::CUexternalSemaphoreHandleType is defined
- * as:
- *
- * \code
-        typedef enum CUexternalSemaphoreHandleType_enum {
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD                = 1,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32             = 2,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT         = 3,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE              = 4,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE              = 5,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC                = 6,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX        = 7,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT    = 8,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD    = 9,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
-        } CUexternalSemaphoreHandleType;
- * \endcode
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
- * file descriptor referencing a synchronization object. Ownership of
- * the file descriptor is transferred to the CUDA driver when the
- * handle is imported successfully. Performing any operations on the
- * file descriptor after it is imported results in undefined behavior.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
- * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
- * NULL. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * references a synchronization object. Ownership of this handle is
- * not transferred to CUDA after the import operation, so the
- * application must release the handle using the appropriate system
- * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
- * be non-NULL and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * must be NULL. The handle specified must be a globally shared KMT
- * handle. This handle does not hold a reference to the underlying
- * object, and thus will be invalid when all references to the
- * synchronization object are destroyed.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
- * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
- * NULL. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3D12Device::CreateSharedHandle when referring to a
- * ID3D12Fence object. This handle holds a reference to the underlying
- * object. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object that
- * refers to a valid ID3D12Fence object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * represents a valid shared NT handle that is returned by
- * ID3D11Fence::CreateSharedHandle. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object that
- * refers to a valid ID3D11Fence object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj
- * represents a valid NvSciSyncObj.
- *
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * represents a valid shared NT handle that
- * is returned by IDXGIResource1::CreateSharedHandle when referring to
- * a IDXGIKeyedMutex object. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object that
- * refers to a valid IDXGIKeyedMutex object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * represents a valid shared KMT handle that
- * is returned by IDXGIResource::GetSharedHandle when referring to
- * a IDXGIKeyedMutex object and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL.
- * 
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
- * file descriptor referencing a synchronization object. Ownership of
- * the file descriptor is transferred to the CUDA driver when the
- * handle is imported successfully. Performing any operations on the
- * file descriptor after it is imported results in undefined behavior.
- * 
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one
- * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
- * NULL. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * references a synchronization object. Ownership of this handle is
- * not transferred to CUDA after the import operation, so the
- * application must release the handle using the appropriate system
- * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object.
- *
- * \param extSem_out    - Returned handle to an external semaphore
- * \param semHandleDesc - Semaphore import handle descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuDestroyExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc);
-
-/**
- * \brief Signals a set of external semaphore objects
- *
- * Enqueues a signal operation on a set of externally allocated
- * semaphore object in the specified stream. The operations will be
- * executed when all prior operations in the stream complete.
- *
- * The exact semantics of signaling a semaphore depends on the type of
- * the object.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
- * then signaling the semaphore will set it to the signaled state.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
- * then the semaphore will be set to the value specified in
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
- *
- * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
- * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence
- * to a value that can be used by subsequent waiters of the same NvSciSync object
- * to order operations with those currently submitted in \p stream. Such an update
- * will overwrite previous contents of
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default,
- * signaling such an external semaphore object causes appropriate memory synchronization
- * operations to be performed over all external memory objects that are imported as
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses
- * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
- * These operations can be skipped by specifying the flag
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
- * performance optimization when data coherency is not required. But specifying this
- * flag in scenarios where data coherency is required results in undefined behavior.
- * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
- * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
- * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
- * CUDA_ERROR_NOT_SUPPORTED.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
- * then the keyed mutex will be released with the key specified in
- * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key.
- *
- * \param extSemArray - Set of external semaphores to be signaled
- * \param paramsArray - Array of semaphore parameters
- * \param numExtSems  - Number of semaphores to signal
- * \param stream      - Stream to enqueue the signal operations in
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuImportExternalSemaphore,
- * ::cuDestroyExternalSemaphore,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-
-/**
- * \brief Waits on a set of external semaphore objects
- *
- * Enqueues a wait operation on a set of externally allocated
- * semaphore object in the specified stream. The operations will be
- * executed when all prior operations in the stream complete.
- *
- * The exact semantics of waiting on a semaphore depends on the type
- * of the object.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
- * then waiting on the semaphore will wait until the semaphore reaches
- * the signaled state. The semaphore will then be reset to the
- * unsignaled state. Therefore for every signal operation, there can
- * only be one wait operation.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
- * then waiting on the semaphore will wait until the value of the
- * semaphore is greater than or equal to
- * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
- *
- * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
- * then, waiting on the semaphore will wait until the
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the
- * signaler of the NvSciSyncObj that was associated with this semaphore object.
- * By default, waiting on such an external semaphore object causes appropriate
- * memory synchronization operations to be performed over all external memory objects
- * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that
- * any subsequent accesses made by other importers of the same set of NvSciBuf memory
- * object(s) are coherent. These operations can be skipped by specifying the flag
- * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
- * performance optimization when data coherency is not required. But specifying this
- * flag in scenarios where data coherency is required results in undefined behavior.
- * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
- * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
- * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return
- * CUDA_ERROR_NOT_SUPPORTED.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
- * then the keyed mutex will be acquired when it is released with the key 
- * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key 
- * or until the timeout specified by
- * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs
- * has lapsed. The timeout interval can either be a finite value
- * specified in milliseconds or an infinite value. In case an infinite
- * value is specified the timeout never elapses. The windows INFINITE
- * macro must be used to specify infinite timeout.
- *
- * \param extSemArray - External semaphores to be waited on
- * \param paramsArray - Array of semaphore parameters
- * \param numExtSems  - Number of semaphores to wait on
- * \param stream      - Stream to enqueue the wait operations in
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_TIMEOUT
- * \notefnerr
- *
- * \sa ::cuImportExternalSemaphore,
- * ::cuDestroyExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-
-/**
- * \brief Destroys an external semaphore
- *
- * Destroys an external semaphore object and releases any references
- * to the underlying resource. Any outstanding signals or waits must
- * have completed before the semaphore is destroyed.
- *
- * \param extSem - External semaphore to be destroyed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
-
-/** @} */ /* END CUDA_EXTRES_INTEROP */
-
-/**
- * \defgroup CUDA_MEMOP Stream memory operations
- *
- * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the stream memory operations of the low-level CUDA
- * driver application programming interface.
- *
- * The whole set of operations is disabled by default. Users are required
- * to explicitly enable them, e.g. on Linux by passing the kernel module
- * parameter shown below:
- *     modprobe nvidia NVreg_EnableStreamMemOPs=1
- * There is currently no way to enable these operations on other operating
- * systems.
- *
- * Users can programmatically query whether the device supports these
- * operations with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
- *
- * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
- *
- * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
- * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
- * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
- *
- * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
- * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
- * hardware features and can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
- *
- * Note that all memory pointers passed as parameters to these operations
- * are device pointers. Where necessary a device pointer should be
- * obtained, for example with ::cuMemHostGetDevicePointer().
- *
- * None of the operations accepts pointers to managed memory buffers
- * (::cuMemAllocManaged).
- *
- * @{
- */
-
-/**
- * \brief Wait on a memory location
- *
- * Enqueues a synchronization of the stream on the given memory location. Work
- * ordered after the operation will block until the given condition on the
- * memory is satisfied. By default, the condition is to wait for
- * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
- * Other condition types can be specified via \p flags.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
- * be used with managed memory (::cuMemAllocManaged).
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
- *
- * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
- *
- * \param stream The stream to synchronize on the memory location.
- * \param addr The memory location to wait on.
- * \param value The value to compare with the memory location.
- * \param flags See ::CUstreamWaitValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue64,
- * ::cuStreamWriteValue32,
- * ::cuStreamWriteValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-
-/**
- * \brief Wait on a memory location
- *
- * Enqueues a synchronization of the stream on the given memory location. Work
- * ordered after the operation will block until the given condition on the
- * memory is satisfied. By default, the condition is to wait for
- * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
- * Other condition types can be specified via \p flags.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer().
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
- *
- * \param stream The stream to synchronize on the memory location.
- * \param addr The memory location to wait on.
- * \param value The value to compare with the memory location.
- * \param flags See ::CUstreamWaitValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue32,
- * ::cuStreamWriteValue32,
- * ::cuStreamWriteValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-
-/**
- * \brief Write a value to memory
- *
- * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
- * flag is passed, the write is preceded by a system-wide memory fence,
- * equivalent to a __threadfence_system() but scoped to the stream
- * rather than a CUDA thread.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
- * be used with managed memory (::cuMemAllocManaged).
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
- *
- * \param stream The stream to do the write in.
- * \param addr The device address to write to.
- * \param value The value to write.
- * \param flags See ::CUstreamWriteValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWriteValue64,
- * ::cuStreamWaitValue32,
- * ::cuStreamWaitValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuEventRecord
- */
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-
-/**
- * \brief Write a value to memory
- *
- * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
- * flag is passed, the write is preceded by a system-wide memory fence,
- * equivalent to a __threadfence_system() but scoped to the stream
- * rather than a CUDA thread.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer().
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
- *
- * \param stream The stream to do the write in.
- * \param addr The device address to write to.
- * \param value The value to write.
- * \param flags See ::CUstreamWriteValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWriteValue32,
- * ::cuStreamWaitValue32,
- * ::cuStreamWaitValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuEventRecord
- */
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-
-/**
- * \brief Batch operations to synchronize the stream via memory operations
- *
- * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
- * Batching operations may avoid some performance overhead in both the API call
- * and the device execution versus adding them to the stream in separate API
- * calls. The operations are enqueued in the order they appear in the array.
- *
- * See ::CUstreamBatchMemOpType for the full set of supported operations, and
- * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
- * and ::cuStreamWriteValue64() for details of specific operations.
- *
- * Basic support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details
- * on querying support for specific operations.
- *
- * \param stream The stream to enqueue the operations in.
- * \param count The number of operations in the array. Must be less than 256.
- * \param paramArray The types and parameters of the individual operations.
- * \param flags Reserved for future expansion; must be 0.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue32,
- * ::cuStreamWaitValue64,
- * ::cuStreamWriteValue32,
- * ::cuStreamWriteValue64,
- * ::cuMemHostRegister
- */
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
-
-/** @} */ /* END CUDA_MEMOP */
-
-/**
- * \defgroup CUDA_EXEC Execution Control
- *
- * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the execution control functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns information about a function
- *
- * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
- * given by \p hfunc. The supported attributes are:
- * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
- *   per block, beyond which a launch of the function would fail. This number
- *   depends on both the function and the device on which the function is
- *   currently loaded.
- * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
- *   statically-allocated shared memory per block required by this function.
- *   This does not include dynamically-allocated shared memory requested by
- *   the user at runtime.
- * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
- *   constant memory required by this function.
- * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
- *   used by each thread of this function.
- * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
- *   of this function.
- * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
- *   which the function was compiled. This value is the major PTX version * 10
- *   + the minor PTX version, so a PTX version 1.3 function would return the
- *   value 13. Note that this may return the undefined value of 0 for cubins
- *   compiled prior to CUDA 3.0.
- * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
- *   which the function was compiled. This value is the major binary
- *   version * 10 + the minor binary version, so a binary version 1.3 function
- *   would return the value 13. Note that this will return a value of 10 for
- *   legacy cubins that do not have a properly-encoded binary architecture
- *   version.
- * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
- *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
- * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
- *   dynamically-allocated shared memory.
- * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
- *   cache split ratio in percent of total shared memory.
- *
- * \param pi     - Returned attribute value
- * \param attrib - Attribute requested
- * \param hfunc  - Function to query attribute of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuLaunchKernel,
- * ::cudaFuncGetAttributes,
- * ::cudaFuncSetAttribute
- */
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
-
-/**
- * \brief Sets information about a function
- *
- * This call sets the value of a specified attribute \p attrib on the kernel given
- * by \p hfunc to an integer value specified by \p val
- * This function returns CUDA_SUCCESS if the new value of the attribute could be
- * successfully set. If the set fails, this call will return an error.
- * Not all attributes can have values set. Attempting to set a value on a read-only
- * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
- *
- * Supported attributes for the cuFuncSetAttribute call are:
- * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
- *   dynamically-allocated shared memory. The value should contain the requested
- *   maximum size of dynamically-allocated shared memory. The sum of this value and
- *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
- *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
- *   The maximal size of requestable dynamic shared memory may differ by GPU
- *   architecture.
- * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
- *   cache and shared memory use the same hardware resources, this sets the shared memory
- *   carveout preference, in percent of the total shared memory. 
- *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
- *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
- *
- * \param hfunc  - Function to query attribute of
- * \param attrib - Attribute requested
- * \param value   - The value to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuLaunchKernel,
- * ::cudaFuncGetAttributes,
- * ::cudaFuncSetAttribute
- */
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
-
-/**
- * \brief Sets the preferred cache configuration for a device function
- *
- * On devices where the L1 cache and shared memory use the same hardware
- * resources, this sets through \p config the preferred cache configuration for
- * the device function \p hfunc. This is only a preference. The driver will use
- * the requested configuration if possible, but it is free to choose a different
- * configuration if required to execute \p hfunc.  Any context-wide preference
- * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
- * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
- * that case, the current context-wide setting will be used.
- *
- * This setting does nothing on devices where the size of the L1 cache and
- * shared memory are fixed.
- *
- * Launching a kernel with a different preference than the most recent
- * preference setting may insert a device-side synchronization point.
- *
- *
- * The supported cache configurations are:
- * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- *
- * \param hfunc  - Kernel to configure cache for
- * \param config - Requested cache configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchKernel,
- * ::cudaFuncSetCacheConfig
- */
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
-
-/**
- * \brief Sets the shared memory configuration for a device function.
- *
- * On devices with configurable shared memory banks, this function will
- * force all subsequent launches of the specified device function to have
- * the given shared memory bank size configuration. On any given launch of the
- * function, the shared memory configuration of the device will be temporarily
- * changed if needed to suit the function's preferred configuration. Changes in
- * shared memory configuration between subsequent launches of functions,
- * may introduce a device side synchronization point.
- *
- * Any per-function setting of shared memory bank size set via
- * ::cuFuncSetSharedMemConfig will override the context wide setting set with
- * ::cuCtxSetSharedMemConfig.
- *
- * Changing the shared memory bank size will not increase shared memory usage
- * or affect occupancy of kernels, but may have major effects on performance.
- * Larger bank sizes will allow for greater potential bandwidth to shared memory,
- * but will change what kinds of accesses to shared memory will result in bank
- * conflicts.
- *
- * This function will do nothing on devices with fixed shared memory bank size.
- *
- * The supported bank configurations are:
- * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
- *   configuration when launching this function.
- * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively four bytes when launching this function.
- * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively eight bytes when launching this function.
- *
- * \param hfunc  - kernel to be given a shared memory config
- * \param config - requested shared memory configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxGetSharedMemConfig,
- * ::cuCtxSetSharedMemConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchKernel,
- * ::cudaFuncSetSharedMemConfig
- */
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
-
-/**
- * \brief Returns a module handle
- *
- * Returns in \p *hmod the handle of the module that function \p hfunc
- * is located in. The lifetime of the module corresponds to the lifetime of
- * the context it was loaded in or until the module is explicitly unloaded.
- *
- * The CUDA runtime manages its own modules loaded into the primary context.
- * If the handle returned by this API refers to a module loaded by the CUDA runtime,
- * calling ::cuModuleUnload() on that module will result in undefined behavior.
- *
- * \param hmod - Returned module handle
- * \param hfunc   - Function to retrieve module for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- */
-CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc);
-
-/**
- * \brief Launches a CUDA function
- *
- * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
- * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
- * \p blockDimZ threads.
- *
- * \p sharedMemBytes sets the amount of dynamic shared memory that will be
- * available to each thread block.
- *
- * Kernel parameters to \p f can be specified in one of two ways:
- *
- * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
- * has N parameters, then \p kernelParams needs to be an array of N
- * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
- * must point to a region of memory from which the actual kernel
- * parameter will be copied.  The number of kernel parameters and their
- * offsets and sizes do not need to be specified as that information is
- * retrieved directly from the kernel's image.
- *
- * 2) Kernel parameters can also be packaged by the application into
- * a single buffer that is passed in via the \p extra parameter.
- * This places the burden on the application of knowing each kernel
- * parameter's size and alignment/padding within the buffer.  Here is
- * an example of using the \p extra parameter in this manner:
- * \code
-    size_t argBufferSize;
-    char argBuffer[256];
-
-    // populate argBuffer and argBufferSize
-
-    void *config[] = {
-        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
-        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
-        CU_LAUNCH_PARAM_END
-    };
-    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
- * \endcode
- *
- * The \p extra parameter exists to allow ::cuLaunchKernel to take
- * additional less commonly used arguments.  \p extra specifies a list of
- * names of extra settings and their corresponding values.  Each extra
- * setting name is immediately followed by the corresponding value.  The
- * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
- *
- * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
- *   array;
- * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
- *   value in \p extra will be a pointer to a buffer containing all
- *   the kernel parameters for launching kernel \p f;
- * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
- *   value in \p extra will be a pointer to a size_t containing the
- *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
- *
- * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
- * parameters are specified with both \p kernelParams and \p extra
- * (i.e. both \p kernelParams and \p extra are non-NULL).
- *
- * Calling ::cuLaunchKernel() invalidates the persistent function state
- * set through the following deprecated APIs:
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(),
- *  ::cuParamSetv().
- *
- * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
- * been compiled with toolchain version 3.2 or later so that it will
- * contain kernel parameter information, or have no kernel parameters.
- * If either of these conditions is not met, then ::cuLaunchKernel() will
- * return ::CUDA_ERROR_INVALID_IMAGE.
- *
- * \param f              - Kernel to launch
- * \param gridDimX       - Width of grid in blocks
- * \param gridDimY       - Height of grid in blocks
- * \param gridDimZ       - Depth of grid in blocks
- * \param blockDimX      - X dimension of each thread block
- * \param blockDimY      - Y dimension of each thread block
- * \param blockDimZ      - Z dimension of each thread block
- * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
- * \param hStream        - Stream identifier
- * \param kernelParams   - Array of pointers to kernel parameters
- * \param extra          - Extra options
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cudaLaunchKernel
- */
-CUresult CUDAAPI cuLaunchKernel(CUfunction f,
-                                unsigned int gridDimX,
-                                unsigned int gridDimY,
-                                unsigned int gridDimZ,
-                                unsigned int blockDimX,
-                                unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes,
-                                CUstream hStream,
-                                void **kernelParams,
-                                void **extra);
-
-
-
-
-
-
-
-
-/**
- * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute
- *
- * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
- * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
- * \p blockDimZ threads.
- *
- * \p sharedMemBytes sets the amount of dynamic shared memory that will be
- * available to each thread block.
- *
- * The device on which this kernel is invoked must have a non-zero value for
- * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
- *
- * The total number of blocks launched cannot exceed the maximum number of blocks per
- * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
- * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
- * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
- *
- * The kernel cannot make use of CUDA dynamic parallelism.
- *
- * Kernel parameters must be specified via \p kernelParams.  If \p f
- * has N parameters, then \p kernelParams needs to be an array of N
- * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
- * must point to a region of memory from which the actual kernel
- * parameter will be copied.  The number of kernel parameters and their
- * offsets and sizes do not need to be specified as that information is
- * retrieved directly from the kernel's image.
- *
- * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
- * the same as function state set through ::cuLaunchKernel API
- *
- * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
- * block shape, shared size and parameter info associated with \p f
- * is overwritten.
- *
- * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
- * been compiled with toolchain version 3.2 or later so that it will
- * contain kernel parameter information, or have no kernel parameters.
- * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
- * return ::CUDA_ERROR_INVALID_IMAGE.
- *
- * \param f              - Kernel to launch
- * \param gridDimX       - Width of grid in blocks
- * \param gridDimY       - Height of grid in blocks
- * \param gridDimZ       - Depth of grid in blocks
- * \param blockDimX      - X dimension of each thread block
- * \param blockDimY      - Y dimension of each thread block
- * \param blockDimZ      - Z dimension of each thread block
- * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
- * \param hStream        - Stream identifier
- * \param kernelParams   - Array of pointers to kernel parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchCooperativeKernelMultiDevice,
- * ::cudaLaunchCooperativeKernel
- */
-CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
-                                unsigned int gridDimX,
-                                unsigned int gridDimY,
-                                unsigned int gridDimZ,
-                                unsigned int blockDimX,
-                                unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes,
-                                CUstream hStream,
-                                void **kernelParams);
-
-/**
- * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
- *
- * \deprecated This function is deprecated as of CUDA 11.3.
- *
- * Invokes kernels as specified in the \p launchParamsList array where each element
- * of the array specifies all the parameters required to perform a single kernel launch.
- * These kernels can cooperate and synchronize as they execute. The size of the array is
- * specified by \p numDevices.
- *
- * No two kernels can be launched on the same device. All the devices targeted by this
- * multi-device launch must be identical. All devices must have a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
- *
- * All kernels launched must be identical with respect to the compiled code. Note that
- * any __device__, __constant__ or __managed__ variables present in the module that owns
- * the kernel launched on each device, are independently instantiated on every device.
- * It is the application's responsiblity to ensure these variables are initialized and
- * used appropriately.
- *
- * The size of the grids as specified in blocks, the size of the blocks themselves
- * and the amount of shared memory used by each thread block must also match across
- * all launched kernels.
- *
- * The streams used to launch these kernels must have been created via either ::cuStreamCreate
- * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
- * cannot be used.
- *
- * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
- * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
- * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
- * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
- * total number of blocks launched per device has to match across all devices, the maximum
- * number of blocks that can be launched per device will be limited by the device with the
- * least number of multiprocessors.
- *
- * The kernels cannot make use of CUDA dynamic parallelism.
- *
- * The ::CUDA_LAUNCH_PARAMS structure is defined as:
- * \code
-        typedef struct CUDA_LAUNCH_PARAMS_st
-        {
-            CUfunction function;
-            unsigned int gridDimX;
-            unsigned int gridDimY;
-            unsigned int gridDimZ;
-            unsigned int blockDimX;
-            unsigned int blockDimY;
-            unsigned int blockDimZ;
-            unsigned int sharedMemBytes;
-            CUstream hStream;
-            void **kernelParams;
-        } CUDA_LAUNCH_PARAMS;
- * \endcode
- * where:
- * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
- *   be identical with respect to the compiled code.
- * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
- *   This must match across all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
- *   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
- *   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
- * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
- *   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
- *   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
- *   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
- *   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
- *   do not need to be specified as that information is retrieved directly from the kernel's image.
- *
- * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
- * streams has completed. This behavior can be overridden by specifying the flag
- * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
- * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
- * execution.
- *
- * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
- * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
- * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
- * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
- * on the GPU corresponding to that stream to complete before it begins execution.
- *
- * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
- * the same as function state set through ::cuLaunchKernel API when called individually for each
- * element in \p launchParamsList.
- *
- * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
- * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
- * in \p launchParamsList is overwritten.
- *
- * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
- * been compiled with toolchain version 3.2 or later so that it will
- * contain kernel parameter information, or have no kernel parameters.
- * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
- * return ::CUDA_ERROR_INVALID_IMAGE.
- *
- * \param launchParamsList - List of launch parameters, one per device
- * \param numDevices       - Size of the \p launchParamsList array
- * \param flags            - Flags to control launch behavior
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchCooperativeKernel,
- * ::cudaLaunchCooperativeKernelMultiDevice
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
-
-/**
- * \brief Enqueues a host function call in a stream
- *
- * Enqueues a host function to run in a stream.  The function will be called
- * after currently enqueued work and will block work added after it.
- *
- * The host function must not make any CUDA API calls.  Attempting to use a
- * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
- * The host function must not perform any synchronization that may depend on
- * outstanding CUDA work not mandated to run earlier.  Host functions without a
- * mandated order (such as in independent streams) execute in undefined order
- * and may be serialized.
- *
- * For the purposes of Unified Memory, execution makes a number of guarantees:
- * <ul>
- *   <li>The stream is considered idle for the duration of the function's
- *   execution.  Thus, for example, the function may always use memory attached
- *   to the stream it was enqueued in.</li>
- *   <li>The start of execution of the function has the same effect as
- *   synchronizing an event recorded in the same stream immediately prior to
- *   the function.  It thus synchronizes streams which have been "joined"
- *   prior to the function.</li>
- *   <li>Adding device work to any stream does not have the effect of making
- *   the stream active until all preceding host functions and stream callbacks
- *   have executed.  Thus, for
- *   example, a function might use global attached memory even if work has
- *   been added to another stream, if the work has been ordered behind the
- *   function call with an event.</li>
- *   <li>Completion of the function does not cause a stream to become
- *   active except as described above.  The stream will remain idle
- *   if no device work follows the function, and will remain idle across
- *   consecutive host functions or stream callbacks without device work in
- *   between.  Thus, for example,
- *   stream synchronization can be done by signaling from a host function at the
- *   end of the stream.</li>
- * </ul>
- *
- * Note that, in contrast to ::cuStreamAddCallback, the function will not be
- * called in the event of an error in the CUDA context.
- *
- * \param hStream  - Stream to enqueue function call in
- * \param fn       - The function to call once preceding stream operations are complete
- * \param userData - User-specified data to be passed to the function
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuMemAllocManaged,
- * ::cuStreamAttachMemAsync,
- * ::cuStreamAddCallback
- */
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
-
-/** @} */ /* END CUDA_EXEC */
-
-/**
- * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the deprecated execution control functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Sets the block-dimensions for the function
- *
- * \deprecated
- *
- * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
- * created when the kernel given by \p hfunc is launched.
- *
- * \param hfunc - Kernel to specify dimensions of
- * \param x     - X dimension
- * \param y     - Y dimension
- * \param z     - Z dimension
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetSharedSize,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSeti,
- * ::cuParamSetf,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
-
-/**
- * \brief Sets the dynamic shared-memory size for the function
- *
- * \deprecated
- *
- * Sets through \p bytes the amount of dynamic shared memory that will be
- * available to each thread block when the kernel given by \p hfunc is launched.
- *
- * \param hfunc - Kernel to specify dynamic shared-memory size for
- * \param bytes - Dynamic shared-memory size per thread in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSeti,
- * ::cuParamSetf,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
-
-/**
- * \brief Sets the parameter size for the function
- *
- * \deprecated
- *
- * Sets through \p numbytes the total size in bytes needed by the function
- * parameters of the kernel corresponding to \p hfunc.
- *
- * \param hfunc    - Kernel to set parameter size for
- * \param numbytes - Size of parameter list in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
-
-/**
- * \brief Adds an integer parameter to the function's argument list
- *
- * \deprecated
- *
- * Sets an integer parameter that will be specified the next time the
- * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
- *
- * \param hfunc  - Kernel to add parameter to
- * \param offset - Offset to add parameter to argument list
- * \param value  - Value of parameter
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
-
-/**
- * \brief Adds a floating-point parameter to the function's argument list
- *
- * \deprecated
- *
- * Sets a floating-point parameter that will be specified the next time the
- * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
- *
- * \param hfunc  - Kernel to add parameter to
- * \param offset - Offset to add parameter to argument list
- * \param value  - Value of parameter
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
-
-/**
- * \brief Adds arbitrary data to the function's argument list
- *
- * \deprecated
- *
- * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
- * into the parameter space of the kernel corresponding to \p hfunc. \p offset
- * is a byte offset.
- *
- * \param hfunc    - Kernel to add data to
- * \param offset   - Offset to add data to argument list
- * \param ptr      - Pointer to arbitrary data
- * \param numbytes - Size of data to copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
-
-/**
- * \brief Launches a CUDA function
- *
- * \deprecated
- *
- * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
- * contains the number of threads specified by a previous call to
- * ::cuFuncSetBlockShape().
- *
- * The block shape, dynamic shared memory size, and parameter information
- * must be set using
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(), and
- *  ::cuParamSetv()
- * prior to calling this function.
- *
- * Launching a function via ::cuLaunchKernel() invalidates the function's
- * block shape, dynamic shared memory size, and parameter information. After
- * launching via cuLaunchKernel, this state must be re-initialized prior to
- * calling this function. Failure to do so results in undefined behavior.
- *
- * \param f - Kernel to launch
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
-
-/**
- * \brief Launches a CUDA function
- *
- * \deprecated
- *
- * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
- * blocks. Each block contains the number of threads specified by a previous
- * call to ::cuFuncSetBlockShape().
- *
- * The block shape, dynamic shared memory size, and parameter information
- * must be set using
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(), and
- *  ::cuParamSetv()
- * prior to calling this function.
- *
- * Launching a function via ::cuLaunchKernel() invalidates the function's
- * block shape, dynamic shared memory size, and parameter information. After
- * launching via cuLaunchKernel, this state must be re-initialized prior to
- * calling this function. Failure to do so results in undefined behavior.
- *
- * \param f           - Kernel to launch
- * \param grid_width  - Width of grid in blocks
- * \param grid_height - Height of grid in blocks
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
-
-/**
- * \brief Launches a CUDA function
- *
- * \deprecated
- *
- * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
- * blocks. Each block contains the number of threads specified by a previous
- * call to ::cuFuncSetBlockShape().
- *
- * The block shape, dynamic shared memory size, and parameter information
- * must be set using
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(), and
- *  ::cuParamSetv()
- * prior to calling this function.
- *
- * Launching a function via ::cuLaunchKernel() invalidates the function's
- * block shape, dynamic shared memory size, and parameter information. After
- * launching via cuLaunchKernel, this state must be re-initialized prior to
- * calling this function. Failure to do so results in undefined behavior.
- *
- * \param f           - Kernel to launch
- * \param grid_width  - Width of grid in blocks
- * \param grid_height - Height of grid in blocks
- * \param hStream     - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- *
- * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
- *       this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by
- *       growing the per-thread stack as needed per launch and not shrinking it afterwards.
- *
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
-
-
-/**
- * \brief Adds a texture-reference to the function's argument list
- *
- * \deprecated
- *
- * Makes the CUDA array or linear memory bound to the texture reference
- * \p hTexRef available to a device program as a texture. In this version of
- * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
- * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
- *
- * \param hfunc   - Kernel to add texture-reference to
- * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
- * \param hTexRef - Texture-reference to add to argument list
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
-/** @} */ /* END CUDA_EXEC_DEPRECATED */
-
-/**
- * \defgroup CUDA_GRAPH Graph Management
- *
- * ___MANBRIEF___ graph management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the graph management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Creates a graph
- *
- * Creates an empty graph, which is returned via \p phGraph.
- *
- * \param phGraph - Returns newly created graph
- * \param flags   - Graph creation flags, must be 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphInstantiate,
- * ::cuGraphDestroy,
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphClone
- */
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
-
-/**
- * \brief Creates a kernel execution node and adds it to a graph
- *
- * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * The CUDA_KERNEL_NODE_PARAMS structure is defined as:
- *
- * \code
- *  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
- *      CUfunction func;
- *      unsigned int gridDimX;
- *      unsigned int gridDimY;
- *      unsigned int gridDimZ;
- *      unsigned int blockDimX;
- *      unsigned int blockDimY;
- *      unsigned int blockDimZ;
- *      unsigned int sharedMemBytes;
- *      void **kernelParams;
- *      void **extra;
- *  } CUDA_KERNEL_NODE_PARAMS;
- * \endcode
- *
- * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
- * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
- * (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
- *
- * \p sharedMemBytes sets the amount of dynamic shared memory that will be
- * available to each thread block.
- *
- * Kernel parameters to \p func can be specified in one of two ways:
- *
- * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
- * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
- * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
- * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
- * to be specified as that information is retrieved directly from the kernel's image.
- *
- * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single
- * buffer that is passed in via \p extra. This places the burden on the application of knowing each
- * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists
- * to allow this function to take additional less commonly used arguments. \p extra specifies
- * a list of names of extra settings and their corresponding values. Each extra setting name is
- * immediately followed by the corresponding value. The list must be terminated with either NULL or
- * CU_LAUNCH_PARAM_END.
- *
- * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
- *   array;
- * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
- *   value in \p extra will be a pointer to a buffer
- *   containing all the kernel parameters for launching kernel
- *   \p func;
- * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
- *   value in \p extra will be a pointer to a size_t
- *   containing the size of the buffer specified with
- *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
- *
- * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
- * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL).
- * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel.
- *
- * The \p kernelParams or \p extra array, as well as the argument values it points to,
- * are copied during this call.
- *
- * \note Kernels launched using graphs must not use texture and surface references. Reading or
- *       writing through any texture or surface reference is undefined behavior.
- *       This restriction does not apply to texture and surface objects.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the GPU execution node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuLaunchCooperativeKernel,
- * ::cuGraphKernelNodeGetParams,
- * ::cuGraphKernelNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns a kernel node's parameters
- *
- * Returns the parameters of kernel node \p hNode in \p nodeParams.
- * The \p kernelParams or \p extra array returned in \p nodeParams,
- * as well as the argument values it points to, are owned by the node.
- * This memory remains valid until the node is destroyed or its
- * parameters are modified, and should not be modified
- * directly. Use ::cuGraphKernelNodeSetParams to update the
- * parameters of this node.
- *
- * The params will contain either \p kernelParams or \p extra,
- * according to which of these was most recently set on the node.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddKernelNode,
- * ::cuGraphKernelNodeSetParams
- */
-CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets a kernel node's parameters
- *
- * Sets the parameters of kernel node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddKernelNode,
- * ::cuGraphKernelNodeGetParams
- */
-CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates a memcpy node and adds it to a graph
- *
- * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * When the graph is launched, the node will perform the memcpy described by \p copyParams.
- * See ::cuMemcpy3D() for a description of the structure and its restrictions.
- *
- * Memcpy nodes have some additional restrictions with regards to managed memory, if the
- * system contains at least one device which has a zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
- * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
- * for those operand(s). The managed memory will be treated as residing on either the
- * host or the device, depending on which memory type is specified.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param copyParams      - Parameters for the memory copy
- * \param ctx             - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemcpy3D,
- * ::cuGraphMemcpyNodeGetParams,
- * ::cuGraphMemcpyNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
-
-/**
- * \brief Returns a memcpy node's parameters
- *
- * Returns the parameters of memcpy node \p hNode in \p nodeParams.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemcpy3D,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphMemcpyNodeSetParams
- */
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams);
-
-/**
- * \brief Sets a memcpy node's parameters
- *
- * Sets the parameters of memcpy node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemcpy3D,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphMemcpyNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams);
-
-/**
- * \brief Creates a memset node and adds it to a graph
- *
- * Creates a new memset node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * The element size must be 1, 2, or 4 bytes.
- * When the graph is launched, the node will perform the memset described by \p memsetParams.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param memsetParams    - Parameters for the memory set
- * \param ctx             - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemsetD2D32,
- * ::cuGraphMemsetNodeGetParams,
- * ::cuGraphMemsetNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode
- */
-CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
-
-/**
- * \brief Returns a memset node's parameters
- *
- * Returns the parameters of memset node \p hNode in \p nodeParams.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemsetD2D32,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphMemsetNodeSetParams
- */
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets a memset node's parameters
- *
- * Sets the parameters of memset node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemsetD2D32,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphMemsetNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates a host execution node and adds it to a graph
- *
- * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * When the graph is launched, the node will invoke the specified CPU function.
- * Host nodes are not supported under MPS with pre-Volta GPUs.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the host node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchHostFunc,
- * ::cuGraphHostNodeGetParams,
- * ::cuGraphHostNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns a host node's parameters
- *
- * Returns the parameters of host node \p hNode in \p nodeParams.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchHostFunc,
- * ::cuGraphAddHostNode,
- * ::cuGraphHostNodeSetParams
- */
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets a host node's parameters
- *
- * Sets the parameters of host node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchHostFunc,
- * ::cuGraphAddHostNode,
- * ::cuGraphHostNodeGetParams
- */
-CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates a child graph node and adds it to a graph
- *
- * Creates a new node which executes an embedded graph, and adds it to \p hGraph with
- * \p numDependencies dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * If \p hGraph contains allocation or free nodes, this call will return an error.
- *
- * The node executes an embedded child graph. The child graph is cloned in this call.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param childGraph      - The graph to clone into this node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphChildGraphNodeGetGraph,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphClone
- */
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
-
-/**
- * \brief Gets a handle to the embedded graph of a child graph node
- *
- * Gets a handle to the embedded graph in a child graph node. This call
- * does not clone the graph. Changes to the graph will be reflected in
- * the node, and the node retains ownership of the graph.
- *
- * Allocation and free nodes cannot be added to the returned graph.
- * Attempting to do so will return an error.
- *
- * \param hNode   - Node to get the embedded graph for
- * \param phGraph - Location to store a handle to the graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphNodeFindInClone
- */
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph);
-
-/**
- * \brief Creates an empty node and adds it to a graph
- *
- * Creates a new node which performs no operation, and adds it to \p hGraph with
- * \p numDependencies dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * An empty node performs no operation during execution, but can be used for
- * transitive ordering. For example, a phased execution graph with 2 groups of n
- * nodes with a barrier between them can be represented using an empty node and
- * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
-
-/**
- * \brief Creates an event record node and adds it to a graph
- *
- * Creates a new event record node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and event specified in \p event.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * Each launch of the graph will record \p event to capture execution of the
- * node's dependencies.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param event           - Event for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
- 
-/**
- * \brief Returns the event associated with an event record node
- *
- * Returns the event of event record node \p hNode in \p event_out.
- *
- * \param hNode     - Node to get the event for
- * \param event_out - Pointer to return the event
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphEventRecordNodeSetEvent,
- * ::cuGraphEventWaitNodeGetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
-
-/**
- * \brief Sets an event record node's event
- *
- * Sets the event of event record node \p hNode to \p event.
- *
- * \param hNode - Node to set the event for
- * \param event - Event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphEventRecordNodeGetEvent,
- * ::cuGraphEventWaitNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Creates an event wait node and adds it to a graph
- *
- * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and event specified in \p event.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
- * for details on what is captured by an event. \p event may be from a different context
- * or device than the launch stream.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param event           - Event for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
-
-/**
- * \brief Returns the event associated with an event wait node
- *
- * Returns the event of event wait node \p hNode in \p event_out.
- *
- * \param hNode     - Node to get the event for
- * \param event_out - Pointer to return the event
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphEventWaitNodeSetEvent,
- * ::cuGraphEventRecordNodeGetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
-
-/**
- * \brief Sets an event wait node's event
- *
- * Sets the event of event wait node \p hNode to \p event.
- *
- * \param hNode - Node to set the event for
- * \param event - Event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphEventWaitNodeGetEvent,
- * ::cuGraphEventRecordNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Creates an external semaphore signal node and adds it to a graph
- *
- * Creates a new external semaphore signal node and adds it to \p hGraph with \p
- * numDependencies dependencies specified via \p dependencies and arguments specified
- * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
- * node will be placed at the root of the graph. \p dependencies may not have any
- * duplicate entries. A handle to the new node will be returned in \p phGraphNode.
- *
- * Performs a signal operation on a set of externally allocated semaphore objects
- * when the node is launched.  The operation(s) will occur after all of the node's
- * dependencies have completed.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphExternalSemaphoresSignalNodeGetParams,
- * ::cuGraphExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns an external semaphore signal node's parameters
- *
- * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
- * The \p extSemArray and \p paramsArray returned in \p params_out,
- * are owned by the node.  This memory remains valid until the node is destroyed or its
- * parameters are modified, and should not be modified
- * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
- * parameters of this node.
- *
- * \param hNode      - Node to get the parameters for
- * \param params_out - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out);
-
-/**
- * \brief Sets an external semaphore signal node's parameters
- *
- * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates an external semaphore wait node and adds it to a graph
- *
- * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
- * to the new node will be returned in \p phGraphNode.
- *
- * Performs a wait operation on a set of externally allocated semaphore objects
- * when the node is launched.  The node's dependencies will not be launched until
- * the wait operation has completed.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphExternalSemaphoresWaitNodeGetParams,
- * ::cuGraphExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns an external semaphore wait node's parameters
- *
- * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
- * The \p extSemArray and \p paramsArray returned in \p params_out,
- * are owned by the node.  This memory remains valid until the node is destroyed or its
- * parameters are modified, and should not be modified
- * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
- * parameters of this node.
- *
- * \param hNode      - Node to get the parameters for
- * \param params_out - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out);
-
-/**
- * \brief Sets an external semaphore wait node's parameters
- *
- * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates an allocation node and adds it to a graph
- *
- * Creates a new allocation node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
- * to the new node will be returned in \p phGraphNode.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the node
- *
- * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
- * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
- *
- * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
- * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
- * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
- * owning graph.
- *
- * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
- * graph which are ordered after the allocation node, but also by stream operations ordered after the
- * graph's execution but before the allocation is freed.
- *
- * Allocations which are not freed in the same graph can be freed by:
- * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree;
- * - launching a graph with a free node for that allocation; or
- * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes
- * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation.
- * 
- * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
- * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
- * in another graph, a free node can no longer be added to the owning graph.
- *
- * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
- * - Nodes and edges of the graph cannot be deleted.
- * - The graph cannot be used in a child node.
- * - Only one instantiation of the graph may exist at any point in time.
- * - The graph cannot be cloned.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemFreeNode,
- * ::cuGraphMemAllocNodeGetParams,
- * ::cuDeviceGraphMemTrim,
- * ::cuDeviceGetGraphMemAttribute,
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuMemAllocAsync,
- * ::cuMemFreeAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns a memory alloc node's parameters
- *
- * Returns the parameters of a memory alloc node \p hNode in \p params_out.
- * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
- * node.  This memory remains valid until the node is destroyed.  The returned
- * parameters must not be modified.
- *
- * \param hNode      - Node to get the parameters for
- * \param params_out - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphMemFreeNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
-
-/**
- * \brief Creates a memory free node and adds it to a graph
- *
- * Creates a new memory free node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
- * to the new node will be returned in \p phGraphNode.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param dptr            - Address of memory to free
- *
- * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free:
- * - an allocation twice in the same graph.
- * - an address that was not returned by an allocation node.
- * - an invalid address.
- *
- * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
- * - Nodes and edges of the graph cannot be deleted.
- * - The graph cannot be used in a child node.
- * - Only one instantiation of the graph may exist at any point in time.
- * - The graph cannot be cloned.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphMemFreeNodeGetParams,
- * ::cuDeviceGraphMemTrim,
- * ::cuDeviceGetGraphMemAttribute,
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuMemAllocAsync,
- * ::cuMemFreeAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
-
-/**
- * \brief Returns a memory free node's parameters
- *
- * Returns the address of a memory free node \p hNode in \p dptr_out.
- *
- * \param hNode    - Node to get the parameters for
- * \param dptr_out - Pointer to return the device address
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemFreeNode,
- * ::cuGraphMemAllocNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out);
-
-/**
- * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
- *
- * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
- * freed back to the operating system.
- *
- * \param device - The device for which cached memory should be freed.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_DEVICE
- *
- * \sa
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphAddMemFreeNode,
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuDeviceGetGraphMemAttribute
- */
-CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
-
-/**
- * \brief Query asynchronous allocation attributes related to graphs
- *
- * Valid attributes are:
- *
- * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs
- * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
- *   last time it was reset.  High watermark can only be reset to zero.
- * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by
- *   the CUDA graphs asynchronous allocator.
- * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
- *   the CUDA graphs asynchronous allocator.
- *
- * \param device - Specifies the scope of the query
- * \param attr - attribute to get
- * \param value - retrieved value
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_DEVICE
- *
- * \sa
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphAddMemFreeNode
- */
-CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
-
-/**
- * \brief Set asynchronous allocation attributes related to graphs
- *
- * Valid attributes are:
- *
- * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
- *   last time it was reset.  High watermark can only be reset to zero.
- * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
- *   the CUDA graphs asynchronous allocator.
- *
- * \param device - Specifies the scope of the query
- * \param attr - attribute to get
- * \param value - pointer to value to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_DEVICE
- *
- * \sa
- * ::cuDeviceGetGraphMemAttribute,
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphAddMemFreeNode
- */
-CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
-
-/**
- * \brief Clones a graph
- *
- * This function creates a copy of \p originalGraph and returns it in \p phGraphClone.
- * All parameters are copied into the cloned graph. The original graph may be modified
- * after this call without affecting the clone.
- *
- * Child graph nodes in the original graph are recursively copied into the clone.
- *
- * \param phGraphClone  - Returns newly created cloned graph
- * \param originalGraph - Graph to clone
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphNodeFindInClone
- */
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
-
-/**
- * \brief Finds a cloned version of a node
- *
- * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
- * in the original graph.
- *
- * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
- * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
- * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
- * been removed. The cloned node is then returned via \p phClonedNode.
- *
- * \param phNode  - Returns handle to the cloned node
- * \param hOriginalNode - Handle to the original node
- * \param hClonedGraph - Cloned graph to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphClone
- */
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
-
-/**
- * \brief Returns a node's type
- *
- * Returns the node type of \p hNode in \p type.
- *
- * \param hNode - Node to query
- * \param type  - Pointer to return the node type
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphChildGraphNodeGetGraph,
- * ::cuGraphKernelNodeGetParams,
- * ::cuGraphKernelNodeSetParams,
- * ::cuGraphHostNodeGetParams,
- * ::cuGraphHostNodeSetParams,
- * ::cuGraphMemcpyNodeGetParams,
- * ::cuGraphMemcpyNodeSetParams,
- * ::cuGraphMemsetNodeGetParams,
- * ::cuGraphMemsetNodeSetParams
- */
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type);
-
-/**
- * \brief Returns a graph's nodes
- *
- * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
- * function will return the number of nodes in \p numNodes. Otherwise,
- * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
- * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
- * number of nodes actually obtained will be returned in \p numNodes.
- *
- * \param hGraph   - Graph to query
- * \param nodes    - Pointer to return the nodes
- * \param numNodes - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetType,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
-
-/**
- * \brief Returns a graph's root nodes
- *
- * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
- * function will return the number of root nodes in \p numRootNodes. Otherwise,
- * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
- * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
- * number of nodes actually obtained will be returned in \p numRootNodes.
- *
- * \param hGraph       - Graph to query
- * \param rootNodes    - Pointer to return the root nodes
- * \param numRootNodes - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphGetNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetType,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
-
-/**
- * \brief Returns a graph's dependency edges
- *
- * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
- * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
- * node in \p from[i]. \p from and \p to may both be NULL, in which
- * case this function only returns the number of edges in \p numEdges. Otherwise,
- * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
- * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
- * the number of edges actually returned will be written to \p numEdges.
- *
- * \param hGraph   - Graph to get the edges from
- * \param from     - Location to return edge endpoints
- * \param to       - Location to return edge endpoints
- * \param numEdges - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphAddDependencies,
- * ::cuGraphRemoveDependencies,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
-
-/**
- * \brief Returns a node's dependencies
- *
- * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
- * function will return the number of dependencies in \p numDependencies. Otherwise,
- * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
- * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
- * number of nodes actually obtained will be returned in \p numDependencies.
- *
- * \param hNode           - Node to query
- * \param dependencies    - Pointer to return the dependencies
- * \param numDependencies - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeGetDependentNodes,
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphAddDependencies,
- * ::cuGraphRemoveDependencies
- */
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
-
-/**
- * \brief Returns a node's dependent nodes
- *
- * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
- * case this function will return the number of dependent nodes in \p numDependentNodes.
- * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
- * higher than the actual number of dependent nodes, the remaining entries in
- * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
- * be returned in \p numDependentNodes.
- *
- * \param hNode             - Node to query
- * \param dependentNodes    - Pointer to return the dependent nodes
- * \param numDependentNodes - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphAddDependencies,
- * ::cuGraphRemoveDependencies
- */
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
-
-/**
- * \brief Adds dependency edges to a graph
- *
- * The number of dependencies to be added is defined by \p numDependencies
- * Elements in \p from and \p to at corresponding indices define a dependency.
- * Each node in \p from and \p to must belong to \p hGraph.
- *
- * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
- * Specifying an existing dependency will return an error.
- *
- * \param hGraph - Graph to which dependencies are added
- * \param from - Array of nodes that provide the dependencies
- * \param to - Array of dependent nodes
- * \param numDependencies - Number of dependencies to be added
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphRemoveDependencies,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
-
-/**
- * \brief Removes dependency edges from a graph
- *
- * The number of \p dependencies to be removed is defined by \p numDependencies.
- * Elements in \p from and \p to at corresponding indices define a dependency.
- * Each node in \p from and \p to must belong to \p hGraph.
- *
- * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
- * Specifying a non-existing dependency will return an error.
- *
- * Dependencies cannot be removed from graphs which contain allocation or free nodes.
- * Any attempt to do so will return an error.
- *
- * \param hGraph - Graph from which to remove dependencies
- * \param from - Array of nodes that provide the dependencies
- * \param to - Array of dependent nodes
- * \param numDependencies - Number of dependencies to be removed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddDependencies,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
-
-/**
- * \brief Remove a node from the graph
- *
- * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
- * on \p hNode and vice versa.
- *
- * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed.
- * Any attempt to do so will return an error.
- *
- * \param hNode  - Node to remove
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
-
-/**
- * \brief Creates an executable graph from a graph
- *
- * Instantiates \p hGraph as an executable graph. The graph is validated for any
- * structural constraints or intra-node constraints which were not previously
- * validated. If instantiation is successful, a handle to the instantiated graph
- * is returned in \p phGraphExec.
- *
- * If there are any errors, diagnostic information may be returned in \p errorNode and
- * \p logBuffer. This is the primary way to inspect instantiation errors. The output
- * will be null terminated unless the diagnostics overflow
- * the buffer. In this case, they will be truncated, and the last byte can be
- * inspected to determine if truncation occurred.
- *
- * \param phGraphExec - Returns instantiated graph
- * \param hGraph      - Graph to instantiate
- * \param phErrorNode - In case of an instantiation error, this may be modified to
- *                      indicate a node contributing to the error
- * \param logBuffer   - A character buffer to store diagnostic messages
- * \param bufferSize  - Size of the log buffer in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiateWithFlags,
- * ::cuGraphCreate,
- * ::cuGraphUpload,
- * ::cuGraphLaunch,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
-
-/**
- * \brief Creates an executable graph from a graph
- *
- * Instantiates \p hGraph as an executable graph. The graph is validated for any
- * structural constraints or intra-node constraints which were not previously
- * validated. If instantiation is successful, a handle to the instantiated graph
- * is returned in \p phGraphExec.
- *
- * The \p flags parameter controls the behavior of instantiation and subsequent
- * graph launches.  Valid flags are:
- *
- * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
- * graph containing memory allocation nodes to automatically free any
- * unfreed memory allocations before the graph is relaunched.
- *
- * If \p hGraph contains any allocation or free nodes, there can be at most one
- * executable graph in existence for that graph at a time.
- *
- * An attempt to instantiate a second executable graph before destroying the first
- * with ::cuGraphExecDestroy will result in an error.
- *
- * \param phGraphExec - Returns instantiated graph
- * \param hGraph      - Graph to instantiate
- * \param flags       - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphCreate,
- * ::cuGraphUpload,
- * ::cuGraphLaunch,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
-
-/**
- * \brief Sets the parameters for a kernel node in the given graphExec
- *
- * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
- * The node is identified by the corresponding node \p hNode in the 
- * non-executable graph, from which the executable graph was instantiated. 
- *
- * \p hNode must not have been removed from the original graph. All \p nodeParams 
- * fields may change, but the following restrictions apply to \p func updates: 
- *
- *   - The owning context of the function cannot change.
- *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
- *     to a function which uses CDP
- *
- * The modifications only affect future launches of \p hGraphExec. Already 
- * enqueued or running launches of \p hGraphExec are not affected by this call. 
- * \p hNode is also not modified by this call.
- * 
- * \param hGraphExec  - The executable graph in which to set the specified node
- * \param hNode       - kernel node from the graph from which graphExec was instantiated
- * \param nodeParams  - Updated Parameters to set
- * 
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddKernelNode,
- * ::cuGraphKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets the parameters for a memcpy node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
- * contained \p copyParams at instantiation.  hNode must remain in the graph which was 
- * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
- *
- * The source and destination memory in \p copyParams must be allocated from the same 
- * contexts as the original source and destination memory.  Both the instantiation-time 
- * memory operands and the memory operands in \p copyParams must be 1-dimensional.
- * Zero-length operations are not supported.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
- * not modified by this call.
- *
- * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or
- * either the original or new memory operands are multidimensional.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Memcpy node from the graph which was used to instantiate graphExec
- * \param copyParams - The updated parameters to set
- * \param ctx        - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphMemcpyNodeSetParams,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
-
-/**
- * \brief Sets the parameters for a memset node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
- * contained \p memsetParams at instantiation.  hNode must remain in the graph which was 
- * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
- *
- * The destination memory in \p memsetParams must be allocated from the same 
- * contexts as the original destination memory.  Both the instantiation-time 
- * memory operand and the memory operand in \p memsetParams must be 1-dimensional.
- * Zero-length operations are not supported.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
- * not modified by this call.
- *
- * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
- * either the original or new memory operand are multidimensional.
- *
- * \param hGraphExec   - The executable graph in which to set the specified node
- * \param hNode        - Memset node from the graph which was used to instantiate graphExec
- * \param memsetParams - The updated parameters to set
- * \param ctx          - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemsetNode,
- * ::cuGraphMemsetNodeSetParams,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
-
-/**
- * \brief Sets the parameters for a host node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
- * contained \p nodeParams at instantiation.  hNode must remain in the graph which was 
- * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
- * not modified by this call.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Host node from the graph which was used to instantiate graphExec
- * \param nodeParams - The updated parameters to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddHostNode,
- * ::cuGraphHostNodeSetParams,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Updates node parameters in the child graph node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained
- * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation.
- * \p hNode must remain in the graph which was used to instantiate \p hGraphExec.
- * Changed edges to and from \p hNode are ignored.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  \p hNode is also 
- * not modified by this call.
- *
- * The topology of \p childGraph, as well as the node insertion order,  must match that
- * of the graph contained in \p hNode.  See ::cuGraphExecUpdate() for a list of restrictions
- * on what can be updated in an instantiated graph.  The update is recursive, so child graph
- * nodes contained within the top level child graph will also be updated.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Host node from the graph which was used to instantiate graphExec
- * \param childGraph - The graph supplying the updated parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphChildGraphNodeGetGraph,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
-
-/**
- * \brief Sets the event for an event record node in the given graphExec
- *
- * Sets the event of an event record node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - event record node from the graph from which graphExec was instantiated
- * \param event      - Updated event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphEventRecordNodeGetEvent,
- * ::cuGraphEventWaitNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Sets the event for an event wait node in the given graphExec
- *
- * Sets the event of an event wait node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - event wait node from the graph from which graphExec was instantiated
- * \param event      - Updated event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphEventWaitNodeGetEvent,
- * ::cuGraphEventRecordNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Sets the parameters for an external semaphore signal node in the given graphExec
- *
- * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * \p hNode must not have been removed from the original graph.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * Changing \p nodeParams->numExtSems is not supported.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
- * \param nodeParams - Updated Parameters to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets the parameters for an external semaphore wait node in the given graphExec
- *
- * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * \p hNode must not have been removed from the original graph.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * Changing \p nodeParams->numExtSems is not supported.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
- * \param nodeParams - Updated Parameters to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Enables or disables the specified node in the given graphExec
- *
- * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent 
- * to empty nodes until they are reenabled. Existing node parameters are not affected by 
- * disabling/enabling the node.
- *  
- * The node is identified by the corresponding node \p hNode in the non-executable 
- * graph, from which the executable graph was instantiated.   
- *
- * \p hNode must not have been removed from the original graph.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * \note Currently only kernel nodes are supported. 
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Node from the graph from which graphExec was instantiated
- * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeGetEnabled,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- * ::cuGraphLaunch
- */
-CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
-
-/**
- * \brief Query whether a node in the given graphExec is enabled
- *
- * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
- *
- * The node is identified by the corresponding node \p hNode in the non-executable 
- * graph, from which the executable graph was instantiated.   
- *
- * \p hNode must not have been removed from the original graph.
- *
- * \note Currently only kernel nodes are supported. 
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Node from the graph from which graphExec was instantiated
- * \param isEnabled  - Location to return the enabled status of the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeSetEnabled,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- * ::cuGraphLaunch
- */
-CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
-
-/**
- * \brief Uploads an executable graph in a stream
- *
- * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
- * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
- * previous work in \p hStream and any previous launches of \p hGraphExec.
- * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec.
- *
- * \param hGraphExec - Executable graph to upload
- * \param hStream    - Stream in which to upload the graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphLaunch,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream);
-
-/**
- * \brief Launches an executable graph in a stream
- *
- * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
- * at a time. Each launch is ordered behind both any previous work in \p hStream
- * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
- * instantiated multiple times into multiple executable graphs.
- *
- * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and
- * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH,
- * the launch will fail with ::CUDA_ERROR_INVALID_VALUE.
- *
- * \param hGraphExec - Executable graph to launch
- * \param hStream    - Stream in which to launch the graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphUpload,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
-
-/**
- * \brief Destroys an executable graph
- *
- * Destroys the executable graph specified by \p hGraphExec, as well
- * as all of its executable nodes. If the executable graph is
- * in-flight, it will not be terminated, but rather freed
- * asynchronously on completion.
- *
- * \param hGraphExec - Executable graph to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphUpload,
- * ::cuGraphLaunch
- */
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
-
-/**
- * \brief Destroys a graph
- *
- * Destroys the graph specified by \p hGraph, as well as all of its nodes.
- *
- * \param hGraph - Graph to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
-
-/**
- * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
- *
- * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
- * node parameters in a topologically identical graph specified by \p hGraph.
- *
- * Limitations:
- *
- * - Kernel nodes:
- *   - The owning context of the function cannot change.
- *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
- *     to a function which uses CDP.
- *   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
- * - Memset and memcpy nodes:
- *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
- *   - The source/destination memory must be allocated from the same contexts as the original
- *     source/destination memory.
- *   - Only 1D memsets can be changed.
- * - Additional memcpy node restrictions:
- *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
- *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
- * - External semaphore wait nodes and record nodes:
- *   - Changing the number of semaphores is not supported.
- *
- * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
- *
- * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under
- * the following conditions:
- *
- * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out
- *   is NULL.
- * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out
- *   is NULL.
- * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is
- *   the pairless node from \p hGraph.
- * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph.
- *
- * cuGraphExecUpdate sets \p updateResult_out to:
- * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed
- * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case
- *   \p hErrorNode_out is set to the node from \p hGraph.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported
- *   way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
- * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way 
- *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way
- *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like 
- *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
- *
- * If \p updateResult_out isn't set in one of the situations described above, the update check passes
- * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph.  If an error happens
- * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise,
- * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS.
- *
- * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.  It returns
- * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included 
- * changes which violated constraints specific to instantiated graph update.
- *
- * \param hGraphExec The instantiated graph to be updated
- * \param hGraph The graph containing the updated parameters
- * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any
- * \param updateResult_out Whether the graph update was permitted.  If was forbidden, the reason why
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- */
-CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
-
-/**
- * \brief Copies attributes from source node to destination node.
- *
- * Copies attributes from source node \p src to destination node \p dst.
- * Both node must have the same context.
- *
- * \param[out] dst Destination node
- * \param[in] src Source node
- * For list of attributes see ::CUkernelNodeAttrID
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src);
-
-/**
- * \brief Queries node attribute.
- * 
- * Queries attribute \p attr from node \p hNode and stores it in corresponding
- * member of \p value_out.
- *
- * \param[in] hNode
- * \param[in] attr
- * \param[out] value_out 
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *  
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                                      CUkernelNodeAttrValue *value_out);
- 
-/**
- * \brief Sets node attribute.
- * 
- * Sets attribute \p attr on node \p hNode from corresponding attribute of
- * \p value.
- *
- * \param[out] hNode
- * \param[in] attr
- * \param[out] value
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                                      const CUkernelNodeAttrValue *value);
-
-/**
- * \brief Write a DOT file describing graph structure
- *
- * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph.
- * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
- * \p flags can be specified to write more detailed information about each node type such as
- * parameter values, kernel attributes, node and function handles.
- *
- * \param hGraph - The graph to create a DOT file from
- * \param path   - The path to write the DOT file to
- * \param flags  - Flags from CUgraphDebugDot_flags for specifying which additional node information to write
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OPERATING_SYSTEM
- */
-CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags);
-
-/**
- * \brief Create a user object
- *
- * Create a user object with the specified destructor callback and initial reference count. The
- * initial references are owned by the caller.
- *
- * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
- * are executed by a shared internal thread. Another thread may be signaled to perform such
- * actions, if it does not block forward progress of tasks scheduled through CUDA.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param object_out      - Location to return the user object handle
- * \param ptr             - The pointer to pass to the destroy function
- * \param destroy         - Callback to free the user object when it is no longer in use
- * \param initialRefcount - The initial refcount to create the object with, typically 1. The
- *                          initial references are owned by the calling thread.
- * \param flags           - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC,
- *                          which is the only defined flag. This indicates that the destroy
- *                          callback cannot be waited on by any CUDA API. Users requiring
- *                          synchronization of the callback should signal its completion
- *                          manually.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectRetain,
- * ::cuUserObjectRelease,
- * ::cuGraphRetainUserObject,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy,
-                                    unsigned int initialRefcount, unsigned int flags);
-
-/**
- * \brief Retain a reference to a user object
- *
- * Retains new references to a user object. The new references are owned by the caller.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param object - The object to retain
- * \param count  - The number of references to retain, typically 1. Must be nonzero
- *                 and not larger than INT_MAX.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRelease,
- * ::cuGraphRetainUserObject,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count);
-
-/**
- * \brief Release a reference to a user object
- *
- * Releases user object references owned by the caller. The object's destructor is invoked if
- * the reference count reaches zero.
- *
- * It is undefined behavior to release references not owned by the caller, or to use a user
- * object handle after all references are released.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param object - The object to release
- * \param count  - The number of references to release, typically 1. Must be nonzero
- *                 and not larger than INT_MAX.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRetain,
- * ::cuGraphRetainUserObject,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count);
-
-/**
- * \brief Retain a reference to a user object from a graph
- *
- * Creates or moves user object references that will be owned by a CUDA graph.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param graph  - The graph to associate the reference with
- * \param object - The user object to retain a reference for
- * \param count  - The number of references to add to the graph, typically 1. Must be
- *                 nonzero and not larger than INT_MAX.
- * \param flags  - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references
- *                 from the calling thread, rather than create new references. Pass 0
- *                 to create new references.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRetain,
- * ::cuUserObjectRelease,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
-
-/**
- * \brief Release a user object reference from a graph
- *
- * Releases user object references owned by a graph.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param graph  - The graph that will release the reference
- * \param object - The user object to release a reference for
- * \param count  - The number of references to release, typically 1. Must be nonzero
- *                 and not larger than INT_MAX.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRetain,
- * ::cuUserObjectRelease,
- * ::cuGraphRetainUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count);
-
-/** @} */ /* END CUDA_GRAPH */
-
-/**
- * \defgroup CUDA_OCCUPANCY Occupancy
- *
- * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the occupancy calculation functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns occupancy of a function
- *
- * Returns in \p *numBlocks the number of the maximum active blocks per
- * streaming multiprocessor.
- *
- * \param numBlocks       - Returned occupancy
- * \param func            - Kernel for which occupancy is calculated
- * \param blockSize       - Block size the kernel is intended to be launched with
- * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
- */
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
-
-/**
- * \brief Returns occupancy of a function
- *
- * Returns in \p *numBlocks the number of the maximum active blocks per
- * streaming multiprocessor.
- *
- * The \p Flags parameter controls how special cases are handled. The
- * valid flags are:
- *
- * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
- *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
- *
- * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
- *   default behavior on platform where global caching affects
- *   occupancy. On such platforms, if caching is enabled, but
- *   per-block SM resource usage would result in zero occupancy, the
- *   occupancy calculator will calculate the occupancy as if caching
- *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
- *   the occupancy calculator to return 0 in such cases. More information
- *   can be found about this feature in the "Unified L1/Texture Cache"
- *   section of the Maxwell tuning guide.
- *
- * \param numBlocks       - Returned occupancy
- * \param func            - Kernel for which occupancy is calculated
- * \param blockSize       - Block size the kernel is intended to be launched with
- * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
- * \param flags           - Requested behavior for the occupancy calculator
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
- */
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
-
-/**
- * \brief Suggest a launch configuration with reasonable occupancy
- *
- * Returns in \p *blockSize a reasonable block size that can achieve
- * the maximum occupancy (or, the maximum number of active warps with
- * the fewest blocks per multiprocessor), and in \p *minGridSize the
- * minimum grid size to achieve the maximum occupancy.
- *
- * If \p blockSizeLimit is 0, the configurator will use the maximum
- * block size permitted by the device / function instead.
- *
- * If per-block dynamic shared memory allocation is not needed, the
- * user should leave both \p blockSizeToDynamicSMemSize and \p
- * dynamicSMemSize as 0.
- *
- * If per-block dynamic shared memory allocation is needed, then if
- * the dynamic shared memory size is constant regardless of block
- * size, the size should be passed through \p dynamicSMemSize, and \p
- * blockSizeToDynamicSMemSize should be NULL.
- *
- * Otherwise, if the per-block dynamic shared memory size varies with
- * different block sizes, the user needs to provide a unary function
- * through \p blockSizeToDynamicSMemSize that computes the dynamic
- * shared memory needed by \p func for any given block size. \p
- * dynamicSMemSize is ignored. An example signature is:
- *
- * \code
- *    // Take block size, returns dynamic shared memory needed
- *    size_t blockToSmem(int blockSize);
- * \endcode
- *
- * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
- * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
- * \param func        - Kernel for which launch configuration is calculated
- * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
- * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
- * \param blockSizeLimit  - The maximum block size \p func is designed to handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxPotentialBlockSize
- */
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
-
-/**
- * \brief Suggest a launch configuration with reasonable occupancy
- *
- * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
- * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
- * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
- * parameter.
- *
- * The \p Flags parameter controls how special cases are handled. The
- * valid flags are:
- *
- * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
- *   ::cuOccupancyMaxPotentialBlockSize;
- *
- * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
- *   default behavior on platform where global caching affects
- *   occupancy. On such platforms, the launch configurations that
- *   produces maximal occupancy might not support global
- *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
- *   guarantees that the the produced launch configuration is global
- *   caching compatible at a potential cost of occupancy. More information
- *   can be found about this feature in the "Unified L1/Texture Cache"
- *   section of the Maxwell tuning guide.
- *
- * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
- * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
- * \param func        - Kernel for which launch configuration is calculated
- * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
- * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
- * \param blockSizeLimit  - The maximum block size \p func is designed to handle
- * \param flags       - Options
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxPotentialBlockSizeWithFlags
- */
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
-
-/**
- * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM 
- *
- * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
- *
- * \param dynamicSmemSize - Returned maximum dynamic shared memory 
- * \param func            - Kernel function for which occupancy is calculated
- * \param numBlocks       - Number of blocks to fit on SM 
- * \param blockSize       - Size of the blocks
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- */
-CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
-
-/** @} */ /* END CUDA_OCCUPANCY */
-
-/**
- * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated texture reference management functions of the
- * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the deprecated texture reference management
- * functions of the low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Binds an array as a texture reference
- *
- * \deprecated
- *
- * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
- * previous address or CUDA array state associated with the texture reference
- * is superseded by this function. \p Flags must be set to
- * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
- * unbound.
- *
- * \param hTexRef - Texture reference to bind
- * \param hArray  - Array to bind
- * \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
-
-/**
- * \brief Binds a mipmapped array to a texture reference
- *
- * \deprecated
- *
- * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
- * Any previous address or CUDA array state associated with the texture reference
- * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
- * Any CUDA array previously bound to \p hTexRef is unbound.
- *
- * \param hTexRef         - Texture reference to bind
- * \param hMipmappedArray - Mipmapped array to bind
- * \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
-
-/**
- * \brief Binds an address as a texture reference
- *
- * \deprecated
- *
- * Binds a linear address range to the texture reference \p hTexRef. Any
- * previous address or CUDA array state associated with the texture reference
- * is superseded by this function. Any memory previously bound to \p hTexRef
- * is unbound.
- *
- * Since the hardware enforces an alignment requirement on texture base
- * addresses, ::cuTexRefSetAddress() passes back a byte offset in
- * \p *ByteOffset that must be applied to texture fetches in order to read from
- * the desired memory. This offset must be divided by the texel size and
- * passed to kernels that read from the texture so they can be applied to the
- * ::tex1Dfetch() function.
- *
- * If the device memory pointer was returned from ::cuMemAlloc(), the offset
- * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
- *
- * The total number of elements (or texels) in the linear address range
- * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
- * The number of elements is computed as (\p bytes / bytesPerElement),
- * where bytesPerElement is determined from the data format and number of
- * components set using ::cuTexRefSetFormat().
- *
- * \param ByteOffset - Returned byte offset
- * \param hTexRef    - Texture reference to bind
- * \param dptr       - Device pointer to bind
- * \param bytes      - Size of memory to bind in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
-
-/**
- * \brief Binds an address as a 2D texture reference
- *
- * \deprecated
- *
- * Binds a linear address range to the texture reference \p hTexRef. Any
- * previous address or CUDA array state associated with the texture reference
- * is superseded by this function. Any memory previously bound to \p hTexRef
- * is unbound.
- *
- * Using a ::tex2D() function inside a kernel requires a call to either
- * ::cuTexRefSetArray() to bind the corresponding texture reference to an
- * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
- * memory.
- *
- * Function calls to ::cuTexRefSetFormat() cannot follow calls to
- * ::cuTexRefSetAddress2D() for the same texture reference.
- *
- * It is required that \p dptr be aligned to the appropriate hardware-specific
- * texture alignment. You can query this value using the device attribute
- * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
- * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
- * This value can be queried using the device attribute
- * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
- * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * Width and Height, which are specified in elements (or texels), cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
- * \p Pitch, which is specified in bytes, cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
- *
- * \param hTexRef - Texture reference to bind
- * \param desc    - Descriptor of CUDA array
- * \param dptr    - Device pointer to bind
- * \param Pitch   - Line pitch in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture2D
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
-
-/**
- * \brief Sets the format for a texture reference
- *
- * \deprecated
- *
- * Specifies the format of the data to be read by the texture reference
- * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
- * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
- * They specify the format of each component and the number of components per
- * array element.
- *
- * \param hTexRef             - Texture reference
- * \param fmt                 - Format to set
- * \param NumPackedComponents - Number of components per array element
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaCreateChannelDesc,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
-
-/**
- * \brief Sets the addressing mode for a texture reference
- *
- * \deprecated
- *
- * Specifies the addressing mode \p am for the given dimension \p dim of the
- * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
- * applied to the first parameter of the functions used to fetch from the
- * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
- * as:
- * \code
-   typedef enum CUaddress_mode_enum {
-      CU_TR_ADDRESS_MODE_WRAP = 0,
-      CU_TR_ADDRESS_MODE_CLAMP = 1,
-      CU_TR_ADDRESS_MODE_MIRROR = 2,
-      CU_TR_ADDRESS_MODE_BORDER = 3
-   } CUaddress_mode;
- * \endcode
- *
- * Note that this call has no effect if \p hTexRef is bound to linear memory.
- * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
- * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
- *
- * \param hTexRef - Texture reference
- * \param dim     - Dimension
- * \param am      - Addressing mode to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
-
-/**
- * \brief Sets the filtering mode for a texture reference
- *
- * \deprecated
- *
- * Specifies the filtering mode \p fm to be used when reading memory through
- * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
- *
- * \code
-   typedef enum CUfilter_mode_enum {
-      CU_TR_FILTER_MODE_POINT = 0,
-      CU_TR_FILTER_MODE_LINEAR = 1
-   } CUfilter_mode;
- * \endcode
- *
- * Note that this call has no effect if \p hTexRef is bound to linear memory.
- *
- * \param hTexRef - Texture reference
- * \param fm      - Filtering mode to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
-
-/**
- * \brief Sets the mipmap filtering mode for a texture reference
- *
- * \deprecated
- *
- * Specifies the mipmap filtering mode \p fm to be used when reading memory through
- * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
- *
- * \code
-   typedef enum CUfilter_mode_enum {
-      CU_TR_FILTER_MODE_POINT = 0,
-      CU_TR_FILTER_MODE_LINEAR = 1
-   } CUfilter_mode;
- * \endcode
- *
- * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
- *
- * \param hTexRef - Texture reference
- * \param fm      - Filtering mode to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
-
-/**
- * \brief Sets the mipmap level bias for a texture reference
- *
- * \deprecated
- *
- * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
- * reading memory through the texture reference \p hTexRef.
- *
- * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
- *
- * \param hTexRef - Texture reference
- * \param bias    - Mipmap level bias
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
-
-/**
- * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
- *
- * \deprecated
- *
- * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
- * respectively, to be used when reading memory through the texture reference
- * \p hTexRef.
- *
- * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
- *
- * \param hTexRef        - Texture reference
- * \param minMipmapLevelClamp - Mipmap min level clamp
- * \param maxMipmapLevelClamp - Mipmap max level clamp
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
-
-/**
- * \brief Sets the maximum anisotropy for a texture reference
- *
- * \deprecated
- *
- * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
- * the texture reference \p hTexRef.
- *
- * Note that this call has no effect if \p hTexRef is bound to linear memory.
- *
- * \param hTexRef  - Texture reference
- * \param maxAniso - Maximum anisotropy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
-
-/**
- * \brief Sets the border color for a texture reference
- *
- * \deprecated
- *
- * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
- * \p hTexRef. The color value supports only float type and holds color components in
- * the following sequence:
- * pBorderColor[0] holds 'R' component
- * pBorderColor[1] holds 'G' component
- * pBorderColor[2] holds 'B' component
- * pBorderColor[3] holds 'A' component
- *
- * Note that the color values can be set only when the Address mode is set to
- * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
- * Applications using integer border color values have to "reinterpret_cast" their values to float.
- *
- * \param hTexRef       - Texture reference
- * \param pBorderColor  - RGBA color
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddressMode,
- * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
-
-/**
- * \brief Sets the flags for a texture reference
- *
- * \deprecated
- *
- * Specifies optional flags via \p Flags to specify the behavior of data
- * returned through the texture reference \p hTexRef. The valid flags are:
- *
- * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
- *   having the texture promote integer data to floating point data in the
- *   range [0, 1]. Note that texture with 32-bit integer format
- *   would not be promoted, regardless of whether or not this
- *   flag is specified;
- * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
- *   default behavior of having the texture coordinates range
- *   from [0, Dim) where Dim is the width or height of the CUDA
- *   array. Instead, the texture coordinates [0, 1.0) reference
- *   the entire breadth of the array dimension;
- * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
- *   filtering optimizations. Trilinear optimizations improve texture filtering
- *   performance by allowing bilinear filtering on textures in scenarios where
- *   it can closely approximate the expected results.
- *
- * \param hTexRef - Texture reference
- * \param Flags   - Optional flags to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
-
-/**
- * \brief Gets the address associated with a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pdptr the base address bound to the texture reference
- * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- * is not bound to any device memory range.
- *
- * \param pdptr   - Returned device address
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
-
-/**
- * \brief Gets the array bound to a texture reference
- *
- * \deprecated
- *
- * Returns in \p *phArray the CUDA array bound to the texture reference
- * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- * is not bound to any CUDA array.
- *
- * \param phArray - Returned array
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
-
-/**
- * \brief Gets the mipmapped array bound to a texture reference
- *
- * \deprecated
- *
- * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
- * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- * is not bound to any CUDA mipmapped array.
- *
- * \param phMipmappedArray - Returned mipmapped array
- * \param hTexRef          - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
-
-/**
- * \brief Gets the addressing mode used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pam the addressing mode corresponding to the
- * dimension \p dim of the texture reference \p hTexRef. Currently, the only
- * valid value for \p dim are 0 and 1.
- *
- * \param pam     - Returned addressing mode
- * \param hTexRef - Texture reference
- * \param dim     - Dimension
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
-
-/**
- * \brief Gets the filter-mode used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pfm the filtering mode of the texture reference
- * \p hTexRef.
- *
- * \param pfm     - Returned filtering mode
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
-
-/**
- * \brief Gets the format used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pFormat and \p *pNumChannels the format and number
- * of components of the CUDA array bound to the texture reference \p hTexRef.
- * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
- *
- * \param pFormat      - Returned format
- * \param pNumChannels - Returned number of components
- * \param hTexRef      - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
-
-/**
- * \brief Gets the mipmap filtering mode for a texture reference
- *
- * \deprecated
- *
- * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
- * the texture reference \p hTexRef.
- *
- * \param pfm     - Returned mipmap filtering mode
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
-
-/**
- * \brief Gets the mipmap level bias for a texture reference
- *
- * \deprecated
- *
- * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
- * level when reading memory through the texture reference \p hTexRef.
- *
- * \param pbias   - Returned mipmap level bias
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
-
-/**
- * \brief Gets the min/max mipmap level clamps for a texture reference
- *
- * \deprecated
- *
- * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
- * that's used when reading memory through the texture reference \p hTexRef.
- *
- * \param pminMipmapLevelClamp - Returned mipmap min level clamp
- * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
- * \param hTexRef              - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
-
-/**
- * \brief Gets the maximum anisotropy for a texture reference
- *
- * \deprecated
- *
- * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
- * the texture reference \p hTexRef.
- *
- * \param pmaxAniso - Returned maximum anisotropy
- * \param hTexRef   - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
-
-/**
- * \brief Gets the border color used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p pBorderColor, values of the RGBA color used by
- * the texture reference \p hTexRef.
- * The color value is of type float and holds color components in
- * the following sequence:
- * pBorderColor[0] holds 'R' component
- * pBorderColor[1] holds 'G' component
- * pBorderColor[2] holds 'B' component
- * pBorderColor[3] holds 'A' component
- *
- * \param hTexRef  - Texture reference
- * \param pBorderColor   - Returned Type and Value of RGBA color
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddressMode,
- * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
-
-/**
- * \brief Gets the flags used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
- *
- * \param pFlags  - Returned flags
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
-
-/**
- * \brief Creates a texture reference
- *
- * \deprecated
- *
- * Creates a texture reference and returns its handle in \p *pTexRef. Once
- * created, the application must call ::cuTexRefSetArray() or
- * ::cuTexRefSetAddress() to associate the reference with allocated memory.
- * Other texture reference functions are used to specify the format and
- * interpretation (addressing, filtering, etc.) to be used when the memory is
- * read through this texture reference.
- *
- * \param pTexRef - Returned texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefDestroy
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
-
-/**
- * \brief Destroys a texture reference
- *
- * \deprecated
- *
- * Destroys the texture reference specified by \p hTexRef.
- *
- * \param hTexRef - Texture reference to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefCreate
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
-
-/** @} */ /* END CUDA_TEXREF_DEPRECATED */
-
-
-/**
- * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED]
- *
- * ___MANBRIEF___ surface reference management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the surface reference management functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Sets the CUDA array for a surface reference.
- *
- * \deprecated
- *
- * Sets the CUDA array \p hArray to be read and written by the surface reference
- * \p hSurfRef.  Any previous CUDA array state associated with the surface
- * reference is superseded by this function.  \p Flags must be set to 0.
- * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
- * Any CUDA array previously bound to \p hSurfRef is unbound.
-
- * \param hSurfRef - Surface reference handle
- * \param hArray - CUDA array handle
- * \param Flags - set to 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuModuleGetSurfRef,
- * ::cuSurfRefGetArray,
- * ::cudaBindSurfaceToArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
-
-/**
- * \brief Passes back the CUDA array bound to a surface reference.
- *
- * \deprecated
- *
- * Returns in \p *phArray the CUDA array bound to the surface reference
- * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
- * is not bound to any CUDA array.
-
- * \param phArray - Surface reference handle
- * \param hSurfRef - Surface reference handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
-
-/** @} */ /* END CUDA_SURFREF_DEPRECATED */
-
-/**
- * \defgroup CUDA_TEXOBJECT Texture Object Management
- *
- * ___MANBRIEF___ texture object management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the texture object management functions of the
- * low-level CUDA driver application programming interface. The texture
- * object API is only supported on devices of compute capability 3.0 or higher.
- *
- * @{
- */
-
-/**
- * \brief Creates a texture object
- *
- * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
- * the data to texture from. \p pTexDesc describes how the data should be sampled.
- * \p pResViewDesc is an optional argument that specifies an alternate format for
- * the data described by \p pResDesc, and also describes the subresource region
- * to restrict access to when texturing. \p pResViewDesc can only be specified if
- * the type of resource is a CUDA array or a CUDA mipmapped array.
- *
- * Texture objects are only supported on devices of compute capability 3.0 or higher.
- * Additionally, a texture object is an opaque value, and, as such, should only be
- * accessed through CUDA API calls.
- *
- * The ::CUDA_RESOURCE_DESC structure is defined as:
- * \code
-        typedef struct CUDA_RESOURCE_DESC_st
-        {
-            CUresourcetype resType;
-
-            union {
-                struct {
-                    CUarray hArray;
-                } array;
-                struct {
-                    CUmipmappedArray hMipmappedArray;
-                } mipmap;
-                struct {
-                    CUdeviceptr devPtr;
-                    CUarray_format format;
-                    unsigned int numChannels;
-                    size_t sizeInBytes;
-                } linear;
-                struct {
-                    CUdeviceptr devPtr;
-                    CUarray_format format;
-                    unsigned int numChannels;
-                    size_t width;
-                    size_t height;
-                    size_t pitchInBytes;
-                } pitch2D;
-            } res;
-
-            unsigned int flags;
-        } CUDA_RESOURCE_DESC;
-
- * \endcode
- * where:
- * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
- * CUresourceType is defined as:
- * \code
-        typedef enum CUresourcetype_enum {
-            CU_RESOURCE_TYPE_ARRAY           = 0x00,
-            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
-            CU_RESOURCE_TYPE_LINEAR          = 0x02,
-            CU_RESOURCE_TYPE_PITCH2D         = 0x03
-        } CUresourcetype;
- * \endcode
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
- * must be set to a valid CUDA array handle.
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
- * must be set to a valid CUDA mipmapped array handle.
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
- * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
- * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
- * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
- * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
- * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
- * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
- * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
- * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
- * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
- * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
- *
- * - ::flags must be set to zero.
- *
- *
- * The ::CUDA_TEXTURE_DESC struct is defined as
- * \code
-        typedef struct CUDA_TEXTURE_DESC_st {
-            CUaddress_mode addressMode[3];
-            CUfilter_mode filterMode;
-            unsigned int flags;
-            unsigned int maxAnisotropy;
-            CUfilter_mode mipmapFilterMode;
-            float mipmapLevelBias;
-            float minMipmapLevelClamp;
-            float maxMipmapLevelClamp;
-        } CUDA_TEXTURE_DESC;
- * \endcode
- * where
- * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
- *   \code
-        typedef enum CUaddress_mode_enum {
-            CU_TR_ADDRESS_MODE_WRAP = 0,
-            CU_TR_ADDRESS_MODE_CLAMP = 1,
-            CU_TR_ADDRESS_MODE_MIRROR = 2,
-            CU_TR_ADDRESS_MODE_BORDER = 3
-        } CUaddress_mode;
- *   \endcode
- *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
- *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
- *
- * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
- *   \code
-        typedef enum CUfilter_mode_enum {
-            CU_TR_FILTER_MODE_POINT = 0,
-            CU_TR_FILTER_MODE_LINEAR = 1
-        } CUfilter_mode;
- *   \endcode
- *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
- *
- * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
- *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
- *   having the texture promote integer data to floating point data in the
- *   range [0, 1]. Note that texture with 32-bit integer format would not be 
- *   promoted, regardless of whether or not this flag is specified.
- *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior
- *   of having the texture coordinates range from [0, Dim) where Dim is the 
- *   width or height of the CUDA array. Instead, the texture coordinates 
- *   [0, 1.0) reference the entire breadth of the array dimension; Note that
- *   for CUDA mipmapped arrays, this flag has to be set.
- *   - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
- *   filtering optimizations. Trilinear optimizations improve texture filtering
- *   performance by allowing bilinear filtering on textures in scenarios where
- *   it can closely approximate the expected results.
- *   - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. 
- *   This flag can only be specified if the underlying resource is a CUDA array 
- *   or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP.
- *   When seamless cube map filtering is enabled, texture address modes specified 
- *   by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode 
- *   is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP 
- *   will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is 
- *   set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed
- *   when sampling along the cube face borders.
- *
- * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
- *   clamped to the range [1,16].
- *
- * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
- *
- * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
- *
- * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
- *
- * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
- *
- *
- * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
- * \code
-        typedef struct CUDA_RESOURCE_VIEW_DESC_st
-        {
-            CUresourceViewFormat format;
-            size_t width;
-            size_t height;
-            size_t depth;
-            unsigned int firstMipmapLevel;
-            unsigned int lastMipmapLevel;
-            unsigned int firstLayer;
-            unsigned int lastLayer;
-        } CUDA_RESOURCE_VIEW_DESC;
- * \endcode
- * where:
- * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
- *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
- *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
- *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
- *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
- *   format but with 4 channels.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
- *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
- *   this value has to be equal to that of the original resource.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
- *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
- *   this value has to be equal to that of the original resource.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
- *   original resource.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
- *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
- *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
- *   then the actual minimum mipmap level clamp will be 3.2.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
- *   has to be zero.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
- *   For non-layered resources, this value has to be zero.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
- *   this value has to be zero.
- *
- *
- * \param pTexObject   - Texture object to create
- * \param pResDesc     - Resource descriptor
- * \param pTexDesc     - Texture descriptor
- * \param pResViewDesc - Resource view descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectDestroy,
- * ::cudaCreateTextureObject
- */
-CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
-
-/**
- * \brief Destroys a texture object
- *
- * Destroys the texture object specified by \p texObject.
- *
- * \param texObject - Texture object to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaDestroyTextureObject
- */
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject);
-
-/**
- * \brief Returns a texture object's resource descriptor
- *
- * Returns the resource descriptor for the texture object specified by \p texObject.
- *
- * \param pResDesc  - Resource descriptor
- * \param texObject - Texture object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaGetTextureObjectResourceDesc,
- */
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
-
-/**
- * \brief Returns a texture object's texture descriptor
- *
- * Returns the texture descriptor for the texture object specified by \p texObject.
- *
- * \param pTexDesc  - Texture descriptor
- * \param texObject - Texture object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaGetTextureObjectTextureDesc
- */
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
-
-/**
- * \brief Returns a texture object's resource view descriptor
- *
- * Returns the resource view descriptor for the texture object specified by \p texObject.
- * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param pResViewDesc - Resource view descriptor
- * \param texObject    - Texture object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaGetTextureObjectResourceViewDesc
- */
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
-
-/** @} */ /* END CUDA_TEXOBJECT */
-
-/**
- * \defgroup CUDA_SURFOBJECT Surface Object Management
- *
- * ___MANBRIEF___ surface object management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the surface object management functions of the
- * low-level CUDA driver application programming interface. The surface
- * object API is only supported on devices of compute capability 3.0 or higher.
- *
- * @{
- */
-
-/**
- * \brief Creates a surface object
- *
- * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
- * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
- * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
- * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
- *
- * Surface objects are only supported on devices of compute capability 3.0 or higher.
- * Additionally, a surface object is an opaque value, and, as such, should only be
- * accessed through CUDA API calls.
- *
- * \param pSurfObject - Surface object to create
- * \param pResDesc    - Resource descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuSurfObjectDestroy,
- * ::cudaCreateSurfaceObject
- */
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
-
-/**
- * \brief Destroys a surface object
- *
- * Destroys the surface object specified by \p surfObject.
- *
- * \param surfObject - Surface object to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuSurfObjectCreate,
- * ::cudaDestroySurfaceObject
- */
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
-
-/**
- * \brief Returns a surface object's resource descriptor
- *
- * Returns the resource descriptor for the surface object specified by \p surfObject.
- *
- * \param pResDesc   - Resource descriptor
- * \param surfObject - Surface object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuSurfObjectCreate,
- * ::cudaGetSurfaceObjectResourceDesc
- */
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
-
-/** @} */ /* END CUDA_SURFOBJECT */
-
-/**
- * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
- *
- * ___MANBRIEF___ direct peer context memory access functions of the low-level
- * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the direct peer context memory access functions
- * of the low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Queries if a device may directly access a peer device's memory.
- *
- * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
- * directly accessing memory from contexts on \p peerDev and 0 otherwise.
- * If direct access of \p peerDev from \p dev is possible, then access may be
- * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
- *
- * \param canAccessPeer - Returned access capability
- * \param dev           - Device from which allocations on \p peerDev are to
- *                        be directly accessed.
- * \param peerDev       - Device on which the allocations to be directly accessed
- *                        by \p dev reside.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuCtxEnablePeerAccess,
- * ::cuCtxDisablePeerAccess,
- * ::cudaDeviceCanAccessPeer
- */
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
-
-/**
- * \brief Enables direct access to memory allocations in a peer context.
- *
- * If both the current context and \p peerContext are on devices which support unified
- * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
- * major compute capability, then on success all allocations from \p peerContext will
- * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
- * details.
- *
- * Note that access granted by this call is unidirectional and that in order to access
- * memory from the current context in \p peerContext, a separate symmetric call
- * to ::cuCtxEnablePeerAccess() is required.
- *
- * Note that there are both device-wide and system-wide limitations per system
- * configuration, as noted in the CUDA Programming Guide under the section
- * "Peer-to-Peer Memory Access".
- *
- * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
- * that the ::CUdevice of the current context cannot directly access memory
- * from the ::CUdevice of \p peerContext.
- *
- * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
- * \p peerContext from the current context has already been enabled.
- *
- * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
- * because hardware resources required for peer access have been exhausted.
- *
- * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
- * is not a valid context, or if the current context is \p peerContext.
- *
- * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
- *
- * \param peerContext - Peer context to enable direct access to from the current context
- * \param Flags       - Reserved for future use and must be set to 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
- * ::CUDA_ERROR_TOO_MANY_PEERS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceCanAccessPeer,
- * ::cuCtxDisablePeerAccess,
- * ::cudaDeviceEnablePeerAccess
- */
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
-
-/**
- * \brief Disables direct access to memory allocations in a peer context and
- * unregisters any registered allocations.
- *
-  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
- * not yet been enabled from \p peerContext to the current context.
- *
- * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
- * \p peerContext is not a valid context.
- *
- * \param peerContext - Peer context to disable direct access to
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * \notefnerr
- *
- * \sa
- * ::cuDeviceCanAccessPeer,
- * ::cuCtxEnablePeerAccess,
- * ::cudaDeviceDisablePeerAccess
- */
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
-
-/**
- * \brief Queries attributes of the link between two devices.
- *
- * Returns in \p *value the value of the requested attribute \p attrib of the
- * link between \p srcDevice and \p dstDevice. The supported attributes are:
- * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
- *   performance of the link between two devices.
- * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
- * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
- *   the link are supported.
- * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
- *   be accessed over the link.
- *
- * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
- * or if they represent the same device.
- *
- * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
- * a null pointer.
- *
- * \param value         - Returned value of the requested attribute
- * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
- * \param srcDevice     - The source device of the target link.
- * \param dstDevice     - The destination device of the target link.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuCtxEnablePeerAccess,
- * ::cuCtxDisablePeerAccess,
- * ::cuDeviceCanAccessPeer,
- * ::cudaDeviceGetP2PAttribute
- */
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
-
-/** @} */ /* END CUDA_PEER_ACCESS */
-
-/**
- * \defgroup CUDA_GRAPHICS Graphics Interoperability
- *
- * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the graphics interoperability functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Unregisters a graphics resource for access by CUDA
- *
- * Unregisters the graphics resource \p resource so it is not accessible by
- * CUDA unless registered again.
- *
- * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
- * returned.
- *
- * \param resource - Resource to unregister
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsD3D9RegisterResource,
- * ::cuGraphicsD3D10RegisterResource,
- * ::cuGraphicsD3D11RegisterResource,
- * ::cuGraphicsGLRegisterBuffer,
- * ::cuGraphicsGLRegisterImage,
- * ::cudaGraphicsUnregisterResource
- */
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
-
-/**
- * \brief Get an array through which to access a subresource of a mapped graphics resource.
- *
- * Returns in \p *pArray an array through which the subresource of the mapped
- * graphics resource \p resource which corresponds to array index \p arrayIndex
- * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
- * change every time that \p resource is mapped.
- *
- * If \p resource is not a texture then it cannot be accessed via an array and
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
- * If \p arrayIndex is not a valid array index for \p resource then
- * ::CUDA_ERROR_INVALID_VALUE is returned.
- * If \p mipLevel is not a valid mipmap level for \p resource then
- * ::CUDA_ERROR_INVALID_VALUE is returned.
- * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * \param pArray      - Returned array through which a subresource of \p resource may be accessed
- * \param resource    - Mapped resource to access
- * \param arrayIndex  - Array index for array textures or cubemap face
- *                      index as defined by ::CUarray_cubemap_face for
- *                      cubemap textures for the subresource to access
- * \param mipLevel    - Mipmap level for the subresource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsSubResourceGetMappedArray
- */
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
-
-/**
- * \brief Get a mipmapped array through which to access a mapped graphics resource.
- *
- * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
- * resource \p resource. The value set in \p *pMipmappedArray may change every time
- * that \p resource is mapped.
- *
- * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
- * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
- * \param resource        - Mapped resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsResourceGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
-
-/**
- * \brief Get a device pointer through which to access a mapped graphics resource.
- *
- * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
- * \p resource may be accessed.
- * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
- * The value set in \p pPointer may change every time that \p resource is mapped.
- *
- * If \p resource is not a buffer then it cannot be accessed via a pointer and
- * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
- * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- * *
- * \param pDevPtr    - Returned pointer through which \p resource may be accessed
- * \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
- * \param resource   - Mapped resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsMapResources,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cudaGraphicsResourceGetMappedPointer
- */
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
-
-/**
- * \brief Set usage flags for mapping a graphics resource
- *
- * Set \p flags for mapping the graphics resource \p resource.
- *
- * Changes to \p flags will take effect the next time \p resource is mapped.
- * The \p flags argument may be any of the following:
-
- * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA kernels.  This is the default value.
- * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
- *   access this resource will not write to this resource.
- * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
- *   which access this resource will not read from this resource and will
- *   write over the entire contents of the resource, so none of the data
- *   previously stored in the resource will be preserved.
- *
- * If \p resource is presently mapped for access by CUDA then
- * ::CUDA_ERROR_ALREADY_MAPPED is returned.
- * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param resource - Registered resource to set flags for
- * \param flags    - Parameters for resource mapping
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsMapResources,
- * ::cudaGraphicsResourceSetMapFlags
- */
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-
-/**
- * \brief Map graphics resources for access by CUDA
- *
- * Maps the \p count graphics resources in \p resources for access by CUDA.
- *
- * The resources in \p resources may be accessed by CUDA until they
- * are unmapped. The graphics API from which \p resources were registered
- * should not access any resources while they are mapped by CUDA. If an
- * application does so, the results are undefined.
- *
- * This function provides the synchronization guarantee that any graphics calls
- * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
- * work issued in \p stream begins.
- *
- * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
- *
- * \param count      - Number of resources to map
- * \param resources  - Resources to map for CUDA usage
- * \param hStream    - Stream with which to synchronize
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \note_null_stream
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cuGraphicsUnmapResources,
- * ::cudaGraphicsMapResources
- */
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-
-/**
- * \brief Unmap graphics resources.
- *
- * Unmaps the \p count graphics resources in \p resources.
- *
- * Once unmapped, the resources in \p resources may not be accessed by CUDA
- * until they are mapped again.
- *
- * This function provides the synchronization guarantee that any CUDA work issued
- * in \p stream before ::cuGraphicsUnmapResources() will complete before any
- * subsequently issued graphics work begins.
- *
- *
- * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * \param count      - Number of resources to unmap
- * \param resources  - Resources to unmap
- * \param hStream    - Stream with which to synchronize
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \note_null_stream
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsMapResources,
- * ::cudaGraphicsUnmapResources
- */
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-
-/** @} */ /* END CUDA_GRAPHICS */
-
-/**
- * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access 
- *
- * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the driver entry point access functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns the requested driver API function pointer
- *
- * Returns in \p **pfn the address of the CUDA driver function for the requested
- * CUDA version and flags.
- *
- * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
- * should be specified as 11020. For a requested driver symbol, if the specified
- * CUDA version is greater than or equal to the CUDA version in which the driver symbol
- * was introduced, this API will return the function pointer to the corresponding
- * versioned function.
- *
- * The pointer returned by the API should be cast to a function pointer matching the
- * requested driver function's definition in the API header file. The function pointer
- * typedef can be picked up from the corresponding typedefs header file. For example,
- * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
- *
- * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not
- * supported on the platform, no ABI compatible driver function exists for the specified
- * \p cudaVersion or if the driver symbol is invalid.
- *
- * The requested flags can be:
- * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to
- *   ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with
- *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
- *   is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise.
- * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols
- *   that match the requested driver symbol name except the corresponding per-thread versions.
- * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all
- *   driver symbols that match the requested driver symbol name including the per-thread
- *   versions. If a per-thread version is not found, the API will return the legacy version
- *   of the driver function.
- *
- * \param symbol - The base name of the driver API function to look for. As an example,
- *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and
- *                 \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. 
- * \param pfn - Location to return the function pointer to the requested driver function
- * \param cudaVersion - The CUDA version to look for the requested driver symbol 
- * \param flags -  Flags to specify search options.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_NOT_FOUND
- * \note_version_mixing
- *
- * \sa
- * ::cudaGetDriverEntryPoint
- */
-CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
-
-/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
-
-/**
- * CUDA API versioning support
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #undef cuMemHostRegister
-    #undef cuGraphicsResourceSetMapFlags
-    #undef cuLinkCreate
-    #undef cuLinkAddData
-    #undef cuLinkAddFile
-    #undef cuDeviceTotalMem
-    #undef cuCtxCreate
-    #undef cuModuleGetGlobal
-    #undef cuMemGetInfo
-    #undef cuMemAlloc
-    #undef cuMemAllocPitch
-    #undef cuMemFree
-    #undef cuMemGetAddressRange
-    #undef cuMemAllocHost
-    #undef cuMemHostGetDevicePointer
-    #undef cuMemcpyHtoD
-    #undef cuMemcpyDtoH
-    #undef cuMemcpyDtoD
-    #undef cuMemcpyDtoA
-    #undef cuMemcpyAtoD
-    #undef cuMemcpyHtoA
-    #undef cuMemcpyAtoH
-    #undef cuMemcpyAtoA
-    #undef cuMemcpyHtoAAsync
-    #undef cuMemcpyAtoHAsync
-    #undef cuMemcpy2D
-    #undef cuMemcpy2DUnaligned
-    #undef cuMemcpy3D
-    #undef cuMemcpyHtoDAsync
-    #undef cuMemcpyDtoHAsync
-    #undef cuMemcpyDtoDAsync
-    #undef cuMemcpy2DAsync
-    #undef cuMemcpy3DAsync
-    #undef cuMemsetD8
-    #undef cuMemsetD16
-    #undef cuMemsetD32
-    #undef cuMemsetD2D8
-    #undef cuMemsetD2D16
-    #undef cuMemsetD2D32
-    #undef cuArrayCreate
-    #undef cuArrayGetDescriptor
-    #undef cuArray3DCreate
-    #undef cuArray3DGetDescriptor
-    #undef cuTexRefSetAddress
-    #undef cuTexRefSetAddress2D
-    #undef cuTexRefGetAddress
-    #undef cuGraphicsResourceGetMappedPointer
-    #undef cuCtxDestroy
-    #undef cuCtxPopCurrent
-    #undef cuCtxPushCurrent
-    #undef cuStreamDestroy
-    #undef cuEventDestroy
-    #undef cuMemcpy
-    #undef cuMemcpyAsync
-    #undef cuMemcpyPeer
-    #undef cuMemcpyPeerAsync
-    #undef cuMemcpy3DPeer
-    #undef cuMemcpy3DPeerAsync
-    #undef cuMemsetD8Async
-    #undef cuMemsetD16Async
-    #undef cuMemsetD32Async
-    #undef cuMemsetD2D8Async
-    #undef cuMemsetD2D16Async
-    #undef cuMemsetD2D32Async
-    #undef cuStreamGetPriority
-    #undef cuStreamGetFlags
-    #undef cuStreamGetCtx
-    #undef cuStreamWaitEvent
-    #undef cuStreamAddCallback
-    #undef cuStreamAttachMemAsync
-    #undef cuStreamQuery
-    #undef cuStreamSynchronize
-    #undef cuEventRecord
-    #undef cuEventRecordWithFlags
-    #undef cuLaunchKernel
-
-
-
-    #undef cuLaunchHostFunc
-    #undef cuGraphicsMapResources
-    #undef cuGraphicsUnmapResources
-    #undef cuStreamWriteValue32
-    #undef cuStreamWaitValue32
-    #undef cuStreamWriteValue64
-    #undef cuStreamWaitValue64
-    #undef cuStreamBatchMemOp
-    #undef cuMemPrefetchAsync
-    #undef cuLaunchCooperativeKernel
-    #undef cuSignalExternalSemaphoresAsync
-    #undef cuWaitExternalSemaphoresAsync
-    #undef cuStreamBeginCapture
-    #undef cuStreamEndCapture
-    #undef cuStreamIsCapturing
-    #undef cuStreamGetCaptureInfo
-    #undef cuStreamGetCaptureInfo_v2
-    #undef cuGraphUpload
-    #undef cuGraphLaunch
-    #undef cuDevicePrimaryCtxRelease
-    #undef cuDevicePrimaryCtxReset
-    #undef cuDevicePrimaryCtxSetFlags
-    #undef cuIpcOpenMemHandle
-    #undef cuStreamCopyAttributes
-    #undef cuStreamSetAttribute
-    #undef cuStreamGetAttribute
-    #undef cuGraphInstantiate
-    #undef cuMemMapArrayAsync
-    #undef cuMemFreeAsync 
-    #undef cuMemAllocAsync 
-    #undef cuMemAllocFromPoolAsync 
-    #undef cuStreamUpdateCaptureDependencies
-
-    CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
-    CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-    CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-    CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
-        unsigned int numOptions, CUjit_option *options, void **optionValues);
-    CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
-        unsigned int numOptions, CUjit_option *options, void **optionValues);
-    CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
-
-    typedef unsigned int CUdeviceptr_v1;
-
-    typedef struct CUDA_MEMCPY2D_v1_st
-    {
-        unsigned int srcXInBytes;   /**< Source X in bytes */
-        unsigned int srcY;          /**< Source Y */
-        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-        const void *srcHost;        /**< Source host pointer */
-        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
-        CUarray srcArray;           /**< Source array reference */
-        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
-
-        unsigned int dstXInBytes;   /**< Destination X in bytes */
-        unsigned int dstY;          /**< Destination Y */
-        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-        void *dstHost;              /**< Destination host pointer */
-        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
-        CUarray dstArray;           /**< Destination array reference */
-        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
-
-        unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
-        unsigned int Height;        /**< Height of 2D memory copy */
-    } CUDA_MEMCPY2D_v1;
-
-    typedef struct CUDA_MEMCPY3D_v1_st
-    {
-        unsigned int srcXInBytes;   /**< Source X in bytes */
-        unsigned int srcY;          /**< Source Y */
-        unsigned int srcZ;          /**< Source Z */
-        unsigned int srcLOD;        /**< Source LOD */
-        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-        const void *srcHost;        /**< Source host pointer */
-        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
-        CUarray srcArray;           /**< Source array reference */
-        void *reserved0;            /**< Must be NULL */
-        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
-        unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
-
-        unsigned int dstXInBytes;   /**< Destination X in bytes */
-        unsigned int dstY;          /**< Destination Y */
-        unsigned int dstZ;          /**< Destination Z */
-        unsigned int dstLOD;        /**< Destination LOD */
-        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-        void *dstHost;              /**< Destination host pointer */
-        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
-        CUarray dstArray;           /**< Destination array reference */
-        void *reserved1;            /**< Must be NULL */
-        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
-        unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
-
-        unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
-        unsigned int Height;        /**< Height of 3D memory copy */
-        unsigned int Depth;         /**< Depth of 3D memory copy */
-    } CUDA_MEMCPY3D_v1;
-
-    typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st
-    {
-        unsigned int Width;         /**< Width of array */
-        unsigned int Height;        /**< Height of array */
-
-        CUarray_format Format;      /**< Array format */
-        unsigned int NumChannels;   /**< Channels per array element */
-    } CUDA_ARRAY_DESCRIPTOR_v1;
-
-    typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st
-    {
-        unsigned int Width;         /**< Width of 3D array */
-        unsigned int Height;        /**< Height of 3D array */
-        unsigned int Depth;         /**< Depth of 3D array */
-
-        CUarray_format Format;      /**< Array format */
-        unsigned int NumChannels;   /**< Channels per array element */
-        unsigned int Flags;         /**< Flags */
-    } CUDA_ARRAY3D_DESCRIPTOR_v1;
-
-    CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
-    CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-    CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
-    CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
-    CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize);
-    CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
-    CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr);
-    CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
-    CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
-    CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
-    CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy);
-    CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy);
-    CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy);
-    CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
-    CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
-    CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
-    CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
-    CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
-    CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
-    CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
-    CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
-    CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
-    CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
-    CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
-    CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
-    CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
-    CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
-
-    CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
-    CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
-    CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
-    CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
-    CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
-    CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
-    CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
-    CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
-
-    CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
-    CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy);
-    CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy);
-    CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
-    CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
-    CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
-    CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
-    CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
-    CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
-    CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
-    CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
-
-    CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
-
-    CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
-    CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
-    CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
-    CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
-    CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
-    CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
-    CUresult CUDAAPI cuStreamQuery(CUstream hStream);
-    CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
-    CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
-    CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
-    CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-
-
-
-    CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
-    CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-    CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-    CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
-    CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
-    CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
-    CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-    CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-    CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
-    CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream);
-    CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode);
-    CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
-    CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
-    CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
-    CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
-    CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream);
-    CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
-    CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream);
-    CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value);
-    CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param);
-
-    CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
-    CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
-    CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream);
-
-    CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
-    CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
-    CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
-
-    CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
-#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
-static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) {
-    const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM|
-                                 CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM);
-    if ((flags & procAddressMask) == 0) {
-        flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM;
-    }
-    return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); 
-}
-#define cuGetProcAddress cuGetProcAddress_ptsz
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#if defined(__GNUC__)
-  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
-    #pragma GCC visibility pop
-  #endif
-#endif
-
-#undef __CUDA_DEPRECATED
-
-#endif /* __cuda_cuda_h__ */
diff --git a/python/triton/third_party/cuda/lib/libdevice.10.bc b/python/triton/third_party/cuda/lib/libdevice.10.bc
deleted file mode 100755
index b2c75a5026df..000000000000
Binary files a/python/triton/third_party/cuda/lib/libdevice.10.bc and /dev/null differ
diff --git a/python/triton/tools/__init__.py b/python/triton/tools/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/python/triton/tools/aot.py b/python/triton/tools/aot.py
deleted file mode 100644
index 4073423bce3f..000000000000
--- a/python/triton/tools/aot.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import argparse
-import sys
-
-import triton._C.libtriton.triton as libtriton
-import triton.compiler.compiler as tc
-
-if __name__ == '__main__':
-
-    # valid source and target formats
-    VALID_FORMATS = ['triton-ir', 'triton-gpu-ir', 'llvm-ir', 'ptx', 'amdgcn']
-
-    # set up the argument parser
-    # TODO: conditional requirements
-    parser = argparse.ArgumentParser()
-    parser.add_argument('src', help="Source file to compile")
-    parser.add_argument('--target', required=True,
-                        help="Target format, one of: " + ', '.join(VALID_FORMATS))
-    parser.add_argument('--sm', type=int, help="Compute capability to compile for")
-    parser.add_argument('--ptx-version', type=int, help="PTX version to compile for")
-    parser.add_argument('--gfx', type=str, help="AMDGPU target to compile for")
-    parser.add_argument('--triple', type=str, help="target triple, for example: amdgcn-amd-amdhsa")
-    parser.add_argument('--features', type=str, help="target features, for example: +sramecc,-xnack")
-    parser.add_argument('--num_warps', type=int, help="number of warps to compile ttgir for")
-
-    # parse the args
-    args = parser.parse_args()
-
-    # TODO: clean-up and re-use triton.compiler primitive functions
-    # check for validity of format arguments
-    if args.target not in VALID_FORMATS:
-        print("Invalid target format: " + args.target)
-        sys.exit(0)
-
-    # parse source file to MLIR module
-    context = libtriton.ir.context()
-    module = libtriton.ir.parse_mlir_module(args.src, context)
-    module.context = context
-
-    # optimizer triton-ir
-    module = tc.optimize_ttir(module, arch=args.sm)
-    if args.target == 'triton-ir':
-        print(module.str())
-        sys.exit(0)
-
-    if not args.num_warps:
-        args.num_warps = 4
-
-    # llvm-ir -> amdgcn
-    if args.target == 'amdgcn':
-        # auto detect available architecture and features
-        # if nothing detected, set with default values
-        arch_details = tc.get_amdgpu_arch_fulldetails()
-        if not arch_details:
-            arch_name = ""
-            arch_triple = "amdgcn-amd-amdhsa"
-            arch_features = ""
-        else:
-            arch_triple, arch_name, arch_features = arch_details
-
-        # stop processing if architecture name is not automatically detected and is not set manually
-        if not args.gfx and not arch_name:
-            raise argparse.ArgumentError(None, "Must specify --gfx for AMDGCN compilation")
-
-        # rewrite default and automatically detected values with manually provided data
-        if args.gfx:
-            arch_name = args.gfx
-        if args.triple:
-            arch_triple = args.triple
-        if args.features:
-            arch_features = args.features
-
-        # triton-ir -> triton-gpu-ir
-        # use compute_capability == 80
-        module = tc.ttir_to_ttgir(module, num_warps=args.num_warps)  # num_stages=3, compute_capability=80)
-        module = tc.optimize_ttgir(module, num_stages=3, arch=80)
-        # triton-gpu-ir -> llvm-ir
-        # use compute_capability == 80
-        module = tc.ttgir_to_llir(module, extern_libs=None, arch=80)
-        # llvm-ir -> amdgcn asm, hsaco binary
-        module, hsaco_path = tc.llir_to_amdgcn_and_hsaco(module, arch_name, arch_triple, arch_features)
-
-        print(hsaco_path)
-        print(module)
-        sys.exit(0)
-
-    if not args.sm:
-        raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation")
-
-    # triton-ir -> triton-gpu-ir
-    module = tc.ttir_to_ttgir(module, num_warps=args.num_warps)
-    module = tc.optimize_ttgir(module, num_stages=3, arch=args.sm)
-    if args.target == 'triton-gpu-ir':
-        print(module.str())
-        sys.exit(0)
-
-    # triton-gpu-ir -> llvm-ir
-    module = tc.ttgir_to_llir(module, extern_libs=None, arch=args.sm)
-    if args.target == 'llvm-ir':
-        print(module)
-        sys.exit(0)
-
-    # llvm-ir -> ptx
-    if args.target == 'ptx':
-        if not args.ptx_version:
-            raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation")
-        module = tc.llir_to_ptx(module, arch=args.sm, ptx_version=args.ptx_version)
-
-    # llvm-ir -> amdgcn
-    if args.target == 'amdgcn':
-        if not args.gfx:
-            raise argparse.ArgumentError(None, "Must specify --gfx for AMDGCN compilation")
-        module, hsaco_path = tc.llir_to_amdgcn_and_hsaco(module, args.gfx)
-
-    print(module)
diff --git a/python/triton/tools/build_extern.py b/python/triton/tools/build_extern.py
deleted file mode 100644
index f19fbd561c07..000000000000
--- a/python/triton/tools/build_extern.py
+++ /dev/null
@@ -1,398 +0,0 @@
-import argparse
-import subprocess
-from abc import ABC, abstractmethod
-from typing import Dict, List, Optional
-
-
-class Symbol:
-    _name: str
-    _op_name: str
-    _ret_type: str
-    _arg_names: List[str]
-    _arg_types: List[str]
-
-    def __init__(
-        self,
-        name: str,
-        op_name: str,
-        ret_type: str,
-        arg_names: List[str],
-        arg_types: List[str],
-    ) -> None:
-        '''
-        A symbol is a function declaration.
-        :param name: name of the symbol
-        :param op_name: name of the operation
-        :param ret_type: return type of the operation
-        :param arg_names: names of the arguments
-        :param arg_types: types of the arguments
-        '''
-        self._name = name
-        self._op_name = op_name
-        self._ret_type = ret_type
-        self._arg_names = list(arg_names)
-        self._arg_types = list(arg_types)
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @property
-    def op_name(self) -> str:
-        return self._op_name
-
-    @property
-    def ret_type(self) -> str:
-        return self._ret_type
-
-    @property
-    def arg_names(self) -> List[str]:
-        return self._arg_names
-
-    @property
-    def arg_types(self) -> List[str]:
-        return self._arg_types
-
-
-def convert_type(type_str) -> Optional[str]:
-    if type_str == "i32":
-        return "int32"
-    elif type_str == "u32":
-        return "uint32"
-    elif type_str == "i64":
-        return "int64"
-    elif type_str == "u64":
-        return "uint64"
-    elif type_str == "float":
-        return "fp32"
-    elif type_str == "double":
-        return "fp64"
-    else:
-        # ignore other types, such as pointer types
-        return None
-
-
-def to_unsigned(type_str) -> str:
-    if type_str == "int32":
-        return "uint32"
-    elif type_str == "int64":
-        return "uint64"
-    else:
-        return type_str
-
-
-class ExternLibrary(ABC):
-    _name: str
-    _path: str
-    _symbols: Dict[str, Symbol]
-    _format: bool
-    _grouping: bool
-
-    def __init__(
-        self,
-        name: str,
-        path: str,
-        format: bool = True,
-        grouping: bool = True,
-    ) -> None:
-        '''
-        Abstract class for extern library.
-        :param name: name of the library
-        :param path: path of the library
-        :param format: whether to format the generated stub file
-        '''
-        self._name = name
-        self._path = path
-        self._symbols = {}
-        self._format = format
-        self._grouping = grouping
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @property
-    def path(self) -> str:
-        return self._path
-
-    @property
-    def symbols(self) -> Dict[str, Symbol]:
-        return self._symbols
-
-    @property
-    def grouping(self) -> bool:
-        return self._grouping
-
-    @abstractmethod
-    def parse_symbols(self, input_file) -> None:
-        pass
-
-    @abstractmethod
-    def _output_stubs(self) -> str:
-        pass
-
-    def generate_stub_file(self, output_dir) -> None:
-        file_str = self._output_stubs()
-        if file_str is None or len(file_str) == 0:
-            raise Exception("file_str is empty")
-
-        output_file = f"{output_dir}/{self._name}.py"
-        with open(output_file, "w") as f:
-            f.write(file_str)
-            f.close()
-            if self._format:
-                subprocess.Popen(["autopep8", "-a", "-r", "-i", output_file],
-                                 stdout=subprocess.PIPE).communicate()
-                subprocess.Popen(["isort", output_file], stdout=subprocess.PIPE).communicate()
-
-
-class Libdevice(ExternLibrary):
-    _symbol_groups: Dict[str, List[Symbol]]
-
-    def __init__(self, path) -> None:
-        '''
-        Constructor for Libdevice.
-        :param path: path of the libdevice library
-        '''
-        super().__init__("libdevice", path)
-        self._symbol_groups = {}
-        self.is_pure = True
-
-    @staticmethod
-    def _extract_symbol(line) -> Optional[Symbol]:
-        # Extract symbols from line in the following format:
-        # "define [internal] <ret_type> @<name>(<arg_types>,)"
-        entries = line.split("@")
-        ret_str = entries[0]
-        func_str = entries[1]
-        # Get ret_type, skip internal symbols
-        ret_strs = ret_str.split()
-        if ret_strs[1] == "internal":
-            return None
-        ret_type = convert_type(ret_strs[1])
-        if ret_type is None:
-            return None
-        # Get function name
-        func_strs = func_str.split("(")
-        func_name = func_strs[0].replace("@", "")
-        op_name = func_name.replace("__nv_", "")
-        if 'ieee' in op_name:
-            return None
-        # Get arg_types
-        arg_strs = func_strs[1].split(",")
-        arg_types = []
-        arg_names = []
-        for i, arg_str in enumerate(arg_strs):
-            arg_type = convert_type(arg_str.split()[0])
-            if arg_type is None:
-                return None
-            arg_name = 'arg' + str(i)
-            arg_types.append(arg_type)
-            arg_names.append(arg_name)
-        if op_name == "sad":
-            # Special case for sad, where the last argument is an unsigned int
-            arg_types[-1] = to_unsigned(arg_types[-1])
-        elif op_name.startswith("u"):
-            # LLVM does not differentiate between signed and unsigned integer type.
-            # We have to convert the types to unsigned
-            ret_type = to_unsigned(ret_type)
-            for i, arg_type in enumerate(arg_types):
-                arg_types[i] = to_unsigned(arg_type)
-        return Symbol(func_name, op_name, ret_type, arg_names, arg_types)
-
-    def _group_symbols(self) -> None:
-        symbol_set = {}
-        for symbol in self._symbols.values():
-            op_name = symbol.op_name
-            symbol_set[op_name] = symbol
-
-        # Group functions together by renaming.
-        renaming = {
-            'llabs': 'abs', 'acosf': 'acos', 'acoshf': 'acosh',
-            'dadd_rd': 'add_rd', 'fadd_rd': 'add_rd', 'dadd_rn': 'add_rn',
-            'fadd_rn': 'add_rn', 'dadd_ru': 'add_ru', 'fadd_ru': 'add_ru',
-            'dadd_rz': 'add_rz', 'fadd_rz': 'add_rz', 'asinf': 'asin',
-            'asinhf': 'asinh', 'atanf': 'atan', 'atan2f': 'atan2',
-            'atanhf': 'atanh', 'brevll': 'brev', 'cbrtf': 'cbrt',
-            'ceilf': 'ceil', 'clzll': 'clz', 'copysignf': 'copysign',
-            'cosf': 'cos', 'coshf': 'cosh', 'cospif': 'cospi',
-            'cyl_bessel_i0f': 'cyl_bessel_i0', 'cyl_bessel_i1f': 'cyl_bessel_i1',
-            'fdiv_rd': 'div_rd', 'ddiv_rd': 'div_rd', 'fdiv_rn': 'div_rn',
-            'ddiv_rn': 'div_rn', 'fdiv_ru': 'div_ru', 'ddiv_ru': 'div_ru',
-            'fdiv_rz': 'div_rz', 'ddiv_rz': 'div_rz', 'erff': 'erf',
-            'erfcf': 'erfc', 'erfcinvf': 'erfcinv', 'erfcxf': 'erfcx',
-            'erfinvf': 'erfinv', 'expf': 'exp', 'exp10f': 'exp10',
-            'exp2f': 'exp2', 'expm1f': 'expm1', 'fabsf': 'abs',
-            'fabs': 'abs', 'fast_fdividef': 'fast_dividef',
-            'fdimf': 'fdim', 'ffsll': 'ffs', 'floorf': 'floor',
-            'fmaf': 'fma', 'fmaf_rd': 'fma_rd', 'fmaf_rn': 'fma_rn',
-            'fmaf_ru': 'fma_ru', 'fmaf_rz': 'fma_rz', 'fmodf': 'fmod',
-            'uhadd': 'hadd', 'hypotf': 'hypot', 'ilogbf': 'ilogb',
-            'isinff': 'isinf', 'isinfd': 'isinf', 'isnanf': 'isnan',
-            'isnand': 'isnan', 'j0f': 'j0', 'j1f': 'j1', 'jnf': 'jn',
-            'ldexpf': 'ldexp', 'lgammaf': 'lgamma', 'llrintf': 'llrint',
-            'llroundf': 'llround', 'logf': 'log', 'log10f': 'log10',
-            'log1pf': 'log1p', 'log2f': 'log2', 'logbf': 'logb',
-            'umax': 'max', 'llmax': 'max', 'ullmax': 'max', 'fmaxf': 'max',
-            'fmax': 'max', 'umin': 'min', 'llmin': 'min', 'ullmin': 'min',
-            'fminf': 'min', 'fmin': 'min', 'dmul_rd': 'mul_rd', 'fmul_rd': 'mul_rd',
-            'dmul_rn': 'mul_rn', 'fmul_rn': 'mul_rn', 'dmul_ru': 'mul_ru',
-            'fmul_ru': 'mul_ru', 'dmul_rz': 'mul_rz', 'fmul_rz': 'mul_rz',
-            'umul24': 'mul24', 'umulhi': 'mulhi', 'mul64hi': 'mulhi',
-            'umul64hi': 'mulhi', 'nearbyintf': 'nearbyint', 'nextafterf': 'nextafter',
-            'norm3df': 'norm3d', 'norm4df': 'norm4d', 'normcdff': 'normcdf',
-            'normcdfinvf': 'normcdfinv', 'popcll': 'popc', 'powif': 'pow', 'powi': 'pow',
-            'powf': 'pow', 'rcbrtf': 'rcbrt', 'frcp_rd': 'rcp_rd', 'drcp_rd': 'rcp_rd',
-            'frcp_rn': 'rcp_rn', 'drcp_rn': 'rcp_rn', 'frcp_ru': 'rcp_ru',
-            'drcp_ru': 'rcp_ru', 'frcp_rz': 'rcp_rz', 'drcp_rz': 'rcp_rz',
-            'remainderf': 'remainder', 'urhadd': 'rhadd', 'rhypotf': 'rhypot',
-            'rintf': 'rint', 'rnorm3df': 'rnorm3d', 'rnorm4df': 'rnorm4d',
-            'roundf': 'round', 'rsqrtf': 'rsqrt', 'frsqrt_rn': 'rsqrt_rn',
-            'usad': 'sad', 'scalbnf': 'scalbn', 'signbitf': 'signbit',
-            'signbitd': 'signbit', 'sinf': 'sin', 'sinhf': 'sinh',
-            'sinpif': 'sinpi', 'sqrtf': 'sqrt', 'fsqrt_rd': 'sqrt_rd',
-            'dsqrt_rd': 'sqrt_rd', 'fsqrt_rn': 'sqrt_rn', 'dsqrt_rn': 'sqrt_rn',
-            'fsqrt_ru': 'sqrt_ru', 'dsqrt_ru': 'sqrt_ru', 'fsqrt_rz': 'sqrt_rz',
-            'dsqrt_rz': 'sqrt_rz', 'fsub_rd': 'sub_rd', 'dsub_rd': 'sub_rd',
-            'fsub_rn': 'sub_rn', 'dsub_rn': 'sub_rn', 'fsub_ru': 'sub_ru',
-            'dsub_ru': 'sub_ru', 'fsub_rz': 'sub_rz', 'dsub_rz': 'sub_rz',
-            'tanf': 'tan', 'tanhf': 'tanh', 'tgammaf': 'tgamma', 'truncf': 'trunc',
-            'y0f': 'y0', 'y1f': 'y1', 'ynf': 'yn'
-        }
-
-        for symbol in self._symbols.values():
-            op_name = symbol.op_name
-            if op_name in renaming:
-                op_name = renaming[op_name]
-                symbol._op_name = op_name
-            if op_name in self._symbol_groups:
-                self._symbol_groups[op_name].append(symbol)
-            else:
-                self._symbol_groups[op_name] = [symbol]
-
-    def parse_symbols(self, input_file) -> None:
-        if len(self.symbols) > 0:
-            return
-        output = subprocess.check_output(["grep", "define", input_file]).decode().splitlines()
-        for line in output:
-            symbol = self._extract_symbol(line)
-            if symbol is None:
-                continue
-            self._symbols[symbol.name] = symbol
-
-        self._group_symbols()
-
-    def _output_stubs(self) -> str:
-        # Generate python functions in the following format:
-        # @extern.extern
-        # def <op_name>(<args>, _builder=None):
-        #   arg_type_symbol_dict = {[arg_type]: {(symbol, ret_type)}}
-        #   return core.extern_elementwise("libdevice", <path>, <args>, <arg_type_symbol_dict>, _builder)
-        import_str = "from . import core\n"
-        import_str += "import os\n"
-        import_str += "import functools\n"
-
-        header_str = ""
-        header_str += "@functools.lru_cache()\n"
-        header_str += "def libdevice_path():\n"
-        header_str += "    import torch\n"
-        header_str += "    third_party_dir =  os.path.join(os.path.dirname(os.path.abspath(__file__)), \"..\", \"third_party\")\n"
-        header_str += "    if torch.version.hip is None:\n"
-        header_str += "        default = os.path.join(third_party_dir, \"cuda\", \"lib\", \"libdevice.10.bc\")\n"
-        header_str += "    else:\n"
-        header_str += "        default = ''\n"
-        header_str += "    return os.getenv(\"TRITON_LIBDEVICE_PATH\", default)\n"
-        func_str = ""
-        for symbols in self._symbol_groups.values():
-            func_str += "@core.extern\n"
-            func_name_str = f"def {symbols[0].op_name}("
-            for arg_name in symbols[0].arg_names:
-                func_name_str += f"{arg_name}, "
-            func_name_str += "_builder=None):\n"
-
-            return_str = f"\treturn core.extern_elementwise(\"{self._name}\", libdevice_path(), ["
-            for arg_name in symbols[0].arg_names:
-                return_str += f"{arg_name}, "
-            return_str += "], \n"
-
-            arg_type_symbol_dict_str = "{"
-            for symbol in symbols:
-                arg_type_symbol_dict_str += "("
-                for arg_type in symbol.arg_types:
-                    arg_type_symbol_dict_str += f'core.dtype("{arg_type}"),'
-                ret_type = f'core.dtype("{symbol.ret_type}")'
-                arg_type_symbol_dict_str += "): (\"" + symbol.name + "\", " + ret_type + "),\n"
-            arg_type_symbol_dict_str += "}"
-
-            return_str += arg_type_symbol_dict_str
-            return_str += f", is_pure={self.is_pure}"
-            return_str += ", _builder=_builder)\n"
-
-            func_str += func_name_str + return_str + "\n"
-        file_str = import_str + header_str + func_str
-
-        return file_str
-
-
-class LLVMDisassembler:
-    _path: str
-    _ll_file: str
-
-    def __init__(self, path) -> None:
-        '''
-        Invoke llvm-dis to disassemble the given file.
-        :param path: path to llvm-dis
-        '''
-        self._path = path
-        self._ll_file = "/tmp/extern_lib.ll"
-
-    def disasm(self, lib_path: str) -> None:
-        subprocess.Popen([self._path, lib_path, "-o", self.ll_file],
-                         stdout=subprocess.PIPE).communicate()
-
-    @property
-    def ll_file(self) -> str:
-        return self._ll_file
-
-    @property
-    def path(self) -> str:
-        return self._path
-
-
-extern_libs = ["libdevice"]
-
-
-def build(
-    llvm_dis_path: str,
-    lib_path: str,
-    lib_name: str,
-    output_dir: str,
-) -> None:
-    '''
-      Interface function to build the library file.
-      :param llvm_dis_path: path to the llvm-dis binary
-      :param lib_path: path to the external library file
-      :param lib_name: name of the library
-      :param output_dir: path to the output directory
-    '''
-    if lib_name == "libdevice":
-        extern_lib = Libdevice(lib_path)
-    else:
-        raise Exception(f"Unknown extern library: {lib_name}")
-
-    llvm_disassembler = LLVMDisassembler(llvm_dis_path)
-    llvm_disassembler.disasm(lib_path)
-
-    extern_lib.parse_symbols(llvm_disassembler.ll_file)
-    extern_lib.generate_stub_file(output_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--llvm-dis", dest="llvm_dis_path", help="Path to llvm-dis", default="llvm-dis")
-    parser.add_argument("--lib-path", dest="lib_path", help="Path to the extern library")
-    parser.add_argument("--lib-name", dest="lib_name", help="Name of the extern library")
-    parser.add_argument("--output", dest="output_dir", help="Output file path", default="/tmp/")
-    args = parser.parse_args()
-
-    build(args.llvm_dis_path, args.lib_path, args.lib_name, args.output_dir)
diff --git a/python/triton/tools/disasm.py b/python/triton/tools/disasm.py
deleted file mode 100644
index 24a0787c5c16..000000000000
--- a/python/triton/tools/disasm.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# MIT License
-
-# Copyright (c) 2020 Da Yan @ HKUST
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import re
-import subprocess
-
-FLINE_RE = re.compile(r'\s*/\*\w{4}\*/\s*([^;]*;)\s*/\* 0x(\w{16}) \*/\s*')
-SLINE_RE = re.compile(r'\s*/\* 0x(\w{16}) \*/\s*')
-FNAME_RE = re.compile(r'\s*Function : (\w+)\s*')
-BRA_RE = re.compile(r'(.*BRA(?:\.U)? )(0x\w+);')
-
-
-def parseCtrl(sline):
-    enc = int(SLINE_RE.match(sline).group(1), 16)
-    stall = (enc >> 41) & 0xf
-    yld = (enc >> 45) & 0x1
-    wrtdb = (enc >> 46) & 0x7
-    readb = (enc >> 49) & 0x7
-    watdb = (enc >> 52) & 0x3f
-
-    yld_str = 'Y' if yld == 0 else '-'
-    wrtdb_str = '-' if wrtdb == 7 else str(wrtdb)
-    readb_str = '-' if readb == 7 else str(readb)
-    watdb_str = '--' if watdb == 0 else f'{watdb:02d}'
-    return f'{watdb_str}:{readb_str}:{wrtdb_str}:{yld_str}:{stall:x}'
-
-
-def processSassLines(fline, sline, labels):
-    asm = FLINE_RE.match(fline).group(1)
-    # Remove tailing space
-    if asm.endswith(" ;"):
-        asm = asm[:-2] + ";"
-    ctrl = parseCtrl(sline)
-    # BRA target address
-    if BRA_RE.match(asm) is not None:
-        target = int(BRA_RE.match(asm).group(2), 16)
-        if target in labels:
-            pass
-        else:
-            labels[target] = len(labels)
-    return (f'{ctrl}', f'{asm}')
-
-
-def extract(file_path, fun):
-    if fun is None:
-        sass_str = subprocess.check_output(["cuobjdump", "-sass", file_path])
-    else:
-        sass_str = subprocess.check_output(["cuobjdump", "-fun", fun, "-sass", file_path])
-    sass_lines = sass_str.splitlines()
-    line_idx = 0
-    while line_idx < len(sass_lines):
-        line = sass_lines[line_idx].decode()
-        # format:
-        # function : <function_name>
-        # .headerflags: ...
-        # /*0000*/ asmstr /*0x...*/
-        #                 /*0x...*/
-
-        # Looking for new function header (function: <name>)
-        while FNAME_RE.match(line) is None:
-            line_idx += 1
-            if line_idx < len(sass_lines):
-                line = sass_lines[line_idx].decode()
-            else:
-                return
-
-        fname = FNAME_RE.match(line).group(1)
-        ret = ''
-        ret += f'Function:{fname}\n'
-        line_idx += 2  # bypass .headerflags
-        line = sass_lines[line_idx].decode()
-        # Remapping address to label
-        labels = {}  # address -> label_idx
-        # store sass asm in buffer and them print them (for labels)
-        # (ctrl, asm)
-        asm_buffer = []
-        while FLINE_RE.match(line) is not None:
-            # First line (Offset ASM Encoding)
-            fline = sass_lines[line_idx].decode()
-            line_idx += 1
-            # Second line (Encoding)
-            sline = sass_lines[line_idx].decode()
-            line_idx += 1
-            asm_buffer.append(processSassLines(fline, sline, labels))
-            # peek the next line
-            line = sass_lines[line_idx].decode()
-        # Print sass
-        # label naming convention: LBB#i
-        for idx, (ctrl, asm) in enumerate(asm_buffer):
-            # Print label if this is BRA target
-            offset = idx * 16
-            if offset in labels:
-                label_name = f'LBB{labels[offset]}'
-                ret += f'{label_name}:\n'
-            ret += ctrl + '\t'
-            # if this is BRA, remap offset to label
-            if BRA_RE.match(asm):
-                target = int(BRA_RE.match(asm).group(2), 16)
-                target_name = f'LBB{labels[target]}'
-                asm = BRA_RE.sub(rf'\1{target_name};', asm)
-            ret += asm + '\n'
-        ret += '\n'
-        return ret
diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
deleted file mode 100644
index 3463ddf1ced1..000000000000
--- a/python/tutorials/01-vector-add.py
+++ /dev/null
@@ -1,139 +0,0 @@
-"""
-Vector Addition
-===============
-
-In this tutorial, you will write a simple vector addition using Triton.
-
-In doing so, you will learn about:
-
-* The basic programming model of Triton.
-
-* The `triton.jit` decorator, which is used to define Triton kernels.
-
-* The best practices for validating and benchmarking your custom ops against native reference implementations.
-
-"""
-
-# %%
-# Compute Kernel
-# --------------
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def add_kernel(
-    x_ptr,  # *Pointer* to first input vector.
-    y_ptr,  # *Pointer* to second input vector.
-    output_ptr,  # *Pointer* to output vector.
-    n_elements,  # Size of the vector.
-    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
-                 # NOTE: `constexpr` so it can be used as a shape value.
-):
-    # There are multiple 'programs' processing different data. We identify which program
-    # we are here:
-    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
-    # This program will process inputs that are offset from the initial data.
-    # For instance, if you had a vector of length 256 and block_size of 64, the programs
-    # would each access the elements [0:64, 64:128, 128:192, 192:256].
-    # Note that offsets is a list of pointers:
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    # Create a mask to guard memory operations against out-of-bounds accesses.
-    mask = offsets < n_elements
-    # Load x and y from DRAM, masking out any extra elements in case the input is not a
-    # multiple of the block size.
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = tl.load(y_ptr + offsets, mask=mask)
-    output = x + y
-    # Write x + y back to DRAM.
-    tl.store(output_ptr + offsets, output, mask=mask)
-
-
-# %%
-# Let's also declare a helper function to (1) allocate the `z` tensor
-# and (2) enqueue the above kernel with appropriate grid/block sizes:
-
-
-def add(x: torch.Tensor, y: torch.Tensor):
-    # We need to preallocate the output.
-    output = torch.empty_like(x)
-    assert x.is_cuda and y.is_cuda and output.is_cuda
-    n_elements = output.numel()
-    # The SPMD launch grid denotes the number of kernel instances that run in parallel.
-    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
-    # In this case, we use a 1D grid where the size is the number of blocks:
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    # NOTE:
-    #  - Each torch.tensor object is implicitly converted into a pointer to its first element.
-    #  - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel.
-    #  - Don't forget to pass meta-parameters as keywords arguments.
-    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
-    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
-    # running asynchronously at this point.
-    return output
-
-
-# %%
-# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
-
-torch.manual_seed(0)
-size = 98432
-x = torch.rand(size, device='cuda')
-y = torch.rand(size, device='cuda')
-output_torch = x + y
-output_triton = add(x, y)
-print(output_torch)
-print(output_triton)
-print(
-    f'The maximum difference between torch and triton is '
-    f'{torch.max(torch.abs(output_torch - output_triton))}'
-)
-
-# %%
-# Seems like we're good to go!
-
-# %%
-# Benchmark
-# ---------
-#
-# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
-# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom ops.
-# for different problem sizes.
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['size'],  # Argument names to use as an x-axis for the plot.
-        x_vals=[
-            2 ** i for i in range(12, 28, 1)
-        ],  # Different possible values for `x_name`.
-        x_log=True,  # x axis is logarithmic.
-        line_arg='provider',  # Argument name whose value corresponds to a different line in the plot.
-        line_vals=['triton', 'torch'],  # Possible values for `line_arg`.
-        line_names=['Triton', 'Torch'],  # Label name for the lines.
-        styles=[('blue', '-'), ('green', '-')],  # Line styles.
-        ylabel='GB/s',  # Label name for the y-axis.
-        plot_name='vector-add-performance',  # Name for the plot. Used also as a file name for saving the plot.
-        args={},  # Values for function arguments not in `x_names` and `y_name`.
-    )
-)
-def benchmark(size, provider):
-    x = torch.rand(size, device='cuda', dtype=torch.float32)
-    y = torch.rand(size, device='cuda', dtype=torch.float32)
-    quantiles = [0.5, 0.2, 0.8]
-    if provider == 'torch':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y, quantiles=quantiles)
-    if provider == 'triton':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y), quantiles=quantiles)
-    gbps = lambda ms: 12 * size / ms * 1e-6
-    return gbps(ms), gbps(max_ms), gbps(min_ms)
-
-
-# %%
-# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
-# `save_path='/path/to/results/' to save them to disk along with raw CSV data:
-benchmark.run(print_data=True, show_plots=True)
diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py
deleted file mode 100644
index 13383cc1c783..000000000000
--- a/python/tutorials/02-fused-softmax.py
+++ /dev/null
@@ -1,200 +0,0 @@
-"""
-Fused Softmax
-=============
-
-In this tutorial, you will write a fused softmax operation that is significantly faster
-than PyTorch's native op for a particular class of matrices: those whose rows can fit in
-the GPU's SRAM.
-
-In doing so, you will learn about:
-
-* The benefits of kernel fusion for bandwidth-bound operations.
-
-* Reduction operators in Triton.
-
-"""
-
-# %%
-# Motivations
-# -----------
-#
-# Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
-# Let us consider instead the case of a simple (numerically stabilized) softmax operation:
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-@torch.jit.script
-def naive_softmax(x):
-    """Compute row-wise softmax of X using native pytorch
-
-    We subtract the maximum element in order to avoid overflows. Softmax is invariant to
-    this shift.
-    """
-    # read  MN elements ; write M  elements
-    x_max = x.max(dim=1)[0]
-    # read MN + M elements ; write MN elements
-    z = x - x_max[:, None]
-    # read  MN elements ; write MN elements
-    numerator = torch.exp(z)
-    # read  MN elements ; write M  elements
-    denominator = numerator.sum(dim=1)
-    # read MN + M elements ; write MN elements
-    ret = numerator / denominator[:, None]
-    # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements
-    return ret
-
-
-# %%
-# When implemented naively in PyTorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}`
-# requires reading :math:`5MN + 2M` elements from DRAM and writing back :math:`3MN + 2M` elements.
-# This is obviously wasteful; we'd prefer to have a custom "fused" kernel that only reads
-# X once and does all the necessary computations on-chip.
-# Doing so would require reading and writing back only :math:`MN` bytes, so we could
-# expect a theoretical speed-up of ~4x (i.e., :math:`(8MN + 4M) / 2MN`).
-# The `torch.jit.script` flags aims to perform this kind of "kernel fusion" automatically
-# but, as we will see later, it is still far from ideal.
-
-# %%
-# Compute Kernel
-# --------------
-#
-# Our softmax kernel works as follows: each program loads a row of the input matrix X,
-# normalizes it and writes back the result to the output Y.
-#
-# Note that one important limitation of Triton is that each block must have a
-# power-of-two number of elements, so we need to internally "pad" each row and guard the
-# memory operations properly if we want to handle any possible input shapes:
-
-
-@triton.jit
-def softmax_kernel(
-    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols,
-    BLOCK_SIZE: tl.constexpr
-):
-    # The rows of the softmax are independent, so we parallelize across those
-    row_idx = tl.program_id(0)
-    # The stride represents how much we need to increase the pointer to advance 1 row
-    row_start_ptr = input_ptr + row_idx * input_row_stride
-    # The block size is the next power of two greater than n_cols, so we can fit each
-    # row in a single block
-    col_offsets = tl.arange(0, BLOCK_SIZE)
-    input_ptrs = row_start_ptr + col_offsets
-    # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols
-    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))
-    # Subtract maximum for numerical stability
-    row_minus_max = row - tl.max(row, axis=0)
-    # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA)
-    numerator = tl.exp(row_minus_max)
-    denominator = tl.sum(numerator, axis=0)
-    softmax_output = numerator / denominator
-    # Write back output to DRAM
-    output_row_start_ptr = output_ptr + row_idx * output_row_stride
-    output_ptrs = output_row_start_ptr + col_offsets
-    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)
-
-
-# %%
-# We can create a helper function that enqueues the kernel and its (meta-)arguments for any given input tensor.
-
-
-def softmax(x):
-    n_rows, n_cols = x.shape
-    # The block size is the smallest power of two greater than the number of columns in `x`
-    BLOCK_SIZE = triton.next_power_of_2(n_cols)
-    # Another trick we can use is to ask the compiler to use more threads per row by
-    # increasing the number of warps (`num_warps`) over which each row is distributed.
-    # You will see in the next tutorial how to auto-tune this value in a more natural
-    # way so you don't have to come up with manual heuristics yourself.
-    num_warps = 4
-    if BLOCK_SIZE >= 2048:
-        num_warps = 8
-    if BLOCK_SIZE >= 4096:
-        num_warps = 16
-    # Allocate output
-    y = torch.empty_like(x)
-    # Enqueue kernel. The 1D launch grid is simple: we have one kernel instance per row o
-    # f the input matrix
-    softmax_kernel[(n_rows,)](
-        y,
-        x,
-        x.stride(0),
-        y.stride(0),
-        n_cols,
-        num_warps=num_warps,
-        BLOCK_SIZE=BLOCK_SIZE,
-    )
-    return y
-
-
-# %%
-# Unit Test
-# ---------
-
-# %%
-# We make sure that we test our kernel on a matrix with an irregular number of rows and columns.
-# This will allow us to verify that our padding mechanism works.
-
-torch.manual_seed(0)
-x = torch.randn(1823, 781, device='cuda')
-y_triton = softmax(x)
-y_torch = torch.softmax(x, axis=1)
-assert torch.allclose(y_triton, y_torch), (y_triton, y_torch)
-
-# %%
-# As expected, the results are identical.
-
-# %%
-# Benchmark
-# ---------
-#
-# Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.
-# We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['N'],  # argument names to use as an x-axis for the plot
-        x_vals=[
-            128 * i for i in range(2, 100)
-        ],  # different possible values for `x_name`
-        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
-        line_vals=[
-            'triton',
-            'torch-native',
-            'torch-jit',
-        ],  # possible values for `line_arg``
-        line_names=[
-            "Triton",
-            "Torch (native)",
-            "Torch (jit)",
-        ],  # label name for the lines
-        styles=[('blue', '-'), ('green', '-'), ('green', '--')],  # line styles
-        ylabel="GB/s",  # label name for the y-axis
-        plot_name="softmax-performance",  # name for the plot. Used also as a file name for saving the plot.
-        args={'M': 4096},  # values for function arguments not in `x_names` and `y_name`
-    )
-)
-def benchmark(M, N, provider):
-    x = torch.randn(M, N, device='cuda', dtype=torch.float32)
-    quantiles = [0.5, 0.2, 0.8]
-    if provider == 'torch-native':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1), quantiles=quantiles)
-    if provider == 'triton':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x), quantiles=quantiles)
-    if provider == 'torch-jit':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x), quantiles=quantiles)
-    gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
-    return gbps(ms), gbps(max_ms), gbps(min_ms)
-
-
-benchmark.run(show_plots=True, print_data=True)
-
-# %%
-# In the above plot, we can see that:
-#  - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
-#  - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.
-#    Note however that the PyTorch `softmax` operation is more general and will work on tensors of any shape.
diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py
deleted file mode 100644
index 8bcae2007abd..000000000000
--- a/python/tutorials/03-matrix-multiplication.py
+++ /dev/null
@@ -1,350 +0,0 @@
-"""
-Matrix Multiplication
-=====================
-In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves
-performance on parallel with cuBLAS.
-
-You will specifically learn about:
-
-* Block-level matrix multiplications.
-
-* Multi-dimensional pointer arithmetics.
-
-* Program re-ordering for improved L2 cache hit rate.
-
-* Automatic performance tuning.
-
-"""
-
-# %%
-# Motivations
-# -----------
-#
-# Matrix multiplications are a key building block of most modern high-performance computing systems.
-# They are notoriously hard to optimize, hence their implementation is generally done by
-# hardware vendors themselves as part of so-called "kernel libraries" (e.g., cuBLAS).
-# Unfortunately, these libraries are often proprietary and cannot be easily customized
-# to accommodate the needs of modern deep learning workloads (e.g., fused activation functions).
-# In this tutorial, you will learn how to implement efficient matrix multiplications by
-# yourself with Triton, in a way that is easy to customize and extend.
-#
-# Roughly speaking, the kernel that we will write will implement the following blocked
-# algorithm to multiply a (M, K) by a (K, N) matrix:
-#
-#  .. code-block:: python
-#
-#    # Do in parallel
-#    for m in range(0, M, BLOCK_SIZE_M):
-#      # Do in parallel
-#      for n in range(0, N, BLOCK_SIZE_N):
-#        acc = zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=float32)
-#        for k in range(0, K, BLOCK_SIZE_K):
-#          a = A[m : m+BLOCK_SIZE_M, k : k+BLOCK_SIZE_K]
-#          b = B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]
-#          acc += dot(a, b)
-#        C[m : m+BLOCK_SIZE_M, n : n+BLOCK_SIZE_N] = acc
-#
-# where each iteration of the doubly-nested for-loop is performed by a dedicated Triton program instance.
-
-# %%
-# Compute Kernel
-# --------------
-#
-# The above algorithm is, actually, fairly straightforward to implement in Triton.
-# The main difficulty comes from the computation of the memory locations at which blocks
-# of :code:`A` and :code:`B` must be read in the inner loop. For that, we need
-# multi-dimensional pointer arithmetics.
-#
-# Pointer Arithmetics
-# ~~~~~~~~~~~~~~~~~~~
-#
-# For a row-major 2D tensor :code:`X`, the memory location of :code:`X[i, j]` is given b
-# y :code:`&X[i, j] = X + i*stride_xi + j*stride_xj`.
-# Therefore, blocks of pointers for :code:`A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K]` and
-# :code:`B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]` can be defined in pseudo-code as:
-#
-#  .. code-block:: python
-#
-#    &A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K] =  a_ptr + (m : m+BLOCK_SIZE_M)[:, None]*A.stride(0) + (k : k+BLOCK_SIZE_K)[None, :]*A.stride(1);
-#    &B[k : k+BLOCK_SIZE_K, n:n+BLOCK_SIZE_N] =  b_ptr + (k : k+BLOCK_SIZE_K)[:, None]*B.stride(0) + (n : n+BLOCK_SIZE_N)[None, :]*B.stride(1);
-#
-# Which means that pointers for blocks of A and B can be initialized (i.e., :code:`k=0`) in Triton as the following
-# code. Also note that we need an extra modulo to handle the case where :code:`M` is not a multiple of
-# :code:`BLOCK_SIZE_M` or :code:`N` is not a multiple of :code:`BLOCK_SIZE_N`, in which case we can pad the data with
-# some useless values, which will not contribute to the results. For the :code:`K` dimension, we will handle that later
-# using masking load semantics.
-#
-#  .. code-block:: python
-#
-#    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-#    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
-#    offs_k = tl.arange(0, BLOCK_SIZE_K)
-#    a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)
-#    b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)
-#
-# And then updated in the inner loop as follows:
-#
-#  .. code-block:: python
-#
-#    a_ptrs += BLOCK_SIZE_K * stride_ak;
-#    b_ptrs += BLOCK_SIZE_K * stride_bk;
-#
-#
-# L2 Cache Optimizations
-# ~~~~~~~~~~~~~~~~~~~~~~
-#
-# As mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]`
-# block of :code:`C`.
-# It is important to remember that the order in which these blocks are computed does
-# matter, since it affects the L2 cache hit rate of our program. and unfortunately, a
-# a simple row-major ordering
-#
-#  .. code-block:: Python
-#
-#    pid = triton.program_id(0);
-#    grid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M;
-#    grid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N;
-#    pid_m = pid / grid_n;
-#    pid_n = pid % grid_n;
-#
-# is just not going to cut it.
-#
-# One possible solution is to launch blocks in an order that promotes data reuse.
-# This can be done by 'super-grouping' blocks in groups of :code:`GROUP_M` rows before
-# switching to the next column:
-#
-#  .. code-block:: python
-#
-#    # Program ID
-#    pid = tl.program_id(axis=0)
-#    # Number of program ids along the M axis
-#    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-#    # Number of programs ids along the N axis
-#    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-#    # Number of programs in group
-#    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-#    # Id of the group this program is in
-#    group_id = pid // num_pid_in_group
-#    # Row-id of the first program in the group
-#    first_pid_m = group_id * GROUP_SIZE_M
-#    # If `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller
-#    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-#    # *Within groups*, programs are ordered in a column-major order
-#    # Row-id of the program in the *launch grid*
-#    pid_m = first_pid_m + (pid % group_size_m)
-#    # Col-id of the program in the *launch grid*
-#    pid_n = (pid % num_pid_in_group) // group_size_m
-#
-# For example, in the following matmul where each matrix is 9 blocks by 9 blocks,
-# we can see that if we compute the output in row-major ordering, we need to load 90
-# blocks into SRAM to compute the first 9 output blocks, but if we do it in grouped
-# ordering, we only need to load 54 blocks.
-#
-#   .. image:: grouped_vs_row_major_ordering.png
-#
-# In practice, this can improve the performance of our matrix multiplication kernel by
-# more than 10\% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).
-#
-
-# %%
-# Final Result
-# ------------
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-# `triton.jit`'ed functions can be auto-tuned by using the `triton.autotune` decorator, which consumes:
-#   - A list of `triton.Config` objects that define different configurations of
-#       meta-parameters (e.g., `BLOCK_SIZE_M`) and compilation options (e.g., `num_warps`) to try
-#   - An auto-tuning *key* whose change in values will trigger evaluation of all the
-#       provided configs
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def matmul_kernel(
-    # Pointers to matrices
-    a_ptr, b_ptr, c_ptr,
-    # Matrix dimensions
-    M, N, K,
-    # The stride variables represent how much to increase the ptr by when moving by 1
-    # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
-    # by to get the element one row down (A has M rows).
-    stride_am, stride_ak,
-    stride_bk, stride_bn,
-    stride_cm, stride_cn,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-    ACTIVATION: tl.constexpr,
-):
-    """Kernel for computing the matmul C = A x B.
-    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
-    """
-    # -----------------------------------------------------------
-    # Map program ids `pid` to the block of C it should compute.
-    # This is done in a grouped ordering to promote L2 data reuse.
-    # See above `L2 Cache Optimizations` section for details.
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    # ----------------------------------------------------------
-    # Create pointers for the first blocks of A and B.
-    # We will advance this pointer as we move in the K direction
-    # and accumulate
-    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
-    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
-    # See above `Pointer Arithmetics` section for details
-    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
-    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
-
-    # -----------------------------------------------------------
-    # Iterate to compute a block of the C matrix.
-    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
-    # of fp32 values for higher accuracy.
-    # `accumulator` will be converted back to fp16 after the loop.
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        # Load the next block of A and B, generate a mask by checking the K dimension.
-        # If it is out of bounds, set it to 0.
-        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
-        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
-        # We accumulate along the K dimension.
-        accumulator += tl.dot(a, b)
-        # Advance the ptrs to the next K block.
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-        b_ptrs += BLOCK_SIZE_K * stride_bk
-    # You can fuse arbitrary activation functions here
-    # while the accumulator is still in FP32!
-    if ACTIVATION == "leaky_relu":
-        accumulator = leaky_relu(accumulator)
-    c = accumulator.to(tl.float16)
-
-    # -----------------------------------------------------------
-    # Write back the block of the output matrix C with masks.
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-    tl.store(c_ptrs, c, mask=c_mask)
-
-
-# We can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`.
-@triton.jit
-def leaky_relu(x):
-    x = x + 1
-    return tl.where(x >= 0, x, 0.01 * x)
-
-
-# %%
-# We can now create a convenience wrapper function that only takes two input tensors,
-# and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel.
-
-
-def matmul(a, b, activation=""):
-    # Check constraints.
-    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
-    assert a.is_contiguous(), "Matrix A must be contiguous"
-    assert b.is_contiguous(), "Matrix B must be contiguous"
-    M, K = a.shape
-    K, N = b.shape
-    # Allocates output.
-    c = torch.empty((M, N), device=a.device, dtype=a.dtype)
-    # 1D launch kernel where each block gets its own program.
-    grid = lambda META: (
-        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
-    )
-    matmul_kernel[grid](
-        a, b, c,
-        M, N, K,
-        a.stride(0), a.stride(1),
-        b.stride(0), b.stride(1),
-        c.stride(0), c.stride(1),
-        ACTIVATION=activation
-    )
-    return c
-
-
-# %%
-# Unit Test
-# ---------
-#
-# We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS).
-
-torch.manual_seed(0)
-a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
-b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
-triton_output = matmul(a, b)
-torch_output = torch.matmul(a, b)
-print(f"triton_output={triton_output}")
-print(f"torch_output={torch_output}")
-if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0):
-    print("✅ Triton and Torch match")
-else:
-    print("❌ Triton and Torch differ")
-
-# %%
-# Benchmark
-# ---------
-#
-# Square Matrix Performance
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# We can now compare the performance of our kernel against that of cuBLAS. Here we focus on square matrices,
-# but feel free to arrange this script as you wish to benchmark any other matrix shape.
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['M', 'N', 'K'],  # Argument names to use as an x-axis for the plot
-        x_vals=[
-            128 * i for i in range(2, 33)
-        ],  # Different possible values for `x_name`
-        line_arg='provider',  # Argument name whose value corresponds to a different line in the plot
-        # Possible values for `line_arg`
-        line_vals=['cublas', 'triton'],
-        # Label name for the lines
-        line_names=["cuBLAS", "Triton"],
-        # Line styles
-        styles=[('green', '-'), ('blue', '-')],
-        ylabel="TFLOPS",  # Label name for the y-axis
-        plot_name="matmul-performance",  # Name for the plot, used also as a file name for saving the plot.
-        args={},
-    )
-)
-def benchmark(M, N, K, provider):
-    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
-    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
-    quantiles = [0.5, 0.2, 0.8]
-    if provider == 'cublas':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), quantiles=quantiles)
-    if provider == 'triton':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b), quantiles=quantiles)
-    perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
-    return perf(ms), perf(max_ms), perf(min_ms)
-
-
-benchmark.run(show_plots=True, print_data=True)
diff --git a/python/tutorials/04-low-memory-dropout.py b/python/tutorials/04-low-memory-dropout.py
deleted file mode 100644
index 3c4d217e22b0..000000000000
--- a/python/tutorials/04-low-memory-dropout.py
+++ /dev/null
@@ -1,173 +0,0 @@
-"""
-Low-Memory Dropout
-==================
-
-In this tutorial, you will write a memory-efficient implementation of dropout whose state
-will be composed of a single int32 seed. This differs from more traditional implementations of dropout,
-whose state is generally composed of a bit mask tensor of the same shape as the input.
-
-In doing so, you will learn about:
-
-* The limitations of naive implementations of Dropout with PyTorch.
-
-* Parallel pseudo-random number generation in Triton.
-
-"""
-
-# %%
-# Baseline
-# --------
-#
-# The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance
-# of deep neural networks in low-data regime (i.e. regularization).
-#
-# It takes a vector as input and produces a vector of the same shape as output. Each scalar in the
-# output has a probability :math:`p` of being changed to zero and otherwise it is copied from the input.
-# This forces the network to perform well even when only :math:`1 - p` scalars from the input are available.
-#
-# At evaluation time we want to use the full power of the network so we set :math:`p=0`. Naively this would
-# increase the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease
-# in the output softmax temperature). To prevent this we multiply the output by :math:`\frac{1}{1 - p}`, which
-# keeps the norm consistent regardless of the dropout probability.
-#
-# Let's first take a look at the baseline implementation.
-
-
-import tabulate
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _dropout(
-    x_ptr,  # pointer to the input
-    x_keep_ptr,  # pointer to a mask of 0s and 1s
-    output_ptr,  # pointer to the output
-    n_elements,  # number of elements in the `x` tensor
-    p,  # probability that an element of `x` is changed to zero
-    BLOCK_SIZE: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-    # Load data
-    x = tl.load(x_ptr + offsets, mask=mask)
-    x_keep = tl.load(x_keep_ptr + offsets, mask=mask)
-    # The line below is the crucial part, described in the paragraph above!
-    output = tl.where(x_keep, x / (1 - p), 0.0)
-    # Write-back output
-    tl.store(output_ptr + offsets, output, mask=mask)
-
-
-def dropout(x, x_keep, p):
-    output = torch.empty_like(x)
-    assert x.is_contiguous()
-    n_elements = x.numel()
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)
-    return output
-
-
-# Input tensor
-x = torch.randn(size=(10,)).cuda()
-# Dropout mask
-p = 0.5
-x_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda()
-#
-output = dropout(x, x_keep=x_keep, p=p)
-print(tabulate.tabulate([
-    ["input"] + x.tolist(),
-    ["keep mask"] + x_keep.tolist(),
-    ["output"] + output.tolist()
-]))
-
-# %%
-# Seeded dropout
-# --------------
-#
-# The above implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly
-# we need to store the dropout mask for backpropagation. Secondly, dropout state management can get
-# very tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in
-# https://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation
-# that (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management
-# of persisting randomness across multiple invocations of the kernel.
-#
-# Pseudo-random number generation in Triton is simple! In this tutorial we will use the
-# :code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32`
-# values in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides
-# other :ref:`random number generation strategies <Random Number Generation>`.
-#
-# .. note::
-#    Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_).
-#
-# Let's put it all together.
-
-
-@triton.jit
-def _seeded_dropout(
-    x_ptr,
-    output_ptr,
-    n_elements,
-    p,
-    seed,
-    BLOCK_SIZE: tl.constexpr,
-):
-    # compute memory offsets of elements handled by this instance
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    # load data from x
-    mask = offsets < n_elements
-    x = tl.load(x_ptr + offsets, mask=mask)
-    # randomly prune it
-    random = tl.rand(seed, offsets)
-    x_keep = random > p
-    # write-back
-    output = tl.where(x_keep, x / (1 - p), 0.0)
-    tl.store(output_ptr + offsets, output, mask=mask)
-
-
-def seeded_dropout(x, p, seed):
-    output = torch.empty_like(x)
-    assert x.is_contiguous()
-    n_elements = x.numel()
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024)
-    return output
-
-
-x = torch.randn(size=(10,)).cuda()
-# Compare this to the baseline - dropout mask is never instantiated!
-output = seeded_dropout(x, p=0.5, seed=123)
-output2 = seeded_dropout(x, p=0.5, seed=123)
-output3 = seeded_dropout(x, p=0.5, seed=512)
-
-print(tabulate.tabulate([
-    ["input"] + x.tolist(),
-    ["output (seed = 123)"] + output.tolist(),
-    ["output (seed = 123)"] + output2.tolist(),
-    ["output (seed = 512)"] + output3.tolist()
-]))
-
-# %%
-# Et Voilà! We have a triton kernel that applies the same dropout mask provided the seed is the same!
-# If you'd like explore further applications of pseudorandomness in GPU programming, we encourage you
-# to explore the `triton/language/random` folder!
-
-# %%
-# Exercises
-# ---------
-#
-# 1. Extend the kernel to operate over a matrix and use a vector of seeds - one per row.
-# 2. Add support for striding.
-# 3. (challenge) Implement a kernel for sparse Johnson-Lindenstrauss transform which generates the projection matrix one the fly each time using a seed.
-
-# %%
-# References
-# ----------
-#
-# .. [SALMON2011] John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, "Parallel Random Numbers: As Easy as 1, 2, 3", 2011
-# .. [SRIVASTAVA2014] Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, "Dropout: A Simple Way to Prevent Neural Networks from Overfitting", JMLR 2014
diff --git a/python/tutorials/05-layer-norm.py b/python/tutorials/05-layer-norm.py
deleted file mode 100644
index 1737e7e36345..000000000000
--- a/python/tutorials/05-layer-norm.py
+++ /dev/null
@@ -1,374 +0,0 @@
-"""
-Layer Normalization
-====================
-In this tutorial, you will write a high-performance layer normalization
-kernel that runs faster than the PyTorch implementation.
-
-In doing so, you will learn about:
-
-* Implementing backward pass in Triton.
-
-* Implementing parallel reduction in Triton.
-
-"""
-
-# %%
-# Motivations
-# -----------
-#
-# The *LayerNorm* operator was first introduced in [BA2016]_ as a way to improve the performance
-# of sequential models (e.g., Transformers) or neural networks with small batch size.
-# It takes a vector :math:`x` as input and produces a vector :math:`y` of the same shape as output.
-# The normalization is performed by subtracting the mean and dividing by the standard deviation of :math:`x`.
-# After the normalization, a learnable linear transformation with weights :math:`w` and biases :math:`b` is applied.
-# The forward pass can be expressed as follows:
-#
-# .. math::
-#    y = \frac{ x - \text{E}[x] }{ \sqrt{\text{Var}(x) + \epsilon} } * w + b
-#
-# where :math:`\epsilon` is a small constant added to the denominator for numerical stability.
-# Let’s first take a look at the forward pass implementation.
-
-import torch
-
-import triton
-import triton.language as tl
-
-try:
-    # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it
-    # should not be added to extras_require in setup.py.
-    import apex
-    HAS_APEX = True
-except ModuleNotFoundError:
-    HAS_APEX = False
-
-
-@triton.jit
-def _layer_norm_fwd_fused(
-    X,  # pointer to the input
-    Y,  # pointer to the output
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride,  # how much to increase the pointer when moving by 1 row
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    BLOCK_SIZE: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    Y += row * stride
-    X += row * stride
-    # Compute mean
-    mean = 0
-    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
-    for off in range(0, N, BLOCK_SIZE):
-        cols = off + tl.arange(0, BLOCK_SIZE)
-        a = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
-        _mean += a
-    mean = tl.sum(_mean, axis=0) / N
-    # Compute variance
-    _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
-    for off in range(0, N, BLOCK_SIZE):
-        cols = off + tl.arange(0, BLOCK_SIZE)
-        x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
-        x = tl.where(cols < N, x - mean, 0.)
-        _var += x * x
-    var = tl.sum(_var, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    # Write mean / rstd
-    tl.store(Mean + row, mean)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    for off in range(0, N, BLOCK_SIZE):
-        cols = off + tl.arange(0, BLOCK_SIZE)
-        mask = cols < N
-        w = tl.load(W + cols, mask=mask)
-        b = tl.load(B + cols, mask=mask)
-        x = tl.load(X + cols, mask=mask, other=0.).to(tl.float32)
-        x_hat = (x - mean) * rstd
-        y = x_hat * w + b
-        # Write output
-        tl.store(Y + cols, y, mask=mask)
-
-
-# %%
-# Backward pass
-# -------------
-#
-# The backward pass for the layer normalization operator is a bit more involved than the forward pass.
-# Let :math:`\hat{x}` be the normalized inputs :math:`\frac{ x - \text{E}[x] }{ \sqrt{\text{Var}(x) + \epsilon} }` before the linear transformation,
-# the Vector-Jacobian Products (VJP) :math:`\nabla_{x}` of :math:`x` are given by:
-#
-# .. math::
-#    \nabla_{x} = \frac{1}{\sigma}\Big( \nabla_{y} \odot w - \underbrace{ \big( \frac{1}{N} \hat{x} \cdot (\nabla_{y} \odot w) \big) }_{c_1} \odot \hat{x} - \underbrace{ \frac{1}{N} \nabla_{y} \cdot w }_{c_2} \Big)
-#
-# where :math:`\odot` denotes the element-wise multiplication, :math:`\cdot` denotes the dot product, and :math:`\sigma` is the standard deviation.
-# :math:`c_1` and :math:`c_2` are intermediate constants that improve the readability of the following implementation.
-#
-# For the weights :math:`w` and biases :math:`b`, the VJPs :math:`\nabla_{w}` and :math:`\nabla_{b}` are more straightforward:
-#
-# .. math::
-#    \nabla_{w} = \nabla_{y} \odot \hat{x} \quad \text{and} \quad \nabla_{b} = \nabla_{y}
-#
-# Since the same weights :math:`w` and biases :math:`b` are used for all rows in the same batch, their gradients need to sum up.
-# To perform this step efficiently, we use a parallel reduction strategy: each kernel instance accumulates
-# partial :math:`\nabla_{w}` and :math:`\nabla_{b}` across certain rows into one of :math:`\text{GROUP_SIZE_M}` independent buffers.
-# These buffers stay in the L2 cache and then are further reduced by another function to compute the actual :math:`\nabla_{w}` and :math:`\nabla_{b}`.
-#
-# Let the number of input rows :math:`M = 4` and :math:`\text{GROUP_SIZE_M} = 2`,
-# here's a diagram of the parallel reduction strategy for :math:`\nabla_{w}` (:math:`\nabla_{b}` is omitted for brevity):
-#
-#   .. image:: parallel_reduction.png
-#
-# In Stage 1, the rows of X that have the same color share the same buffer and thus a lock is used to ensure that only one kernel instance writes to the buffer at a time.
-# In Stage 2, the buffers are further reduced to compute the final :math:`\nabla_{w}` and :math:`\nabla_{b}`.
-# In the following implementation, Stage 1 is implemented by the function :code:`_layer_norm_bwd_dx_fused` and Stage 2 is implemented by the function :code:`_layer_norm_bwd_dwdb`.
-
-@triton.jit
-def _layer_norm_bwd_dx_fused(
-    DX,  # pointer to the input gradient
-    DY,  # pointer to the output gradient
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    X,   # pointer to the input
-    W,   # pointer to the weights
-    B,   # pointer to the biases
-    Mean,   # pointer to the mean
-    Rstd,   # pointer to the 1/std
-    Lock,  # pointer to the lock
-    stride,  # how much to increase the pointer when moving by 1 row
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    GROUP_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr
-):
-    # Map the program id to the elements of X, DX, and DY it should compute.
-    row = tl.program_id(0)
-    cols = tl.arange(0, BLOCK_SIZE_N)
-    mask = cols < N
-    X += row * stride
-    DY += row * stride
-    DX += row * stride
-    # Offset locks and weights/biases gradient pointer for parallel reduction
-    lock_id = row % GROUP_SIZE_M
-    Lock += lock_id
-    Count = Lock + GROUP_SIZE_M
-    DW = DW + lock_id * N + cols
-    DB = DB + lock_id * N + cols
-    # Load data to SRAM
-    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
-    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    mean = tl.load(Mean + row)
-    rstd = tl.load(Rstd + row)
-    # Compute dx
-    xhat = (x - mean) * rstd
-    wdy = w * dy
-    xhat = tl.where(mask, xhat, 0.)
-    wdy = tl.where(mask, wdy, 0.)
-    c1 = tl.sum(xhat * wdy, axis=0) / N
-    c2 = tl.sum(wdy, axis=0) / N
-    dx = (wdy - (xhat * c1 + c2)) * rstd
-    # Write dx
-    tl.store(DX + cols, dx, mask=mask)
-    # Accumulate partial sums for dw/db
-    partial_dw = (dy * xhat).to(w.dtype)
-    partial_db = (dy).to(w.dtype)
-    while tl.atomic_cas(Lock, 0, 1) == 1:
-        pass
-    count = tl.load(Count)
-    # First store doesn't accumulate
-    if count == 0:
-        tl.atomic_xchg(Count, 1)
-    else:
-        partial_dw += tl.load(DW, mask=mask)
-        partial_db += tl.load(DB, mask=mask)
-    tl.store(DW, partial_dw, mask=mask)
-    tl.store(DB, partial_db, mask=mask)
-    # Release the lock
-    tl.atomic_xchg(Lock, 0)
-
-
-@triton.jit
-def _layer_norm_bwd_dwdb(
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    FINAL_DW,  # pointer to the weights gradient
-    FINAL_DB,  # pointer to the biases gradient
-    M,  # GROUP_SIZE_M
-    N,  # number of columns
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr
-):
-    # Map the program id to the elements of DW and DB it should compute.
-    pid = tl.program_id(0)
-    cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    # Iterate through the rows of DW and DB to sum the partial sums.
-    for i in range(0, M, BLOCK_SIZE_M):
-        rows = i + tl.arange(0, BLOCK_SIZE_M)
-        mask = (rows[:, None] < M) & (cols[None, :] < N)
-        offs = rows[:, None] * N + cols[None, :]
-        dw += tl.load(DW + offs, mask=mask, other=0.)
-        db += tl.load(DB + offs, mask=mask, other=0.)
-    # Write the final sum to the output.
-    sum_dw = tl.sum(dw, axis=0)
-    sum_db = tl.sum(db, axis=0)
-    tl.store(FINAL_DW + cols, sum_dw, mask=cols < N)
-    tl.store(FINAL_DB + cols, sum_db, mask=cols < N)
-
-
-# %%
-# Benchmark
-# ---------
-#
-# We can now compare the performance of our kernel against that of PyTorch.
-# Here we focus on inputs that have Less than 64KB per feature.
-# Specifically, one can set :code:`'mode': 'backward'` to benchmark the backward pass.
-
-
-class LayerNorm(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, x, normalized_shape, weight, bias, eps):
-        # allocate output
-        y = torch.empty_like(x)
-        # reshape input data into 2D tensor
-        x_arg = x.reshape(-1, x.shape[-1])
-        M, N = x_arg.shape
-        mean = torch.empty((M, ), dtype=torch.float32, device='cuda')
-        rstd = torch.empty((M, ), dtype=torch.float32, device='cuda')
-        # Less than 64KB per feature: enqueue fused kernel
-        MAX_FUSED_SIZE = 65536 // x.element_size()
-        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-        if N > BLOCK_SIZE:
-            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-        # heuristics for number of warps
-        num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
-        # enqueue kernel
-        _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd,
-                                    x_arg.stride(0), N, eps,
-                                    BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)
-        ctx.save_for_backward(x, weight, bias, mean, rstd)
-        ctx.BLOCK_SIZE = BLOCK_SIZE
-        ctx.num_warps = num_warps
-        ctx.eps = eps
-        return y
-
-    @staticmethod
-    def backward(ctx, dy):
-        x, w, b, m, v = ctx.saved_tensors
-        # heuristics for amount of parallel reduction stream for DW/DB
-        N = w.shape[0]
-        GROUP_SIZE_M = 64
-        if N <= 8192: GROUP_SIZE_M = 96
-        if N <= 4096: GROUP_SIZE_M = 128
-        if N <= 1024: GROUP_SIZE_M = 256
-        # allocate output
-        locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device='cuda')
-        _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
-        _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
-        dw = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
-        db = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
-        dx = torch.empty_like(dy)
-        # enqueue kernel using forward pass heuristics
-        # also compute partial sums for DW and DB
-        x_arg = x.reshape(-1, x.shape[-1])
-        M, N = x_arg.shape
-        _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks,
-                                       x_arg.stride(0), N, ctx.eps,
-                                       BLOCK_SIZE_N=ctx.BLOCK_SIZE,
-                                       GROUP_SIZE_M=GROUP_SIZE_M,
-                                       num_warps=ctx.num_warps)
-        grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]
-        # accumulate partial sums in separate kernel
-        _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N,
-                                   BLOCK_SIZE_M=32,
-                                   BLOCK_SIZE_N=128)
-        return dx, None, dw, db, None
-
-
-layer_norm = LayerNorm.apply
-
-
-def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
-    # create data
-    x_shape = (M, N)
-    w_shape = (x_shape[-1], )
-    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
-    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
-    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
-    dy = .1 * torch.randn_like(x)
-    x.requires_grad_(True)
-    # forward pass
-    y_tri = layer_norm(x, w_shape, weight, bias, eps)
-    y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
-    # backward pass (triton)
-    y_tri.backward(dy, retain_graph=True)
-    dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]
-    x.grad, weight.grad, bias.grad = None, None, None
-    # backward pass (torch)
-    y_ref.backward(dy, retain_graph=True)
-    dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
-    # compare
-    assert torch.allclose(y_tri, y_ref, atol=1e-2, rtol=0)
-    assert torch.allclose(dx_tri, dx_ref, atol=1e-2, rtol=0)
-    assert torch.allclose(db_tri, db_ref, atol=1e-2, rtol=0)
-    assert torch.allclose(dw_tri, dw_ref, atol=1e-2, rtol=0)
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['N'],
-        x_vals=[512 * i for i in range(2, 32)],
-        line_arg='provider',
-        line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []),
-        line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []),
-        styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
-        ylabel='GB/s',
-        plot_name='layer-norm-backward',
-        args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'}
-    )
-)
-def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='cuda'):
-    # create data
-    x_shape = (M, N)
-    w_shape = (x_shape[-1], )
-    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
-    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
-    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
-    dy = .1 * torch.randn_like(x)
-    x.requires_grad_(True)
-    quantiles = [0.5, 0.2, 0.8]
-    # utility functions
-    if provider == 'triton':
-        y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)
-    if provider == 'torch':
-        y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)
-    if provider == 'apex':
-        apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)
-        y_fwd = lambda: apex_layer_norm(x)
-    # forward pass
-    if mode == 'forward':
-        gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6
-        ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, quantiles=quantiles, rep=500)
-    # backward pass
-    if mode == 'backward':
-        gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6
-        y = y_fwd()
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True),
-                                                     quantiles=quantiles, grad_to_none=[x], rep=500)
-    return gbps(ms), gbps(max_ms), gbps(min_ms)
-
-
-test_layer_norm(1151, 8192, torch.float16)
-bench_layer_norm.run(save_path='.', print_data=True)
-
-# %%
-# References
-# ----------
-#
-# .. [BA2016] Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E. Hinton, "Layer Normalization", Arxiv 2016
diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py
deleted file mode 100644
index c9875cf358a2..000000000000
--- a/python/tutorials/06-fused-attention.py
+++ /dev/null
@@ -1,361 +0,0 @@
-"""
-Fused Attention
-===============
-
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
-"""
-
-import pytest
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _fwd_kernel(
-    Q, K, V, sm_scale,
-    L, M,
-    Out,
-    stride_qz, stride_qh, stride_qm, stride_qk,
-    stride_kz, stride_kh, stride_kn, stride_kk,
-    stride_vz, stride_vh, stride_vk, stride_vn,
-    stride_oz, stride_oh, stride_om, stride_on,
-    Z, H, N_CTX,
-    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hz = tl.program_id(1)
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    off_k = off_hz * stride_qh + offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kk
-    off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    # Initialize pointers to Q, K, V
-    q_ptrs = Q + off_q
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-    # initialize pointer to m and l
-    m_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_prev = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    q = tl.load(q_ptrs)
-    # loop over k, v and update accumulator
-    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
-        # -- compute qk ----
-        k = tl.load(k_ptrs)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk *= sm_scale
-        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
-        # compute new m
-        m_curr = tl.maximum(tl.max(qk, 1), m_prev)
-        # correct old l
-        l_prev *= tl.exp(m_prev - m_curr)
-        # attention weights
-        p = tl.exp(qk - m_curr[:, None])
-        l_curr = tl.sum(p, 1) + l_prev
-        # rescale operands of matmuls
-        l_rcp = 1. / l_curr
-        p *= l_rcp[:, None]
-        acc *= (l_prev * l_rcp)[:, None]
-        # update acc
-        p = p.to(Q.dtype.element_ty)
-        v = tl.load(v_ptrs)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_prev = l_curr
-        m_prev = m_curr
-        # update pointers
-        k_ptrs += BLOCK_N * stride_kn
-        v_ptrs += BLOCK_N * stride_vk
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    l_ptrs = L + off_hz * N_CTX + offs_m
-    m_ptrs = M + off_hz * N_CTX + offs_m
-    tl.store(l_ptrs, l_prev)
-    tl.store(m_ptrs, m_prev)
-    # initialize pointers to output
-    offs_n = tl.arange(0, BLOCK_DMODEL)
-    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc)
-
-
-@triton.jit
-def _bwd_preprocess(
-    Out, DO, L,
-    NewDO, Delta,
-    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
-):
-    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_n = tl.arange(0, D_HEAD)
-    # load
-    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    denom = tl.load(L + off_m).to(tl.float32)
-    # compute
-    do = do / denom[:, None]
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
-    tl.store(Delta + off_m, delta)
-
-
-@triton.jit
-def _bwd_kernel(
-    Q, K, V, sm_scale, Out, DO,
-    DQ, DK, DV,
-    L, M,
-    D,
-    stride_qz, stride_qh, stride_qm, stride_qk,
-    stride_kz, stride_kh, stride_kn, stride_kk,
-    stride_vz, stride_vh, stride_vk, stride_vn,
-    Z, H, N_CTX,
-    num_block,
-    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hz = tl.program_id(0)
-    off_z = off_hz // H
-    off_h = off_hz % H
-    # offset pointers for batch/head
-    Q += off_z * stride_qz + off_h * stride_qh
-    K += off_z * stride_qz + off_h * stride_qh
-    V += off_z * stride_qz + off_h * stride_qh
-    DO += off_z * stride_qz + off_h * stride_qh
-    DQ += off_z * stride_qz + off_h * stride_qh
-    DK += off_z * stride_qz + off_h * stride_qh
-    DV += off_z * stride_qz + off_h * stride_qh
-    for start_n in range(0, num_block):
-        lo = start_n * BLOCK_M
-        # initialize row/col offsets
-        offs_qm = lo + tl.arange(0, BLOCK_M)
-        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
-        offs_m = tl.arange(0, BLOCK_N)
-        offs_k = tl.arange(0, BLOCK_DMODEL)
-        # initialize pointers to value-like data
-        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        # pointer to row-wise quantities in value-like data
-        D_ptrs = D + off_hz * N_CTX
-        m_ptrs = M + off_hz * N_CTX
-        # initialize dv amd dk
-        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        # k and v stay in SRAM throughout
-        k = tl.load(k_ptrs)
-        v = tl.load(v_ptrs)
-        # loop over rows
-        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
-            offs_m_curr = start_m + offs_m
-            # load q, k, v, do on-chip
-            q = tl.load(q_ptrs)
-            # recompute p = softmax(qk, dim=-1).T
-            # NOTE: `do` is pre-divided by `l`; no normalization here
-            qk = tl.dot(q, tl.trans(k))
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-            m = tl.load(m_ptrs + offs_m_curr)
-            p = tl.exp(qk * sm_scale - m[:, None])
-            # compute dv
-            do = tl.load(do_ptrs)
-            dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)
-            # compute dp = dot(v, do)
-            Di = tl.load(D_ptrs + offs_m_curr)
-            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
-            dp += tl.dot(do, tl.trans(v))
-            # compute ds = p * (dp - delta[:, None])
-            ds = p * dp * sm_scale
-            # compute dk = dot(ds.T, q)
-            dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)
-            # compute dq
-            dq = tl.load(dq_ptrs)
-            dq += tl.dot(ds.to(Q.dtype.element_ty), k)
-            tl.store(dq_ptrs, dq)
-            # increment pointers
-            dq_ptrs += BLOCK_M * stride_qm
-            q_ptrs += BLOCK_M * stride_qm
-            do_ptrs += BLOCK_M * stride_qm
-        # write-back
-        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        tl.store(dv_ptrs, dv)
-        tl.store(dk_ptrs, dk)
-
-
-empty = torch.empty(128, device="cuda")
-
-
-class _attention(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, q, k, v, sm_scale):
-        BLOCK = 128
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        assert Lk in {16, 32, 64, 128}
-        o = torch.empty_like(q)
-        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1], 1)
-        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        num_warps = 4 if Lk <= 64 else 8
-
-        _fwd_kernel[grid](
-            q, k, v, sm_scale,
-            L, m,
-            o,
-            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
-            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
-            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
-            o.stride(0), o.stride(1), o.stride(2), o.stride(3),
-            q.shape[0], q.shape[1], q.shape[2],
-            BLOCK_M=BLOCK, BLOCK_N=BLOCK,
-            BLOCK_DMODEL=Lk, num_warps=num_warps,
-            num_stages=2,
-        )
-        # print(h.asm["ttgir"])
-
-        ctx.save_for_backward(q, k, v, o, L, m)
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = Lk
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        BLOCK = 128
-        q, k, v, o, l, m = ctx.saved_tensors
-        do = do.contiguous()
-        dq = torch.zeros_like(q, dtype=torch.float32)
-        dk = torch.empty_like(k)
-        dv = torch.empty_like(v)
-        do_scaled = torch.empty_like(do)
-        delta = torch.empty_like(l)
-        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )](
-            o, do, l,
-            do_scaled, delta,
-            BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL,
-        )
-        _bwd_kernel[(ctx.grid[1],)](
-            q, k, v, ctx.sm_scale,
-            o, do_scaled,
-            dq, dk, dv,
-            l, m,
-            delta,
-            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
-            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
-            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
-            q.shape[0], q.shape[1], q.shape[2],
-            ctx.grid[0],
-            BLOCK_M=BLOCK, BLOCK_N=BLOCK,
-            BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8,
-            num_stages=1,
-        )
-        # print(h.asm["ttgir"])
-        return dq, dk, dv, None
-
-
-attention = _attention.apply
-
-
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(4, 48, 1024, 64)])
-def test_op(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
-    torch.manual_seed(20)
-    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_()
-    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_()
-    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_()
-    sm_scale = 0.2
-    dout = torch.randn_like(q)
-    # reference implementation
-    M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda"))
-    p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
-    for z in range(Z):
-        for h in range(H):
-            p[:, :, M == 0] = float("-inf")
-    p = torch.softmax(p.float(), dim=-1).half()
-    # p = torch.exp(p)
-    ref_out = torch.matmul(p, v)
-    ref_out.backward(dout)
-    ref_dv, v.grad = v.grad.clone(), None
-    ref_dk, k.grad = k.grad.clone(), None
-    ref_dq, q.grad = q.grad.clone(), None
-    # # triton implementation
-    tri_out = attention(q, k, v, sm_scale)
-    # print(ref_out)
-    # print(tri_out)
-    tri_out.backward(dout)
-    tri_dv, v.grad = v.grad.clone(), None
-    tri_dk, k.grad = k.grad.clone(), None
-    tri_dq, q.grad = q.grad.clone(), None
-    # compare
-    assert torch.allclose(ref_out, tri_out, atol=1e-2, rtol=0)
-    assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=0)
-    assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=0)
-    assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=0)
-
-
-try:
-    from flash_attn.flash_attn_interface import flash_attn_func
-    HAS_FLASH = True
-except BaseException:
-    HAS_FLASH = False
-
-BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64
-# vary seq length for fixed head and batch=4
-configs = [triton.testing.Benchmark(
-    x_names=['N_CTX'],
-    x_vals=[2**i for i in range(10, 14)],
-    line_arg='provider',
-    line_vals=['triton'] + (['flash'] if HAS_FLASH else []),
-    line_names=['Triton'] + (['Flash'] if HAS_FLASH else []),
-    styles=[('red', '-'), ('blue', '-')],
-    ylabel='ms',
-    plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-{mode}',
-    args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': torch.float16, 'mode': mode}
-) for mode in ['fwd', 'bwd']]
-
-
-@triton.testing.perf_report(configs)
-def bench_flash_attention(BATCH, H, N_CTX, D_HEAD, mode, provider, dtype=torch.float16, device="cuda"):
-    assert mode in ['fwd', 'bwd']
-    warmup = 25
-    rep = 100
-    if provider == "triton":
-        q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
-        k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
-        v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
-        sm_scale = 1.3
-        fn = lambda: attention(q, k, v, sm_scale)
-        if mode == 'bwd':
-            o = fn()
-            do = torch.randn_like(o)
-            fn = lambda: o.backward(do, retain_graph=True)
-        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-        return ms
-    if provider == "flash":
-        lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
-        cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
-        cu_seqlens[1:] = lengths.cumsum(0)
-        qkv = torch.randn((BATCH * N_CTX, 3, H, D_HEAD), dtype=dtype, device=device, requires_grad=True)
-        fn = lambda: flash_attn_func(qkv, cu_seqlens, 0., N_CTX, causal=True)
-        if mode == 'bwd':
-            o = fn()
-            do = torch.randn_like(o)
-            fn = lambda: o.backward(do, retain_graph=True)
-        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-        return ms
-
-
-# only works on post-Ampere GPUs right now
-bench_flash_attention.run(save_path='.', print_data=True)
diff --git a/python/tutorials/07-math-functions.py b/python/tutorials/07-math-functions.py
deleted file mode 100644
index 1ded3aa984d6..000000000000
--- a/python/tutorials/07-math-functions.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
-Libdevice (`tl.math`) function
-==============================
-Triton can invoke a custom function from an external library.
-In this example, we will use the `libdevice` library (a.k.a `math` in triton) to apply `asin` on a tensor.
-Please refer to https://docs.nvidia.com/cuda/libdevice-users-guide/index.html regarding the semantics of all available libdevice functions.
-In `triton/language/math.py`, we try to aggregate functions with the same computation but different data types together.
-For example, both `__nv_asin` and `__nvasinf` calculate the principal value of the arc sine of the input, but `__nv_asin` operates on `double` and `__nv_asinf` operates on `float`.
-Using triton, you can simply call `tl.math.asin`.
-Triton automatically selects the correct underlying device function to invoke based on input and output types.
-"""
-
-# %%
-#  asin Kernel
-# ------------
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def asin_kernel(
-        x_ptr,
-        y_ptr,
-        n_elements,
-        BLOCK_SIZE: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-    x = tl.load(x_ptr + offsets, mask=mask)
-    x = tl.math.asin(x)
-    tl.store(y_ptr + offsets, x, mask=mask)
-
-# %%
-#  Using the default libdevice library path
-# -----------------------------------------
-# We can use the default libdevice library path encoded in `triton/language/math.py`
-
-
-torch.manual_seed(0)
-size = 98432
-x = torch.rand(size, device='cuda')
-output_triton = torch.zeros(size, device='cuda')
-output_torch = torch.asin(x)
-assert x.is_cuda and output_triton.is_cuda
-n_elements = output_torch.numel()
-grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-asin_kernel[grid](x, output_triton, n_elements, BLOCK_SIZE=1024)
-print(output_torch)
-print(output_triton)
-print(
-    f'The maximum difference between torch and triton is '
-    f'{torch.max(torch.abs(output_torch - output_triton))}'
-)
-
-# %%
-#  Customize the libdevice library path
-# -------------------------------------
-# We can also customize the libdevice library path by passing the path to the `libdevice` library to the `asin` kernel.
-
-output_triton = torch.empty_like(x)
-asin_kernel[grid](x, output_triton, n_elements, BLOCK_SIZE=1024,
-                  extern_libs={'libdevice': '/usr/local/cuda/nvvm/libdevice/libdevice.10.bc'})
-print(output_torch)
-print(output_triton)
-print(
-    f'The maximum difference between torch and triton is '
-    f'{torch.max(torch.abs(output_torch - output_triton))}'
-)
diff --git a/python/tutorials/08-experimental-block-pointer.py b/python/tutorials/08-experimental-block-pointer.py
deleted file mode 100644
index 7147b69de6cc..000000000000
--- a/python/tutorials/08-experimental-block-pointer.py
+++ /dev/null
@@ -1,228 +0,0 @@
-"""
-Block Pointer (Experimental)
-============================
-This tutorial will guide you through writing a matrix multiplication algorithm that utilizes block pointer semantics.
-These semantics are more friendly for Triton to optimize and can result in better performance on specific hardware.
-Note that this feature is still experimental and may change in the future.
-
-"""
-
-# %%
-# Motivations
-# -----------
-# In the previous matrix multiplication tutorial, we constructed blocks of values by de-referencing blocks of pointers,
-# i.e., :code:`load(block<pointer_type<element_type>>) -> block<element_type>`, which involved loading blocks of
-# elements from memory. This approach allowed for flexibility in using hardware-managed cache and implementing complex
-# data structures, such as tensors of trees or unstructured look-up tables.
-#
-# However, the drawback of this approach is that it relies heavily on complex optimization passes by the compiler to
-# optimize memory access patterns. This can result in brittle code that may suffer from performance degradation when the
-# optimizer fails to perform adequately. Additionally, as memory controllers specialize to accommodate dense spatial
-# data structures commonly used in machine learning workloads, this problem is likely to worsen.
-#
-# To address this issue, we will use block pointers :code:`pointer_type<block<element_type>>` and load them into
-# :code:`block<element_type>`, in which way gives better friendliness for the compiler to optimize memory access
-# patterns.
-#
-# Let's start with the previous matrix multiplication example and demonstrate how to rewrite it to utilize block pointer
-# semantics.
-
-# %%
-# Make a Block Pointer
-# --------------------
-# A block pointer pointers to a block in a parent tensor and is constructed by :code:`make_block_ptr` function,
-# which takes the following information as arguments:
-#
-# * :code:`base`: the base pointer to the parent tensor;
-#
-# * :code:`shape`: the shape of the parent tensor;
-#
-# * :code:`strides`: the strides of the parent tensor, which means how much to increase the pointer by when moving by 1 element in a specific axis;
-#
-# * :code:`offsets`: the offsets of the block;
-#
-# * :code:`block_shape`: the shape of the block;
-#
-# * :code:`order`: the order of the block, which means how the block is laid out in memory.
-#
-# For example, to a block pointer to a :code:`BLOCK_SIZE_M * BLOCK_SIZE_K` block in a row-major 2D matrix A by
-# offsets :code:`(pid_m * BLOCK_SIZE_M, 0)` and strides :code:`(stride_am, stride_ak)`, we can use the following code
-# (exactly the same as the previous matrix multiplication tutorial):
-#
-# .. code-block:: python
-#
-#     a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(M, K), strides=(stride_am, stride_ak),
-#                                     offsets=(pid_m * BLOCK_SIZE_M, 0), block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_K),
-#                                     order=(1, 0))
-#
-# Note that the :code:`order` argument is set to :code:`(1, 0)`, which means the second axis is the inner dimension in
-# terms of storage, and the first axis is the outer dimension. This information may sound redundant, but it is necessary
-# for some hardware backends to optimize for better performance.
-
-# %%
-# Load/Store a Block Pointer
-# --------------------------
-# To load/store a block pointer, we can use :code:`load/store` function, which takes a block pointer as an argument,
-# de-references it, and loads/stores a block. You may mask some values in the block, here we have an extra argument
-# :code:`boundary_check` to specify whether to check the boundary of each axis for the block pointer. With check on,
-# out-of-bound values will be masked according to the :code:`padding_option` argument (load only), which can be
-# :code:`zero` or :code:`nan`. Temporarily, we do not support other values due to some hardware limitations. In this
-# mode of block pointer load/store does not support :code:`mask` or :code:`other` arguments in the legacy mode.
-#
-# So to load the block pointer of A in the previous section, we can simply write
-# :code:`a = tl.load(a_block_ptr, boundary_check=(0, 1))`. Boundary check may cost extra performance, so if you can
-# guarantee that the block pointer is always in-bound in some axis, you can turn off the check by not passing the index
-# into the :code:`boundary_check` argument. For example, if we know that :code:`M` is a multiple of
-# :code:`BLOCK_SIZE_M`, we can replace with :code:`a = tl.load(a_block_ptr, boundary_check=(1, ))`, since axis 0 is
-# always in bound.
-
-# %%
-# Advance a Block Pointer
-# -----------------------
-# To advance a block pointer, we can use :code:`advance` function, which takes a block pointer and the increment for
-# each axis as arguments and returns a new block pointer with the same shape and strides as the original one,
-# but with the offsets advanced by the specified amount.
-#
-# For example, to advance the block pointer by :code:`BLOCK_SIZE_K` in the second axis
-# (no need to multiply with strides), we can write :code:`a_block_ptr = tl.advance(a_block_ptr, (0, BLOCK_SIZE_K))`.
-
-# %%
-# Final Result
-# ------------
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def matmul_kernel_with_block_pointers(
-        # Pointers to matrices
-        a_ptr, b_ptr, c_ptr,
-        # Matrix dimensions
-        M, N, K,
-        # The stride variables represent how much to increase the ptr by when moving by 1
-        # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
-        # by to get the element one row down (A has M rows).
-        stride_am, stride_ak,
-        stride_bk, stride_bn,
-        stride_cm, stride_cn,
-        # Meta-parameters
-        BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr
-):
-    """Kernel for computing the matmul C = A x B.
-    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
-    """
-    # -----------------------------------------------------------
-    # Map program ids `pid` to the block of C it should compute.
-    # This is done in a grouped ordering to promote L2 data reuse.
-    # See the matrix multiplication tutorial for details.
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    # ----------------------------------------------------------
-    # Create block pointers for the first blocks of A and B.
-    # We will advance this pointer as we move in the K direction and accumulate.
-    # See above `Make a Block Pointer` section for details.
-    a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(M, K), strides=(stride_am, stride_ak),
-                                    offsets=(pid_m * BLOCK_SIZE_M, 0), block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_K),
-                                    order=(1, 0))
-    b_block_ptr = tl.make_block_ptr(base=b_ptr, shape=(K, N), strides=(stride_bk, stride_bn),
-                                    offsets=(0, pid_n * BLOCK_SIZE_N), block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_N),
-                                    order=(1, 0))
-
-    # -----------------------------------------------------------
-    # Iterate to compute a block of the C matrix.
-    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block.
-    # of fp32 values for higher accuracy.
-    # `accumulator` will be converted back to fp16 after the loop.
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, K, BLOCK_SIZE_K):
-        # Load with boundary checks, no need to calculate the mask manually.
-        # For better performance, you may remove some axis from the boundary
-        # check, if you can guarantee that the access is always in-bound in
-        # that axis.
-        # See above `Load/Store a Block Pointer` section for details.
-        a = tl.load(a_block_ptr, boundary_check=(0, 1))
-        b = tl.load(b_block_ptr, boundary_check=(0, 1))
-        # We accumulate along the K dimension.
-        accumulator += tl.dot(a, b)
-        # Advance the block pointer to the next K block.
-        # See above `Advance a Block Pointer` section for details.
-        a_block_ptr = tl.advance(a_block_ptr, (0, BLOCK_SIZE_K))
-        b_block_ptr = tl.advance(b_block_ptr, (BLOCK_SIZE_K, 0))
-    c = accumulator.to(tl.float16)
-
-    # ----------------------------------------------------------------
-    # Write back the block of the output matrix C with boundary checks.
-    # See above `Load/Store a Block Pointer` section for details.
-    c_block_ptr = tl.make_block_ptr(base=c_ptr, shape=(M, N), strides=(stride_cm, stride_cn),
-                                    offsets=(pid_m * BLOCK_SIZE_M, pid_n * BLOCK_SIZE_N),
-                                    block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_N), order=(1, 0))
-    tl.store(c_block_ptr, c, boundary_check=(0, 1))
-
-
-# We can now create a convenience wrapper function that only takes two input tensors,
-# and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel.
-def matmul(a, b):
-    # Check constraints.
-    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
-    assert a.is_contiguous(), "Matrix A must be contiguous"
-    assert b.is_contiguous(), "Matrix B must be contiguous"
-    M, K = a.shape
-    K, N = b.shape
-    # Allocates output.
-    c = torch.empty((M, N), device=a.device, dtype=a.dtype)
-    # 1D launch kernel where each block gets its own program.
-    grid = lambda META: (
-        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
-    )
-    matmul_kernel_with_block_pointers[grid](
-        a, b, c,
-        M, N, K,
-        a.stride(0), a.stride(1),
-        b.stride(0), b.stride(1),
-        c.stride(0), c.stride(1),
-    )
-    return c
-
-
-# %%
-# Unit Test
-# ---------
-#
-# Still we can test our matrix multiplication with block pointers against a native torch implementation (i.e., cuBLAS).
-
-torch.manual_seed(0)
-a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
-b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
-triton_output = matmul(a, b)
-torch_output = torch.matmul(a, b)
-print(f"triton_output={triton_output}")
-print(f"torch_output={torch_output}")
-if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0):
-    print("✅ Triton and Torch match")
-else:
-    print("❌ Triton and Torch differ")
diff --git a/python/tutorials/README.rst b/python/tutorials/README.rst
deleted file mode 100644
index 1dfa5f4dca91..000000000000
--- a/python/tutorials/README.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Tutorials
-=========
-
-Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one.
-
-To install the dependencies for the tutorials:
-
-.. code-block:: bash
-
-    cd triton
-    pip install -e './python[tutorials]'
diff --git a/test/BUILD b/test/BUILD
new file mode 100644
index 000000000000..6cb5c9e81afe
--- /dev/null
+++ b/test/BUILD
@@ -0,0 +1,55 @@
+load("//third_party/llvm/build_defs:lit.bzl", "glob_lit_tests")
+load("//tools/build_defs/build_test:build_test.bzl", "build_test")
+
+package(
+    default_compatible_with = ["//buildenv/target:gce"],
+    default_visibility = ["//third_party/triton:__subpackages__"],
+)
+
+glob_lit_tests(
+    data = [
+        "@llvm-project//llvm:FileCheck",
+        "//third_party/triton:triton-opt",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    exclude = [
+        # These require adjusted RUN commands for python in google3.
+        "Target/tritongpu_to_llvmir.mlir",
+        "Target/tritongpu_to_ptx.mlir",
+    ],
+    test_file_exts = ["mlir"],
+)
+
+cc_library(
+    name = "TritonTestAnalysis",
+    srcs = glob(["lib/Analysis/*.cpp"]),
+    deps = [
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Transforms",
+        "//third_party/triton:TritonAnalysis",
+        "//third_party/triton:TritonGPUDialect",
+    ],
+)
+
+build_test(
+    name = "build_test",
+    allow_empty_target = False,
+    targets = [
+        "//third_party/triton:TritonAnalysis",
+        "//third_party/triton:TritonDialect",
+        "//third_party/triton:TritonGPUDialect",
+        "//third_party/triton:TritonGPUToLLVM",
+        "//third_party/triton:TritonGPUTransforms",
+        "//third_party/triton:TritonLLVMIR",
+        "//third_party/triton:TritonPTX",
+        "//third_party/triton:TritonToTritonGPU",
+        "//third_party/triton:TritonTools",
+        "//third_party/triton:TritonTransforms",
+        "//third_party/triton:triton-opt",
+        "//third_party/triton:triton-translate",
+    ],
+)
diff --git a/triton.bzl b/triton.bzl
new file mode 100644
index 000000000000..25627d030bcd
--- /dev/null
+++ b/triton.bzl
@@ -0,0 +1,10 @@
+"""Bazel macros used by the triton build."""
+
+def if_msvc(if_true, if_false = []):
+    return select({
+        ":compiler_is_msvc": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_not_msvc(a):
+    return if_msvc([], a)