diff --git a/BUILD b/BUILD new file mode 100644 index 000000000000..d4b988f04058 --- /dev/null +++ b/BUILD @@ -0,0 +1,628 @@ +# This package imports OpenAI's Triton (https://github.com/openai/triton). +# +# There are two versions of Triton in google3 at the moment. The older version +# can be found at //third_party/py/triton. This is the MLIR-based version close +# to head. We expect to transition users to this version in the following +# weeks. +# +# There is no SLA associated with this package and it may get broken by LLVM +# imports at any time. + +load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library") +# copybara:uncomment load("//tools/build_defs/license:license.bzl", "license") +load("//:triton.bzl", "if_not_msvc") + +package( + # copybara:uncomment_begin + # default_applicable_licenses = [":license"], + # default_compatible_with = ["//buildenv/target:gce"], + # default_visibility = [ + # "//third_party/tensorflow/compiler/xla:__subpackages__", + # "//third_party/triton:__subpackages__", + # ], + # copybara:uncomment_end_and_comment_begin + default_visibility = ["//visibility:public"], + # copybara:comment_end + # TODO(csigg): fix and remove + features = [ + "-parse_headers", + "-use_header_modules", + ], +) + +# copybara:uncomment_begin +# license(name = "license") +# +# licenses(["notice"]) +# +# exports_files(["LICENSE"]) +# copybara:uncomment_end + +config_setting( + name = "compiler_is_msvc", + flag_values = { + "@bazel_tools//tools/cpp:compiler": "msvc-cl", + }, +) + +td_library( + name = "td_files", + srcs = glob(["include/triton/**/*.td"]), + includes = ["include"], + deps = [ + "@llvm-project//mlir:ArithOpsTdFiles", + "@llvm-project//mlir:CastInterfacesTdFiles", + "@llvm-project//mlir:ControlFlowInterfacesTdFiles", + "@llvm-project//mlir:DestinationStyleOpInterfaceTdFiles", + "@llvm-project//mlir:FunctionInterfacesTdFiles", + "@llvm-project//mlir:InferTypeOpInterfaceTdFiles", + "@llvm-project//mlir:OpBaseTdFiles", + "@llvm-project//mlir:PassBaseTdFiles", + "@llvm-project//mlir:SideEffectInterfacesTdFiles", + "@llvm-project//mlir:ViewLikeInterfaceTdFiles", + ], +) + +gentbl_cc_library( + name = "triton_dialect_inc_gen", + tbl_outs = [ + ( + ["--gen-dialect-decls"], + "include/triton/Dialect/Triton/IR/Dialect.h.inc", + ), + ( + ["--gen-dialect-defs"], + "include/triton/Dialect/Triton/IR/Dialect.cpp.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/Triton/IR/TritonDialect.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_ops_inc_gen", + tbl_outs = [ + ( + ["--gen-enum-decls"], + "include/triton/Dialect/Triton/IR/OpsEnums.h.inc", + ), + ( + ["--gen-enum-defs"], + "include/triton/Dialect/Triton/IR/OpsEnums.cpp.inc", + ), + ( + ["--gen-op-decls"], + "include/triton/Dialect/Triton/IR/Ops.h.inc", + ), + ( + ["--gen-op-defs"], + "include/triton/Dialect/Triton/IR/Ops.cpp.inc", + ), + ( + ["--gen-typedef-decls"], + "include/triton/Dialect/Triton/IR/Types.h.inc", + ), + ( + ["--gen-typedef-defs"], + "include/triton/Dialect/Triton/IR/Types.cpp.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/Triton/IR/TritonOps.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_interfaces_inc_gen", + tbl_outs = [ + ( + ["--gen-attr-interface-decls"], + "include/triton/Dialect/Triton/IR/AttrInterfaces.h.inc", + ), + ( + ["--gen-attr-interface-defs"], + "include/triton/Dialect/Triton/IR/AttrInterfaces.cpp.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/Triton/IR/TritonInterfaces.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_transforms_inc_gen", + tbl_outs = [ + ( + [ + "--gen-pass-decls", + "--name=Triton", + ], + "include/triton/Dialect/Triton/Transforms/Passes.h.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/Triton/Transforms/Passes.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_combine_inc_gen", + # The generated file is #included without relative path. + strip_include_prefix = "lib/Dialect/Triton/Transforms", + tbl_outs = [ + ( + ["--gen-rewriters"], + "lib/Dialect/Triton/Transforms/TritonCombine.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "lib/Dialect/Triton/Transforms/Combine.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_gpu_dialect_inc_gen", + tbl_outs = [ + ( + ["--gen-dialect-decls"], + "include/triton/Dialect/TritonGPU/IR/Dialect.h.inc", + ), + ( + ["--gen-dialect-defs"], + "include/triton/Dialect/TritonGPU/IR/Dialect.cpp.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_gpu_ops_inc_gen", + tbl_outs = [ + ( + ["--gen-op-decls"], + "include/triton/Dialect/TritonGPU/IR/Ops.h.inc", + ), + ( + ["--gen-op-defs"], + "include/triton/Dialect/TritonGPU/IR/Ops.cpp.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_gpu_attr_inc_gen", + tbl_outs = [ + ( + ["--gen-attrdef-decls"], + "include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc", + ), + ( + ["--gen-attrdef-defs"], + "include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_gpu_transforms_inc_gen", + tbl_outs = [ + ( + [ + "--gen-pass-decls", + "--name=TritonGPU", + ], + "include/triton/Dialect/TritonGPU/Transforms/Passes.h.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Dialect/TritonGPU/Transforms/Passes.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_conversion_triton_gpu_to_llvm_passes_inc_gen", + tbl_outs = [ + ( + [ + "--gen-pass-decls", + "--name=TritonGPUToLLVM", + ], + "include/triton/Conversion/TritonGPUToLLVM/Passes.h.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Conversion/TritonGPUToLLVM/Passes.td", + deps = ["td_files"], +) + +gentbl_cc_library( + name = "triton_conversion_triton_to_triton_gpu_passes_inc_gen", + tbl_outs = [ + ( + [ + "--gen-pass-decls", + "--name=TritonToTritonGPU", + ], + "include/triton/Conversion/TritonToTritonGPU/Passes.h.inc", + ), + ], + tblgen = "@llvm-project//mlir:mlir-tblgen", + td_file = "include/triton/Conversion/TritonToTritonGPU/Passes.td", + deps = ["td_files"], +) + +cc_library( + name = "TritonAnalysis", + srcs = glob(["lib/Analysis/*.cpp"]), + hdrs = glob(["include/triton/Analysis/*.h"]), + includes = ["include"], + deps = [ + ":TritonDialect", + ":TritonGPUDialect", + ":triton_gpu_attr_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:Analysis", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:GPUDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:TensorDialect", + ], +) + +cc_library( + name = "TritonDialect", + srcs = glob(["lib/Dialect/Triton/IR/*.cpp"]), + hdrs = glob(["include/triton/Dialect/Triton/IR/*.h"]), + copts = if_not_msvc(["-Wno-unused-variable"]), + includes = ["include"], + deps = [ + ":triton_dialect_inc_gen", + ":triton_interfaces_inc_gen", + ":triton_ops_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:ArithDialect", + "@llvm-project//mlir:ControlFlowDialect", + "@llvm-project//mlir:ControlFlowInterfaces", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:MathDialect", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:SCFDialect", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:TensorDialect", + ], +) + +cc_library( + name = "TritonTransforms", + srcs = glob(["lib/Dialect/Triton/Transforms/*.cpp"]), + hdrs = glob(["include/triton/Dialect/Triton/Transforms/*.h"]), + includes = ["include"], + deps = [ + ":TritonDialect", + ":triton_combine_inc_gen", + ":triton_transforms_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:ArithDialect", + "@llvm-project//mlir:ControlFlowDialect", + "@llvm-project//mlir:ControlFlowInterfaces", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:MathDialect", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:SCFDialect", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:TensorDialect", + "@llvm-project//mlir:Transforms", + ], + alwayslink = True, # TritonDialect uses getCanonicalizationPatterns(). +) + +cc_library( + name = "TritonGPUDialect", + srcs = glob(["lib/Dialect/TritonGPU/IR/*.cpp"]), + hdrs = glob([ + "include/triton/Analysis/*.h", + "include/triton/Dialect/TritonGPU/IR/*.h", + ]), + copts = if_not_msvc(["-Wno-unused-variable"]), + includes = ["include"], + deps = [ + ":TritonDialect", + ":triton_gpu_attr_inc_gen", + ":triton_gpu_dialect_inc_gen", + ":triton_gpu_ops_inc_gen", + ":triton_gpu_transforms_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:Analysis", + "@llvm-project//mlir:DestinationStyleOpInterface", + "@llvm-project//mlir:GPUDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:TensorDialect", + "@llvm-project//mlir:Transforms", + ], +) + +cc_library( + name = "TritonGPUTransforms", + srcs = glob([ + "lib/Dialect/TritonGPU/Transforms/*.cpp", + "lib/Dialect/TritonGPU/Transforms/*.h", + ]), + hdrs = glob(["include/triton/Dialect/TritonGPU/Transforms/*.h"]), + copts = if_not_msvc(["-Wno-unused-variable"]), + includes = ["include"], + deps = [ + ":TritonDialect", + ":TritonGPUDialect", + ":triton_gpu_transforms_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:Analysis", + "@llvm-project//mlir:ArithDialect", + "@llvm-project//mlir:ControlFlowDialect", + "@llvm-project//mlir:ControlFlowInterfaces", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:InferTypeOpInterface", + "@llvm-project//mlir:MathDialect", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:SCFDialect", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:TensorDialect", + "@llvm-project//mlir:Transforms", + ], +) + +cc_library( + name = "TritonGPUToLLVM", + srcs = glob([ + "lib/Conversion/TritonGPUToLLVM/*.h", + "lib/Conversion/TritonGPUToLLVM/**/*.cpp", + ]) + [ + "include/triton/Conversion/MLIRTypes.h", + ], + hdrs = glob([ + "include/triton/Tools/Sys/*.hpp", + "include/triton/Conversion/TritonGPUToLLVM/*.h", + ]), + copts = if_not_msvc(["-Wno-unused-variable"]), + includes = [ + "include", + "lib/Conversion/TritonGPUToLLVM", + ], + deps = [ + ":TritonAnalysis", + ":TritonDialect", + ":TritonGPUDialect", + ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen", + ":triton_conversion_triton_to_triton_gpu_passes_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:Analysis", + "@llvm-project//mlir:ArithDialect", + "@llvm-project//mlir:ArithToLLVM", + "@llvm-project//mlir:ControlFlowDialect", + "@llvm-project//mlir:ControlFlowToLLVM", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:GPUDialect", + "@llvm-project//mlir:GPUToNVVMTransforms", + "@llvm-project//mlir:GPUToROCDLTransforms", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:IndexDialect", + "@llvm-project//mlir:LLVMCommonConversion", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:MathToLLVM", + "@llvm-project//mlir:NVVMDialect", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:ROCDLDialect", + "@llvm-project//mlir:SCFToControlFlow", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:TensorDialect", + "@llvm-project//mlir:Transforms", + ], +) + +cc_library( + name = "TritonToTritonGPU", + srcs = glob([ + "lib/Conversion/TritonToTritonGPU/*.h", + "lib/Conversion/TritonToTritonGPU/*.cpp", + ]), + hdrs = glob(["include/triton/Conversion/TritonToTritonGPU/*.h"]), + includes = ["include"], + deps = [ + ":TritonDialect", + ":TritonGPUDialect", + ":TritonGPUTransforms", + ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen", + ":triton_conversion_triton_to_triton_gpu_passes_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:ArithDialect", + "@llvm-project//mlir:ControlFlowDialect", + "@llvm-project//mlir:GPUDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:IndexDialect", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:NVVMDialect", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:Transforms", + ], +) + +cc_library( + name = "TritonLLVMIR", + srcs = glob([ + "lib/Target/LLVMIR/*.cpp", + ]) + [ + "include/triton/Tools/Sys/GetEnv.hpp", + ], + hdrs = glob(["include/triton/Target/LLVMIR/*.h"]), + includes = ["include"], + deps = [ + ":TritonGPUToLLVM", + ":TritonTransforms", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:IRReader", + "@llvm-project//llvm:Linker", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:BuiltinToLLVMIRTranslation", + "@llvm-project//mlir:ConversionPasses", + "@llvm-project//mlir:ExecutionEngine", + "@llvm-project//mlir:ExecutionEngineUtils", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:LLVMToLLVMIRTranslation", + "@llvm-project//mlir:NVVMToLLVMIRTranslation", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:ROCDLToLLVMIRTranslation", + "@llvm-project//mlir:ToLLVMIRTranslation", + "@llvm-project//mlir:Transforms", + # copybara:uncomment_begin + # "//third_party/py/triton/google:find_cuda", + # copybara:uncomment_end + ], +) + +cc_library( + name = "TritonPTX", + srcs = glob([ + "lib/Target/PTX/*.cpp", + ]), + hdrs = glob(["include/triton/Target/PTX/*.h"]), + includes = ["include"], + deps = [ + ":TritonLLVMIR", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:MC", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:Target", + ], +) + +cc_library( + name = "TritonHSACO", + srcs = glob([ + "lib/Target/HSACO/*.cpp", + ]), + hdrs = glob(["include/triton/Target/HSACO/*.h"]), + includes = ["include"], + deps = [ + ":TritonLLVMIR", + ":TritonTools", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:ExecutionEngine", + "@llvm-project//llvm:MC", + "@llvm-project//llvm:Scalar", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:Target", + "@llvm-project//llvm:TransformUtils", + "@llvm-project//mlir:ExecutionEngine", + "@llvm-project//mlir:ExecutionEngineUtils", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:LLVMToLLVMIRTranslation", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:ToLLVMIRTranslation", + ], +) + +cc_library( + name = "TritonTools", + hdrs = ["include/triton/Tools/Sys/GetEnv.hpp"], + includes = ["include"], +) + +cc_binary( + name = "triton-opt", + srcs = [ + "bin/RegisterTritonDialects.h", + "bin/triton-opt.cpp", + "include/triton/Conversion/TritonGPUToLLVM/Passes.h", + "include/triton/Conversion/TritonToTritonGPU/Passes.h", + ], + includes = ["include"], + deps = [ + ":TritonDialect", + ":TritonGPUDialect", + ":TritonGPUToLLVM", + ":TritonGPUTransforms", + ":TritonToTritonGPU", + ":TritonTransforms", + ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen", + ":triton_conversion_triton_to_triton_gpu_passes_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:ir_headers", + "@llvm-project//mlir:AllPassesAndDialects", + "@llvm-project//mlir:ControlFlowDialect", + "@llvm-project//mlir:ConversionPasses", + "@llvm-project//mlir:ExecutionEngine", + "@llvm-project//mlir:ExecutionEngineUtils", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:LLVMCommonConversion", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:LLVMToLLVMIRTranslation", + "@llvm-project//mlir:MlirOptLib", + "@llvm-project//mlir:Parser", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:ToLLVMIRTranslation", + "@llvm-project//mlir:TransformUtils", + "@llvm-project//mlir:Transforms", + # copybara:uncomment "//third_party/triton/test:TritonTestAnalysis", + ], +) + +cc_binary( + name = "triton-translate", + srcs = [ + "bin/triton-translate.cpp", + "include/triton/Conversion/TritonGPUToLLVM/Passes.h", + "include/triton/Conversion/TritonToTritonGPU/Passes.h", + ], + includes = ["include"], + deps = [ + ":TritonDialect", + ":TritonGPUDialect", + ":TritonGPUToLLVM", + ":TritonGPUTransforms", + ":TritonHSACO", + ":TritonLLVMIR", + ":TritonPTX", + ":TritonToTritonGPU", + ":TritonTransforms", + ":triton_conversion_triton_gpu_to_llvm_passes_inc_gen", + ":triton_conversion_triton_to_triton_gpu_passes_inc_gen", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:ir_headers", + "@llvm-project//mlir:AllPassesAndDialects", + "@llvm-project//mlir:ConversionPasses", + "@llvm-project//mlir:ExecutionEngine", + "@llvm-project//mlir:ExecutionEngineUtils", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:LLVMCommonConversion", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:LLVMToLLVMIRTranslation", + "@llvm-project//mlir:MlirOptLib", + "@llvm-project//mlir:Parser", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:ToLLVMIRTranslation", + "@llvm-project//mlir:TransformUtils", + "@llvm-project//mlir:Transforms", + ], +) diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index 7c52e99b8aaf..11598fcd44b2 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -93,6 +93,11 @@ bool expensiveLoadOrStore(Operation *op, Attribute &targetEncoding) { // same if (isSingleValue(op->getOperand(0))) return false; + // TODO(manany): Investigate with Openai why the change here + // https://github.com/openai/triton/commit/640f3c392184cd14291c1bca6a4795eb0f32a61a + // which introduces Case 2 causes breakage to this test + // //third_party/py/jax_triton/tests:pallas_test_sm80 --test_filter=test_fused_attention_bwd + return true; // Case 2: Tensor of pointers has more threads than elements // we can presume a high hit-rate that makes it cheap to load auto ptrType = op->getOperand(0).getType().cast(); diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp index 6431b6ae8d89..bc29f18ec773 100644 --- a/lib/Target/PTX/PTXTranslation.cpp +++ b/lib/Target/PTX/PTXTranslation.cpp @@ -49,7 +49,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) { auto *shortPtr = static_cast *>(options["nvptx-short-ptr"]); assert(shortPtr); - shortPtr->setValue(true); + shortPtr->setValue(false); std::string sm = cc == 90 ? "sm_90a" : "sm_" + std::to_string(cc); // max PTX version int ptxMajor = maxPTX / 10; diff --git a/python/MANIFEST.in b/python/MANIFEST.in deleted file mode 100644 index 04da9b52953e..000000000000 --- a/python/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -graft src -graft triton/third_party -graft triton/runtime/backends/ -graft triton/language/extra diff --git a/python/README.md b/python/README.md deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/examples/copy_strided.py b/python/examples/copy_strided.py deleted file mode 100644 index 34cf12630205..000000000000 --- a/python/examples/copy_strided.py +++ /dev/null @@ -1,18 +0,0 @@ -import triton -import triton.language as tl - - -# triton kernel -@triton.jit -def kernel(X, stride_xm, - Z, stride_zn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr): - off_m = tl.arange(0, BLOCK_M) - off_n = tl.arange(0, BLOCK_N) - Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * 1 - Zs = Z + off_m[:, None] * 1 + off_n[None, :] * stride_zn - tl.store(Zs, tl.load(Xs)) - - -ret = triton.compile(kernel, signature="*fp32,i32,*fp32,i32", constants={"BLOCK_M": 64, "BLOCK_N": 64}) -print(ret.asm["ttgir"]) diff --git a/python/examples/empty.py b/python/examples/empty.py deleted file mode 100644 index df313fb85869..000000000000 --- a/python/examples/empty.py +++ /dev/null @@ -1,13 +0,0 @@ -import torch - -import triton -import triton.language as tl - - -@triton.jit -def kernel(X, stride_xm, stride_xn, BLOCK: tl.constexpr): - pass - - -X = torch.randn(1, device="cuda") -pgm = kernel[(1,)](X, 1, 1, BLOCK=1024) diff --git a/python/pyproject.toml b/python/pyproject.toml deleted file mode 100644 index 6430c0c154dc..000000000000 --- a/python/pyproject.toml +++ /dev/null @@ -1,8 +0,0 @@ - -[build-system] -requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18"] - -[tool.autopep8] -aggressive = 1 -ignore = "E501,E701,E731,W690" -max_line_length = 88 diff --git a/python/setup.py b/python/setup.py deleted file mode 100644 index 68c6cfee9c5c..000000000000 --- a/python/setup.py +++ /dev/null @@ -1,305 +0,0 @@ -import os -import platform -import re -import shutil -import subprocess -import sys -import sysconfig -import tarfile -import tempfile -import urllib.request -from pathlib import Path -from typing import NamedTuple - -from setuptools import Extension, setup -from setuptools.command.build_ext import build_ext - - -# Taken from https://github.com/pytorch/pytorch/blob/master/tools/setup_helpers/env.py -def check_env_flag(name: str, default: str = "") -> bool: - return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"] - - -def get_build_type(): - if check_env_flag("DEBUG"): - return "Debug" - elif check_env_flag("REL_WITH_DEB_INFO"): - return "RelWithDebInfo" - elif check_env_flag("TRITON_REL_BUILD_WITH_ASSERTS"): - return "TritonRelBuildWithAsserts" - else: - # TODO: change to release when stable enough - return "TritonRelBuildWithAsserts" - -# --- third party packages ----- - - -class Package(NamedTuple): - package: str - name: str - url: str - include_flag: str - lib_flag: str - syspath_var_name: str - -# pybind11 - - -def get_pybind11_package_info(): - name = "pybind11-2.10.0" - url = "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz" - return Package("pybind11", name, url, "PYBIND11_INCLUDE_DIR", "", "PYBIND11_SYSPATH") - -# llvm - - -def get_llvm_package_info(): - # download if nothing is installed - system = platform.system() - if system == "Darwin": - system_suffix = "apple-darwin" - elif system == "Linux": - vglibc = tuple(map(int, platform.libc_ver()[1].split('.'))) - vglibc = vglibc[0] * 100 + vglibc[1] - linux_suffix = 'ubuntu-18.04' if vglibc > 217 else 'centos-7' - system_suffix = f"linux-gnu-{linux_suffix}" - else: - return Package("llvm", "LLVM-C.lib", "", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH") - use_assert_enabled_llvm = check_env_flag("TRITON_USE_ASSERT_ENABLED_LLVM", "False") - release_suffix = "assert" if use_assert_enabled_llvm else "release" - name = f'llvm+mlir-17.0.0-x86_64-{system_suffix}-{release_suffix}' - version = "llvm-17.0.0-c5dede880d17" - url = f"https://github.com/ptillet/triton-llvm-releases/releases/download/{version}/{name}.tar.xz" - return Package("llvm", name, url, "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH") - - -def get_thirdparty_packages(triton_cache_path): - packages = [get_pybind11_package_info(), get_llvm_package_info()] - thirdparty_cmake_args = [] - for p in packages: - package_root_dir = os.path.join(triton_cache_path, p.package) - package_dir = os.path.join(package_root_dir, p.name) - if p.syspath_var_name in os.environ: - package_dir = os.environ[p.syspath_var_name] - version_file_path = os.path.join(package_dir, "version.txt") - if p.syspath_var_name not in os.environ and\ - (not os.path.exists(version_file_path) or Path(version_file_path).read_text() != p.url): - try: - shutil.rmtree(package_root_dir) - except Exception: - pass - os.makedirs(package_root_dir, exist_ok=True) - print(f'downloading and extracting {p.url} ...') - ftpstream = urllib.request.urlopen(p.url) - file = tarfile.open(fileobj=ftpstream, mode="r|*") - file.extractall(path=package_root_dir) - # write version url to package_dir - with open(os.path.join(package_dir, "version.txt"), "w") as f: - f.write(p.url) - if p.include_flag: - thirdparty_cmake_args.append(f"-D{p.include_flag}={package_dir}/include") - if p.lib_flag: - thirdparty_cmake_args.append(f"-D{p.lib_flag}={package_dir}/lib") - return thirdparty_cmake_args - -# ---- package data --- - - -def download_and_copy_ptxas(): - base_dir = os.path.dirname(__file__) - src_path = "bin/ptxas" - version = "12.1.105" - url = f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-64/cuda-nvcc-{version}-0.tar.bz2" - dst_prefix = os.path.join(base_dir, "triton") - dst_suffix = os.path.join("third_party", "cuda", src_path) - dst_path = os.path.join(dst_prefix, dst_suffix) - is_linux = platform.system() == "Linux" - download = False - if is_linux: - download = True - if os.path.exists(dst_path): - curr_version = subprocess.check_output([dst_path, "--version"]).decode("utf-8").strip() - curr_version = re.search(r"V([.|\d]+)", curr_version).group(1) - download = curr_version != version - if download: - print(f'downloading and extracting {url} ...') - ftpstream = urllib.request.urlopen(url) - file = tarfile.open(fileobj=ftpstream, mode="r|*") - with tempfile.TemporaryDirectory() as temp_dir: - file.extractall(path=temp_dir) - src_path = os.path.join(temp_dir, src_path) - os.makedirs(os.path.split(dst_path)[0], exist_ok=True) - shutil.copy(src_path, dst_path) - return dst_suffix - - -# ---- cmake extension ---- - - -class CMakeExtension(Extension): - def __init__(self, name, path, sourcedir=""): - Extension.__init__(self, name, sources=[]) - self.sourcedir = os.path.abspath(sourcedir) - self.path = path - - -class CMakeBuild(build_ext): - - user_options = build_ext.user_options + [('base-dir=', None, 'base directory of Triton')] - - def initialize_options(self): - build_ext.initialize_options(self) - self.base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) - - def finalize_options(self): - build_ext.finalize_options(self) - - def run(self): - try: - out = subprocess.check_output(["cmake", "--version"]) - except OSError: - raise RuntimeError( - "CMake must be installed to build the following extensions: " + ", ".join(e.name for e in self.extensions) - ) - - match = re.search(r"version\s*(?P\d+)\.(?P\d+)([\d.]+)?", out.decode()) - cmake_major, cmake_minor = int(match.group("major")), int(match.group("minor")) - if (cmake_major, cmake_minor) < (3, 18): - raise RuntimeError("CMake >= 3.18.0 is required") - - for ext in self.extensions: - self.build_extension(ext) - - def get_cmake_dir(self): - plat_name = sysconfig.get_platform() - python_version = sysconfig.get_python_version() - dir_name = f"cmake.{plat_name}-{sys.implementation.name}-{python_version}" - cmake_dir = Path(self.base_dir) / "python" / "build" / dir_name - cmake_dir.mkdir(parents=True, exist_ok=True) - return cmake_dir - - def build_extension(self, ext): - lit_dir = shutil.which('lit') - user_home = os.getenv("HOME") or os.getenv("USERPROFILE") or \ - os.getenv("HOMEPATH") or None - if not user_home: - raise RuntimeError("Could not find user home directory") - triton_cache_path = os.path.join(user_home, ".triton") - # lit is used by the test suite - thirdparty_cmake_args = get_thirdparty_packages(triton_cache_path) - extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path))) - # create build directories - if not os.path.exists(self.build_temp): - os.makedirs(self.build_temp) - # python directories - python_include_dir = sysconfig.get_path("platinclude") - cmake_args = [ - "-DLLVM_ENABLE_WERROR=ON", - "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir, - "-DTRITON_BUILD_TUTORIALS=OFF", - "-DTRITON_BUILD_PYTHON_MODULE=ON", - "-DPython3_EXECUTABLE:FILEPATH=" + sys.executable, - "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON", - "-DPYTHON_INCLUDE_DIRS=" + python_include_dir, - ] - if lit_dir is not None: - cmake_args.append("-DLLVM_EXTERNAL_LIT=" + lit_dir) - cmake_args.extend(thirdparty_cmake_args) - - # configuration - cfg = get_build_type() - build_args = ["--config", cfg] - - if platform.system() == "Windows": - cmake_args += [f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"] - if sys.maxsize > 2**32: - cmake_args += ["-A", "x64"] - build_args += ["--", "/m"] - else: - cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg] - max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count())) - build_args += ['-j' + max_jobs] - - if check_env_flag("TRITON_BUILD_WITH_CLANG_LLD"): - cmake_args += ["-DCMAKE_C_COMPILER=clang", - "-DCMAKE_CXX_COMPILER=clang++", - "-DCMAKE_LINKER=lld", - "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld", - "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld", - "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld"] - - env = os.environ.copy() - cmake_dir = self.get_cmake_dir() - subprocess.check_call(["cmake", self.base_dir] + cmake_args, cwd=cmake_dir, env=env) - subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=cmake_dir) - - -download_and_copy_ptxas() - - -setup( - name="triton", - version="2.1.0", - author="Philippe Tillet", - author_email="phil@openai.com", - description="A language and compiler for custom Deep Learning operations", - long_description="", - packages=[ - "triton", - "triton/_C", - "triton/common", - "triton/compiler", - "triton/debugger", - "triton/language", - "triton/language/extra", - "triton/ops", - "triton/ops/blocksparse", - "triton/runtime", - "triton/runtime/backends", - "triton/third_party/cuda/bin", - "triton/third_party/cuda/include", - "triton/third_party/cuda/lib", - "triton/tools", - ], - install_requires=[ - "filelock", - ], - include_package_data=True, - ext_modules=[CMakeExtension("triton", "triton/_C/")], - cmdclass={"build_ext": CMakeBuild}, - zip_safe=False, - # for PyPI - keywords=["Compiler", "Deep Learning"], - url="https://github.com/openai/triton/", - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Topic :: Software Development :: Build Tools", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - ], - test_suite="tests", - extras_require={ - "build": [ - "cmake>=3.18", - "lit", - ], - "tests": [ - "autopep8", - "flake8", - "isort", - "numpy", - "pytest", - "scipy>=1.7.1", - ], - "tutorials": [ - "matplotlib", - "pandas", - "tabulate", - ], - }, -) diff --git a/python/src/extra/cuda.ll b/python/src/extra/cuda.ll deleted file mode 100644 index 0ab2f6896bdd..000000000000 --- a/python/src/extra/cuda.ll +++ /dev/null @@ -1,17 +0,0 @@ -; ~/.triton/llvm/llvm+mlir-17.0.0-x86_64-linux-gnu-ubuntu-18.04-release/bin/llvm-as ./src/extra/cuda.ll -o ./triton/language/extra/cuda.bc - -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "nvptx64-nvidia-cuda" - - -define i64 @globaltimer() #0 { - %1 = call i64 asm sideeffect "mov.u64 $0, %globaltimer;", "=l"() nounwind - ret i64 %1 -} - -define i32 @smid() #0 { - %1 = call i32 asm "mov.u32 $0, %smid;", "=r"() nounwind - ret i32 %1 -} - -attributes #0 = { alwaysinline nounwind } diff --git a/python/src/main.cc b/python/src/main.cc deleted file mode 100644 index 801a83a4b19f..000000000000 --- a/python/src/main.cc +++ /dev/null @@ -1,11 +0,0 @@ -#include - -void init_superblocking(pybind11::module &m); -void init_torch_utils(pybind11::module &m); -void init_triton(pybind11::module &m); -void init_cutlass(pybind11::module &m); - -PYBIND11_MODULE(libtriton, m) { - m.doc() = "Python bindings to the C++ Triton API"; - init_triton(m); -} diff --git a/python/src/triton.cc b/python/src/triton.cc deleted file mode 100644 index 66c112d09bc0..000000000000 --- a/python/src/triton.cc +++ /dev/null @@ -1,1703 +0,0 @@ -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/MLIRContext.h" -#include "mlir/IR/Verifier.h" - -#include "mlir/Conversion/Passes.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Transforms/Passes.h" - -#include "mlir/Parser/Parser.h" -#include "mlir/Support/FileUtilities.h" - -#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" -#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" -#include "mlir/Dialect/Index/IR/IndexDialect.h" -#include "mlir/Dialect/Index/IR/IndexOps.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "triton/Analysis/Allocation.h" -#include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h" -#include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" -#include "triton/Dialect/Triton/IR/Dialect.h" -#include "triton/Dialect/Triton/IR/Types.h" -#include "triton/Dialect/Triton/Transforms/Passes.h" -#include "triton/Dialect/TritonGPU/Transforms/Passes.h" -#include "triton/Target/HSACO/HSACOTranslation.h" -#include "triton/Target/LLVMIR/LLVMIRTranslation.h" -#include "triton/Target/PTX/PTXTranslation.h" -#include "triton/Tools/Sys/GetEnv.hpp" -#include "triton/Tools/Sys/GetPlatform.hpp" - -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Support/FileUtilities.h" -#include "llvm/Support/raw_ostream.h" - -#include "llvm/Support/SourceMgr.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace py = pybind11; - -enum backend_t { - HOST, - CUDA, - ROCM, -}; - -void init_triton_runtime(py::module &&m) { - // wrap backend_t - py::enum_(m, "backend") - .value("HOST", HOST) - .value("CUDA", CUDA) - .value("ROCM", ROCM) - .export_values(); -} - -/*****************************************************************************/ -/* Python bindings for triton::ir */ -/*****************************************************************************/ - -void init_triton_ir(py::module &&m) { - using ret = py::return_value_policy; - using namespace pybind11::literals; - - py::enum_(m, "PADDING_OPTION") - .value("PAD_ZERO", mlir::triton::PaddingOption::PAD_ZERO) - .value("PAD_NAN", mlir::triton::PaddingOption::PAD_NAN) - .export_values(); - - py::enum_(m, "CACHE_MODIFIER") - .value("NONE", mlir::triton::CacheModifier::NONE) - .value("CA", mlir::triton::CacheModifier::CA) - .value("CG", mlir::triton::CacheModifier::CG) - .export_values(); - - py::enum_(m, "EVICTION_POLICY") - .value("NORMAL", mlir::triton::EvictionPolicy::NORMAL) - .value("EVICT_FIRST", mlir::triton::EvictionPolicy::EVICT_FIRST) - .value("EVICT_LAST", mlir::triton::EvictionPolicy::EVICT_LAST) - .export_values(); - - py::enum_(m, "ATOMIC_OP") - .value("ADD", mlir::triton::RMWOp::ADD) - .value("FADD", mlir::triton::RMWOp::FADD) - .value("AND", mlir::triton::RMWOp::AND) - .value("OR", mlir::triton::RMWOp::OR) - .value("XOR", mlir::triton::RMWOp::XOR) - .value("XCHG", mlir::triton::RMWOp::XCHG) - .value("MAX", mlir::triton::RMWOp::MAX) - .value("MIN", mlir::triton::RMWOp::MIN) - .value("UMIN", mlir::triton::RMWOp::UMIN) - .value("UMAX", mlir::triton::RMWOp::UMAX); - - py::class_(m, "context") - .def(py::init<>()) - .def("load_triton", [](mlir::MLIRContext &self) { - self.getOrLoadDialect(); - self.getOrLoadDialect(); - self.getOrLoadDialect(); - self.getOrLoadDialect(); - // we load LLVM because the frontend uses LLVM.undef for - // some placeholders - self.getOrLoadDialect(); - }); - // .def(py::init([](){ - // mlir::MLIRContext context; - // context.getOrLoadDialect(); - // // TODO: should we return a (raw/unique) pointer here? - // return context; - // })); - - // py::class_(m, "value") - // .def("multiple_of", [](ir::value *self, int val) { - // if (auto *instr = dynamic_cast(self)) { - // instr->set_metadata(ir::metadata::multiple_of, val); - // } else - // throw std::runtime_error("multiple_of"); - // }) - // .def("max_contiguous", [](ir::value *self, int val) { - // if (auto *instr = dynamic_cast(self)) { - // instr->set_metadata(ir::metadata::max_contiguous, val); - // } else - // throw std::runtime_error("max_contiguous"); - // }) - // .def("set_fdiv_ieee_rounding", [](ir::value *self, bool val) { - // if (auto *instr = dynamic_cast(self)) - // instr->set_fdiv_ieee_rounding(val); - // else - // throw std::runtime_error("set_fdiv_ieee_rounding"); - // }) - // .def("ops", [](ir::value *self) { - // if (auto *instr = dynamic_cast(self)) { - // return instr->ops(); - // } - // throw std::runtime_error("cannot use ops()"); - // }) - // .def("replace_all_uses_with", &ir::value::replace_all_uses_with) - // .def("erase_from_parent", [](ir::value *self) { - // if (auto *instr = dynamic_cast(self)) - // return instr->erase_from_parent(); - // throw std::runtime_error("cannot use erase_from_parent"); - // }) - // .def_property("name", &ir::value::get_name, &ir::value::set_name) - // .def_property_readonly("type", &ir::value::get_type); - - // // // Do we need under in TritonIR ? - // // py::class_(m, "undef") - // // .def("get", &ir::undef_value::get, ret::reference); - - py::class_(m, "type") - .def("is_integer", &mlir::Type::isInteger) - .def("is_fp16", &mlir::Type::isF16) - .def("__str__", [](mlir::Type &self) { - std::string str; - llvm::raw_string_ostream os(str); - self.print(os); - return os.str(); - }); - - py::class_(m, "function_type") - .def("param_types", [](mlir::FunctionType &self) { - return std::vector(self.getInputs().begin(), - self.getInputs().end()); - }); - - py::class_(m, "value") - .def("set_attr", - [](mlir::Value &self, std::string &name, - mlir::Attribute &attr) -> void { - if (mlir::Operation *definingOp = self.getDefiningOp()) - definingOp->setAttr(name, attr); - else { - auto arg = self.cast(); - int id = arg.getArgNumber(); - std::string attrName = name + "_arg" + std::to_string(id); - mlir::Block *owner = arg.getOwner(); - if (owner->isEntryBlock() && - !mlir::isa(owner->getParentOp())) { - owner->getParentOp()->setAttr(attrName, attr); - } - } - }) - .def("get_context", &mlir::Value::getContext) - .def("replace_all_uses_with", - [](mlir::Value &self, mlir::Value &newValue) { - self.replaceAllUsesWith(newValue); - }) - .def("get_type", &mlir::Value::getType); - - py::class_(m, "block_argument"); - - py::class_(m, "region") - .def("get_parent_region", &mlir::Region::getParentRegion, ret::reference) - .def("size", [](mlir::Region &self) { return self.getBlocks().size(); }) - .def("empty", &mlir::Region::empty); - - py::class_(m, "block") - .def("arg", - [](mlir::Block &self, int index) -> mlir::BlockArgument { - return self.getArgument(index); - }) - .def("add_argument", - [](mlir::Block &self, mlir::Type ty) { - auto loc = mlir::UnknownLoc::get(ty.getContext()); - self.addArgument(ty, loc); - }) - .def("get_num_arguments", &mlir::Block::getNumArguments) - .def("dump", &mlir::Block::dump) - .def("move_before", &mlir::Block::moveBefore) - .def("insert_before", &mlir::Block::insertBefore) - .def("get_parent", &mlir::Block::getParent, ret::reference) - .def("merge_block_before", - [](mlir::Block &self, mlir::Block &dst) { - // ref: RewriterBase::mergeBlocks() - if (self.getNumArguments() != 0) - throw std::runtime_error( - "This block has arguments, don't merge"); - dst.getOperations().splice(dst.begin(), self.getOperations()); - self.dropAllUses(); - self.erase(); - }) - .def("replace_use_in_block_with", - [](mlir::Block &self, mlir::Value &v, mlir::Value &newVal) { - v.replaceUsesWithIf(newVal, [&](mlir::OpOperand &operand) { - mlir::Operation *user = operand.getOwner(); - mlir::Block *currentBlock = user->getBlock(); - while (currentBlock) { - if (currentBlock == &self) - return true; - // Move up one level - currentBlock = - currentBlock->getParent()->getParentOp()->getBlock(); - } - return false; - }); - }) - .def("__str__", - [](mlir::Block &self) { - std::string str; - llvm::raw_string_ostream os(str); - self.print(os); - return str; - }) - .def("has_terminator", - [](mlir::Block &self) { - return !self.empty() && - self.back().hasTrait(); - }) - .def("has_return", - [](mlir::Block &self) { - return !self.empty() && - self.back().hasTrait(); - }) - .def("erase", [](mlir::Block &self) { self.erase(); }); - - // using eattr = ir::attribute_kind_t; - // py::enum_(m, "attribute_kind") - // .value("readonly", eattr::readonly) - // .value("writeonly", eattr::writeonly) - // .value("noalias", eattr::noalias) - // .value("aligned", eattr::aligned) - // .value("multiple_of", eattr::multiple_of) - // .value("retune", eattr::retune) - // .value("not_implemented", eattr::not_implemented); - - py::class_(m, "attribute"); - py::class_(m, "integer_attr"); - py::class_(m, "bool_attr"); - - // Ops - py::class_(m, "OpState") - .def("set_attr", - [](mlir::OpState &self, std::string &name, - mlir::Attribute &attr) -> void { self->setAttr(name, attr); }) - .def( - "get_num_results", - [](mlir::OpState &self) -> unsigned { return self->getNumResults(); }) - .def("get_result", - [](mlir::OpState &self, unsigned idx) -> mlir::Value { - return self->getResult(idx); - }) - .def( - "get_region", - [](mlir::OpState &self, unsigned idx) -> mlir::Region & { - return self->getRegion(idx); - }, - ret::reference) - .def( - "get_body", - [](mlir::scf::ForOp &self, unsigned idx) -> mlir::Block * { - return self.getBody(idx); - }, - ret::reference) - .def("dump", [](mlir::OpState &self) { self->dump(); }) - .def("__str__", - [](mlir::OpState &self) -> std::string { - std::string str; - llvm::raw_string_ostream os(str); - self->print(os); - return str; - }) - .def("append_operand", - [](mlir::OpState &self, mlir::Value &val) { - self->insertOperands(self->getNumOperands(), val); - }) - .def("verify", [](mlir::OpState &self) -> bool { - return mlir::succeeded(mlir::verify(self.getOperation())); - }); - // scf Ops - py::class_(m, "ForOp") - .def("get_induction_var", &mlir::scf::ForOp::getInductionVar); - - py::class_(m, "IfOp") - .def("get_then_block", &mlir::scf::IfOp::thenBlock, ret::reference) - .def("get_else_block", &mlir::scf::IfOp::elseBlock, ret::reference) - .def("get_then_yield", &mlir::scf::IfOp::thenYield) - .def("get_else_yield", &mlir::scf::IfOp::elseYield); - py::class_(m, "YieldOp"); - py::class_(m, "WhileOp") - .def("get_before", &mlir::scf::WhileOp::getBefore, ret::reference) - .def("get_after", &mlir::scf::WhileOp::getAfter, ret::reference); - py::class_(m, "ConditionOp"); - - // dynamic_attr is used to transfer ownership of the MLIR context to the - // module - py::class_(m, "module", py::dynamic_attr()) - .def("dump", &mlir::ModuleOp::dump) - .def("str", - [](mlir::ModuleOp &self) -> std::string { - std::string str; - llvm::raw_string_ostream os(str); - self.print(os); - return str; - }) - .def("push_back", - [](mlir::ModuleOp &self, mlir::triton::FuncOp &funcOp) -> void { - self.push_back(funcOp); - }) - .def("has_function", - [](mlir::ModuleOp &self, std::string &funcName) -> bool { - if (self.lookupSymbol(funcName)) - return true; - return false; - }) - .def("get_function", - [](mlir::ModuleOp &self, - std::string &funcName) -> mlir::triton::FuncOp { - return self.lookupSymbol(funcName); - }) - .def("get_single_function", - [](mlir::ModuleOp &self) -> mlir::triton::FuncOp { - llvm::SmallVector funcs; - self.walk( - [&](mlir::triton::FuncOp func) { funcs.push_back(func); }); - if (funcs.size() != 1) - throw std::runtime_error("Expected a single function"); - return funcs[0]; - }); - - m.def("make_attr", - [](const std::vector &values, mlir::MLIRContext &context) { - return mlir::DenseIntElementsAttr::get( - mlir::RankedTensorType::get( - {static_cast(values.size())}, - mlir::IntegerType::get(&context, 32)), - values) - .cast(); - }); - - m.def( - "parse_mlir_module", - [](const std::string &inputFilename, mlir::MLIRContext &context) { - // initialize registry - // note: we initialize llvm for undef - mlir::DialectRegistry registry; - registry.insert< - mlir::triton::TritonDialect, mlir::triton::gpu::TritonGPUDialect, - mlir::math::MathDialect, mlir::arith::ArithDialect, - mlir::index::IndexDialect, mlir::scf::SCFDialect, - mlir::cf::ControlFlowDialect, mlir::LLVM::LLVMDialect>(); - context.appendDialectRegistry(registry); - context.loadAllAvailableDialects(); - - // parse module - mlir::OwningOpRef module = - mlir::parseSourceFile(inputFilename, &context); - if (!module) - throw std::runtime_error("Parse MLIR file failed."); - // locations are incompatible with ptx < 7.5 ! - module->walk([](mlir::Operation *op) { - op->setLoc(mlir::UnknownLoc::get(op->getContext())); - }); - - return module->clone(); - }, - ret::take_ownership); - - py::class_(m, "function") - // .def_property_readonly("attrs", &ir::function::attrs) - // .def("add_attr", &ir::function::add_attr); - .def("args", - [](mlir::triton::FuncOp &self, unsigned idx) -> mlir::BlockArgument { - return self.getArgument(idx); - }) - .def( - "add_entry_block", - [](mlir::triton::FuncOp &self) -> mlir::Block * { - return self.addEntryBlock(); - }, - ret::reference) - .def( - "set_arg_attr", - [](mlir::triton::FuncOp &self, int arg_no, const std::string &name, - int val) { - // set arg attributes "name" to value "val" - auto attrTy = mlir::IntegerType::get(self.getContext(), 32); - self.setArgAttr(arg_no, name, mlir::IntegerAttr::get(attrTy, val)); - }, - ret::reference) - .def("finalize", - [](mlir::triton::FuncOp &self) -> void { - // Remove dead code - // 1. Unreachable code after return - self.walk([&](mlir::Block *block) { - mlir::Operation *retOp = nullptr; - // It's better to not use walk here because we only want to - // check operations in the current block - for (auto &op : block->getOperations()) { - if (mlir::isa(op)) - if (retOp == nullptr) { - retOp = &op; - break; - } - } - if (retOp && retOp != &block->back()) { - auto pos = retOp->getIterator(); - pos++; - auto *newBlock = block->splitBlock(pos); - newBlock->erase(); - } - }); - }) - .def_property_readonly("type", &mlir::triton::FuncOp::getFunctionType) - .def("reset_type", &mlir::triton::FuncOp::setType); - - py::class_(m, "InsertPoint"); - - py::class_(m, "builder", py::dynamic_attr()) - .def(py::init()) - // // getters - .def_property_readonly("context", &mlir::OpBuilder::getContext, - ret::reference) - .def("create_module", - [](mlir::OpBuilder &self) -> mlir::ModuleOp { - auto loc = self.getUnknownLoc(); - return self.create(loc); - }) - .def("ret", - [](mlir::OpBuilder &self, std::vector &vals) -> void { - auto loc = self.getUnknownLoc(); - self.create(loc, vals); - }) - .def("call", - [](mlir::OpBuilder &self, mlir::triton::FuncOp &func, - std::vector &args) -> mlir::OpState { - auto loc = self.getUnknownLoc(); - auto callOp = self.create(loc, func, args); - return callOp; - }) - // insertion block/point - .def("set_insertion_point_to_start", - [](mlir::OpBuilder &self, mlir::Block &block) -> void { - self.setInsertionPointToStart(&block); - }) - .def("set_insertion_point_to_end", - [](mlir::OpBuilder &self, mlir::Block &block) { - self.setInsertionPointToEnd(&block); - }) - .def("set_insertion_point_after", - [](mlir::OpBuilder &self, mlir::Operation &op) { - self.setInsertionPointAfter(&op); - }) - .def( - "get_insertion_block", - [](mlir::OpBuilder &self) -> mlir::Block * { - return self.getInsertionBlock(); - }, - ret::reference) - .def("get_insertion_point", &mlir::OpBuilder::saveInsertionPoint) - .def("restore_insertion_point", &mlir::OpBuilder::restoreInsertionPoint) - // .def("set_insert_point", [](ir::builder *self, - // std::pair pt) { - // ir::basic_block *bb = pt.first; - // ir::instruction *instr = pt.second; - // if (instr) { - // if (bb != instr->get_parent()) - // throw std::runtime_error("invalid insertion point, instr not in - // bb"); - // self->set_insert_point(instr); - // } else { - // assert(bb); - // self->set_insert_point(bb); - // } - // }) - // Attr - .def("get_bool_attr", &mlir::OpBuilder::getBoolAttr) - .def("get_int32_attr", &mlir::OpBuilder::getI32IntegerAttr) - // Use arith.ConstantOp to create constants - // Constants - .def("get_int1", - [](mlir::OpBuilder &self, bool v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value(self.create( - loc, v, self.getI1Type())); - }) - .def("get_int8", - [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value(self.create( - loc, v, self.getI8Type())); - }) - .def("get_int16", - [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value(self.create( - loc, v, self.getI16Type())); - }) - .def("get_int32", - [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value(self.create( - loc, v, self.getI32Type())); - }) - .def("get_int64", - [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value(self.create( - loc, v, self.getI64Type())); - }) - .def("get_bf16", - [](mlir::OpBuilder &self, float v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - auto type = self.getBF16Type(); - return self.create( - loc, - mlir::APFloat(type.getFloatSemantics(), std::to_string(v)), - type); - }) - .def("get_fp16", - [](mlir::OpBuilder &self, float v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, self.getF16FloatAttr(v)); - }) - .def("get_fp32", - [](mlir::OpBuilder &self, float v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, self.getF32FloatAttr(v)); - }) - .def("get_fp64", - [](mlir::OpBuilder &self, double v) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, self.getF64FloatAttr(v)); - }) - .def("get_null_value", - [](mlir::OpBuilder &self, mlir::Type type) -> mlir::Value { - auto loc = self.getUnknownLoc(); - if (auto floatTy = type.dyn_cast()) - return self.create( - loc, mlir::APFloat(floatTy.getFloatSemantics(), 0), floatTy); - else if (auto intTy = type.dyn_cast()) - return self.create(loc, 0, intTy); - else - throw std::runtime_error("Not implemented"); - }) - .def("get_all_ones_value", - [](mlir::OpBuilder &self, mlir::Type type) -> mlir::Value { - auto loc = self.getUnknownLoc(); - uint64_t val = 0xFFFFFFFFFFFFFFFF; - if (auto intTy = type.dyn_cast()) - return self.create(loc, val, intTy); - else - throw std::runtime_error("Not implemented"); - }) - - // Types - .def("get_void_ty", - [](mlir::OpBuilder &self) -> mlir::Type { - return self.getNoneType(); - }) - .def("get_int1_ty", - [](mlir::OpBuilder &self) -> mlir::Type { - return self.getI1Type(); - }) // or ret::copy? - .def("get_int8_ty", - [](mlir::OpBuilder &self) -> mlir::Type { return self.getI8Type(); }) - .def("get_int16_ty", - [](mlir::OpBuilder &self) -> mlir::Type { - return self.getType(16); - }) - .def( - "get_int32_ty", - [](mlir::OpBuilder &self) -> mlir::Type { return self.getI32Type(); }) - .def( - "get_int64_ty", - [](mlir::OpBuilder &self) -> mlir::Type { return self.getI64Type(); }) - .def("get_fp8e4_ty", - [](mlir::OpBuilder &self) -> mlir::Type { - return self.getType(); - }) - .def("get_fp8e5_ty", - [](mlir::OpBuilder &self) -> mlir::Type { - return self.getType(); - }) - .def( - "get_half_ty", - [](mlir::OpBuilder &self) -> mlir::Type { return self.getF16Type(); }) - .def("get_bf16_ty", - [](mlir::OpBuilder &self) -> mlir::Type { - return self.getBF16Type(); - }) - .def( - "get_float_ty", - [](mlir::OpBuilder &self) -> mlir::Type { return self.getF32Type(); }) - .def( - "get_double_ty", - [](mlir::OpBuilder &self) -> mlir::Type { return self.getF64Type(); }) - .def("get_ptr_ty", - [](mlir::OpBuilder &self, mlir::Type &type, - int addrSpace) -> mlir::Type { - return mlir::triton::PointerType::get(type, addrSpace); - }) - .def("get_block_ty", - [](mlir::OpBuilder &self, mlir::Type &elementType, - std::vector &shape) -> mlir::Type { - return mlir::RankedTensorType::get(shape, elementType); - }) - .def("get_function_ty", - [](mlir::OpBuilder &self, std::vector inTypes, - std::vector outTypes) -> mlir::Type { - return self.getFunctionType(inTypes, outTypes); - }) - - // Ops - .def("get_or_insert_function", - [](mlir::OpBuilder &self, mlir::ModuleOp &module, - std::string &funcName, mlir::Type &funcType, - std::string &visibility, bool noinline) -> mlir::triton::FuncOp { - if (mlir::Operation *funcOperation = module.lookupSymbol(funcName)) - return llvm::dyn_cast(funcOperation); - auto loc = self.getUnknownLoc(); - if (auto funcTy = funcType.dyn_cast()) { - llvm::SmallVector attrs = { - mlir::NamedAttribute(self.getStringAttr("sym_visibility"), - self.getStringAttr(visibility)), - mlir::NamedAttribute(self.getStringAttr("noinline"), - self.getBoolAttr(noinline))}; - return self.create(loc, funcName, funcTy, - attrs); - } - throw std::runtime_error("invalid function type"); - }) - .def( - "create_block", - [](mlir::OpBuilder &self) -> mlir::Block * { - mlir::Region *parent = self.getBlock()->getParent(); - return self.createBlock(parent); - }, - ret::reference) - .def( - "create_block_with_parent", - [](mlir::OpBuilder &self, mlir::Region &parent, - std::vector &argTypes) -> mlir::Block * { - auto argLoc = self.getUnknownLoc(); - llvm::SmallVector argLocs(argTypes.size(), - argLoc); - return self.createBlock(&parent, {}, argTypes, argLocs); - }, - ret::reference) - .def( - "new_block", - [](mlir::OpBuilder &self) -> mlir::Block * { - return new mlir::Block(); - }, - ret::reference) - // Unstructured control flow - .def("create_cond_branch", - [](mlir::OpBuilder &self, mlir::Value condition, - mlir::Block *trueDest, mlir::Block *falseDest) { - auto loc = self.getUnknownLoc(); - self.create(loc, condition, trueDest, - falseDest); - return; - }) - .def("create_branch", - [](mlir::OpBuilder &self, mlir::Block *dest, - std::vector &args) { - auto loc = self.getUnknownLoc(); - self.create(loc, dest, args); - return; - }) - // Structured control flow - .def("create_for_op", - [](mlir::OpBuilder &self, mlir::Value &lb, mlir::Value &ub, - mlir::Value &step, - std::vector &initArgs) -> mlir::scf::ForOp { - auto loc = self.getUnknownLoc(); - return self.create(loc, lb, ub, step, initArgs); - }) - .def("create_if_op", - [](mlir::OpBuilder &self, std::vector &retTypes, - mlir::Value &condition, bool withElse) -> mlir::scf::IfOp { - auto loc = self.getUnknownLoc(); - return self.create(loc, retTypes, condition, - withElse); - }) - .def("create_yield_op", - [](mlir::OpBuilder &self, - std::vector &yields) -> mlir::scf::YieldOp { - auto loc = self.getUnknownLoc(); - return self.create(loc, yields); - }) - .def("create_while_op", - [](mlir::OpBuilder &self, std::vector &retTypes, - std::vector &initArgs) -> mlir::scf::WhileOp { - auto loc = self.getUnknownLoc(); - return self.create(loc, retTypes, initArgs); - }) - .def("create_condition_op", - [](mlir::OpBuilder &self, mlir::Value &cond, - std::vector &args) -> mlir::scf::ConditionOp { - auto loc = self.getUnknownLoc(); - return self.create(loc, cond, args); - }) - - // miscellaneous - .def("create_make_range", - [](mlir::OpBuilder &self, int start, int end) -> mlir::Value { - auto loc = self.getUnknownLoc(); - auto retType = - mlir::RankedTensorType::get({end - start}, self.getI32Type()); - return self.create(loc, retType, start, - end); - }) - - // Cast instructions - // Conversions for custom FP types (FP8) - .def("create_fp_to_fp", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - // Conversions for standard LLVM builtin types - .def("create_bitcast", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - .def("create_si_to_fp", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - .def("create_ui_to_fp", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - .def("create_fp_to_si", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - .def("create_fp_to_ui", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - .def("create_fp_ext", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - .def("create_fp_trunc", - [](mlir::OpBuilder &self, mlir::Value &src, - mlir::Type &dstType) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, dstType, src); - }) - .def("create_int_cast", - [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType, - bool isSigned) -> mlir::Value { - auto loc = self.getUnknownLoc(); - // get element type if necessary - mlir::Type srcType = src.getType(); - auto srcTensorType = srcType.dyn_cast(); - auto dstTensorType = dstType.dyn_cast(); - mlir::Type srcEltType = srcType; - mlir::Type dstEltType = dstType; - if (dstTensorType && srcTensorType) { - dstEltType = dstTensorType.getElementType(); - srcEltType = srcTensorType.getElementType(); - } - unsigned srcWidth = srcEltType.getIntOrFloatBitWidth(); - unsigned dstWidth = dstEltType.getIntOrFloatBitWidth(); - if (srcWidth == dstWidth) - return self.create(loc, dstType, src); - else if (srcWidth > dstWidth) - return self.create(loc, dstType, src); - else if (isSigned) - return self.create(loc, dstType, src); - else - return self.create(loc, dstType, src); - }) - .def("create_to_index", - [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, self.getIndexType(), input); - }) - .def("create_index_to_si", - [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, self.getI64Type(), input); - }) - .def("create_fmul", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_fdiv", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_frem", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_fadd", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_fsub", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_mul", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_sdiv", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_udiv", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_srem", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_urem", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_add", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_sub", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value( - self.create(loc, lhs, rhs)); - }) - .def("create_shl", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value( - self.create(loc, lhs, rhs)); - }) - .def("create_lshr", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value( - self.create(loc, lhs, rhs)); - }) - .def("create_ashr", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return mlir::Value( - self.create(loc, lhs, rhs)); - }) - // AddPtr (similar to GEP) - .def("create_addptr", - [](mlir::OpBuilder &self, mlir::Value &ptr, - mlir::Value &offset) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, ptr.getType(), ptr, - offset); - }) - // Comparison (int) - .def("create_icmpSLE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::sle, lhs, rhs); - }) - .def("create_icmpSLT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::slt, lhs, rhs); - }) - .def("create_icmpSGE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::sge, lhs, rhs); - }) - .def("create_icmpSGT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::sgt, lhs, rhs); - }) - .def("create_icmpULE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::ule, lhs, rhs); - }) - .def("create_icmpULT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::ult, lhs, rhs); - }) - .def("create_icmpUGE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::uge, lhs, rhs); - }) - .def("create_icmpUGT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::ugt, lhs, rhs); - }) - .def("create_icmpEQ", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::eq, lhs, rhs); - }) - .def("create_icmpNE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpIPredicate::ne, lhs, rhs); - }) - // Comparison (float) - .def("create_fcmpOLT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::OLT, lhs, rhs); - }) - .def("create_fcmpOGT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::OGT, lhs, rhs); - }) - .def("create_fcmpOLE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::OLE, lhs, rhs); - }) - .def("create_fcmpOGE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::OGE, lhs, rhs); - }) - .def("create_fcmpOEQ", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::OEQ, lhs, rhs); - }) - .def("create_fcmpONE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::ONE, lhs, rhs); - }) - .def("create_fcmpULT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::ULT, lhs, rhs); - }) - .def("create_fcmpUGT", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::UGT, lhs, rhs); - }) - .def("create_fcmpULE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::ULE, lhs, rhs); - }) - .def("create_fcmpUGE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::UGE, lhs, rhs); - }) - .def("create_fcmpUEQ", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::UEQ, lhs, rhs); - }) - .def("create_fcmpUNE", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, mlir::arith::CmpFPredicate::UNE, lhs, rhs); - }) - // // Logical - .def("create_and", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_xor", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - .def("create_or", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, lhs, rhs); - }) - // Input/Output - .def("create_load", - [](mlir::OpBuilder &self, mlir::Value &ptrs, - mlir::triton::CacheModifier cacheModifier, - mlir::triton::EvictionPolicy evictionPolicy, - bool isVolatile) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, ptrs, cacheModifier, evictionPolicy, isVolatile); - }) - .def("create_store", - [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &value, - mlir::triton::CacheModifier cacheModifier, - mlir::triton::EvictionPolicy evictionPolicy) -> void { - auto loc = self.getUnknownLoc(); - self.create(loc, ptrs, value, cacheModifier, - evictionPolicy); - }) - .def("create_tensor_pointer_load", - [](mlir::OpBuilder &self, mlir::Value &ptr, - std::vector &boundaryCheck, - std::optional paddingOption, - mlir::triton::CacheModifier cacheModifier, - mlir::triton::EvictionPolicy evictionPolicy, - bool isVolatile) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, ptr, boundaryCheck, paddingOption, cacheModifier, - evictionPolicy, isVolatile); - }) - .def("create_tensor_pointer_store", - [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &val, - std::vector &boundaryCheck, - mlir::triton::CacheModifier cacheModifier, - mlir::triton::EvictionPolicy evictionPolicy) -> void { - auto loc = self.getUnknownLoc(); - self.create(loc, ptr, val, boundaryCheck, - cacheModifier, evictionPolicy); - }) - .def("create_masked_load", - [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &mask, - std::optional &other, - mlir::triton::CacheModifier cacheModifier, - mlir::triton::EvictionPolicy evictionPolicy, - bool isVolatile) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, ptrs, mask, other.value_or(mlir::Value()), cacheModifier, - evictionPolicy, isVolatile); - }) - .def("create_masked_store", - [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &val, - mlir::Value &mask, mlir::triton::CacheModifier cacheModifier, - mlir::triton::EvictionPolicy evictionPolicy) -> void { - auto loc = self.getUnknownLoc(); - self.create(loc, ptrs, val, mask, - cacheModifier, evictionPolicy); - }) - .def("create_view", - [](mlir::OpBuilder &self, mlir::Value &arg, - std::vector &shape) -> mlir::Value { - auto loc = self.getUnknownLoc(); - auto argType = arg.getType() - .dyn_cast() - .getElementType(); - return self.create( - loc, mlir::RankedTensorType::get(shape, argType), arg); - }) - .def( - "create_expand_dims", - [](mlir::OpBuilder &self, mlir::Value &arg, int axis) -> mlir::Value { - auto loc = self.getUnknownLoc(); - auto argType = arg.getType().dyn_cast(); - auto argEltType = argType.getElementType(); - std::vector retShape = argType.getShape(); - retShape.insert(retShape.begin() + axis, 1); - return self.create( - loc, mlir::RankedTensorType::get(retShape, argEltType), arg, - axis); - }) - .def("create_cat", - [](mlir::OpBuilder &self, mlir::Value &lhs, - mlir::Value &rhs) -> mlir::Value { - auto loc = self.getUnknownLoc(); - auto lhsType = lhs.getType().dyn_cast(); - auto rhsType = rhs.getType().dyn_cast(); - if (!(lhsType.getShape().size() == 1 && - rhsType.getShape().size() == 1)) - throw std::runtime_error( - "shape not supported by cat. Expecting rank-1 inputs"); - std::vector shape{lhsType.getShape()[0] + - rhsType.getShape()[0]}; - return self.create( - loc, - mlir::RankedTensorType::get(shape, lhsType.getElementType()), - lhs, rhs); - }) - .def("create_trans", - [](mlir::OpBuilder &self, mlir::Value &arg) -> mlir::Value { - auto loc = self.getUnknownLoc(); - auto argType = arg.getType().dyn_cast(); - auto argEltType = argType.getElementType(); - std::vector retShape = argType.getShape(); - std::reverse(retShape.begin(), retShape.end()); - return self.create( - loc, mlir::RankedTensorType::get(retShape, argEltType), arg); - }) - .def("create_broadcast", - [](mlir::OpBuilder &self, mlir::Value &arg, - std::vector &shape) -> mlir::Value { - auto loc = self.getUnknownLoc(); - if (auto argType = - arg.getType().dyn_cast()) - return self.createOrFold( - loc, - mlir::RankedTensorType::get(shape, argType.getElementType()), - arg); - throw std::runtime_error( - "arg is not of RankedTensorType, use create_splat"); - }) - .def("create_splat", - [](mlir::OpBuilder &self, mlir::Value &arg, - std::vector &shape) -> mlir::Value { - auto loc = self.getUnknownLoc(); - auto argType = arg.getType(); - auto ret = self.createOrFold( - loc, mlir::RankedTensorType::get(shape, argType), arg); - return ret; - }) - // // atomic - .def("create_atomic_cas", - [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &cmp, - mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - mlir::Type dstType; - if (auto srcTensorType = - ptr.getType().dyn_cast()) { - mlir::Type dstElemType = srcTensorType.getElementType() - .cast() - .getPointeeType(); - dstType = mlir::RankedTensorType::get(srcTensorType.getShape(), - dstElemType); - } else { - auto ptrType = mlir::getElementTypeOrSelf(ptr) - .cast(); - dstType = ptrType.getPointeeType(); - } - return self.create(loc, dstType, ptr, - cmp, val); - }) - .def("create_atomic_rmw", - [](mlir::OpBuilder &self, mlir::triton::RMWOp rmwOp, - mlir::Value &ptr, mlir::Value &val, - mlir::Value &mask) -> mlir::Value { - auto loc = self.getUnknownLoc(); - mlir::Type dstType; - if (auto srcTensorType = - ptr.getType().dyn_cast()) { - mlir::Type dstElemType = srcTensorType.getElementType() - .cast() - .getPointeeType(); - dstType = mlir::RankedTensorType::get(srcTensorType.getShape(), - dstElemType); - } else { - auto ptrType = mlir::getElementTypeOrSelf(ptr) - .cast(); - dstType = ptrType.getPointeeType(); - } - return self.create(loc, dstType, rmwOp, - ptr, val, mask); - }) - // External - .def("create_extern_elementwise", - [](mlir::OpBuilder &self, const std::string &libName, - const std::string &libPath, const std::string &symbol, - std::vector &argList, mlir::Type retType, - bool isPure) -> mlir::Value { - auto loc = self.getUnknownLoc(); - if (isPure) - return self.create( - loc, retType, argList, libName, libPath, symbol); - else - return self.create( - loc, retType, argList, libName, libPath, symbol); - }) - // Built-in instruction - .def("create_get_program_id", - [](mlir::OpBuilder &self, int axis) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, self.getI32Type(), self.getI32IntegerAttr(axis)); - }) - .def("create_get_num_programs", - [](mlir::OpBuilder &self, int axis) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, self.getI32Type(), self.getI32IntegerAttr(axis)); - }) - .def("create_dot", - [](mlir::OpBuilder &self, mlir::Value &a, mlir::Value &b, - mlir::Value &c, bool allowTF32) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, c.getType(), a, b, c, - allowTF32); - }) - .def("create_exp", - [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, val); - }) - .def("create_cos", - [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, val); - }) - .def("create_sin", - [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, val); - }) - .def("create_log", - [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, val); - }) - .def("create_sqrt", - [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, val); - }) - .def("create_fabs", - [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, val); - }) - .def("create_iabs", - [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, val); - }) - .def("create_reduce", - [](mlir::OpBuilder &self, std::vector operands, - int axis) -> mlir::OpState { - auto loc = self.getUnknownLoc(); - return self.create(loc, operands, axis); - }) - .def("create_reduce_ret", - [](mlir::OpBuilder &self, py::args args) -> mlir::OpState { - auto loc = self.getUnknownLoc(); - llvm::SmallVector return_values; - for (const auto &arg : args) { - return_values.push_back(py::cast(arg)); - } - return self.create(loc, - return_values); - }) - .def("create_ptr_to_int", - [](mlir::OpBuilder &self, mlir::Value &val, - mlir::Type &type) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, type, val); - }) - .def("create_int_to_ptr", - [](mlir::OpBuilder &self, mlir::Value &val, - mlir::Type &type) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, type, val); - }) - .def("create_select", - [](mlir::OpBuilder &self, mlir::Value &condition, - mlir::Value &trueValue, mlir::Value &falseValue) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, condition, - trueValue, falseValue); - }) - .def("create_print", - [](mlir::OpBuilder &self, const std::string &prefix, - const std::vector &values) -> void { - auto loc = self.getUnknownLoc(); - self.create( - loc, - mlir::StringAttr::get(self.getContext(), - llvm::StringRef(prefix)), - values); - }) - .def("create_assert", - [](mlir::OpBuilder &self, mlir::Value &condition, - const std::string &message, const std::string &fileName, - const std::string &funcName, unsigned lineNo) -> void { - auto loc = self.getUnknownLoc(); - auto messageAttr = mlir::StringAttr::get(self.getContext(), - llvm::StringRef(message)); - auto fileNameAttr = mlir::StringAttr::get( - self.getContext(), llvm::StringRef(fileName)); - auto funcNameAttr = mlir::StringAttr::get( - self.getContext(), llvm::StringRef(funcName)); - auto lineNoAttr = self.getI32IntegerAttr(lineNo); - self.create(loc, condition, messageAttr, - fileNameAttr, funcNameAttr, - lineNoAttr); - }) - // Undef - .def("create_undef", - [](mlir::OpBuilder &self, mlir::Type &type) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create<::mlir::LLVM::UndefOp>(loc, type); - }) - // Force GPU barrier - .def("create_barrier", - [](mlir::OpBuilder &self) { - auto loc = self.getUnknownLoc(); - self.create(loc); - }) - // Make a block pointer (tensor pointer in Triton IR) - .def("create_make_block_ptr", - [](mlir::OpBuilder &self, mlir::Value &base, - std::vector &shape, - std::vector &strides, - std::vector &offsets, - std::vector &tensorShape, - std::vector &order) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create( - loc, base, shape, strides, offsets, tensorShape, order); - }) - // Advance a block pointer - .def("create_advance", - [](mlir::OpBuilder &self, mlir::Value &ptr, - std::vector &offsets) -> mlir::Value { - auto loc = self.getUnknownLoc(); - return self.create(loc, ptr.getType(), - ptr, offsets); - }); - - py::class_(m, "pass_manager") - .def(py::init()) - .def("enable_debug", - [](mlir::PassManager &self) { - auto printingFlags = mlir::OpPrintingFlags(); - printingFlags.elideLargeElementsAttrs(16); - self.enableIRPrinting( - /*shouldPrintBeforePass=*/nullptr, - /*shouldPrintAfterPass=*/ - [](mlir::Pass *pass, mlir::Operation *) { - return ::triton::tools::getBoolEnv("MLIR_ENABLE_DUMP"); - }, - /*printModuleScope=*/false, - /*printAfterOnlyOnChange=*/true, - /*printAfterOnlyOnFailure*/ false, llvm::dbgs(), - printingFlags); - }) - .def("run", - [](mlir::PassManager &self, mlir::ModuleOp &mod) { - // TODO: maybe dump module to file and print error for better - // diagnostics - if (mlir::failed(self.run(mod.getOperation()))) - throw std::runtime_error("PassManager::run failed"); - }) - .def( - "add_sccp_pass", - [](mlir::PassManager &self) { self.addPass(mlir::createSCCPPass()); }) - .def("add_tritongpu_coalesce_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createTritonGPUCoalescePass()); - }) - .def("add_symbol_dce_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createSymbolDCEPass()); - }) - .def("add_inliner_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createInlinerPass()); - }) - .def("add_canonicalizer_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createCanonicalizerPass()); - }) - .def("add_cse_pass", - [](mlir::PassManager &self) { self.addPass(mlir::createCSEPass()); }) - .def("add_licm_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createLoopInvariantCodeMotionPass()); - }) - .def("add_triton_combine_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::triton::createCombineOpsPass()); - }) - .def("add_rewrite_tensor_pointer_pass", - [](mlir::PassManager &self, int computeCapability) { - self.addPass(mlir::triton::createRewriteTensorPointerPass( - computeCapability)); - }) - .def("add_convert_triton_to_tritongpu_pass", - [](mlir::PassManager &self, int numWarps) { - self.addPass( - mlir::triton::createConvertTritonToTritonGPUPass(numWarps)); - }) - .def("add_tritongpu_pipeline_pass", - [](mlir::PassManager &self, int numStages) { - self.addPass(mlir::createTritonGPUPipelinePass(numStages)); - }) - .def("add_tritongpu_prefetch_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createTritonGPUPrefetchPass()); - }) - .def("add_tritongpu_accelerate_matmul_pass", - [](mlir::PassManager &self, int computeCapability) { - self.addPass( - mlir::createTritonGPUAccelerateMatmulPass(computeCapability)); - }) - .def("add_tritongpu_optimize_dot_operands_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createTritonGPUOptimizeDotOperandsPass()); - }) - .def("add_tritongpu_remove_layout_conversions_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createTritonGPURemoveLayoutConversionsPass()); - }) - .def("add_tritongpu_reorder_instructions_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createTritonGPUReorderInstructionsPass()); - }) - .def("add_tritongpu_decompose_conversions_pass", - [](mlir::PassManager &self) { - self.addPass(mlir::createTritonGPUDecomposeConversionsPass()); - }) - .def("add_triton_gpu_to_llvm", - [](mlir::PassManager &self) { - self.addPass(mlir::triton::createConvertTritonGPUToLLVMPass()); - }) - .def("add_scf_to_cfg", [](mlir::PassManager &self) { - self.addPass(mlir::createConvertSCFToCFPass()); - }); -} - -void init_triton_translation(py::module &m) { - using ret = py::return_value_policy; - - m.def("get_shared_memory_size", [](mlir::ModuleOp mod) { - auto shared = mod->getAttrOfType("triton_gpu.shared"); - return shared.getInt(); - }); - - m.def( - "translate_triton_gpu_to_llvmir", - [](mlir::ModuleOp op, int computeCapability, bool isROCM) { - py::gil_scoped_release allow_threads; - llvm::LLVMContext llvmContext; - auto llvmModule = ::mlir::triton::translateTritonGPUToLLVMIR( - &llvmContext, op, computeCapability, isROCM); - if (!llvmModule) - llvm::report_fatal_error("Failed to translate TritonGPU to LLVM IR."); - - std::string str; - llvm::raw_string_ostream os(str); - llvmModule->print(os, nullptr); - os.flush(); - return str; - }, - ret::take_ownership); - - m.def( - "translate_llvmir_to_ptx", - [](const std::string llvmIR, int capability, int version) -> std::string { - py::gil_scoped_release allow_threads; - // create LLVM module from C++ - llvm::LLVMContext context; - std::unique_ptr buffer = - llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str()); - llvm::SMDiagnostic error; - std::unique_ptr module = - llvm::parseIR(buffer->getMemBufferRef(), error, context); - if (!module) { - llvm::report_fatal_error( - "failed to parse IR: " + error.getMessage() + - "lineno: " + std::to_string(error.getLineNo())); - } - - // translate module to PTX - auto ptxCode = - triton::translateLLVMIRToPTX(*module, capability, version); - return ptxCode; - }, - ret::take_ownership); - - m.def("compile_ptx_to_cubin", - [](const std::string &ptxCode, const std::string &ptxasPath, - int capability) -> py::object { - std::string cubin; - { - py::gil_scoped_release allow_threads; - - // compile ptx with ptxas - llvm::SmallString<64> fsrc; - llvm::SmallString<64> flog; - llvm::sys::fs::createTemporaryFile("compile-ptx-src", "", fsrc); - llvm::sys::fs::createTemporaryFile("compile-ptx-log", "", flog); - std::string fbin = std::string(fsrc) + ".o"; - llvm::FileRemover logRemover(flog); - llvm::FileRemover binRemover(fbin); - const char *_fsrc = fsrc.c_str(); - const char *_flog = flog.c_str(); - const char *_fbin = fbin.c_str(); - std::ofstream ofs(_fsrc); - ofs << ptxCode << std::endl; - ofs.close(); - std::string cmd; - int err; - cmd = ptxasPath + " -v --gpu-name=sm_" + - std::to_string(capability) + (capability == 90 ? "a " : " ") + - _fsrc + " -o " + _fsrc + ".o 2> " + _flog; - - err = system(cmd.c_str()); - if (err != 0) { - err >>= 8; - std::ifstream _log(_flog); - std::string log(std::istreambuf_iterator(_log), {}); - if (err == 255) { - throw std::runtime_error( - "Internal Triton PTX codegen error: \n" + log); - } else if (err == 128 + SIGSEGV) { - throw std::runtime_error("Please run `ptxas " + - fsrc.str().str() + - "` to confirm that this is a " - "bug in `ptxas`\n" + - log); - } else { - throw std::runtime_error("`ptxas` failed with error code " + - std::to_string(err) + ": \n" + log); - } - return {}; - } else { - llvm::FileRemover srcRemover(fsrc); - std::ifstream _cubin(_fbin, std::ios::binary); - cubin = std::string(std::istreambuf_iterator(_cubin), {}); - _cubin.close(); - // Do not return here, exit the gil scope and return below - } - } - py::bytes bytes(cubin); - return std::move(bytes); - }); - - m.def("add_external_libs", - [](mlir::ModuleOp &op, const std::vector &names, - const std::vector &paths) { - ::mlir::triton::addExternalLibs(op, names, paths); - }); - - m.def( - "translate_llvmir_to_hsaco", - [](const std::string llvmIR, std::string gfx_arch, std::string gfx_triple, - std::string gfx_features) -> std::tuple { - // create LLVM module from C++ - llvm::LLVMContext context; - std::unique_ptr buffer = - llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str()); - llvm::SMDiagnostic error; - std::unique_ptr module = - llvm::parseIR(buffer->getMemBufferRef(), error, context); - // translate module to HSACO - auto hsacoCode = triton::translateLLVMIRToHSACO( - *module, gfx_arch, gfx_triple, gfx_features); - return hsacoCode; - }, - ret::take_ownership); -} - -void init_triton(py::module &m) { - py::module subm = m.def_submodule("triton"); - // init_triton_codegen(subm.def_submodule("code_gen")); - init_triton_runtime(subm.def_submodule("runtime")); - init_triton_ir(subm.def_submodule("ir")); - init_triton_translation(subm); -} diff --git a/python/test/regression/test_functional_regressions.py b/python/test/regression/test_functional_regressions.py deleted file mode 100644 index 02e9d2323f18..000000000000 --- a/python/test/regression/test_functional_regressions.py +++ /dev/null @@ -1,136 +0,0 @@ -import numpy as np -import torch -from numpy.random import RandomState - -import triton -import triton.language as tl - - -def test_chained_matmul(): - # Regression test for issue #1601 - def chained_matmul_reference(a, b, c): - intermediate = torch.einsum('MK,NK->MN', a, b) - return torch.einsum('MN,NK->MK', intermediate, c) - - @triton.jit - def chained_matmul_kernel( - A, # shape: (m, k) - B, # shape: (n, k) - C, # shape: (n, k) - out, # shape: (m, k) - m, n, k: tl.constexpr, - block_m: tl.constexpr, - block_n: tl.constexpr, - block_k: tl.constexpr): - - tl.static_assert(block_k == k, - f"expected block_k == k but got {block_k} != {k}") - - block_ix = tl.program_id(0) - a_tile = (block_ix * block_m + tl.arange(0, block_m))[:, None] * block_k \ - + tl.arange(0, block_k)[None, :] - - a = tl.load(A + a_tile, mask=a_tile < m * k, other=0.0) - - acc = tl.zeros([block_m, block_k], dtype=tl.float32) - - for loop_block_start in range(0, n, block_n): - bc_tile = (loop_block_start + tl.arange(0, block_n))[:, None] * block_k \ - + tl.arange(0, block_k)[None, :] - b = tl.load(B + bc_tile, mask=bc_tile < n * k, other=0.0) - - intermediate = tl.dot(a, tl.trans(b)) - intermediate_mask = ((loop_block_start + tl.arange(0, block_n)) < n)[None, :] \ - * (tl.arange(0, block_m) < m)[:, None] - - intermediate = tl.where(intermediate_mask, intermediate, 0.0) - - c = tl.load(C + bc_tile, mask=bc_tile < n * k) - - acc += tl.dot(intermediate.to(A.dtype.element_ty), c) - - tl.store(out + a_tile, acc.to(A.dtype.element_ty), mask=a_tile < m * k) - - m, n, k = 32, 64, 128 - block_m, block_n, block_k = 16, 32, k - - grid = (triton.cdiv(m, block_m),) - a = torch.randint(low=0, high=2, size=(m, k), dtype=torch.float16, - device='cuda') - b = torch.randint(low=0, high=2, size=(n, k), dtype=torch.float16, - device='cuda') - c = torch.randint_like(b, low=0, high=2) - triton_result = torch.zeros_like(a) - - torch_result = chained_matmul_reference(a, b, c) - chained_matmul_kernel[grid](a, b, c, triton_result, m, n, k, - block_m=block_m, block_n=block_n, - block_k=block_k) - - assert (torch_result == triton_result).all() - - -def test_vecmat(): - @triton.jit - def batched_vecmat( - # inputs - A, # shape: [dim_m, dim_k] - B, # shape: [dim_m, dim_n, dim_k] - # dimensions - dim_m, dim_n, dim_k, - # outputs - output, - # block information - block_m: tl.constexpr, block_n: tl.constexpr, block_k: tl.constexpr - ): - m_index = tl.program_id(0) - n_index = tl.program_id(1) - # Output tile - output_tile = (m_index * block_m + tl.arange(0, block_m))[:, None] * dim_n \ - + (n_index * block_n + tl.arange(0, block_n))[None, :] - - vecmat = tl.zeros([block_m, block_n], dtype=A.dtype.element_ty) - k_blocks = dim_k // block_k - for k_index in range(k_blocks): - # Load A tile - a_tile = (m_index * block_m + tl.arange(0, block_m))[:, None] * dim_k \ - + (k_index * block_k + tl.arange(0, block_k))[None, :] - a = tl.load(A + a_tile) - - # Load B tile, transposed to [n, m, k] in order to broadcast A on a - # leading dimension. - b_tile = (m_index * block_m + tl.arange(0, block_m))[None, :, None] * dim_n * dim_k \ - + (n_index * block_n + tl.arange(0, block_n))[:, None, None] * dim_k \ - + (k_index * block_k + tl.arange(0, block_k))[None, None, :] - b = tl.load(B + b_tile) - - expanded_a, _ = tl.broadcast(a, b) - vecmat += tl.trans(tl.sum(expanded_a * b, axis=2)) - - tl.store(output + output_tile, vecmat) - - M, N, K = 128, 128, 128 - block_m, block_n, block_k = 16, 32, 64 - - rs = RandomState(17) - A_vec = rs.randint(0, 4, (M, K)).astype('float32') - B_vec = rs.randint(0, 4, (M, N, K)).astype('float32') - A = A_vec - B = B_vec - - A_tri = torch.tensor(A, device='cuda') - B_tri = torch.tensor(B, device='cuda') - C_tri = torch.zeros((M, N), dtype=torch.float32, device='cuda') - - grid = (M // block_m, N // block_n) - - batched_vecmat[grid](A_tri, B_tri, M, N, K, C_tri, - block_m=block_m, block_n=block_n, block_k=block_k, - num_warps=4, num_stages=1) - - A_expanded = A[:, np.newaxis, :] - A_broadcasted = np.broadcast_to(A_expanded, (M, N, K)) - AB = A_broadcasted * B - C_ref = np.sum(AB, axis=2) - - np.testing.assert_allclose(C_ref, C_tri.cpu().numpy(), rtol=0.01, atol=1e-3) diff --git a/python/test/regression/test_performance.py b/python/test/regression/test_performance.py deleted file mode 100644 index 341248fd06c4..000000000000 --- a/python/test/regression/test_performance.py +++ /dev/null @@ -1,212 +0,0 @@ -import subprocess -import sys - -import pytest -import torch - -import triton -import triton.language as tl -import triton.ops -from triton.testing import get_dram_gbps, get_max_tensorcore_tflops - -DEVICE_NAME = {7: 'v100', 8: 'a100'}[torch.cuda.get_device_capability()[0]] - -####################### -# Utilities -####################### - - -def print_perf(cur_ms, cur_util, ref_util): - # print on the same line cur_ms, cur_util and ref_util with 3 decimal places - print(f'{cur_ms:.3f} ms \t cur: {cur_util:.3f} \t ref: {ref_util:.3f} \t dif={cur_util - ref_util:.3f}', end='\t') - - -def nvsmi(attrs): - attrs = ','.join(attrs) - cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits'] - out = subprocess.check_output(cmd) - ret = out.decode(sys.stdout.encoding).split(',') - ret = [int(x) for x in ret] - return ret - - -####################### -# Matrix Multiplication -####################### - -sm_clocks = {'v100': 1350, 'a100': 1350} -mem_clocks = {'v100': 877, 'a100': 1215} - -matmul_data = { - 'v100': { - # square - (512, 512, 512): {'float16': 0.158}, - (1024, 1024, 1024): {'float16': 0.466}, - (2048, 2048, 2048): {'float16': 0.695}, - (4096, 4096, 4096): {'float16': 0.831}, - (8192, 8192, 8192): {'float16': 0.849}, - # tall-skinny - (16, 1024, 1024): {'float16': 0.0128}, - (16, 4096, 4096): {'float16': 0.0883}, - (16, 8192, 8192): {'float16': 0.101}, - (64, 1024, 1024): {'float16': 0.073}, - (64, 4096, 4096): {'float16': 0.270}, - (64, 8192, 8192): {'float16': 0.459}, - (1024, 64, 1024): {'float16': 0.0692}, - (4096, 64, 4096): {'float16': 0.264}, - (8192, 64, 8192): {'float16': 0.452}, - }, - # NOTE: - # A100 in the CI server is slow-ish for some reason. - # On some other servers, we are getting about 90% peak for 8kx8x8k float16 - 'a100': { - (512, 512, 512): {'float16': 0.084, 'float32': 0.13, 'int8': 0.05}, - (1024, 1024, 1024): {'float16': 0.332, 'float32': 0.35, 'int8': 0.169}, - (2048, 2048, 2048): {'float16': 0.641, 'float32': 0.57, 'int8': 0.34}, - (4096, 4096, 4096): {'float16': 0.785, 'float32': 0.75, 'int8': 0.46}, - (8192, 8192, 8192): {'float16': 0.805, 'float32': 0.85, 'int8': 0.51}, - # tall-skinny - (16, 1024, 1024): {'float16': 0.0077, 'float32': 0.0127, 'int8': 0.005}, - (16, 4096, 4096): {'float16': 0.044, 'float32': 0.0457, 'int8': 0.0259}, - (16, 8192, 8192): {'float16': 0.07, 'float32': 0.0648, 'int8': 0.0431}, - (64, 1024, 1024): {'float16': 0.030, 'float32': 0.0509, 'int8': 0.0169}, - (64, 4096, 4096): {'float16': 0.163, 'float32': 0.162, 'int8': 0.097}, - (64, 8192, 8192): {'float16': 0.285, 'float32': 0.257, 'int8': 0.174}, - (1024, 64, 1024): {'float16': 0.033, 'float32': 0.0458, 'int8': 0.017}, - (4096, 64, 4096): {'float16': 0.16, 'float32': 0.177, 'int8': 0.102}, - (8192, 64, 8192): {'float16': 0.254, 'float32': 0.230, 'int8': 0.177}, - } -} - - -@pytest.mark.parametrize('M, N, K, dtype_str', - [(M, N, K, dtype_str) - for M, N, K in matmul_data[DEVICE_NAME].keys() - for dtype_str in ['float16']]) -def test_matmul(M, N, K, dtype_str): - if dtype_str in ['float32', 'int8'] and DEVICE_NAME != 'a100': - pytest.skip('Only test float32 & int8 on a100') - dtype = {'float16': torch.float16, 'float32': torch.float32, 'int8': torch.int8}[dtype_str] - torch.manual_seed(0) - ref_gpu_util = matmul_data[DEVICE_NAME][(M, N, K)][dtype_str] - cur_sm_clock = nvsmi(['clocks.current.sm'])[0] - max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3) - if dtype == torch.int8: - a = torch.randint(-128, 127, (M, K), dtype=dtype, device='cuda') - b = torch.randint(-128, 127, (N, K), dtype=dtype, device='cuda') - b = b.t() # only test row-col layout - else: - a = torch.randn((M, K), dtype=dtype, device='cuda') - b = torch.randn((K, N), dtype=dtype, device='cuda') - fn = lambda: triton.ops.matmul(a, b) - ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=300) - cur_gpu_perf = 2. * M * N * K / ms * 1e-9 - cur_gpu_util = cur_gpu_perf / max_gpu_perf - print_perf(ms, cur_gpu_util, ref_gpu_util) - triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05) - - -####################### -# Element-Wise -####################### - - -@triton.jit -def _add(x_ptr, y_ptr, output_ptr, n_elements, - BLOCK_SIZE: tl.constexpr): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - y = tl.load(y_ptr + offsets, mask=mask) - output = x + y - tl.store(output_ptr + offsets, output, mask=mask) - - -elementwise_data = { - 'v100': { - 1024 * 16: 0.0219, - 1024 * 64: 0.0791, - 1024 * 256: 0.243, - 1024 * 1024: 0.530, - 1024 * 4096: 0.796, - 1024 * 16384: 0.905, - 1024 * 65536: 0.939, - }, - 'a100': { - 1024 * 16: 0.010, - 1024 * 64: 0.040, - 1024 * 256: 0.132, - 1024 * 1024: 0.353, - 1024 * 4096: 0.605, - 1024 * 16384: 0.758, - 1024 * 65536: 0.850, - } -} - - -@pytest.mark.parametrize('N', elementwise_data[DEVICE_NAME].keys()) -def test_elementwise(N): - torch.manual_seed(0) - ref_gpu_util = elementwise_data[DEVICE_NAME][N] - max_gpu_perf = get_dram_gbps() - z = torch.empty((N, ), dtype=torch.float16, device='cuda') - x = torch.randn_like(z) - y = torch.randn_like(z) - grid = lambda args: (triton.cdiv(N, args['BLOCK_SIZE']), ) - fn = lambda: _add[grid](x, y, z, N, BLOCK_SIZE=1024) - ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=500) - cur_gpu_perf = 3. * N * z.element_size() / ms * 1e-6 - cur_gpu_util = cur_gpu_perf / max_gpu_perf - print_perf(ms, cur_gpu_util, ref_gpu_util) - triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05) - -####################### -# Flash-Attention -####################### - - -flash_attention_data = { - "a100": { - (4, 48, 4096, 64, 'forward', 'float16'): 0.37, - (4, 48, 4096, 64, 'backward', 'float16'): 0.25, - } -} - - -@pytest.mark.parametrize("Z, H, N_CTX, D_HEAD", [[4, 48, 4096, 64]]) -@pytest.mark.parametrize("mode", ['forward', 'backward']) -@pytest.mark.parametrize("dtype_str", ['float16']) -def test_flash_attention(Z, H, N_CTX, D_HEAD, mode, dtype_str): - is_backward = mode == 'backward' - capability = torch.cuda.get_device_capability() - if capability[0] < 8: - pytest.skip("Flash attention only supported for compute capability < 80") - torch.manual_seed(20) - dtype = {'float16': torch.float16, 'float32': torch.float32, 'int8': torch.int8}[dtype_str] - # init data - q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_() - k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_() - v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_() - sm_scale = 0.2 - # benchmark - fn = lambda: triton.ops.attention(q, k, v, sm_scale) - if is_backward: - o = fn() - do = torch.randn_like(o) - fn = lambda: o.backward(do, retain_graph=True) - ms = triton.testing.do_bench(fn, return_mode="min", warmup=100, rep=500) - # compute flops - flops_per_matmul = 2. * Z * H * N_CTX * N_CTX * D_HEAD * 0.5 - total_flops = 2 * flops_per_matmul - if is_backward: - total_flops *= 2.5 # 2.0(bwd) + 0.5(recompute) - cur_gpu_perf = total_flops / ms * 1e-9 - # maximum flops - cur_sm_clock = nvsmi(['clocks.current.sm'])[0] - max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3) - cur_gpu_util = cur_gpu_perf / max_gpu_perf - ref_gpu_util = flash_attention_data[DEVICE_NAME][(Z, H, N_CTX, D_HEAD, mode, dtype_str)] - print_perf(ms, cur_gpu_util, ref_gpu_util) - triton.testing.assert_close(cur_gpu_util, ref_gpu_util, atol=0.01, rtol=0.05) diff --git a/python/test/unit/debugger/test_debugger.py b/python/test/unit/debugger/test_debugger.py deleted file mode 100644 index 741fcab3becd..000000000000 --- a/python/test/unit/debugger/test_debugger.py +++ /dev/null @@ -1,69 +0,0 @@ -import random - -import torch - -import triton -import triton.language as tl -from triton.debugger.debugger import program_ids_from_grid - - -def test_addition(): - - @triton.jit(interpret=True) - def add_kernel( - x_ptr, - y_ptr, - output_ptr, - n_elements, - BLOCK_SIZE: tl.constexpr, - ): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - y = tl.load(y_ptr + offsets, mask=mask) - output = x + y - tl.store(output_ptr + offsets, output, mask=mask) - - a = torch.rand((128,), device="cuda") - b = torch.rand((128,), device="cuda") - expected = a + b - output = torch.empty((128,), device="cuda") - - def grid(meta): - return (triton.cdiv(128, meta["BLOCK_SIZE"]),) - - add_kernel[grid](a, b, output, 128, BLOCK_SIZE=32) - - assert torch.allclose(expected, output, atol=1e-2, rtol=0) - - -def test_program_ids_from_grid(): - random.seed(123) - grid = (3, 4) - expected_combinations = 3 * 4 - unique_combinations = set(program_ids_from_grid(grid)) - assert len(unique_combinations) == expected_combinations - - first_run = list(program_ids_from_grid(grid)) - second_run = list(program_ids_from_grid(grid)) - assert first_run != second_run - - -def test_atomic(): - @triton.jit(interpret=True) - def atomic( - x_ptr, - ): - pid = tl.program_id(axis=0) - tl.atomic_add(x_ptr + pid, 1) - t = tl.atomic_xchg(x_ptr + pid, 3) - t += 1 # 2 - tl.atomic_cas(x_ptr + pid, 3, t) # match - tl.atomic_cas(x_ptr + pid, 40, 9) # no match - nb_dim = 16 - a = torch.zeros((nb_dim, ), dtype=torch.int32, device="cuda") - - atomic[(nb_dim, )](a) - assert torch.allclose(a, torch.full_like(a, 2)) diff --git a/python/test/unit/language/assert_helper.py b/python/test/unit/language/assert_helper.py deleted file mode 100644 index 8419d5a34218..000000000000 --- a/python/test/unit/language/assert_helper.py +++ /dev/null @@ -1,131 +0,0 @@ -import sys - -import torch -from torch.testing import assert_close - -import triton -import triton.language as tl - - -@triton.jit -def kernel_device_assert(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - tl.device_assert(x == 0, "x != 0") - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit -def kernel_device_assert_scalar(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - # Trivial assert - tl.device_assert(0 == 0, "x != 0") - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit(debug=False) -def kernel_device_assert_no_debug(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - tl.device_assert(x == 0, "x != 0") - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit -def kernel_assert(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - assert x == 0, "x != 0" - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit -def kernel_static_assert(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - tl.static_assert(BLOCK == 128, "BLOCK != 128") - tl.store(Y + tl.arange(0, BLOCK), x) - - -def test_assert(func: str): - shape = (128, ) - x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda') - y = torch.zeros(shape, dtype=x.dtype, device="cuda") - if func == "device_assert": - kernel_device_assert[(1,)](x, y, BLOCK=shape[0]) - kernel_device_assert_scalar[(1,)](x, y, BLOCK=shape[0]) - elif func == "no_debug": - # TRITON_DEBUG=True can override the debug flag - kernel_device_assert_no_debug[(1,)](x, y, BLOCK=shape[0]) - elif func == "assert": - kernel_assert[(1,)](x, y, BLOCK=shape[0]) - elif func == "static_assert": - kernel_static_assert[(1,)](x, y, BLOCK=shape[0]) - assert_close(y, x) - - -@triton.jit -def jit_device_assert_none(x): - tl.device_assert(x == 0, "x != 0") - - -@triton.jit(debug=True) -def jit_device_assert_true(x): - tl.device_assert(x == 0, "x != 0") - - -@triton.jit(debug=False) -def jit_device_assert_false(x): - tl.device_assert(x == 0, "x != 0") - - -@triton.jit -def kernel_device_assert_nested(X, Y, BLOCK: tl.constexpr, jit_debug: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - if jit_debug == "true": - jit_device_assert_true(x) - elif jit_debug == "false": - jit_device_assert_false(x) - else: - jit_device_assert_none(x) - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit(debug=True) -def kernel_device_assert_nested_true(X, Y, BLOCK: tl.constexpr, jit_debug: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - if jit_debug == "true": - jit_device_assert_true(x) - elif jit_debug == "false": - jit_device_assert_false(x) - else: - jit_device_assert_none(x) - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit(debug=False) -def kernel_device_assert_nested_false(X, Y, BLOCK: tl.constexpr, jit_debug: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - if jit_debug == "true": - jit_device_assert_true(x) - elif jit_debug == "false": - jit_device_assert_false(x) - else: - jit_device_assert_none(x) - tl.store(Y + tl.arange(0, BLOCK), x) - - -def test_assert_nested(caller: str, callee: str): - shape = (128, ) - x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda') - y = torch.zeros(shape, dtype=x.dtype, device="cuda") - if caller == "none": - kernel_device_assert_nested[(1,)](x, y, BLOCK=shape[0], jit_debug=callee) - elif caller == "true": - kernel_device_assert_nested_true[(1,)](x, y, BLOCK=shape[0], jit_debug=callee) - elif caller == "false": - kernel_device_assert_nested_false[(1,)](x, y, BLOCK=shape[0], jit_debug=callee) - assert_close(y, x) - - -if __name__ == "__main__": - if len(sys.argv) == 3: - test_assert_nested(sys.argv[1], sys.argv[2]) - else: - test_assert(sys.argv[1]) diff --git a/python/test/unit/language/print_helper.py b/python/test/unit/language/print_helper.py deleted file mode 100644 index afdd12960737..000000000000 --- a/python/test/unit/language/print_helper.py +++ /dev/null @@ -1,46 +0,0 @@ -import sys - -import torch -from torch.testing import assert_close - -import triton -import triton.language as tl - - -@triton.jit -def kernel_device_print(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - tl.device_print("", x) - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit -def kernel_print(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - print("", x) - tl.store(Y + tl.arange(0, BLOCK), x) - - -@triton.jit -def kernel_static_print(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - tl.static_print(x) - tl.store(Y + tl.arange(0, BLOCK), x) - - -def test_print(func: str, data_type: str): - shape = (128, ) - # limit the range of integers so that the sum does not overflow - x = torch.arange(0, shape[0], dtype=torch.int32, device='cuda').to(getattr(torch, data_type)) - y = torch.zeros(shape, dtype=x.dtype, device="cuda") - if func == "device_print": - kernel_device_print[(1,)](x, y, BLOCK=shape[0]) - elif func == "print": - kernel_print[(1,)](x, y, BLOCK=shape[0]) - elif func == "static_print": - kernel_static_print[(1,)](x, y, BLOCK=shape[0]) - assert_close(y, x) - - -if __name__ == "__main__": - test_print(sys.argv[1], sys.argv[2]) diff --git a/python/test/unit/language/test_annotations.py b/python/test/unit/language/test_annotations.py deleted file mode 100644 index 88df39fac52d..000000000000 --- a/python/test/unit/language/test_annotations.py +++ /dev/null @@ -1,21 +0,0 @@ - -from __future__ import annotations - -import torch - -import triton -import triton.language as tl - - -def test_annotations(): - - @triton.jit - def _kernel(X: torch.Tensor, N: int, BLOCK_SIZE: tl.constexpr): - pass - - x = torch.empty(1, device='cuda') - _kernel[(1,)](x, x.shape[0], 32) - try: - _kernel[(1,)](x.shape[0], x.shape[0], 32) - except AttributeError: - pass diff --git a/python/test/unit/language/test_block_pointer.py b/python/test/unit/language/test_block_pointer.py deleted file mode 100644 index 147249076181..000000000000 --- a/python/test/unit/language/test_block_pointer.py +++ /dev/null @@ -1,102 +0,0 @@ -import pytest -import torch - -import triton -import triton.language as tl - - -@triton.jit -def block_copy_kernel(a_ptr, b_ptr, N, BLOCK_SIZE: tl.constexpr, padding_option: tl.constexpr): - pid = tl.program_id(0) - # We only copy half of the data to see if the padding works - a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(N // 2, ), strides=(1, ), offsets=(pid * BLOCK_SIZE, ), - block_shape=(BLOCK_SIZE, ), order=(0, )) - b_block_ptr = tl.make_block_ptr(base=b_ptr, shape=(N, ), strides=(1, ), offsets=(pid * BLOCK_SIZE, ), - block_shape=(BLOCK_SIZE, ), order=(0, )) - a = tl.load(a_block_ptr, boundary_check=(0, ), padding_option=padding_option) - tl.store(b_block_ptr, a, boundary_check=(0, )) - - -@pytest.mark.parametrize("dtype_str, n, padding_option", - [(dtype_str, n, padding) for dtype_str in ("bool", "int16", "float16") - for n in (64, 128, 256, 512, 1024) - for padding in ("zero", "nan")]) -def test_block_copy(dtype_str, n, padding_option): - capability = torch.cuda.get_device_capability() - if capability[0] >= 9: - pytest.skip("Hopper support is working in progress") - - dtype = getattr(torch, dtype_str) - if dtype_str in ("bool", "int16"): - if padding_option == "nan": - pytest.skip("Padding with NaN is not supported for integer types") - a = torch.randint(0, 2, (n, ), device="cuda", dtype=dtype) - else: - a = torch.randn((n, ), device="cuda", dtype=dtype) - b = torch.zeros((n, ), device="cuda", dtype=dtype) - - grid = lambda meta: (triton.cdiv(n, meta["BLOCK_SIZE"]),) - block_copy_kernel[grid](a_ptr=a, b_ptr=b, N=n, BLOCK_SIZE=64, padding_option=padding_option) - - assert torch.all(a[0: n // 2] == b[0: n // 2]) - if padding_option == "zero": - assert torch.all(b[n // 2: n] == 0) - else: - assert torch.all(torch.isnan(b[n // 2: n])) - - -@triton.jit -def matmul_no_scf_with_advance_kernel( - a_ptr, b_ptr, c_ptr, - M, N, K, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr -): - offs_m = tl.arange(0, BLOCK_M) - offs_n = tl.arange(0, BLOCK_N) - a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(M, K), strides=(stride_am, stride_ak), - offsets=(0, 0), block_shape=(BLOCK_M, BLOCK_K), order=(1, 0)) - b_block_ptr = tl.make_block_ptr(base=b_ptr, shape=(K, N), strides=(stride_bk, stride_bn), - offsets=(0, 0), block_shape=(BLOCK_K, BLOCK_N), order=(1, 0)) - # Below two lines are just for testing negative offsets for the `advance` API, which could be removed - a_block_ptr = tl.advance(a_block_ptr, (BLOCK_M, -BLOCK_K)) - a_block_ptr = tl.advance(a_block_ptr, (-BLOCK_M, BLOCK_K)) - a = tl.load(a_block_ptr, boundary_check=(1, ), padding_option="zero") - b = tl.load(b_block_ptr, boundary_check=(0, ), padding_option="zero") - - c = tl.dot(a, b) - c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn - tl.store(c_ptrs, c) - - -@pytest.mark.parametrize("shape, num_warps", [ - (shape, num_warps) - for shape in [ - [64, 64, 16], - [64, 64, 32], - [64, 64, 64], - ] - for num_warps in [4, 8] -]) -def test_block_ptr_matmul_no_scf(shape, num_warps): - capability = torch.cuda.get_device_capability() - if capability[0] >= 9: - pytest.skip("Hopper support is working in progress") - - m, n, k = shape - a = torch.randn((m, k), device="cuda", dtype=torch.float16) - b = torch.randn((k, n), device="cuda", dtype=torch.float16) - c = torch.empty((m, n), device="cuda", dtype=torch.float32) - - grid = lambda META: (1, ) - matmul_no_scf_with_advance_kernel[grid](a_ptr=a, b_ptr=b, c_ptr=c, - M=m, N=n, K=k, - stride_am=a.stride(0), stride_ak=a.stride(1), - stride_bk=b.stride(0), stride_bn=b.stride(1), - stride_cm=c.stride(0), stride_cn=c.stride(1), - BLOCK_M=m, BLOCK_N=n, BLOCK_K=k, - num_warps=num_warps) - golden = torch.matmul(a, b) - torch.testing.assert_allclose(c, golden) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py deleted file mode 100644 index b2fdc02daa79..000000000000 --- a/python/test/unit/language/test_core.py +++ /dev/null @@ -1,2992 +0,0 @@ -# flake8: noqa: F821,F841 -import itertools -import os -import re -from typing import Optional, Union - -import numpy as np -import pytest -import torch -from numpy.random import RandomState - -import triton -import triton._C.libtriton.triton as _triton -import triton.language as tl -from triton.runtime.jit import JITFunction, TensorWrapper, reinterpret - -int_dtypes = ['int8', 'int16', 'int32', 'int64'] -uint_dtypes = ['uint8', 'uint16', 'uint32', 'uint64'] -float_dtypes = ['float16', 'float32', 'float64'] -dtypes = int_dtypes + uint_dtypes + float_dtypes -dtypes_with_bfloat16 = dtypes + ['bfloat16'] -torch_dtypes = ['bool'] + int_dtypes + ['uint8'] + float_dtypes + ['bfloat16'] - - -def _bitwidth(dtype: str) -> int: - # ex.: "int64" -> 64 - return int(re.search(r'(\d+)$', dtype).group(1)) - - -def numpy_random(shape, dtype_str, rs: Optional[RandomState] = None, low=None, high=None): - """ - Override `rs` if you're calling this function twice and don't want the same - result for both calls. - """ - if isinstance(shape, int): - shape = (shape, ) - if rs is None: - rs = RandomState(seed=17) - if dtype_str in int_dtypes + uint_dtypes: - iinfo = np.iinfo(getattr(np, dtype_str)) - low = iinfo.min if low is None else max(low, iinfo.min) - high = iinfo.max if high is None else min(high, iinfo.max) - dtype = getattr(np, dtype_str) - x = rs.randint(low, high, shape, dtype=dtype) - x[x == 0] = 1 # Hack. Never return zero so tests of division don't error out. - return x - elif dtype_str in float_dtypes: - return rs.normal(0, 1, shape).astype(dtype_str) - elif dtype_str == 'bfloat16': - return (rs.normal(0, 1, shape).astype('float32').view('uint32') - & np.uint32(0xffff0000)).view('float32') - elif dtype_str in ['bool', 'int1', 'bool_']: - return rs.normal(0, 1, shape) > 0.0 - else: - raise RuntimeError(f'Unknown dtype {dtype_str}') - - -def to_triton(x: np.ndarray, device='cuda', dst_type=None) -> Union[TensorWrapper, torch.Tensor]: - ''' - Note: We need dst_type because the type of x can be different from dst_type. - For example: x is of type `float32`, dst_type is `bfloat16`. - If dst_type is None, we infer dst_type from x. - ''' - t = x.dtype.name - if t in uint_dtypes: - signed_type_name = t.lstrip('u') # e.g. "uint16" -> "int16" - x_signed = x.astype(getattr(np, signed_type_name)) - return reinterpret(torch.tensor(x_signed, device=device), getattr(tl, t)) - else: - if t == 'float32' and dst_type == 'bfloat16': - return torch.tensor(x, device=device).bfloat16() - return torch.tensor(x, device=device) - - -def torch_dtype_name(dtype) -> str: - if isinstance(dtype, triton.language.dtype): - return dtype.name - elif isinstance(dtype, torch.dtype): - # 'torch.int64' -> 'int64' - m = re.match(r'^torch\.(\w+)$', str(dtype)) - return m.group(1) - else: - raise TypeError(f'not a triton or torch dtype: {type(dtype)}') - - -def to_numpy(x): - if isinstance(x, TensorWrapper): - return x.base.cpu().numpy().astype(getattr(np, torch_dtype_name(x.dtype))) - elif isinstance(x, torch.Tensor): - if x.dtype is torch.bfloat16: - return x.cpu().float().numpy() - return x.cpu().numpy() - else: - raise ValueError(f"Not a triton-compatible tensor: {x}") - - -def patch_kernel(template, to_replace): - kernel = triton.JITFunction(template.fn) - for key, value in to_replace.items(): - kernel.src = kernel.src.replace(key, value) - return kernel - - -def check_type_supported(dtype): - ''' - skip test if dtype is not supported on the current device - ''' - cc = torch.cuda.get_device_capability() - if cc[0] < 8 and (dtype is tl.bfloat16 or dtype == "bfloat16" or dtype is torch.bfloat16): - pytest.skip("bfloat16 is only supported on NVGPU with cc >= 80") - - -class MmaLayout: - def __init__(self, version, warps_per_cta): - self.version = version - self.warps_per_cta = str(warps_per_cta) - - def __str__(self): - return f"#triton_gpu.mma<{{versionMajor={self.version[0]}, versionMinor={self.version[1]}, warpsPerCTA={self.warps_per_cta}}}>" - - -class BlockedLayout: - def __init__(self, size_per_thread, threads_per_warp, warps_per_cta, order): - self.sz_per_thread = str(size_per_thread) - self.threads_per_warp = str(threads_per_warp) - self.warps_per_cta = str(warps_per_cta) - self.order = str(order) - - def __str__(self): - return f"#triton_gpu.blocked<{{sizePerThread={self.sz_per_thread}, threadsPerWarp={self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, order={self.order}}}>" - - -class SharedLayout: - def __init__(self, vec, per_phase, max_phase, order): - self.vec = str(vec) - self.per_phase = str(per_phase) - self.max_phase = str(max_phase) - self.order = str(order) - - def __str__(self): - return f"#triton_gpu.shared<{{vec={self.vec}, perPhase={self.per_phase}, maxPhase={self.max_phase}, order={self.order}}}>" - - -@pytest.mark.parametrize("dtype_x", list(dtypes) + ["bfloat16"]) -def test_empty_kernel(dtype_x, device='cuda'): - SIZE = 128 - - @triton.jit - def kernel(X, SIZE: tl.constexpr): - pass - check_type_supported(dtype_x) - x = to_triton(numpy_random(SIZE, dtype_str=dtype_x), device=device, dst_type=dtype_x) - kernel[(1, )](x, SIZE=SIZE, num_warps=4) - - -# generic test functions -def _test_unary(dtype_x, expr, numpy_expr=None, device='cuda'): - check_type_supported(dtype_x) # early return if dtype_x is not supported - SIZE = 128 - # define the kernel / launch-grid - - @triton.jit - def kernel(Z, X, SIZE: tl.constexpr): - off = tl.arange(0, SIZE) - x = tl.load(X + off) - z = GENERATE_TEST_HERE - tl.store(Z + off, z) - - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': expr}) - # inputs - x = numpy_random(SIZE, dtype_str=dtype_x) - if 'log' in expr: - x = np.abs(x) + 0.01 - # reference result - z_ref = eval(expr if numpy_expr is None else numpy_expr) - # triton result - x_tri = to_triton(x, device=device, dst_type=dtype_x) - z_tri = to_triton(np.empty_like(z_ref), device=device, dst_type=dtype_x) - kernel[(1, )](z_tri, x_tri, SIZE=SIZE, num_warps=4) - # compare - np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01) - - -def _binary_op_dtype_override(a: str, b: str) -> Optional[np.dtype]: - """ - Given two dtype strings, returns the numpy dtype Triton thinks binary - operations on the two types should return. Returns None if the return value - matches numpy. This is generally needed because Triton and pytorch return - narrower floating point types than numpy in mixed operations, and because - Triton follows C/C++ semantics around mixed signed/unsigned operations, and - numpy/pytorch do not. - """ - overrides = { - ('float16', 'int16'): np.float16, - ('float16', 'int32'): np.float16, - ('float16', 'int64'): np.float16, - ('float16', 'uint16'): np.float16, - ('float16', 'uint32'): np.float16, - ('float16', 'uint64'): np.float16, - ('int8', 'uint8'): np.uint8, - ('int8', 'uint16'): np.uint16, - ('int8', 'uint32'): np.uint32, - ('int8', 'uint64'): np.uint64, - ('int16', 'uint16'): np.uint16, - ('int16', 'uint32'): np.uint32, - ('int16', 'uint64'): np.uint64, - ('int32', 'uint32'): np.uint32, - ('int32', 'uint64'): np.uint64, - ('int64', 'uint64'): np.uint64, - } - key = (a, b) if a < b else (b, a) - return overrides.get(key) - - -def _test_binary(dtype_x, dtype_y, expr, numpy_expr=None, mode_x='real', mode_y='real', device='cuda', y_low=None, y_high=None): - check_type_supported(dtype_x) # early return if dtype_x is not supported - check_type_supported(dtype_y) - SIZE = 128 - # define the kernel / launch-grid - - @triton.jit - def kernel(Z, X, Y, SIZE: tl.constexpr): - off = tl.arange(0, SIZE) - x = tl.load(X + off) - y = tl.load(Y + off) - z = GENERATE_TEST_HERE - tl.store(Z + off, z) - - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': expr}) - # inputs - rs = RandomState(17) - x = numpy_random(SIZE, dtype_str=dtype_x, rs=rs) - y = numpy_random(SIZE, dtype_str=dtype_y, rs=rs, low=y_low, high=y_high) - if mode_x == 'nan': - x[:] = float('nan') - if mode_y == 'nan': - y[:] = float('nan') - # reference result - z_ref = eval(expr if numpy_expr is None else numpy_expr) - dtype_z = _binary_op_dtype_override(dtype_x, dtype_y) - if dtype_z is not None: - z_ref = z_ref.astype(dtype_z) - # triton result - x_tri = to_triton(x, device=device, dst_type=dtype_x) - y_tri = to_triton(y, device=device, dst_type=dtype_y) - z_tri = to_triton(np.empty(SIZE, dtype=z_ref.dtype), device=device) - kernel[(1, )](z_tri, x_tri, y_tri, SIZE=SIZE, num_warps=4) - np.testing.assert_allclose(z_ref, to_numpy(z_tri), err_msg=expr, rtol=0.01) - - -def _mod_operation_ill_conditioned(dtype_x, dtype_y) -> bool: - # The result of x % y is ill-conditioned if x % y is much smaller than x. - # pytorch/CUDA has slightly different (probably better) rounding on - # remainders than stock LLVM. We currently don't expect to match it - # bit-for-bit. - return (dtype_x, dtype_y) in [ - ('int32', 'bfloat16'), - ('int32', 'float16'), - ('int32', 'float32'), - ('int64', 'bfloat16'), - ('int64', 'float16'), - ('int64', 'float32'), - ('int64', 'float64'), - ('uint16', 'bfloat16'), - ('uint16', 'float16'), - ('uint16', 'float32'), - ('uint32', 'bfloat16'), - ('uint32', 'float16'), - ('uint32', 'float32'), - ('uint64', 'bfloat16'), - ('uint64', 'float16'), - ('uint64', 'float32'), - ('uint64', 'float64'), - ] - -# --------------- -# test binary ops -# --------------- - - -@pytest.mark.parametrize("dtype_x, dtype_y, op", [ - (dtype_x, dtype_y, op) - for op in ['+', '-', '*', '/', '%'] - for dtype_x in dtypes_with_bfloat16 - for dtype_y in dtypes_with_bfloat16 -]) -def test_bin_op(dtype_x, dtype_y, op, device='cuda'): - expr = f' x {op} y' - if op == '%' and dtype_x in int_dtypes + uint_dtypes and dtype_y in int_dtypes + uint_dtypes: - # LLVM has 'numpy.fmod', not 'numpy.remainder', semantics on integer remainders. - numpy_expr = 'np.fmod(x, y)' - elif op in ('/', '%') and dtype_x in ('int16', 'float16', 'bfloat16') and dtype_y in ('int16', 'float16', 'bfloat16'): - # Triton promotes 16-bit floating-point / and % to 32-bit because there - # are no native div or FRem operations on float16. Since we have to - # convert anyway, we may as well take the accuracy bump. - numpy_expr = f'x.astype(np.float32) {op} y.astype(np.float32)' - elif (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)): - numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})' - elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)): - numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})' - else: - numpy_expr = None - if op == '%' and _mod_operation_ill_conditioned(dtype_x, dtype_y): - with pytest.raises(AssertionError, match='Not equal to tolerance'): - _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) - elif (op in ('%', '/') and - ((dtype_x in int_dtypes and dtype_y in uint_dtypes) or - (dtype_x in uint_dtypes and dtype_y in int_dtypes))): - with pytest.raises(triton.CompilationError) as exc_info: - _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) - assert re.match('Cannot use .* because they have different signedness', str(exc_info.value.__cause__)) - else: - _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) - - -@pytest.mark.parametrize("dtype_x, dtype_y", - [(dtype_x, dtype_y) for dtype_x in int_dtypes for dtype_y in int_dtypes] + - [(dtype_x, dtype_y) for dtype_x in uint_dtypes for dtype_y in uint_dtypes] - ) -def test_floordiv(dtype_x, dtype_y, device='cuda'): - # Triton has IEEE, not numpy/torch, semantics for %, and those carry - # through to //, so we have to use a nonstandard expression to get a - # reference result for //. - expr = 'x // y' - numpy_expr = '((x - np.fmod(x, y)) / y)' - _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) - - -def test_unsigned_name_mangling(device='cuda'): - # Test that uint32 and int32 are mangled differently by the compiler - SIZE = 128 - # define the kernel / launch-grid - - @triton.jit - def kernel(O1, O2, X, Y, SIZE: tl.constexpr): - off = tl.arange(0, SIZE) - x = tl.load(X + off) - y = tl.load(Y + off) - out1 = tl.abs(x) # uint32 -> nop - out2 = tl.abs(-y) # int32 -> should have an effect - tl.store(O1 + off, out1) - tl.store(O2 + off, out2) - - dtype_x = 'uint32' - dtype_y = 'int32' - # inputs - rs = RandomState(17) - x = numpy_random(SIZE, dtype_str=dtype_x, rs=rs) - y = numpy_random(SIZE, dtype_str=dtype_y, rs=rs) - # reference result - expect = (np.abs(x), np.abs(-y)) - # triton result - x_tri = to_triton(x, device=device, dst_type=dtype_x) - y_tri = to_triton(y, device=device, dst_type=dtype_y) - actual = tuple( - to_triton(np.empty_like(e), device=device) - for e in expect - ) - kernel[(1, )](actual[0], actual[1], x_tri, y_tri, SIZE=SIZE, num_warps=4) - - # Bitwise op, so expect exact equality - assert (expect[0] == to_numpy(actual[0])).all() - assert (expect[1] == to_numpy(actual[1])).all() - - -# --------------- -# test bitwise ops -# --------------- -@pytest.mark.parametrize("dtype_x, dtype_y, op", [ - (dtype_x, dtype_y, op) - for op in ['&', '|', '^'] - for dtype_x in dtypes + dtypes_with_bfloat16 - for dtype_y in dtypes + dtypes_with_bfloat16 -]) -def test_bitwise_op(dtype_x, dtype_y, op, device='cuda'): - expr = f'x {op} y' - if (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)): - numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})' - elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)): - numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})' - else: - numpy_expr = None - if 'float' in dtype_x + dtype_y: - with pytest.raises(triton.CompilationError) as exc_info: - _test_binary(dtype_x, dtype_y, expr, numpy_expr='np.array([])', device=device) - # The CompilationError must have been caused by a C++ exception with this text. - assert re.match('invalid operands of type', str(exc_info.value.__cause__)) - else: - _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) - - -@pytest.mark.parametrize("dtype_x, dtype_y, op", [ - (dtype_x, dtype_y, op) - for op in ['<<', '>>'] - for dtype_x in int_dtypes + uint_dtypes - for dtype_y in int_dtypes + uint_dtypes -]) -def test_shift_op(dtype_x, dtype_y, op, device='cuda'): - expr = f'x {op} y' - bw = max(_bitwidth(dtype_x), _bitwidth(dtype_y)) - if dtype_x.startswith('int'): - dtype_z = f'int{bw}' - else: - dtype_z = f'uint{bw}' - numpy_expr = f'x.astype(np.{dtype_z}) {op} y.astype(np.{dtype_z})' - _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device, y_low=0, y_high=65) - - -# --------------- -# test compare ops -# --------------- -ops = ['==', '!=', '>', '<', '>=', '<='] - - -@pytest.mark.parametrize("dtype_x, dtype_y, op, mode_x, mode_y", - # real - [ - (dtype_x, dtype_y, op, 'real', 'real') - for op in ops - for dtype_x in dtypes - for dtype_y in dtypes - ] + - # NaNs - [('float32', 'float32', op, mode_x, mode_y) - for op in ops - for mode_x, mode_y in [('nan', 'real'), - ('real', 'nan'), - ('nan', 'nan')] - - ]) -def test_compare_op(dtype_x, dtype_y, op, mode_x, mode_y, device='cuda'): - expr = f'x {op} y' - if (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)): - numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})' - elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)): - numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})' - else: - numpy_expr = None - _test_binary(dtype_x, dtype_y, expr, numpy_expr, mode_x=mode_x, mode_y=mode_y, device=device) - - -# --------------- -# test broadcast -# --------------- -@pytest.mark.parametrize("dtype", dtypes_with_bfloat16) -def test_broadcast(dtype): - @triton.jit - def broadcast_kernel(x_ptr, y_ptr, y_broadcasted_ptr, M: tl.constexpr, N: tl.constexpr): - offset1 = tl.arange(0, M) - offset2 = tl.arange(0, N) - x = tl.load(x_ptr + N * offset1[:, None] + offset2[None, :]) - y = tl.load(y_ptr + offset2) - _, y_broadcasted = tl.broadcast(x, y) - tl.store(y_broadcasted_ptr + N * offset1[:, None] + offset2[None, :], y_broadcasted) - - M = 32 - N = 64 - rs = RandomState(17) - x = numpy_random((M, N), dtype_str=dtype, rs=rs) - y = numpy_random(N, dtype_str=dtype, rs=rs) - _, y_broadcasted_np = np.broadcast_arrays(x, y) - - x_tri = to_triton(x, device='cuda', dst_type=dtype) - y_tri = to_triton(y, device='cuda', dst_type=dtype) - y_broadcasted_tri = to_triton(np.empty((M, N), dtype=y_broadcasted_np.dtype), device='cuda', dst_type=dtype) - - broadcast_kernel[(1,)](x_tri, y_tri, y_broadcasted_tri, M=M, N=N) - assert (y_broadcasted_np == to_numpy(y_broadcasted_tri)).all() - - -# ---------------- -# test expand_dims -# ---------------- -def test_expand_dims(): - @triton.jit - def expand_dims_kernel(dummy, N: tl.constexpr): - offset1 = tl.arange(0, N) - - t = tl.expand_dims(offset1, 0) - tl.static_assert(t.shape == [1, N]) - - t = tl.expand_dims(offset1, 1) - tl.static_assert(t.shape == [N, 1]) - - t = tl.expand_dims(offset1, -1) - tl.static_assert(t.shape == [N, 1]) - - t = tl.expand_dims(offset1, -2) - tl.static_assert(t.shape == [1, N]) - - t = tl.expand_dims(offset1, (0, -1)) - tl.static_assert(t.shape == [1, N, 1]) - - t = tl.expand_dims(offset1, (0, 1, 3)) - tl.static_assert(t.shape == [1, 1, N, 1]) - - t = tl.expand_dims(offset1, (-4, 2, -1)) - tl.static_assert(t.shape == [1, N, 1, 1]) - - t = tl.expand_dims(offset1, (3, 1, 2)) - tl.static_assert(t.shape == [N, 1, 1, 1]) - - N = 32 - dummy_tensor = torch.empty((), device="cuda") - expand_dims_kernel[(1,)](dummy_tensor, N) - - -def test_expand_dims_error_cases(): - @triton.jit - def dim_out_of_range1(dummy, N: tl.constexpr): - offset1 = tl.arange(0, N) - - t = tl.expand_dims(offset1, -2) - t = tl.expand_dims(offset1, -3) - - @triton.jit - def dim_out_of_range2(dummy, N: tl.constexpr): - offset1 = tl.arange(0, N) - - t = tl.expand_dims(offset1, 1) - t = tl.expand_dims(offset1, 2) - - @triton.jit - def duplicate_dim1(dummy, N: tl.constexpr): - offset1 = tl.arange(0, N) - - t = tl.expand_dims(offset1, (0, 0)) - - @triton.jit - def duplicate_dim2(dummy, N: tl.constexpr): - offset1 = tl.arange(0, N) - - t = tl.expand_dims(offset1, (0, -3)) - - N = 32 - dummy_tensor = torch.empty((), device="cuda") - - with pytest.raises(triton.CompilationError, match="invalid axis -3"): - dim_out_of_range1[(1,)](dummy_tensor, N) - - with pytest.raises(triton.CompilationError, match="invalid axis 2"): - dim_out_of_range2[(1,)](dummy_tensor, N) - - with pytest.raises(triton.CompilationError, match=r"duplicate axes, normalized axes = \[0, 0\]"): - duplicate_dim1[(1,)](dummy_tensor, N) - - with pytest.raises(triton.CompilationError, match=r"duplicate axes, normalized axes = \[0, 0\]"): - duplicate_dim2[(1,)](dummy_tensor, N) - - -# --------------- -# test where -# --------------- -@pytest.mark.parametrize("dtype", dtypes_with_bfloat16 + ["*int32"]) -def test_where(dtype): - select_ptrs = False - if dtype == "*int32": - dtype = "int64" - select_ptrs = True - check_type_supported(dtype) - - @triton.jit - def where_kernel(cond_ptr, a_ptr, b_ptr, output_ptr, n_elements, - BLOCK_SIZE: tl.constexpr, - TEST_POINTERS: tl.constexpr, - TEST_SCALAR_POINTERS: tl.constexpr): - offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - decide = tl.load(cond_ptr + offsets, mask=mask) - if TEST_SCALAR_POINTERS: - ptr = tl.where(tl.load(cond_ptr), a_ptr, b_ptr) - output = tl.load(ptr + offsets, mask=mask) - else: - if TEST_POINTERS: - a = tl.load(a_ptr + offsets, mask=mask).to(tl.pi32_t) - b = tl.load(b_ptr + offsets, mask=mask).to(tl.pi32_t) - else: - a = tl.load(a_ptr + offsets, mask=mask) - b = tl.load(b_ptr + offsets, mask=mask) - output = tl.where(decide, a, b) - tl.store(output_ptr + offsets, output, mask=mask) - - SIZE = 1_000 - rs = RandomState(17) - cond = numpy_random(SIZE, 'bool', rs) - x = numpy_random(SIZE, dtype_str=dtype, rs=rs) - y = numpy_random(SIZE, dtype_str=dtype, rs=rs) - z = np.where(cond, x, y) - - cond_tri = to_triton(cond, device='cuda') - x_tri = to_triton(x, device='cuda', dst_type=dtype) - y_tri = to_triton(y, device='cuda', dst_type=dtype) - z_tri = to_triton(np.empty(SIZE, dtype=z.dtype), device='cuda', dst_type=dtype) - - grid = lambda meta: (triton.cdiv(SIZE, meta['BLOCK_SIZE']),) - where_kernel[grid](cond_tri, x_tri, y_tri, z_tri, SIZE, BLOCK_SIZE=1024, TEST_POINTERS=select_ptrs, TEST_SCALAR_POINTERS=False) - assert (z == to_numpy(z_tri)).all() - if select_ptrs: - where_kernel[grid](cond_tri, x_tri, y_tri, z_tri, SIZE, BLOCK_SIZE=1024, TEST_POINTERS=select_ptrs, TEST_SCALAR_POINTERS=True) - z = np.where(cond[0], x, y) - assert (z == to_numpy(z_tri)).all() - - -def test_where_broadcast(): - @triton.jit - def where_kernel(cond_ptr, a_ptr, out_ptr, BLOCK_SIZE: tl.constexpr): - xoffsets = tl.arange(0, BLOCK_SIZE)[:, None] - yoffsets = tl.arange(0, BLOCK_SIZE)[None, :] - - mask = tl.load(cond_ptr + yoffsets) - vals = tl.load(a_ptr + yoffsets + BLOCK_SIZE * xoffsets) - res = tl.where(mask, vals, 0.) - tl.store(out_ptr + yoffsets + BLOCK_SIZE * xoffsets, res) - - @triton.jit - def where_scalar_condition(a_ptr, out_ptr, BLOCK_SIZE: tl.constexpr): - xoffsets = tl.arange(0, BLOCK_SIZE)[:, None] - yoffsets = tl.arange(0, BLOCK_SIZE)[None, :] - mask = 0 - vals = tl.load(a_ptr + yoffsets + BLOCK_SIZE * xoffsets) - res = tl.where(mask, vals, 0.) - tl.store(out_ptr + yoffsets + BLOCK_SIZE * xoffsets, res) - - SIZE = 32 - dtype = 'float32' - rs = RandomState(17) - x = numpy_random((SIZE, SIZE), dtype_str=dtype, rs=rs) - mask = numpy_random(SIZE, 'bool', rs=rs) - z = np.where(mask, x, 0) - cond_tri = to_triton(mask, device="cuda") - x_tri = to_triton(x, device='cuda', dst_type=dtype) - z_tri = to_triton(np.empty((SIZE, SIZE), dtype=z.dtype), device='cuda', dst_type=dtype) - where_kernel[(1,)](cond_tri, x_tri, z_tri, SIZE) - assert (z == to_numpy(z_tri)).all() - where_scalar_condition[(1,)](x_tri, z_tri, SIZE) - z = np.where(0, x, 0) - assert (z == to_numpy(z_tri)).all() - -# --------------- -# test unary ops -# --------------- - - -@pytest.mark.parametrize("dtype_x, expr", [ - (dtype_x, ' -x') for dtype_x in dtypes_with_bfloat16 -] + [ - (dtype_x, ' ~x') for dtype_x in int_dtypes -]) -def test_unary_op(dtype_x, expr, device='cuda'): - _test_unary(dtype_x, expr, device=device) - -# ---------------- -# test math ops -# ---------------- - - -@pytest.mark.parametrize("dtype_x, expr", [(dtype_x, expr) for dtype_x in ["float32", "float64"] for expr in ['exp', 'log', 'cos', 'sin']]) -def test_math_op(dtype_x, expr, device='cuda'): - _test_unary(dtype_x, f'tl.{expr}(x)', f'np.{expr}(x) ', device=device) - -# ---------------- -# test abs -# ---------------- - - -@pytest.mark.parametrize("dtype_x", [ - (dtype_x) - for dtype_x in dtypes_with_bfloat16 -]) -def test_abs(dtype_x, device='cuda'): - _test_unary(dtype_x, 'tl.abs(x)', 'np.abs(x) ', device=device) - - -@pytest.mark.parametrize("in_dtype", [tl.float8e4, tl.float8e5]) -def test_abs_f8(in_dtype): - - @triton.jit - def abs_kernel(Z, X, SIZE: tl.constexpr): - off = tl.arange(0, SIZE) - x = tl.load(X + off) - z = tl.abs(x) - tl.store(Z + off, z) - - f8_tensor = torch.tensor(range(-128, 128), dtype=torch.int8, device='cuda') - # f32_to_f8 doesn't handle nan, so we make sure f8_tensor doesn't contain any nan - all_exp_ones = (f8_tensor & 0b01111100) == 128 - 2**in_dtype.fp_mantissa_width - f8_tensor[all_exp_ones] = 0 - f8 = triton.reinterpret(f8_tensor, in_dtype) - n_elements = f8_tensor.numel() - out_f8 = torch.empty_like(f8_tensor) - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - abs_kernel[(1,)](f8, triton.reinterpret(out_f8, in_dtype), n_elements) - - f32_tensor = convert_float_to_float32(f8_tensor, in_dtype) - expect = f32_tensor.abs() - actual_f8 = convert_float_to_float32(out_f8, in_dtype) - torch.testing.assert_allclose(expect, actual_f8) - - -# ---------------- -# test indexing -# ---------------- - - -def make_ptr_str(name, shape): - rank = len(shape) - offsets = [] - stride = 1 - for i in reversed(range(rank)): - idx = ', '.join([':' if ii == i else 'None' for ii in range(rank)]) - offsets += [f'tl.arange(0, {shape[i]})[{idx}]*{stride}'] - stride *= shape[i] - return f"{name} + {' + '.join(offsets)}" - - -# TODO: handle `%4 = triton_gpu.convert_layout %3 : (tensor<32xi32, #blocked0>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>`` -@pytest.mark.parametrize("expr, dtype_str", [ - (f'x[{s}]', d) - for s in ['None, :', ':, None', - 'None, :, :', - ':, :, None'] - for d in ['int32', 'uint32', 'uint16'] -]) -def test_index1d(expr, dtype_str, device='cuda'): - rank_x = expr.count(':') - rank_y = expr.count(',') + 1 - shape_x = [32 for _ in range(rank_x)] - shape_z = [32 for _ in range(rank_y)] - shape_z_rank_mismatch = [32 for _ in range(rank_y + 1)] - shape_z_dim_mismatch = [64 for _ in range(rank_y)] - - # Triton kernel - @triton.jit - def kernel(Z, X, SIZE: tl.constexpr): - m = tl.arange(0, SIZE) - n = tl.arange(0, SIZE) - x = tl.load(X_PTR_EXPR) - z = GENERATE_TEST_HERE - tl.store(Z_PTR_EXPR, z) - - def generate_kernel(shape_x, shape_z): - to_replace = { - 'X_PTR_EXPR': make_ptr_str('X', shape_x), - 'Z_PTR_EXPR': make_ptr_str('Z', shape_z), - 'GENERATE_TEST_HERE': expr, - } - return patch_kernel(kernel, to_replace) - - kernel_match = generate_kernel(shape_x, shape_z) - kernel_dim_mismatch = generate_kernel(shape_x, shape_z_dim_mismatch) - kernel_rank_mismatch = generate_kernel(shape_x, shape_z_rank_mismatch) - - # torch result - x = numpy_random(shape_x, dtype_str=dtype_str) - y = np.zeros(shape_z, dtype=getattr(np, dtype_str)) - z_ref = eval(expr) + y - # triton result - z_tri = to_triton(np.empty_like(z_ref), device=device) - x_tri = to_triton(x) - kernel_match[(1, )](z_tri, x_tri, num_warps=1, SIZE=shape_x[0]) - # compare - assert (z_ref == to_numpy(z_tri)).all() - - def catch_compilation_error(kernel): - try: - kernel[(1, )](z_tri, x_tri, num_warps=1, SIZE=shape_x[0]) - except triton.CompilationError as e: - np.testing.assert_(True) - except BaseException: - np.testing.assert_(False) - - catch_compilation_error(kernel_dim_mismatch) - catch_compilation_error(kernel_rank_mismatch) - - -# --------------- -# test tuples -# --------------- - - -@triton.jit -def tuples_fn(a, b): - return a + b, \ - a - b, \ - a * b - - -def test_tuples(): - device = 'cuda' - - @triton.jit - def with_fn(X, Y, A, B, C): - x = tl.load(X) - y = tl.load(Y) - a, b, c = tuples_fn(x, y) - tl.store(A, a) - tl.store(B, b) - tl.store(C, c) - - @triton.jit - def without_fn(X, Y, A, B, C): - x = tl.load(X) - y = tl.load(Y) - a, b, c = x + y, x - y, x * y - tl.store(A, a) - tl.store(B, b) - tl.store(C, c) - - x = torch.tensor([1.3], device=device, dtype=torch.float32) - y = torch.tensor([1.9], device=device, dtype=torch.float32) - a_tri = torch.tensor([0], device=device, dtype=torch.float32) - b_tri = torch.tensor([0], device=device, dtype=torch.float32) - c_tri = torch.tensor([0], device=device, dtype=torch.float32) - for kernel in [with_fn, without_fn]: - kernel[(1, )](x, y, a_tri, b_tri, c_tri, num_warps=1) - a_ref, b_ref, c_ref = x + y, x - y, x * y - assert a_tri == a_ref - assert b_tri == b_ref - assert c_tri == c_ref - - -@triton.jit(noinline=True) -def noinline_simple_fn(x, y, Z): - z = x + y - tl.store(Z, z) - - -@triton.jit(noinline=True) -def noinline_call_graph_fn1(x): - return x + 1 - - -@triton.jit(noinline=True) -def noinline_call_graph_fn2(y): - return y + 2 - - -@triton.jit(noinline=True) -def noinline_call_graph_fn(x, y, Z): - t0 = noinline_call_graph_fn1(x) - t1 = noinline_call_graph_fn2(y) - z = t0 + t1 - tl.store(Z, z) - - -@triton.jit(noinline=True) -def noinline_shared_fn(x, y, Z): - offs = tl.arange(0, 16)[:, None] * 16 + tl.arange(0, 16)[None, :] - z = tl.load(Z + offs) - z = tl.dot(z, z) + x + y - tl.store(Z + offs, z) - - -@triton.jit(noinline=True) -def noinline_dynamic_fn(x, y, Z): - if x >= 1: - x = noinline_call_graph_fn1(x) - else: - x = noinline_call_graph_fn2(x) - if y >= 2: - y = noinline_call_graph_fn2(y) - else: - y = noinline_call_graph_fn1(y) - z = x + y - tl.store(Z, z) - - -@triton.jit(noinline=True) -def noinline_call_multi_values_fn(x, y): - return x + 1, y + 2 - - -@triton.jit(noinline=True) -def noinline_multi_values_fn(x, y, Z): - x, y = noinline_call_multi_values_fn(x, y) - z = x + y - tl.store(Z, z) - - -@pytest.mark.parametrize("mode", ["simple", "call_graph", "shared", "dynamic", "multi_values"]) -def test_noinline(mode): - device = 'cuda' - - @triton.jit - def kernel(X, Y, Z): - x = tl.load(X) - y = tl.load(Y) - GENERATE_TEST_HERE(x, y, Z) - - func_name = f'noinline_{mode}_fn' - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': func_name}) - x = torch.tensor([1.0], device=device, dtype=torch.float32) - y = torch.tensor([2.0], device=device, dtype=torch.float32) - if mode == "shared": - z = torch.ones((16, 16), device=device, dtype=torch.float32) - else: - z = torch.tensor([0.0], device=device, dtype=torch.float32) - kernel[(1,)](x, y, z, num_warps=1) - if mode == "simple": - assert torch.equal(z, x + y) - elif mode == "call_graph" or mode == "dynamic" or mode == "multi_values": - assert torch.equal(z, x + 1 + y + 2) - elif mode == "shared": - ref = torch.full((16, 16), 16, device=device, dtype=torch.float32) - assert torch.equal(z, ref + x + y) - - -# --------------- -# test atomics -# --------------- -@pytest.mark.parametrize("op, dtype_x_str, mode", itertools.chain.from_iterable([ - [ - ('add', 'float16', mode), - ('add', 'uint32', mode), ('add', 'int32', mode), ('add', 'float32', mode), - ('max', 'uint32', mode), ('max', 'int32', mode), ('max', 'float32', mode), - ('min', 'uint32', mode), ('min', 'int32', mode), ('min', 'float32', mode), - ] - for mode in ['all_neg', 'all_pos', 'min_neg', 'max_pos']])) -def test_atomic_rmw(op, dtype_x_str, mode, device='cuda'): - capability = torch.cuda.get_device_capability() - if capability[0] < 7: - if dtype_x_str == 'float16': - pytest.skip("Only test atomic float16 ops on devices with sm >= 70") - n_programs = 5 - - # triton kernel - @triton.jit - def kernel(X, Z): - pid = tl.program_id(0) - x = tl.load(X + pid) - old = GENERATE_TEST_HERE - - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.atomic_{op}(Z, x)'}) - numpy_op = {'add': np.sum, 'max': np.max, 'min': np.min}[op] - max_neutral = float('-inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).min - min_neutral = float('inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).max - neutral = {'add': 0, 'max': max_neutral, 'min': min_neutral}[op] - - # triton result - rs = RandomState(17) - x = np.array([2**i for i in range(n_programs)], dtype=getattr(np, dtype_x_str)) - if mode == 'all_neg': - x = -np.abs(x) - if mode == 'all_pos': - x = np.abs(x) - if mode == 'min_neg': - idx = rs.randint(n_programs, size=(1, )).item() - x[idx] = -np.max(np.abs(x)) - 1 - if mode == 'max_pos': - idx = rs.randint(n_programs, size=(1, )).item() - x[idx] = np.max(np.abs(x)) + 1 - x_tri = to_triton(x, device=device) - - z_tri = to_triton(np.array([neutral], dtype=getattr(np, dtype_x_str)), device=device) - kernel[(n_programs, )](x_tri, z_tri) - # torch result - z_ref = numpy_op(x).astype(getattr(np, dtype_x_str)) - # compare - exact = op not in ['add'] - if exact: - assert z_ref.item() == to_numpy(z_tri).item() - else: - np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01) - - -def test_atomic_rmw_predicate(device="cuda"): - @triton.jit - def kernel(X): - val = tl.program_id(0) - if val < 64: - tl.atomic_max(X, val) - x = torch.zeros((1,), device=device, dtype=torch.int32) - kernel[(4096,)](x) - assert x.item() == 63 - - -@pytest.mark.parametrize("shape, axis", - [(shape, axis) for shape in [(2, 2), (2, 8), (8, 2), (8, 8), (32, 32)] for axis in [0, 1]]) -def test_tensor_atomic_rmw(shape, axis, device="cuda"): - shape0, shape1 = shape - # triton kernel - - @triton.jit - def kernel(Z, X, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr): - off0 = tl.arange(0, SHAPE0) - off1 = tl.arange(0, SHAPE1) - x = tl.load(X + off0[:, None] * SHAPE1 + off1[None, :]) - z = tl.sum(x, axis=AXIS) - if AXIS == 1: - tl.atomic_add(Z + off0, z) - else: - tl.atomic_add(Z + off1, z) - rs = RandomState(17) - x = numpy_random((shape0, shape1), dtype_str="float32", rs=rs) - # reference result - z_ref = np.sum(x, axis=axis, keepdims=False) - # triton result - x_tri = to_triton(x, device=device) - z_shape = (shape0, ) if axis == 1 else (shape1, ) - z_tri = to_triton(np.zeros(z_shape, dtype="float32"), device=device) - kernel[(1,)](z_tri, x_tri, axis, shape0, shape1) - np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=1e-4) - - -def test_tensor_atomic_rmw_block(device="cuda"): - shape = (8, 8) - - @triton.jit - def kernel(X, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr): - off0 = tl.arange(0, SHAPE0) - off1 = tl.arange(0, SHAPE1) - offs = off0[:, None] * SHAPE1 + off1[None, :] - val = offs.to(tl.float32) - x = X + offs - tl.atomic_min(x, val) - x = torch.ones((8, 8), device=device, dtype=torch.float32) - kernel[(2,)](x, shape[0], shape[1]) - assert torch.min(x).item() == 0.0 - - -def test_atomic_cas(): - # 1. make sure that atomic_cas changes the original value (Lock) - @triton.jit - def change_value(Lock): - tl.atomic_cas(Lock, 0, 1) - - Lock = torch.zeros((1,), device='cuda', dtype=torch.int32) - change_value[(1,)](Lock) - - assert (Lock[0] == 1) - - # 2. only one block enters the critical section - @triton.jit - def serialized_add(data, Lock): - ptrs = data + tl.arange(0, 128) - while tl.atomic_cas(Lock, 0, 1) == 1: - pass - - tl.store(ptrs, tl.load(ptrs) + 1.0) - - # release lock - tl.atomic_xchg(Lock, 0) - - Lock = torch.zeros((1,), device='cuda', dtype=torch.int32) - data = torch.zeros((128,), device='cuda', dtype=torch.float32) - ref = torch.full((128,), 64.0) - serialized_add[(64,)](data, Lock) - np.testing.assert_allclose(to_numpy(data), to_numpy(ref)) - - -# --------------- -# test cast -# --------------- - - -@pytest.mark.parametrize("dtype_x, dtype_z, bitcast", [ - (dtype_x, dtype_z, False) - for dtype_x in dtypes - for dtype_z in dtypes -] + [ - ('float32', 'bfloat16', False), - ('bfloat16', 'float32', False), - ('float32', 'int32', True), - ('float32', 'int1', False), -] + [ - (f'uint{x}', f'int{x}', True) for x in [8, 16, 32, 64] -] + [ - (f'int{x}', f'uint{x}', True) for x in [8, 16, 32, 64] -]) -def test_cast(dtype_x, dtype_z, bitcast, device='cuda'): - # bfloat16 on cc < 80 will not be tested - check_type_supported(dtype_x) - check_type_supported(dtype_z) - - # This is tricky because numpy doesn't have bfloat, and torch doesn't have uints. - x0 = 43 if dtype_x in int_dtypes else 43.5 - if dtype_x in float_dtypes and dtype_z == 'int1': - x0 = 0.5 - if dtype_x.startswith('bfloat'): - x_tri = torch.tensor([x0], dtype=getattr(torch, dtype_x), device=device) - else: - x = np.array([x0], dtype=getattr(np, dtype_x)) - x_tri = to_triton(x) - - # triton kernel - @triton.jit - def kernel(X, Z, BITCAST: tl.constexpr): - x_ptr = X + tl.arange(0, 1) - z_ptr = Z + tl.arange(0, 1) - x = tl.load(x_ptr) - z = x.to(Z.dtype.element_ty, bitcast=BITCAST) - tl.store(z_ptr, z) - - dtype_z_np = dtype_z if dtype_z != 'int1' else 'bool_' - # triton result - if dtype_z.startswith('bfloat'): - z_tri = torch.empty((1,), dtype=getattr(torch, dtype_z), device=device) - else: - z_tri = to_triton(np.empty((1, ), dtype=getattr(np, dtype_z_np)), device=device) - kernel[(1, )](x_tri, z_tri, BITCAST=bitcast) - # torch result - if dtype_z.startswith('bfloat') or dtype_x.startswith('bfloat'): - assert bitcast is False - z_ref = x_tri.to(z_tri.dtype) - assert z_tri == z_ref - else: - if bitcast: - z_ref = x.view(getattr(np, dtype_z_np)) - else: - z_ref = x.astype(getattr(np, dtype_z_np)) - assert to_numpy(z_tri) == z_ref - - -@pytest.mark.parametrize("dtype_str", list(torch_dtypes)) -def test_store_constant(dtype_str): - check_type_supported(dtype_str) - - """Tests that boolean True is stored as 1""" - @triton.jit - def kernel(output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): - offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - output = GENERATE_TEST_HERE - tl.store(output_ptr + offsets, output, mask=mask) - - triton_dtype_str = 'uint8' if dtype_str == 'bool' else dtype_str - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.zeros([BLOCK_SIZE], dtype=tl.{triton_dtype_str}) + 1'}) - block_size = 128 - ref = torch.ones([block_size], dtype=getattr(torch, dtype_str), device='cuda') - output = torch.zeros([block_size], dtype=getattr(torch, dtype_str), device='cuda') - kernel[(1,)](output, block_size, BLOCK_SIZE=block_size) - - assert torch.all(output == ref) - - -def test_load_store_same_ptr(): - @triton.jit() - def kernel(in_out_ptr): - pid = tl.program_id(axis=0) - x = tl.load(in_out_ptr + pid) - out = x * 2 - tl.store(in_out_ptr + pid, out) - - for _ in range(1000): - x = torch.ones((65536,), device="cuda", dtype=torch.float32) - kernel[(65536,)](x, num_warps=32) - assert torch.all(x == 2) - - -def convert_float_to_float32(fp: torch.tensor, dtype=None): - if not dtype: - dtype = getattr(tl, torch_dtype_name(fp.dtype)) - - fp = fp.view(getattr(torch, f"int{dtype.primitive_bitwidth}")) - exp_width = dtype.primitive_bitwidth - dtype.fp_mantissa_width - 1 - exp_bias = 2 ** (exp_width - 1) - 1 - sign = ((fp >> (dtype.primitive_bitwidth - 1)) & 0x01).int() - exp = ((fp >> dtype.fp_mantissa_width) & ((1 << exp_width) - 1)).int() - frac = (fp & ((1 << dtype.fp_mantissa_width) - 1)).int() - - output = torch.where(exp == 0, - # subnormal - ((-1.0) ** sign) * (2.0 ** (1 - exp_bias)) * (frac / (2.0 ** dtype.fp_mantissa_width)), - # normal - ((-1.0) ** sign) * (2.0 ** (exp - exp_bias)) * (1.0 + frac / (2.0 ** dtype.fp_mantissa_width))).float() - - extended_exp = ((1 << (tl.float32.primitive_bitwidth - tl.float32.fp_mantissa_width - 1)) - 1) << tl.float32.fp_mantissa_width - # special cases, exp is 0b11..1 - if dtype == tl.float8e4: - # float8e4m3 does not have infinities - output[fp == torch.tensor(0b01111111, dtype=torch.int8)] = torch.nan - output[fp == torch.tensor(0b11111111, dtype=torch.int8)] = torch.nan - else: - output = torch.where(exp == (1 << exp_width) - 1, - ((sign << (tl.float32.primitive_bitwidth - 1)) | extended_exp | (frac << (tl.float32.fp_mantissa_width - dtype.fp_mantissa_width))).view(torch.float32), - output) - return output - - -@pytest.mark.parametrize("in_dtype", [torch.float16, torch.bfloat16]) -def test_convert_float16_to_float32(in_dtype): - """Tests that check convert_float_to_float32 function""" - check_type_supported(in_dtype) - - f16_input = torch.tensor(range(-int(2 ** (16 - 1)), int(2 ** (16 - 1))), dtype=torch.int16).view(in_dtype) - f32_output = convert_float_to_float32(f16_input) - - nan = f16_input.isnan() - assert torch.all(f32_output[nan].isnan()) - inf = f16_input.isinf() - assert torch.all(f32_output[inf].isinf()) - other = torch.logical_not(torch.logical_or(nan, inf)) - assert torch.all(f16_input[other] == f32_output[other]) - - -@pytest.mark.parametrize("in_dtype", [tl.float8e4, tl.float8e5]) -@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16, torch.float32]) -def test_f8_xf16_roundtrip(in_dtype, out_dtype): - """Tests that converting an f8 to f16 and back to f8 doesn't change its value""" - check_type_supported(out_dtype) - - @triton.jit - def copy_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): - offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - input = tl.load(input_ptr + offsets, mask=mask) - output = input - tl.store(output_ptr + offsets, output, mask=mask) - - f8_tensor = torch.tensor(range(-128, 128), dtype=torch.int8, device='cuda') - # f32_to_f8 doesn't handle nan, so we make sure f8_tensor doesn't contain any nan - all_exp_ones = (f8_tensor & 0b01111100) == 128 - 2**in_dtype.fp_mantissa_width - f8_tensor[all_exp_ones] = 0 - f8 = triton.reinterpret(f8_tensor, in_dtype) - n_elements = f8_tensor.numel() - xf16 = torch.empty_like(f8_tensor, dtype=out_dtype) - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - copy_kernel[grid](f8, xf16, n_elements, BLOCK_SIZE=1024) - - # exponent_mask = 0b01111100 for float8e5 - # exponent_mask = 0b01111000 for float8e4 - exponent_mask = 0b01111111 ^ ((1 << in_dtype.fp_mantissa_width) - 1) - normal = torch.logical_and((f8_tensor & exponent_mask) != 0, (f8_tensor & exponent_mask) != exponent_mask) - ref16 = convert_float_to_float32(f8_tensor, in_dtype) - # WARN: currently only normal float8s are handled - assert torch.all(xf16[normal] == ref16[normal]) - - f8_output_tensor = torch.empty_like(xf16, dtype=torch.int8) - f8_output = triton.reinterpret(f8_output_tensor, in_dtype) - copy_kernel[grid](xf16, f8_output, n_elements, BLOCK_SIZE=1024) - - assert torch.all(f8_tensor == f8_output_tensor) - - -@pytest.mark.parametrize("in_dtype", [tl.float8e4, tl.float8e5]) -@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16]) -def test_f16_to_f8_rounding(in_dtype, out_dtype): - """Takes all float16s, converts them to float8 and back to float16. Checks that the absolute - error is the minimum over all float8. - Or the same explanation a bit mathier: - for all f16 |f16 - fromf8(tof8(f16))| == min over all f8 |f16 - fromf8(f8)|""" - @triton.jit - def copy_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): - offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - input = tl.load(input_ptr + offsets, mask=mask) - output = input - tl.store(output_ptr + offsets, output, mask=mask) - - i16_input = torch.tensor(range(-int(2 ** (16 - 1)), int(2 ** (16 - 1))), dtype=torch.int16, device='cuda') - f16_input = i16_input.view(out_dtype) - n_elements = f16_input.numel() - f8_output_tensor = torch.empty_like(f16_input, dtype=torch.int8) - f8_output = triton.reinterpret(f8_output_tensor, in_dtype) - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - copy_kernel[grid](f16_input, f8_output, n_elements, BLOCK_SIZE=1024) - - f16_output = torch.empty_like(f16_input, dtype=out_dtype) - copy_kernel[grid](f8_output, f16_output, n_elements, BLOCK_SIZE=1024) - - abs_error = torch.abs(f16_input - f16_output) - - all_f8_vals_tensor = torch.tensor(range(2 ** 8), dtype=torch.uint8, device='cuda') - all_f8_vals = triton.reinterpret(all_f8_vals_tensor, in_dtype) - all_f8_vals_in_f16 = torch.empty_like(all_f8_vals_tensor, dtype=out_dtype) - copy_kernel[grid](all_f8_vals, all_f8_vals_in_f16, n_elements=256, BLOCK_SIZE=1024) - - all_finite_f8_vals_in_f16 = all_f8_vals_in_f16[ - torch.isfinite(all_f8_vals_in_f16) - ] - - min_error = torch.min( - torch.abs( - f16_input.reshape((-1, 1)) - - all_finite_f8_vals_in_f16.reshape((1, -1)) - ), - dim=1, - )[0] - - # WARN: only normalized numbers are handled - f8_normal_min = 1 << in_dtype.fp_mantissa_width # 0b00001000 for float8e4 - f8_normal_max = 0b01111110 if in_dtype == tl.float8e4 else 0b01111011 - f16_min, f16_max, f16_max_minus_1 = convert_float_to_float32(torch.tensor([f8_normal_min, f8_normal_max, f8_normal_max - 1], dtype=torch.int8), in_dtype) - assert torch.all(torch.isfinite(f16_min)) - assert torch.all(torch.isfinite(f16_max)) - thres_error = f16_max - f16_max_minus_1 - mismatch = torch.logical_and( - torch.logical_or(abs_error != min_error, abs_error > thres_error), torch.logical_and(torch.isfinite(f16_input), torch.logical_and(torch.abs(f16_input) <= f16_max, torch.abs(f16_input) >= f16_min)) - ) - assert torch.all( - torch.logical_not(mismatch) - ), f"f16_input[mismatch]={f16_input[mismatch]} f16_output[mismatch]={f16_output[mismatch]} abs_error[mismatch]={abs_error[mismatch]} min_error[mismatch]={min_error[mismatch]}" - - -# --------------- -# test reduce -# --------------- - - -def get_reduced_dtype(dtype_str, op): - if op in ('argmin', 'argmax'): - return 'int32' - if dtype_str in ['int8', 'uint8', 'int16', 'uint16']: - return 'int32' - if dtype_str == 'bfloat16': - return 'float32' - return dtype_str - - -@pytest.mark.parametrize("op, dtype_str, shape", - [(op, dtype, shape) - for op in ['min', 'max', 'sum', 'argmin', 'argmax'] - for dtype in dtypes_with_bfloat16 - for shape in [32, 64, 128, 512]]) -def test_reduce1d(op, dtype_str, shape, device='cuda'): - check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested - - # triton kernel - @triton.jit - def kernel(X, Z, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - tl.store(Z, GENERATE_TEST_HERE) - - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'}) - # input - rs = RandomState(17) - # limit the range of integers so that the sum does not overflow - x = numpy_random((shape,), dtype_str=dtype_str, rs=rs) - x_tri = to_triton(x, device=device) - numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min, - 'argmin': np.argmin, 'argmax': np.argmax}[op] - # numpy result - z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str - z_tri_dtype_str = z_dtype_str - if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16': - z_dtype_str = 'float32' - z_ref = numpy_op(x).astype(getattr(np, z_dtype_str)) - # trunc mantissa for a fair comparison of accuracy - z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32') - z_tri_dtype_str = 'bfloat16' - else: - z_ref = numpy_op(x).astype(getattr(np, z_dtype_str)) - # triton result - z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs), - device=device, dst_type=z_tri_dtype_str) - kernel[(1,)](x_tri, z_tri, BLOCK=shape) - z_tri = to_numpy(z_tri) - # compare - if op == 'sum': - np.testing.assert_allclose(z_ref, z_tri, rtol=0.01) - else: - if op in ('argmin', 'argmax'): - # argmin and argmax can have multiple valid indices. - # so instead we compare the values pointed by indices - np.testing.assert_equal(x[z_ref], x[z_tri]) - else: - np.testing.assert_equal(z_ref, z_tri) - - -# TODO: [Qingyi] Fix argmin / argmax -reduce_configs1 = [ - (op, dtype, (1, 1024), axis) for dtype in dtypes_with_bfloat16 - for op in ['min', 'max', 'sum', 'argmin', 'argmax'] - for axis in [1] -] - - -# shape (128, 256) and (32, 1024) are not enabled on sm86 because the required shared memory -# exceeds the limit of 99KB -reduce2d_shapes = [(2, 32), (4, 32), (4, 128)] -# TODO: fix and uncomment -# , (32, 64), (64, 128)] -if 'V100' in torch.cuda.get_device_name(0): - reduce2d_shapes += [(128, 256) and (32, 1024)] - - -reduce_configs2 = [ - (op, 'float32', shape, axis) - for op in ['min', 'max', 'sum', 'argmin', 'argmax'] - for shape in reduce2d_shapes - for axis in [0, 1] -] - - -@pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2) -def test_reduce2d(op, dtype_str, shape, axis, device='cuda'): - check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested - - # triton kernel - @triton.jit - def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr): - range_m = tl.arange(0, BLOCK_M) - range_n = tl.arange(0, BLOCK_N) - x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :]) - z = GENERATE_TEST_HERE - if AXIS == 1: - tl.store(Z + range_m, z) - else: - tl.store(Z + range_n, z) - - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'}) - # input - rs = RandomState(17) - # limit the range of integers so that the sum does not overflow - x = numpy_random(shape, dtype_str=dtype_str, rs=rs) - x_tri = to_triton(x) - numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min, - 'argmin': np.argmin, 'argmax': np.argmax}[op] - z_dtype_str = get_reduced_dtype(dtype_str, op) - z_tri_dtype_str = z_dtype_str - # numpy result - if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16': - z_dtype_str = 'float32' - z_tri_dtype_str = 'bfloat16' - z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str)) - # trunc mantissa for a fair comparison of accuracy - z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32') - else: - z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str)) - # triton result - z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs), - device=device, dst_type=z_tri_dtype_str) - kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis) - z_tri = to_numpy(z_tri) - # compare - if op == 'sum': - np.testing.assert_allclose(z_ref, z_tri, rtol=0.01) - else: - if op in ('argmin', 'argmax'): - # argmin and argmax can have multiple valid indices. - # so instead we compare the values pointed by indices - z_ref_index = np.expand_dims(z_ref, axis=axis) - z_tri_index = np.expand_dims(z_tri, axis=axis) - z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis) - z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis) - np.testing.assert_equal(z_ref_value, z_tri_value) - else: - np.testing.assert_equal(z_ref, z_tri) - - -layouts = [ - BlockedLayout([1, 4], [8, 4], [4, 1], [1, 0]), - BlockedLayout([1, 4], [8, 4], [4, 1], [0, 1]), - MmaLayout(version=(2, 0), warps_per_cta=[4, 1]) -] - - -@pytest.mark.parametrize("M, N", [[128, 16], [128, 128], [32, 128]]) -@pytest.mark.parametrize("src_layout", layouts) -@pytest.mark.parametrize("axis", [0, 1]) -def test_reduce_layouts(M, N, src_layout, axis, device='cuda'): - rdims_2d = f"1x{N}" if axis == 0 else f"{M}x1" - rdims_1d = f"{N}" if axis == 0 else f"{M}" - store_range = "%7" if axis == 0 else "%1" - ir = f""" - #blocked = #triton_gpu.blocked<{{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}}> - #src = {src_layout} - module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{ - tt.func public @kernel_0d1d2c3d4c(%arg0: !tt.ptr {{tt.divisibility = 16 : i32}}, %arg1: i32 {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr {{tt.divisibility = 16 : i32}}) {{ - %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>> - %1 = tt.expand_dims %0 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>>) -> tensor<{M}x1xi32, #blocked> - %2 = tt.splat %arg1 : (i32) -> tensor<{M}x1xi32, #blocked> - %3 = arith.muli %1, %2 : tensor<{M}x1xi32, #blocked> - %4 = tt.splat %arg0 : (!tt.ptr) -> tensor<{M}x1x!tt.ptr, #blocked> - %5 = tt.addptr %4, %3 : tensor<{M}x1x!tt.ptr, #blocked>, tensor<{M}x1xi32, #blocked> - %6 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>> - %7 = tt.expand_dims %6 {{axis = 0 : i32}} : (tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>>) -> tensor<1x{N}xi32, #blocked> - %8 = tt.broadcast %5 : (tensor<{M}x1x!tt.ptr, #blocked>) -> tensor<{M}x{N}x!tt.ptr, #blocked> - %9 = tt.broadcast %7 : (tensor<1x{N}xi32, #blocked>) -> tensor<{M}x{N}xi32, #blocked> - %10 = tt.addptr %8, %9 : tensor<{M}x{N}x!tt.ptr, #blocked>, tensor<{M}x{N}xi32, #blocked> - %11 = tt.splat %arg2 : (!tt.ptr) -> tensor<{rdims_2d}x!tt.ptr, #blocked> - %12 = tt.addptr %11, {store_range} : tensor<{rdims_2d}x!tt.ptr, #blocked>, tensor<{rdims_2d}xi32, #blocked> - %13 = tt.load %10 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}x{N}xf32, #blocked> - %14 = triton_gpu.convert_layout %13 : (tensor<{M}x{N}xf32, #blocked>) -> tensor<{M}x{N}xf32, #src> - %15 = "tt.reduce"(%14) ({{ - ^bb0(%arg3: f32, %arg4: f32): - %16 = "triton_gpu.cmpf"(%arg3, %arg4) {{predicate = 2 : i64}} : (f32, f32) -> i1 - %17 = arith.select %16, %arg3, %arg4 : f32 - tt.reduce.return %17 : f32 - }}) {{axis = {axis} : i32}} : (tensor<{M}x{N}xf32, #src>) -> tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #src}}>> - %18 = triton_gpu.convert_layout %15 : (tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #src}}>>) -> tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #blocked}}>> - %19 = tt.expand_dims %18 {{axis = {axis} : i32}} : (tensor<{rdims_1d}xf32, #triton_gpu.slice<{{dim = {axis}, parent = #blocked}}>>) -> tensor<{rdims_2d}xf32, #blocked> - tt.store %12, %19 {{cache = 1 : i32, evict = 1 : i32}} : tensor<{rdims_2d}xf32, #blocked> - tt.return - }} - }} - """ - - import tempfile - with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f: - f.write(ir) - f.flush() - kernel = triton.compile(f.name) - - rs = RandomState(17) - x = rs.randint(0, 4, (M, N)).astype('float32') - x = (x.view('uint32') & np.uint32(0xffffe000)).view('float32') - - if axis == 0: - z = np.zeros((1, N)).astype('float32') - else: - z = np.zeros((M, 1)).astype('float32') - - x_tri = torch.tensor(x, device=device) - z_tri = torch.tensor(z, device=device) - - pgm = kernel[(1, 1, 4)](x_tri, x_tri.stride(0), z_tri) - - z_ref = np.max(x, axis=axis, keepdims=True) - - np.testing.assert_allclose(z_ref, z_tri.cpu().numpy(), rtol=0.01, atol=1e-3) - - -layouts = [ - BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]), - BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]), - MmaLayout(version=(2, 0), warps_per_cta=[4, 1]) -] - - -@pytest.mark.parametrize("M", [32, 64, 128, 256]) -@pytest.mark.parametrize("src_layout", layouts) -def test_store_op(M, src_layout, device='cuda'): - ir = f""" - #src = {src_layout} - module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{ - tt.func public @kernel(%arg0: !tt.ptr {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr {{tt.divisibility = 16 : i32}}) {{ - %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> - %1 = tt.splat %arg0 : (!tt.ptr) -> tensor<{M}x!tt.ptr, #triton_gpu.slice<{{dim = 1, parent = #src}}>> - %2 = tt.addptr %1, %0 : tensor<{M}x!tt.ptr, #triton_gpu.slice<{{dim = 1, parent = #src}}>>, tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> - %3 = tt.load %2 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}xf32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> - %4 = tt.expand_dims %3 {{axis = 1 : i32}} : (tensor<{M}xf32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xf32, #src> - %5 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> - %6 = tt.expand_dims %5 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xi32, #src> - %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<{M}x1x!tt.ptr, #src> - %8 = tt.addptr %7, %6 : tensor<{M}x1x!tt.ptr, #src>, tensor<{M}x1xi32, #src> - tt.store %8, %4 : tensor<{M}x1xf32, #src> - tt.return - }} - }} - """ - - import tempfile - with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f: - f.write(ir) - f.flush() - store_kernel = triton.compile(f.name) - - rs = RandomState(17) - x = rs.randint(0, 4, (M, 1)).astype('float32') - y = np.zeros((M, 1), dtype='float32') - x_tri = torch.tensor(x, device=device) - y_tri = torch.tensor(y, device=device) - - pgm = store_kernel[(1, 1, 1)](x_tri, y_tri) - y_ref = x - np.testing.assert_allclose(y_ref, y_tri.cpu().numpy(), rtol=0.01, atol=1e-3) - - -layouts = [ - BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]), - BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]), - MmaLayout(version=(2, 0), warps_per_cta=[4, 1]) -] - - -@pytest.mark.parametrize("M", [64, 128, 256]) -@pytest.mark.parametrize("src_layout", layouts) -@pytest.mark.parametrize("dst_layout", layouts) -@pytest.mark.parametrize("src_dim", [0, 1]) -@pytest.mark.parametrize("dst_dim", [0, 1]) -def test_convert1d(M, src_layout, dst_layout, src_dim, dst_dim, device='cuda'): - ir = f""" - #dst = {dst_layout} - #src = {src_layout} - module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{ - tt.func public @kernel(%arg0: !tt.ptr {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr {{tt.divisibility = 16 : i32}}) {{ - %0 = tt.splat %arg0 : (!tt.ptr) -> tensor<{M}x!tt.ptr, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>> - %1 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>> - %2 = tt.addptr %0, %1 : tensor<{M}x!tt.ptr, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>, tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>> - %3 = tt.load %2 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>> - %4 = tt.splat %arg1 : (!tt.ptr) -> tensor<{M}x!tt.ptr, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>> - %5 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>> - %6 = tt.addptr %4, %5 : tensor<{M}x!tt.ptr, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>>, tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>> - %7 = triton_gpu.convert_layout %3 : (tensor<{M}xi32, #triton_gpu.slice<{{dim = {src_dim}, parent = #src}}>>) -> tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>> - tt.store %6, %7 : tensor<{M}xi32, #triton_gpu.slice<{{dim = {dst_dim}, parent = #dst}}>> - tt.return - }} - }} - """ - import tempfile - with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f: - f.write(ir) - f.flush() - kernel = triton.compile(f.name) - - rs = RandomState(17) - x = rs.randint(0, 4, (M, )).astype('int32') - y = np.zeros((M, ), dtype='int32') - x_tri = torch.tensor(x, device=device) - y_tri = torch.tensor(y, device=device) - pgm = kernel[(1, 1, 1)](x_tri, y_tri) - y_ref = x - np.testing.assert_allclose(y_ref, y_tri.cpu().numpy(), rtol=0.01, atol=1e-3) - - -@triton.jit -def _welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2): - delta = mean_2 - mean_1 - new_weight = weight_1 + weight_2 - w2_over_w = weight_2 / new_weight - return ( - mean_1 + delta * w2_over_w, - m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w, - new_weight, - ) - - -layouts = [ - BlockedLayout([1, 4], [1, 32], [4, 1], [1, 0]), - BlockedLayout([1, 4], [1, 32], [2, 2], [1, 0]), - BlockedLayout([1, 4], [1, 32], [1, 4], [1, 0]), - BlockedLayout([1, 4], [8, 4], [2, 2], [0, 1]) -] - - -@pytest.mark.parametrize("M, N", [[128, 128], [256, 128], [256, 256], [128, 256]]) -@pytest.mark.parametrize("src_layout", layouts) -def test_chain_reduce(M, N, src_layout, device='cuda'): - ir = f""" - #src = {src_layout} - module attributes {{"triton_gpu.num-warps" = 4 : i32}} {{ - tt.func public @sum_kernel_0d1d(%arg0: !tt.ptr {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr {{tt.divisibility = 16 : i32}}) {{ - %cst = arith.constant dense<{N}> : tensor<{M}x1xi32, #src> - %0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> - %1 = tt.expand_dims %0 {{axis = 1 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> tensor<{M}x1xi32, #src> - %2 = arith.muli %1, %cst : tensor<{M}x1xi32, #src> - %3 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>> - %4 = tt.expand_dims %3 {{axis = 0 : i32}} : (tensor<{N}xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>) -> tensor<1x{N}xi32, #src> - %5 = tt.broadcast %2 : (tensor<{M}x1xi32, #src>) -> tensor<{M}x{N}xi32, #src> - %6 = tt.broadcast %4 : (tensor<1x{N}xi32, #src>) -> tensor<{M}x{N}xi32, #src> - %7 = arith.addi %5, %6 : tensor<{M}x{N}xi32, #src> - %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<{M}x{N}x!tt.ptr, #src> - %9 = tt.addptr %8, %7 : tensor<{M}x{N}x!tt.ptr, #src>, tensor<{M}x{N}xi32, #src> - %10 = tt.load %9 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}x{N}xi32, #src> - %11 = "tt.reduce"(%10) ({{ - ^bb0(%arg2: i32, %arg3: i32): - %13 = arith.addi %arg2, %arg3 : i32 - tt.reduce.return %13 : i32 - }}) {{axis = 1 : i32}} : (tensor<{M}x{N}xi32, #src>) -> tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> - %12 = "tt.reduce"(%11) ({{ - ^bb0(%arg2: i32, %arg3: i32): - %13 = arith.addi %arg2, %arg3 : i32 - tt.reduce.return %13 : i32 - }}) {{axis = 0 : i32}} : (tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>) -> i32 - tt.store %arg1, %12 {{cache = 1 : i32, evict = 1 : i32}} : i32 - tt.return - }} - }} - """ - import tempfile - with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f: - f.write(ir) - f.flush() - kernel = triton.compile(f.name) - - rs = RandomState(17) - x = rs.randint(0, 4, (M, N)).astype('int32') - - z = np.zeros((1,)).astype('int32') - - x_tri = torch.tensor(x, device=device) - z_tri = torch.tensor(z, device=device) - - pgm = kernel[(1, 1, 1)](x_tri, z_tri) - z_ref = np.sum(x) - - np.testing.assert_allclose(z_ref, z_tri.cpu().numpy(), rtol=0.01, atol=1e-3) - - -def test_generic_reduction(device='cuda'): - - @triton.jit - def var_mean_kernel(X, out_mean, out_var, BLOCK: tl.constexpr): - xindex = tl.arange(0, BLOCK) - x = tl.load(X + xindex) - mean = x - m2 = tl.zeros_like(x) - weight = tl.full(x.shape, 1, x.dtype) - (mean, m2, weight) = tl.reduce((mean, m2, weight), 0, _welford_combine) - tl.store(out_mean, mean) - tl.store(out_var, m2 / weight) - - SIZE = 512 - x = torch.rand(SIZE, device=device) - out_mean = torch.empty((), device=device) - out_var = torch.empty((), device=device) - - var_mean_kernel[(1,)](x, out_mean, out_var, BLOCK=SIZE) - - expect_var, expect_mean = torch.var_mean(x, dim=0, correction=0) - torch.testing.assert_close(out_mean, expect_mean) - torch.testing.assert_close(out_var, expect_var) - - -# --------------- -# test permute -# --------------- - - -@pytest.mark.parametrize("dtype_str, shape, perm", - [(dtype, shape, perm) - # TODO: bfloat16 - for dtype in ['float16', 'float32'] - for shape in [(64, 64), (128, 128)] - for perm in [(1, 0)]]) -def test_permute(dtype_str, shape, perm, device='cuda'): - check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested - - # triton kernel - @triton.jit - def kernel(X, stride_xm, stride_xn, - Z, stride_zm, stride_zn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr): - off_m = tl.arange(0, BLOCK_M) - off_n = tl.arange(0, BLOCK_N) - Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * stride_xn - Zs = Z + off_m[:, None] * stride_zm + off_n[None, :] * stride_zn - tl.store(Zs, tl.load(Xs)) - # input - x = numpy_random(shape, dtype_str=dtype_str) - # triton result - z_tri = to_triton(np.empty_like(x), device=device, dst_type=dtype_str) - z_tri_contiguous = to_triton(np.empty_like(x), device=device, dst_type=dtype_str) - x_tri = to_triton(x, device=device, dst_type=dtype_str) - pgm = kernel[(1, 1)](x_tri, x_tri.stride(0), x_tri.stride(1), - z_tri, z_tri.stride(1), z_tri.stride(0), - BLOCK_M=shape[0], BLOCK_N=shape[1]) - pgm_contiguous = kernel[(1, 1)](x_tri, x_tri.stride(1), x_tri.stride(0), - z_tri_contiguous, z_tri_contiguous.stride(0), z_tri_contiguous.stride(1), - BLOCK_M=shape[0], BLOCK_N=shape[1]) - # numpy result - z_ref = x.transpose(*perm) - # compare - np.testing.assert_allclose(to_numpy(z_tri), z_ref) - np.testing.assert_allclose(to_numpy(z_tri_contiguous), z_ref) - # parse ptx to make sure ld/st are vectorized - ptx = pgm.asm['ptx'] - assert 'ld.global.v4' in ptx - assert 'st.global.v4' in ptx - ptx = pgm_contiguous.asm['ptx'] - assert 'ld.global.v4' in ptx - assert 'st.global.v4' in ptx - -# --------------- -# test dot -# --------------- - - -@pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, in_dtype, out_dtype", - [(*shape, 4, False, False, epilogue, allow_tf32, in_dtype, out_dtype) - for shape in [(64, 64, 64), (16, 16, 16)] - for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot'] - for allow_tf32 in [True, False] - for in_dtype, out_dtype in [('float16', 'float16'), - ('float16', 'float32'), - ('float32', 'float32')] - if not (allow_tf32 and (in_dtype in ['float16']))] + - - [(*shape_nw, col_a, col_b, 'none', allow_tf32, in_dtype, out_dtype) - for shape_nw in [[128, 256, 32, 8], - [128, 16, 32, 4], - [32, 128, 64, 4], - [128, 128, 64, 4], - [64, 128, 128, 4], - [32, 128, 64, 2], - [64, 64, 32, 4], - [32, 32, 128, 16], - [128, 128, 64, 2], - [64, 128, 128, 2]] - for allow_tf32 in [True] - for col_a in [True, False] - for col_b in [True, False] - for in_dtype, out_dtype in [('int8', 'int8'), - ('float16', 'float16'), - ('float16', 'float32'), - ('float32', 'float32')]]) -def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, in_dtype, out_dtype, device='cuda'): - capability = torch.cuda.get_device_capability() - if capability[0] < 7: - pytest.skip("Only test tl.dot() on devices with sm >= 70") - if capability[0] < 8: - if in_dtype == 'int8': - pytest.skip("Only test int8 on devices with sm >= 80") - elif in_dtype == 'float32' and allow_tf32: - pytest.skip("Only test tf32 on devices with sm >= 80") - if capability[0] == 7: - if (M, N, K, num_warps) == (128, 256, 32, 8): - pytest.skip("shared memory out of resource") - if out_dtype == 'float16': - # TODO: support out_dtype=float16 for tl.dot on V100 - pytest.skip("Only test out_dtype=float16 on devices with sm >=80") - - torch.backends.cuda.matmul.allow_tf32 = allow_tf32 - - # triton kernel - @triton.jit - def kernel(X, stride_xm, stride_xk, - Y, stride_yk, stride_yn, - W, stride_wn, stride_wl, - Z, stride_zm, stride_zn, - out_dtype: tl.constexpr, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - ADD_MATRIX: tl.constexpr, ADD_ROWS: tl.constexpr, ADD_COLS: tl.constexpr, - ALLOW_TF32: tl.constexpr, - DO_SOFTMAX: tl.constexpr, CHAIN_DOT: tl.constexpr, - COL_A: tl.constexpr, COL_B: tl.constexpr): - off_m = tl.arange(0, BLOCK_M) - off_n = tl.arange(0, BLOCK_N) - off_l = tl.arange(0, BLOCK_N) - off_k = tl.arange(0, BLOCK_K) - Xs = X + off_m[:, None] * stride_xm + off_k[None, :] * stride_xk - Ys = Y + off_k[:, None] * stride_yk + off_n[None, :] * stride_yn - Ws = W + off_n[:, None] * stride_wn + off_l[None, :] * stride_wl - Zs = Z + off_m[:, None] * stride_zm + off_n[None, :] * stride_zn - x = tl.load(Xs) - y = tl.load(Ys) - z = tl.dot(x, y, allow_tf32=ALLOW_TF32, out_dtype=out_dtype) - if ADD_MATRIX: - z += tl.load(Zs) - if ADD_ROWS: - ZRs = Z + off_m * stride_zm - z += tl.load(ZRs)[:, None] - if ADD_COLS: - ZCs = Z + off_n * stride_zn - z += tl.load(ZCs)[None, :] - if DO_SOFTMAX: - max = tl.max(z, 1) - z = z - max[:, None] - num = tl.exp(z.to(tl.float32)).to(max.dtype) - den = tl.sum(num, 1) - z = num / den[:, None] - if CHAIN_DOT: - w = tl.load(Ws) - z = tl.dot(z.to(w.dtype), w, out_dtype=out_dtype) - tl.store(Zs, z) - # input - rs = RandomState(17) - if col_a: - x = numpy_random((K, M), dtype_str=in_dtype, rs=rs).T - else: - x = numpy_random((M, K), dtype_str=in_dtype, rs=rs) - if col_b: - y = numpy_random((N, K), dtype_str=in_dtype, rs=rs).T - else: - y = numpy_random((K, N), dtype_str=in_dtype, rs=rs) - w = numpy_random((N, N), dtype_str=in_dtype, rs=rs) - if 'int' not in in_dtype: - x *= .1 - y *= .1 - if in_dtype == 'float32' and allow_tf32: - x = (x.view('uint32') & np.uint32(0xffffe000)).view('float32') - y = (y.view('uint32') & np.uint32(0xffffe000)).view('float32') - w = (w.view('uint32') & np.uint32(0xffffe000)).view('float32') - x_tri = to_triton(x, device=device) - y_tri = to_triton(y, device=device) - w_tri = to_triton(w, device=device) - # triton result - if out_dtype == 'int8': - z = 1 + numpy_random((M, N), dtype_str='int32', rs=rs) - else: - z = 1 + numpy_random((M, N), dtype_str=in_dtype, rs=rs) * .1 - - z_tri = to_triton(z, device=device) - if epilogue == 'trans': - z_tri = torch.as_strided(z_tri, (M, N), z_tri.stride()[::-1]) - - if out_dtype == 'int8': - out_dtype = tl.int8 - elif out_dtype == 'float16' and epilogue != 'softmax': - # TODO: for out_dtype == 'float16' and epilogue == 'softmax', it will - # fail with the following error: 'llvm.fmul' op requires the same type - # for all operands and results - out_dtype = tl.float16 - else: - out_dtype = tl.float32 - - pgm = kernel[(1, 1)](x_tri, x_tri.stride(0), x_tri.stride(1), - y_tri, y_tri.stride(0), y_tri.stride(1), - w_tri, w_tri.stride(0), w_tri.stride(1), - z_tri, z_tri.stride(0), z_tri.stride(1), - out_dtype, - COL_A=col_a, COL_B=col_b, - BLOCK_M=M, BLOCK_K=K, BLOCK_N=N, - ADD_MATRIX=epilogue == 'add-matrix', - ADD_ROWS=epilogue == 'add-rows', - ADD_COLS=epilogue == 'add-cols', - DO_SOFTMAX=epilogue == 'softmax', - CHAIN_DOT=epilogue == 'chain-dot', - ALLOW_TF32=allow_tf32, - num_warps=num_warps) - # torch result - if in_dtype == 'int8': - z_ref = np.matmul(x.astype(np.float32), - y.astype(np.float32())).astype(np.int32) - else: - z_ref = np.matmul(x, y) - - if epilogue == 'add-matrix': - z_ref += z - if epilogue == 'add-rows': - z_ref += z[:, 0][:, None] - if epilogue == 'add-cols': - z_ref += z[0, :][None, :] - if epilogue == 'softmax': - num = np.exp(z_ref - np.max(z_ref, axis=-1, keepdims=True)) - denom = np.sum(num, axis=-1, keepdims=True) - z_ref = num / denom - if epilogue == 'chain-dot': - z_ref = np.matmul(z_ref, w) - # compare - # print(z_ref[:,0], z_tri[:,0]) - if in_dtype == 'float32': - # XXX: Somehow there's a larger difference when we use float32 - np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3) - elif out_dtype == tl.float16: - np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3) - else: - np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01) - # make sure ld/st are vectorized - ptx = pgm.asm['ptx'] - if (K > 16 or N > 16 or M > 16) and (M * N // (num_warps * 32) >= 4): - # XXX: skip small sizes because they are not vectorized - assert 'ld.global.v4' in ptx - assert 'st.global.v4' in ptx - if in_dtype == 'float32' and allow_tf32: - assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' in ptx - elif in_dtype == 'float32' and allow_tf32: - assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' not in ptx - elif in_dtype == 'int8': - assert 'mma.sync.aligned.m16n8k32.row.col.satfinite.s32.s8.s8.s32' in ptx - elif out_dtype == tl.float16: - assert 'mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16' in ptx - - -@pytest.mark.parametrize("dtype_str", int_dtypes + float_dtypes + ['bfloat16']) -def test_full(dtype_str): - dtype = getattr(torch, dtype_str) - check_type_supported(dtype) # bfloat16 on cc < 80 will not be tested - - @triton.jit - def kernel_static(out): - a = GENERATE_TEST_HERE - out_ptr = out + tl.arange(0, 128)[:] - tl.store(out_ptr, a) - - @triton.jit - def kernel_dynamic(out, val, dtype: tl.constexpr): - a = tl.full((128,), val, dtype) - out_ptr = out + tl.arange(0, 128)[:] - tl.store(out_ptr, a) - - kernel_static_patched = patch_kernel(kernel_static, {'GENERATE_TEST_HERE': f"tl.full((128,), 2, tl.{dtype_str})"}) - out_static = torch.zeros((128), dtype=dtype, device="cuda") - kernel_static_patched[(1,)](out_static) - out_dynamic = torch.zeros((128), dtype=dtype, device="cuda") - kernel_dynamic[(1,)](out_dynamic, 2, getattr(triton.language, dtype_str)) - assert torch.all(out_static == 2) - assert torch.all(out_dynamic == 2) - - -@pytest.mark.parametrize("literal, dtype_str", - [(1e+50, "f64"), (1e+10, "f32"), (1.0, "f32"), - ('float("inf")', "f32"), ('float("-inf")', "f32"), - ('float("nan")', "f32"), ('float("-nan")', "f32"), - (0., "f32"), - (5, "i32"), (2**40, "i64"),]) -def test_constexpr(literal, dtype_str): - @triton.jit - def kernel(out_ptr): - val = GENERATE_TEST_HERE - tl.store(out_ptr.to(tl.pointer_type(val.dtype)), val) - - kernel_patched = patch_kernel(kernel, {'GENERATE_TEST_HERE': f"{literal}"}) - out = torch.zeros((1,), dtype=torch.float32, device="cuda") - h = kernel_patched[(1,)](out) - assert re.search(r"arith.constant .* : " + dtype_str, h.asm["ttir"]) is not None - -# TODO: uncomment once DotOperandEncoding::getElemsPerThread is implemented -# @pytest.mark.parametrize("dtype_str", ['float32', 'float16']) -# def test_dot_without_load(dtype_str): -# @triton.jit -# def _kernel(out): -# a = GENERATE_TEST_HERE -# b = GENERATE_TEST_HERE -# c = tl.dot(a, b) -# out_ptr = out + tl.arange(0, 32)[:, None] * 32 + tl.arange(0, 32)[None, :] -# tl.store(out_ptr, c) - -# kernel = patch_kernel(_kernel, {'GENERATE_TEST_HERE': f"tl.full((32, 32), 1.0, tl.{dtype_str})"}) -# a = torch.ones((32, 32), dtype=getattr(torch, dtype_str), device="cuda") -# b = torch.ones((32, 32), dtype=getattr(torch, dtype_str), device="cuda") -# out_ref = torch.matmul(a, b) -# out = torch.zeros((32, 32), dtype=getattr(torch, dtype_str), device="cuda") -# kernel[(1,)](out) -# assert torch.all(out == out_ref) - -# --------------- -# test arange -# --------------- - - -@pytest.mark.parametrize("start", [0, 1, 7, 16]) -def test_arange(start, device='cuda'): - BLOCK = 128 - z_tri = torch.empty(BLOCK, dtype=torch.int32, device=device) - - @triton.jit - def _kernel(z, BLOCK: tl.constexpr, - START: tl.constexpr, END: tl.constexpr): - off = tl.arange(0, BLOCK) - val = tl.arange(START, END) - tl.store(z + off, val) - _kernel[(1,)](z_tri, START=start, END=start + BLOCK, BLOCK=BLOCK) - z_ref = torch.arange(start, BLOCK + start, dtype=torch.int32, device=device) - np.testing.assert_allclose(to_numpy(z_tri), to_numpy(z_ref)) - -# --------------- -# test load -# --------------- - - -@pytest.mark.parametrize("dtype_str, size, size_diff", [(dtype_str, size, size_diff) for dtype_str in torch_dtypes for size in [128, 512] for size_diff in [0, 1, 2, 3, 4]]) -def test_masked_load(dtype_str, size, size_diff, device='cuda'): - dtype = getattr(torch, dtype_str) - check_type_supported(dtype) # bfloat16 on cc < 80 will not be tested - - input_size = size - size_diff - output_size = size - if dtype_str == 'bool': - input = torch.randint(0, 2, (input_size,), dtype=dtype, device=device) - elif dtype_str in int_dtypes or dtype_str in uint_dtypes: - input = torch.randint(0, 127, (input_size,), dtype=dtype, device=device) - else: - input = torch.rand(input_size, dtype=dtype, device=device) - output = torch.zeros((output_size,), dtype=dtype, device=device) - - @triton.jit - def _kernel(in_ptr, out_ptr, in_size: tl.constexpr, out_size: tl.constexpr): - in_offsets = tl.arange(0, out_size) - # Load inputs. - x = GENERATE_TEST_HERE - # Store output - output_offsets = tl.arange(0, out_size) - tl.store(out_ptr + output_offsets, x) - - mask_str = "mask=in_offsets < in_size, other=1" if size_diff > 0 else "None" - kernel = patch_kernel(_kernel, {'GENERATE_TEST_HERE': f"tl.load(in_ptr + in_offsets, {mask_str})"}) - kernel[(1,)](input, output, input_size, output_size) - - reference_out = torch.cat((input, torch.ones((size_diff,), dtype=dtype, device=device))) - # print((output - reference_out).nonzero()) - torch.testing.assert_allclose(output, reference_out) - -# Testing masked loads with an intermate copy to shared memory run. - - -@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) -def test_masked_load_shared_memory(dtype, device='cuda'): - check_type_supported(dtype) # bfloat16 on cc < 80 will not be tested - - M = 32 - N = 32 - K = 16 - - in1 = torch.rand((M, K), dtype=dtype, device=device) - in2 = torch.rand((K, N), dtype=dtype, device=device) - out = torch.zeros((M, N), dtype=dtype, device=device) - - @triton.jit - def _kernel(in1_ptr, in2_ptr, output_ptr, - in_stride, in2_stride, out_stride, - in_numel, in2_numel, out_numel, - M: tl.constexpr, N: tl.constexpr, K: tl.constexpr): - - M_offsets = tl.arange(0, M) - N_offsets = tl.arange(0, N) - K_offsets = tl.arange(0, K) - - in_offsets = M_offsets[:, None] * in_stride + K_offsets[None, :] - in2_offsets = K_offsets[:, None] * in2_stride + N_offsets[None, :] - - # Load inputs. - x = tl.load(in1_ptr + in_offsets, mask=in_offsets < M * K) - w = tl.load(in2_ptr + in2_offsets, mask=in2_offsets < K * N) - - # Without a dot product the memory doesn't get promoted to shared. - o = tl.dot(x, w, out_dtype=tl.float32) - - # Store output - output_offsets = M_offsets[:, None] * out_stride + N_offsets[None, :] - tl.store(output_ptr + output_offsets, o, mask=output_offsets < M * N) - - pgm = _kernel[(1,)](in1, in2, out, - in1.stride()[0], - in2.stride()[0], - out.stride()[0], - in1.numel(), - in2.numel(), - out.numel(), - M=M, N=N, K=K) - - reference_out = torch.matmul(in1, in2) - torch.testing.assert_allclose(out, reference_out, atol=1e-2, rtol=0) - - -@pytest.mark.parametrize("cache", ["", ".ca", ".cg"]) -def test_load_cache_modifier(cache): - src = torch.empty(128, device='cuda') - dst = torch.empty(128, device='cuda') - - @triton.jit - def _kernel(dst, src, CACHE: tl.constexpr): - offsets = tl.arange(0, 128) - x = tl.load(src + offsets, cache_modifier=CACHE) - tl.store(dst + offsets, x) - - pgm = _kernel[(1,)](dst, src, CACHE=cache) - ptx = pgm.asm['ptx'] - if cache == '': - assert 'ld.global.ca' not in ptx - assert 'ld.global.cg' not in ptx - if cache == '.cg': - assert 'ld.global.cg' in ptx - assert 'ld.global.ca' not in ptx - if cache == '.ca': - assert 'ld.global.ca' in ptx - assert 'ld.global.cg' not in ptx - - -@pytest.mark.parametrize("N", [16, 10, 11, 1024]) -def test_vectorization(N): - src = torch.empty(1024, device='cuda') - dst = torch.empty(1024, device='cuda') - - @triton.jit - def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr): - offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - x = tl.load(src + offsets, mask=offsets < N) - tl.store(dst + offsets, x, mask=offsets < N) - pgm = _kernel[(1,)](dst, src, N=N, BLOCK_SIZE=src.shape[0]) - ptx = pgm.asm["ptx"] - if N % 16 == 0: - assert "ld.global.v4.b32" in ptx - else: - assert "ld.global.b32" in ptx - # np.testing.assert_allclose(dst, src[:N]) - - -@pytest.mark.parametrize("has_hints", [False, True]) -def test_vectorization_hints(has_hints): - src = torch.empty(1024, device='cuda') - dst = torch.empty(1024, device='cuda') - off = torch.zeros(1, device='cuda', dtype=torch.int32) - - @triton.jit - def _kernel(dst, src, off, N, BLOCK_SIZE: tl.constexpr, HINT: tl.constexpr): - offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - offsets = offsets + tl.load(off) - if HINT: - tl.max_contiguous(tl.multiple_of(offsets, 1024), 1024) - x = tl.load(src + offsets, mask=offsets < N) - tl.store(dst + offsets, x, mask=offsets < N) - pgm = _kernel[(1,)](dst, src, off, N=1024, BLOCK_SIZE=src.shape[0], HINT=has_hints) - ptx = pgm.asm["ptx"] - if has_hints: - assert "ld.global.v4.b32" in ptx - else: - assert "ld.global.v4.b32" not in ptx - -# --------------- -# test store -# --------------- - -# --------------- -# test if -# --------------- - -# --------------- -# test for -# --------------- - -# --------------- -# test while -# --------------- - -# --------------- -# test default -# --------------- -# TODO: can't be local to test_default - - -@triton.jit -def _impl(value=10): - return value - - -def test_default(): - value = 5 - ret0 = torch.zeros(1, dtype=torch.int32, device='cuda') - ret1 = torch.zeros(1, dtype=torch.int32, device='cuda') - - @triton.jit - def _kernel(ret0, ret1, value): - tl.store(ret0, _impl()) - tl.store(ret1, _impl(value)) - - _kernel[(1,)](ret0, ret1, value) - assert ret0.item() == 10 - assert ret1.item() == value - -# --------------- -# test noop -# ---------------- - - -def test_noop(device='cuda'): - @triton.jit - def kernel(x): - pass - x = to_triton(numpy_random((1,), dtype_str='int32'), device=device) - kernel[(1, )](x) - - -@pytest.mark.parametrize("device", ['cuda', 'cpu', 'cpu_pinned']) -def test_pointer_arguments(device): - @triton.jit - def kernel(x): - pass - pin_memory = 'pinned' in device - x = torch.empty(1024, device=device.split('_')[0], pin_memory=pin_memory) - if device == "cpu": - with pytest.raises(ValueError): - kernel[(1,)](x) - else: - kernel[(1, )](x) - - -@pytest.mark.parametrize("value, value_type", [ - (-1, 'i32'), (0, 'i32'), (-2**31, 'i32'), (2**31 - 1, 'i32'), - (2**31, 'i64'), (2**32 - 1, 'i64'), (2**32, 'i64'), (2**63 - 1, 'i64'), - (-2**63, 'i64'), (2**63, 'u64'), (2**64 - 1, 'u64') -]) -def test_value_specialization(value: int, value_type: str, device='cuda') -> None: - spec_type = None - - def cache_hook(*args, **kwargs): - nonlocal spec_type - spec_type = kwargs["compile"]["signature"][0] - JITFunction.cache_hook = cache_hook - - @triton.jit - def kernel(VALUE, X): - pass - - x = torch.tensor([3.14159], device='cuda') - pgm = kernel[(1, )](value, x) - - JITFunction.cache_hook = None - assert spec_type == value_type - -# -------------------- -# value specialization -# -------------------- - - -@pytest.mark.parametrize( - "value, overflow", - [(2**64 - 1, False), (2**64, True), (-2**63, False), (-2**63 - 1, True)] -) -def test_value_specialization_overflow(value: int, overflow: bool, device='cuda') -> None: - - @triton.jit - def kernel(VALUE, X): - pass - - x = torch.tensor([3.14159], device='cuda') - - if overflow: - with pytest.raises(OverflowError): - kernel[(1, )](value, x) - else: - kernel[(1, )](value, x) - - -# ---------------- -# test constexpr -# ---------------- - -@pytest.mark.parametrize("op", ['+', '-', '*', '/', '%', '<', '>', '<<', '>>', '&', '^', '|']) -@pytest.mark.parametrize("is_lhs_constexpr", [False, True]) -@pytest.mark.parametrize("is_rhs_constexpr", [True, False]) -def test_bin_op_constexpr(op, is_lhs_constexpr, is_rhs_constexpr): - - @triton.jit - def kernel(Z, X, Y): - x = tl.load(X) - y = tl.load(Y) - z = GENERATE_TEST_HERE - tl.store(Z, z) - - if op in ['<<', '>>', '&', '^', '|']: # int op - x_str = "3" if is_lhs_constexpr else "x" - y_str = "4" if is_rhs_constexpr else "y" - x = numpy_random((1,), dtype_str="int32") - y = numpy_random((1,), dtype_str="int32") - else: - x_str = "3.14" if is_lhs_constexpr else "x" - y_str = "4.13" if is_rhs_constexpr else "y" - x = numpy_random((1,), dtype_str="float32") - y = numpy_random((1,), dtype_str="float32") - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f"{x_str} {op} {y_str}"}) - z = np.array(eval(f"{x_str} {op} {y_str}")) - x_tri = to_triton(x) - y_tri = to_triton(y) - z_tri = to_triton(np.empty((1,), dtype=z.dtype)) - kernel[(1,)](z_tri, x_tri, y_tri) - np.testing.assert_allclose(z, to_numpy(z_tri)) - - -def test_constexpr_shape(): - - @triton.jit - def kernel(X): - off = tl.arange(0, 128 + 128) - tl.store(X + off, off) - - x_tri = to_triton(np.empty((256, ), dtype=np.int32)) - kernel[(1,)](x_tri) - np.testing.assert_equal(to_numpy(x_tri), np.arange(0, 256)) - - -def test_constexpr_scalar_shape(): - - @triton.jit - def kernel(X, s): - off = tl.arange(0, 256) - val = off % (256 // s) - tl.store(X + off, val) - - x_tri = to_triton(np.empty((256, ), dtype=np.int32)) - kernel[(1,)](x_tri, 32) - np.testing.assert_equal(to_numpy(x_tri), np.arange(0, 256) % 8) - -# ------------- -# test call -# ------------- - - -@triton.jit -def val_multiplier(val, i): - return val * i - - -@triton.jit(noinline=True) -def val_multiplier_noinline(val, i): - return val * i - - -@triton.jit -def vecmul_kernel(ptr, n_elements, rep, type: tl.constexpr): - pid = tl.program_id(axis=0) - offsets = pid * 128 + tl.arange(0, 128) - mask = offsets < n_elements - vec = tl.load(ptr + offsets, mask=mask) - for i in range(1, rep): - if type == "inline": - vec = val_multiplier(vec, i) - else: - vec = val_multiplier_noinline(vec, i) - tl.store(ptr + offsets, vec, mask=mask) - - -@pytest.mark.parametrize("type", ["inline", "noinline"]) -def test_call(type): - - @triton.jit - def kernel(ptr, n_elements, num1, num2, type: tl.constexpr): - vecmul_kernel(ptr, n_elements, num1, type) - vecmul_kernel(ptr, n_elements, num2, type) - - size = 1024 - rand_val = numpy_random((size,), dtype_str="float32") - rand_val_tri = to_triton(rand_val, device='cuda') - err_msg = "" - try: - kernel[(size // 128,)](rand_val_tri, size, 3, 5, type) - except Exception as e: - err_msg = str(e) - - if type == "noinline": - assert err_msg is not "" - else: - ans = rand_val * 1 * 2 * 1 * 2 * 3 * 4 - np.testing.assert_equal(to_numpy(rand_val_tri), ans) - -# ------------- -# test if -# ------------- - - -@pytest.mark.parametrize("if_type", ["if", "if_exp", "if_and"]) -def test_if(if_type): - - @triton.jit - def kernel(Cond, XTrue, XFalse, Ret, IfType: tl.constexpr, BoolVar: tl.constexpr): - pid = tl.program_id(0) - cond = tl.load(Cond) - if IfType == "if": - if pid % 2 == 0: - tl.store(Ret, tl.load(XTrue)) - else: - tl.store(Ret, tl.load(XFalse)) - elif IfType == "if_exp": - tl.store(Ret, tl.load(XTrue)) if pid % 2 else tl.store(Ret, tl.load(XFalse)) - elif IfType == "if_and": - if BoolVar and pid % 2 == 0: - tl.store(Ret, tl.load(XTrue)) - else: - tl.store(Ret, tl.load(XFalse)) - - cond = torch.ones(1, dtype=torch.int32, device='cuda') - x_true = torch.tensor([3.14], dtype=torch.float32, device='cuda') - x_false = torch.tensor([1.51], dtype=torch.float32, device='cuda') - ret = torch.empty(1, dtype=torch.float32, device='cuda') - kernel[(1,)](cond, x_true, x_false, ret, if_type, True) - assert torch.equal(ret, x_true) - - -def test_num_warps_pow2(): - dst = torch.empty(128, device='cuda') - - @triton.jit - def _kernel(dst): - pass - - with pytest.raises(AssertionError, match='must be a power of 2'): - _kernel[(1,)](dst=dst, num_warps=3) - _kernel[(1,)](dst=dst, num_warps=1) - _kernel[(1,)](dst=dst, num_warps=2) - _kernel[(1,)](dst=dst, num_warps=4) - -# ------------- -# test extern -# ------------- - - -@pytest.mark.parametrize("dtype_str, expr, lib_path", - [('int32', 'math.ffs', ''), - ('float32', 'math.log2', ''), - ('float32', 'math.scalbn', ''), - ('float32', 'math.pow', tl.math.libdevice_path()), - ('float64', 'math.pow_dtype', tl.math.libdevice_path()), - ('float64', 'math.norm4d', '')]) -def test_math_tensor(dtype_str, expr, lib_path): - - @triton.jit - def kernel(X, Y, BLOCK: tl.constexpr): - x = tl.load(X + tl.arange(0, BLOCK)) - y = GENERATE_TEST_HERE - tl.store(Y + tl.arange(0, BLOCK), y) - - shape = (128, ) - rs = RandomState(17) - # limit the range of integers so that the sum does not overflow - x = numpy_random(shape, dtype_str=dtype_str, rs=rs) - - if expr == 'math.log2': - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.broadcast_to(tl.{expr}(5.0), x.shape)'}) - y_ref = np.log2(5.0) - elif expr == 'math.ffs': - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x)'}) - y_ref = np.zeros(shape, dtype=x.dtype) - for i in range(shape[0]): - y_ref[i] = (int(x[i]) & int(-x[i])).bit_length() - elif expr == 'math.scalbn': - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x, 2)'}) - y_ref = x * pow(2, 2) - elif expr == 'math.pow_dtype': - x = np.abs(x) - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.math.pow(x, 0.5)'}) - y_ref = np.power(x, 0.5) - elif expr == 'math.pow': - # numpy does not allow negative factors in power, so we use abs() - x = np.abs(x) - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x, x)'}) - y_ref = np.power(x, x) - elif expr == 'math.pow_dtype': - x = np.abs(x) - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.math.pow(x, 0.5)'}) - y_ref = np.power(x, 0.5) - elif expr == 'math.norm4d': - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{expr}(x, x, x, x)'}) - y_ref = np.sqrt(4 * np.power(x, 2)) - - x_tri = to_triton(x) - # triton result - y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda') - kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path}) - # compare - if expr == 'math.ffs': - np.testing.assert_equal(y_ref, to_numpy(y_tri)) - else: - np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01) - - -@pytest.mark.parametrize("dtype_str, expr, lib_path", - [('float32', 'math.pow', ''), - ('float64', 'math.pow_dtype', ''), - ('float64', 'math.pow', tl.math.libdevice_path())]) -def test_math_scalar(dtype_str, expr, lib_path): - - @triton.jit - def kernel(X, Y, BLOCK: tl.constexpr): - x = X - y = GENERATE_TEST_HERE - tl.store(Y + tl.arange(0, BLOCK), y) - - shape = (128, ) - rs = RandomState(17) - # limit the range of integers so that the sum does not overflow - x = numpy_random((1,), dtype_str=dtype_str, rs=rs) - y_ref = np.zeros(shape, dtype=x.dtype) - - # numpy does not allow negative factors in power, so we use abs() - if expr == 'math.pow': - x = np.abs(x) - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.math.pow(x, x)'}) - y_ref[:] = np.power(x, x) - elif expr == 'math.pow_dtype': - x = np.abs(x) - kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.math.pow(x, 0.5)'}) - y_ref[:] = np.power(x, 0.5) - - # triton result - x_tri = to_triton(x)[0].item() - y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda') - kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path}) - # compare - np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01) - -# ----------------------- -# test control flow -# ----------------------- - - -@pytest.mark.parametrize("lo, hi, iv", [(2**35, 2**35 + 20, 1), (2**35, 2**35 + 20, 2), (2**35, 2**35 + 20, 3), - (15, -16, -1), (15, -16, -2), (15, -16, -3), - (-18, -22, -1), (22, 18, -1)]) -def test_for_iv(lo, hi, iv): - - @triton.jit - def kernel(Out, lo, hi, iv: tl.constexpr): - acc = 0 - acc = acc.to(tl.int64) - for i in range(lo, hi, iv): - acc += i - tl.store(Out, acc) - - lo = 2**35 - hi = 2**35 + 20 - out = to_triton(np.zeros((1,), dtype=np.int64), device='cuda') - kernel[(1,)](out, lo, hi, iv) - assert out[0] == sum(range(lo, hi, iv)) - - -def test_if_else(): - - @triton.jit - def kernel(Cond, TrueVal, FalseVal, Out): - if tl.load(Cond): - val = tl.load(TrueVal) - else: - val = tl.load(FalseVal) - tl.store(Out, val) - - out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') - true_val = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda') - false_val = to_triton(np.full((1,), 2, dtype=np.int32), device='cuda') - cond = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') - # True - cond[0] = True - kernel[(1,)](cond, true_val, false_val, out) - assert to_numpy(out)[0] == true_val[0] - # False - cond[0] = False - kernel[(1,)](cond, true_val, false_val, out) - assert to_numpy(out)[0] == false_val[0] - - -@pytest.mark.parametrize("mode", ["dynamic", "static"]) -def test_if_return(mode): - - @triton.jit - def kernel(ExitEarly, Out, cond: tl.constexpr, mode: tl.constexpr): - if mode == "dynamic": - if tl.load(ExitEarly): - tl.store(Out, 0) - return - else: - if cond: - tl.store(Out, 0) - return - tl.store(Out, 1) - - out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') - exit_early = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') - # exit early path taken - exit_early[0] = 1 - kernel[(1,)](exit_early, out, True, mode) - assert to_numpy(out)[0] == 0 - # exit early path not taken - exit_early[0] = 0 - kernel[(1,)](exit_early, out, False, mode) - assert to_numpy(out)[0] == 1 - - -@triton.jit -def add_fn(x): - return x + 1 - - -@triton.jit(noinline=True) -def add_fn_noinline(x): - return x + 1 - - -@triton.jit -def add_fn_return(x, pid): - if pid == 0: - return x + 1 - else: - return x + 2 - - -@triton.jit -def add_fn_expr(Out, x): - tl.store(Out, x) - - -@triton.jit -def add_fn_static_cond(x, cond: tl.constexpr): - if cond == "": - return x - else: - return x + 1 - - -@pytest.mark.parametrize("call_type", ["attribute", "attribute_jit", - "jit", "jit_if", "jit_ifexp", "jit_expr", - "jit_static_cond", "jit_noinline", "jit_extern"]) -def test_if_call(call_type): - @triton.jit - def kernel(Out, call_type: tl.constexpr): - pid = tl.program_id(0) - o = tl.load(Out) - if call_type == "attribute": - # call attribute - if pid == 0: - a = o - a = a.to(tl.int32).to(tl.int32) + 1 - o = a - elif call_type == "attribute_jit": - # call attribute and jit function - if pid == 0: - a = o - a = tl.load(Out + add_fn(a) - 1).to(tl.int32) + 1 - o = a - elif call_type == "jit": - if pid == 0: - # regular function call - a = o - a = add_fn(a) - o = a - elif call_type == "jit_if": - # function without end_if block - if pid == 0: - a = o - a = add_fn_return(a, pid) - o = a - elif call_type == "jit_ifexp": - # ifexp expression - if pid == 0: - a = o - a = add_fn(a) if pid == 0 else add_fn_return(a, pid) - o = a - elif call_type == "jit_expr": - # call without return - if pid == 0: - a = o + 1 - add_fn_expr(Out, a) - o = a - elif call_type == "jit_static_cond": - if pid == 0: - a = o + 1 - add_fn_static_cond(o, call_type) - o = a - elif call_type == "jit_noinline": - if pid == 0: - a = o + 1 - add_fn_noinline(a) - o = a - elif call_type == "jit_extern": - if pid == 0: - a = o + 1 - tl.cdiv(a, a) - o = a - - tl.store(Out, o) - - out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') - kernel[(1,)](out, call_type) - assert to_numpy(out)[0] == 1 - - -@pytest.mark.parametrize("_cond1", [True, False]) -@pytest.mark.parametrize("_cond2", [True, False]) -@pytest.mark.parametrize("_cond3", [True, False]) -def test_nested_if_else_return(_cond1, _cond2, _cond3): - - @triton.jit - def kernel(Cond1, Cond2, Cond3, Val1, Val2, Val3, Out): - val = 0 - if tl.load(Cond1): - if tl.load(Cond2): - val = tl.load(Val1) - else: - return - else: - if tl.load(Cond3): - val = tl.load(Val2) - else: - val = tl.load(Val3) - tl.store(Out, val) - - out = to_triton(np.full((1,), -1, dtype=np.int32), device='cuda') - cond1 = to_triton(np.full((1,), _cond1, dtype=np.int32), device='cuda') - cond2 = to_triton(np.full((1,), _cond2, dtype=np.int32), device='cuda') - cond3 = to_triton(np.full((1,), _cond3, dtype=np.int32), device='cuda') - val1 = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda') - val2 = to_triton(np.full((1,), 2, dtype=np.int32), device='cuda') - val3 = to_triton(np.full((1,), 3, dtype=np.int32), device='cuda') - kernel[(1,)](cond1, cond2, cond3, val1, val2, val3, out) - targets = { - (True, True, True): val1[0], - (True, True, False): val1[0], - (True, False, True): out[0], - (True, False, False): out[0], - (False, True, True): val2[0], - (False, True, False): val3[0], - (False, False, True): val2[0], - (False, False, False): val3[0], - } - assert out[0] == targets[(_cond1, _cond2, _cond3)] - - -def test_while(): - - @triton.jit - def kernel(InitI, Bound, CutOff, OutI, OutJ): - init_i = tl.load(InitI) - curr_i = init_i - j = 0 - while curr_i == init_i and j < tl.load(Bound): - curr_i = curr_i + (j == tl.load(CutOff)) - j += 1 - tl.store(OutI, curr_i) - tl.store(OutJ, j) - - out_i = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') - out_j = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') - init_i = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda') - bound = to_triton(np.full((1,), 10, dtype=np.int32), device='cuda') - cut_off = to_triton(np.full((1,), 5, dtype=np.int32), device='cuda') - kernel[(1,)](init_i, bound, cut_off, out_i, out_j) - assert out_i[0] == init_i[0] + 1 - assert out_j[0] == cut_off[0] + 1 - -# def test_for_if(): - -# @triton.jit -# def kernel(bound, cutoff, M, N): -# m = 0 -# n = 0 -# for i in range(bound): -# if i > cutoff: -# m = m + 1 -# else: -# n = n + 1 -# tl.store(M, m) -# tl.store(N, n) - -# m = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') -# n = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') -# kernel[(1,)](10, 7, m, n) -# print(m[0]) -# print(n[0]) - -# ----------------------- -# test extra -# ----------------------- - - -def test_globaltimer(): - - @triton.jit - def kernel(Out1, Out2): - start = tl.extra.cuda.globaltimer() - off = tl.arange(0, 128) - for i in range(100): - tl.store(Out1 + off, tl.load(Out1 + off) + 1) - end = tl.extra.cuda.globaltimer() - tl.store(Out2, end - start) - - out1 = to_triton(np.zeros((128,), dtype=np.int64), device='cuda') - out2 = to_triton(np.zeros((1,), dtype=np.int64), device='cuda') - h = kernel[(1,)](out1, out2) - assert out2[0] > 0 - # 2 inlined globaltimers + one extra in the wrapper extern function - assert h.asm["ptx"].count("%globaltimer") == 3 - - -def test_smid(): - - @triton.jit - def kernel(Out): - tl.store(Out + tl.program_id(0), tl.extra.cuda.smid()) - - out = to_triton(np.zeros((1024,), dtype=np.int32), device='cuda') - h = kernel[(out.shape[0],)](out) - assert out.sort()[0].unique().shape[0] > 0 - assert h.asm["ptx"].count("%smid") == 2 - -# ----------------------- -# test layout conversions -# ----------------------- -# TODO: backend should be tested separately - - -layouts = [ - # MmaLayout(version=1, warps_per_cta=[1, 4]), - MmaLayout(version=(2, 0), warps_per_cta=[1, 4]), - # MmaLayout(version=1, warps_per_cta=[4, 1]), - MmaLayout(version=(2, 0), warps_per_cta=[4, 1]), - BlockedLayout([1, 8], [2, 16], [4, 1], [1, 0]), - BlockedLayout([1, 4], [4, 8], [2, 2], [1, 0]), - BlockedLayout([1, 1], [1, 32], [2, 2], [1, 0]), - BlockedLayout([8, 1], [16, 2], [1, 4], [0, 1]), - BlockedLayout([4, 1], [8, 4], [2, 2], [0, 1]), - BlockedLayout([1, 1], [32, 1], [2, 2], [0, 1]), - BlockedLayout([4, 4], [1, 32], [4, 1], [1, 0]) -] - -intermediate_layouts = [ - None, - SharedLayout(1, 1, 1, [1, 0]), - SharedLayout(4, 2, 4, [1, 0]), - SharedLayout(2, 2, 4, [1, 0]), -] - - -@pytest.mark.parametrize("shape", [(128, 128)]) -@pytest.mark.parametrize("dtype", ['float16']) -@pytest.mark.parametrize("src_layout", layouts) -@pytest.mark.parametrize("interm_layout", intermediate_layouts) -@pytest.mark.parametrize("dst_layout", layouts) -def test_convert2d(dtype, shape, src_layout, interm_layout, dst_layout, device='cuda'): - if str(src_layout) == str(dst_layout): - pytest.skip() - if 'mma' in str(src_layout) and 'mma' in str(dst_layout): - pytest.skip() - - layouts = f""" - #src = {src_layout} - #dst = {dst_layout} - """ if interm_layout is None else f""" - #src = {src_layout} - #interm = {interm_layout} - #dst = {dst_layout} - """ - - conversion = f""" - %12 = triton_gpu.convert_layout %9 : (tensor<128x128xi32, #src>) -> tensor<128x128xi32, #dst> - %13 = triton_gpu.convert_layout %11 : (tensor<128x128xf16, #src>) -> tensor<128x128xf16, #dst> - """ if interm_layout is None else f""" - %15 = triton_gpu.convert_layout %9 : (tensor<128x128xi32, #src>) -> tensor<128x128xi32, #interm> - %16 = triton_gpu.convert_layout %15 : (tensor<128x128xi32, #interm>) -> tensor<128x128xi32, #src> - %17 = triton_gpu.convert_layout %11 : (tensor<128x128xf16, #src>) -> tensor<128x128xf16, #interm> - %18 = triton_gpu.convert_layout %17 : (tensor<128x128xf16, #interm>) -> tensor<128x128xf16, #src> - - %12 = triton_gpu.convert_layout %16 : (tensor<128x128xi32, #src>) -> tensor<128x128xi32, #dst> - %13 = triton_gpu.convert_layout %18 : (tensor<128x128xf16, #src>) -> tensor<128x128xf16, #dst> - """ - - ir = layouts + """ - module attributes {"triton_gpu.num-warps" = 4 : i32} { - tt.func public @kernel_0d1d(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { - %cst = arith.constant dense<128> : tensor<128x1xi32, #src> - %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #src}>> - %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #src}>> - %2 = tt.splat %arg0 : (!tt.ptr) -> tensor<128x128x!tt.ptr, #src> - %4 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #src}>>) -> tensor<128x1xi32, #src> - %5 = arith.muli %4, %cst : tensor<128x1xi32, #src> - %6 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #src}>>) -> tensor<1x128xi32, #src> - %7 = tt.broadcast %6 : (tensor<1x128xi32, #src>) -> tensor<128x128xi32, #src> - %8 = tt.broadcast %5 : (tensor<128x1xi32, #src>) -> tensor<128x128xi32, #src> - %9 = arith.addi %8, %7 : tensor<128x128xi32, #src> - %10 = tt.addptr %2, %9 : tensor<128x128x!tt.ptr, #src>, tensor<128x128xi32, #src> - %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x128xf16, #src> - %3 = tt.splat %arg1 : (!tt.ptr) -> tensor<128x128x!tt.ptr, #dst> - """ + conversion + """ - %14 = tt.addptr %3, %12 : tensor<128x128x!tt.ptr, #dst>, tensor<128x128xi32, #dst> - tt.store %14, %13 : tensor<128x128xf16, #dst> - tt.return - } -} -""" - - x = to_triton(numpy_random(shape, dtype_str=dtype)) - z = torch.empty_like(x) - - # write the IR to a temporary file using mkstemp - import tempfile - with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f: - f.write(ir) - f.flush() - kernel = triton.compile(f.name) - kernel[(1, 1, 1)](x.data_ptr(), z.data_ptr()) - - assert torch.equal(z, x) - - -def test_load_scalar_with_mask(): - @triton.jit - def kernel(Input, Index, Out, N: int): - index = tl.load(Index) - scalar = tl.load(Input + index, mask=index < N, other=0) - tl.store(Out, scalar, mask=index < N) - Index = torch.tensor([0], dtype=torch.int32, device='cuda') - Input = torch.tensor([0], dtype=torch.int32, device='cuda') - Out = torch.empty_like(Index, device='cuda') - kernel[(1,)](Input, Index, Out, Index.numel()) - assert Out.data[0] == 0 - - -# This test is used to test our own PTX codegen for float16 and int16 conversions -# maybe delete it later after ptxas has been fixed -@pytest.mark.parametrize("dtype_str", ['float16', 'int16']) -def test_ptx_cast(dtype_str): - @triton.jit - def kernel(in_ptr0, out_ptr2, xnumel, rnumel, dtype: tl.constexpr, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr): - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None] - xmask = xindex < xnumel - rbase = tl.arange(0, RBLOCK)[None, :] - x0 = xindex - _tmp4 = (tl.zeros([XBLOCK, RBLOCK], dtype) - 10000).to(dtype) - for roffset in range(0, rnumel, RBLOCK): - rindex = roffset + rbase - rmask = rindex < rnumel - r1 = rindex - tmp0 = tl.load(in_ptr0 + (r1 + (197 * x0)), rmask & xmask).to(dtype) - tmp1 = 2 - tmp2 = tmp0 * tmp1 - tmp3 = tmp2.to(dtype) - tmp5 = _tmp4 < tmp3 - _tmp4 = tl.where(rmask & xmask & tmp5, tmp3, _tmp4) - tl.store(out_ptr2 + (r1 + (197 * x0) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), _tmp4, rmask & xmask) - - torch.manual_seed(123) - if dtype_str == 'int16': - torch_dtype = torch.int16 - triton_dtype = tl.int32 - else: - torch_dtype = torch.float16 - triton_dtype = tl.float32 - - s0 = 4 - buf11 = -torch.ones((6 * s0, 197, 197), device='cuda', dtype=torch_dtype) - buf14 = -torch.ones((s0, 6, 197, 197), device='cuda', dtype=torch_dtype) - kernel[(4728,)](buf11, buf14, 1182 * s0, 197, triton_dtype, 1, 256, num_warps=2) - assert buf14.to(torch.float32).mean() == -2.0 diff --git a/python/test/unit/language/test_random.py b/python/test/unit/language/test_random.py deleted file mode 100644 index 39ae59e35a8d..000000000000 --- a/python/test/unit/language/test_random.py +++ /dev/null @@ -1,198 +0,0 @@ -import numpy as np -import pytest -import scipy.stats -import torch - -import triton -import triton.language as tl - -##################################### -# Reference Philox Implementation -##################################### - - -class PhiloxConfig: - def __init__(self, PHILOX_ROUND_A, PHILOX_ROUND_B, PHILOX_KEY_A, PHILOX_KEY_B, DTYPE): - self.PHILOX_ROUND_A = np.array(PHILOX_ROUND_A, dtype=DTYPE) - self.PHILOX_ROUND_B = np.array(PHILOX_ROUND_B, dtype=DTYPE) - self.PHILOX_KEY_A = np.array(PHILOX_KEY_A, dtype=DTYPE) - self.PHILOX_KEY_B = np.array(PHILOX_KEY_B, dtype=DTYPE) - self.DTYPE = DTYPE - - -# This is better for GPU -PHILOX_32 = PhiloxConfig( - PHILOX_KEY_A=0x9E3779B9, - PHILOX_KEY_B=0xBB67AE85, - PHILOX_ROUND_A=0xD2511F53, - PHILOX_ROUND_B=0xCD9E8D57, - DTYPE=np.uint32, -) - -# This is what numpy implements -PHILOX_64 = PhiloxConfig( - PHILOX_KEY_A=0x9E3779B97F4A7C15, - PHILOX_KEY_B=0xBB67AE8584CAA73B, - PHILOX_ROUND_A=0xD2E7470EE14C6C93, - PHILOX_ROUND_B=0xCA5A826395121157, - DTYPE=np.uint64, -) - - -class CustomPhilox4x: - def __init__(self, seed, config): - self._config = config - seed = self._into_pieces(seed) - self._key = np.array(seed[:2], dtype=self._dtype) - self._counter = np.array((0, 0) + seed[2:], dtype=self._dtype) - - @property - def _dtype(self): - return self._config.DTYPE - - def _into_pieces(self, n, pad=4): - res = [] - while len(res) < pad: - res.append(np.array(n, dtype=self._dtype)) - n >>= (np.dtype(self._dtype).itemsize * 8) - assert n == 0 - return tuple(res) - - def _multiply_low_high(self, a, b): - low = a * b - high = int(a) * int(b) - high = np.array(high >> (np.dtype(self._dtype).itemsize * 8), dtype=self._dtype) - return low, high - - def _single_round(self, counter, key): - lo0, hi0 = self._multiply_low_high(self._config.PHILOX_ROUND_A, counter[0]) - lo1, hi1 = self._multiply_low_high(self._config.PHILOX_ROUND_B, counter[2]) - ret0 = hi1 ^ counter[1] ^ key[0] - ret1 = lo1 - ret2 = hi0 ^ counter[3] ^ key[1] - ret3 = lo0 - return np.array([ret0, ret1, ret2, ret3], dtype=self._dtype) - - def _raise_key(self, key): - pk = [self._config.PHILOX_KEY_A, self._config.PHILOX_KEY_B] - return key + np.array(pk, dtype=self._dtype) - - def random_raw(self): - counter = self._counter - key = self._key - for _ in range(10): - counter = self._single_round(counter, key) - key = self._raise_key(key) - self.advance(1) - return counter - - def advance(self, n_steps): - self._counter[0] += n_steps - assert self._counter[0] < 2**32, "FIXME: doesn't work for large offsets" - - -class CustomPhilox(CustomPhilox4x): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.buffer = [] - - def random_raw(self): - if len(self.buffer) == 0: - self.buffer = list(super().random_raw())[::-1] - return int(self.buffer.pop()) - - -##################################### -# Unit Tests -##################################### - -BLOCK = 1024 - -# test generation of random uint32 - - -@pytest.mark.parametrize('size, seed', - [(size, seed) for size in ['10', '4,53', '10000'] - for seed in [0, 42, 124, 54, 0xffffffff, 0xdeadbeefcafeb0ba]] - ) -def test_randint(size, seed, device='cuda'): - size = list(map(int, size.split(','))) - - @triton.jit - def kernel(X, N, seed): - offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) - rand = tl.randint(seed, offset) - tl.store(X + offset, rand, mask=offset < N) - # triton result - x = torch.empty(size, dtype=torch.int32, device=device) - N = x.numel() - grid = (triton.cdiv(N, BLOCK),) - kernel[grid](x, N, seed) - out_tri = x.cpu().numpy().astype(np.uint32).flatten().tolist() - # reference result - gen = CustomPhilox4x(seed, config=PHILOX_32) - out_ref = [gen.random_raw()[0] for _ in out_tri] - assert out_tri == out_ref - -# test uniform PRNG - - -@pytest.mark.parametrize('size, seed', - [(size, seed) for size in [1000000] - for seed in [0, 42, 124, 54]] - ) -def test_rand(size, seed, device='cuda'): - @triton.jit - def kernel(X, N, seed): - offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) - rand = tl.rand(seed, offset) - tl.store(X + offset, rand, mask=offset < N) - # triton result - x = torch.empty(size, dtype=torch.float32, device=device) - N = x.numel() - grid = (triton.cdiv(N, BLOCK),) - kernel[grid](x, N, seed) - assert all((x >= 0) & (x <= 1)) - assert scipy.stats.kstest(x.tolist(), 'uniform', args=(0, 1)).statistic < 0.01 - -# test normal PRNG - - -@pytest.mark.parametrize('size, seed', - [(size, seed) for size in [1000000] - for seed in [0, 42, 124, 54]] - ) -def test_randn(size, seed, device='cuda'): - @triton.jit - def kernel(X, N, seed): - offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) - rand = tl.randn(seed, offset) - tl.store(X + offset, rand, mask=offset < N) - # triton result - x = torch.empty(size, dtype=torch.float32, device=device) - N = x.numel() - grid = (triton.cdiv(N, BLOCK),) - kernel[grid](x, N, seed) - assert abs(x.mean()) < 1e-2 - assert abs(x.std() - 1) < 1e-2 - - -# tl.rand() should never produce >=1.0 - -def test_rand_limits(): - @triton.jit - def kernel(input, output, n: tl.constexpr): - idx = tl.arange(0, n) - x = tl.load(input + idx) - y = tl.random.uint32_to_uniform_float(x) - tl.store(output + idx, y) - - min_max_int32 = torch.tensor([ - torch.iinfo(torch.int32).min, - torch.iinfo(torch.int32).max, - ], dtype=torch.int32, device='cuda') - output = torch.empty(2, dtype=torch.float32, device='cuda') - kernel[(1,)](min_max_int32, output, 2) - - assert output[0] == output[1] - assert 1.0 - torch.finfo(torch.float32).eps <= output[0].item() < 1.0 diff --git a/python/test/unit/language/test_subprocess.py b/python/test/unit/language/test_subprocess.py deleted file mode 100644 index 3263166d8c28..000000000000 --- a/python/test/unit/language/test_subprocess.py +++ /dev/null @@ -1,80 +0,0 @@ -import os -import subprocess -import sys - -import pytest - -dir_path = os.path.dirname(os.path.realpath(__file__)) -print_path = os.path.join(dir_path, "print_helper.py") -assert_path = os.path.join(dir_path, "assert_helper.py") - -# TODO: bfloat16 after LLVM-15 -func_types = ["device_assert", "assert", "static_assert", "no_debug"] -nested_types = [(caller, callee) for caller in ["true", "false", "none"] for callee in ["true", "false", "none"]] -torch_types = ["int8", "uint8", "int16", "int32", "long", "float16", "float32", "float64"] - - -@pytest.mark.parametrize("func_type, data_type", - [("device_print", data_type) for data_type in torch_types] + [("print", "int32"), ("static_print", "int32")]) -def test_print(func_type: str, data_type: str): - proc = subprocess.Popen([sys.executable, print_path, func_type, data_type], stdout=subprocess.PIPE, shell=False) - outs, _ = proc.communicate() - outs = outs.split() - new_lines = set() - for line in outs: - try: - value = line - if func_type != "static_print": - value = int(float(line)) - new_lines.add(value) - except Exception as e: - print(e) - if func_type != "static_print": - for i in range(128): - assert i in new_lines - assert len(new_lines) == 128 - else: - assert len(new_lines) == 1 - - -@pytest.mark.parametrize("func_type", func_types) -def test_assert(func_type: str): - os.environ["TRITON_DEBUG"] = "1" - proc = subprocess.Popen([sys.executable, assert_path, func_type], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False) - _, errs = proc.communicate() - errs = errs.splitlines() - num_errs = 0 - for err in errs: - if "x != 0" in err.decode("utf-8"): - num_errs += 1 - os.environ["TRITON_DEBUG"] = "0" - if func_type != "static_assert": - assert num_errs == 127 - else: - assert num_errs == 0 - - -@pytest.mark.parametrize("caller_type, callee_type", nested_types) -def test_assert_nested(caller_type, callee_type): - proc = subprocess.Popen([sys.executable, assert_path, caller_type, callee_type], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False) - _, errs = proc.communicate() - errs = errs.splitlines() - num_errs = 0 - for err in errs: - if "x != 0" in err.decode("utf-8"): - num_errs += 1 - if caller_type == "none": - if callee_type == "true": - assert num_errs == 127 - else: - assert num_errs == 0 - elif caller_type == "true": - if callee_type == "false": - assert num_errs == 0 - else: - assert num_errs == 127 - elif caller_type == "false": - if callee_type == "true": - assert num_errs == 127 - else: - assert num_errs == 0 diff --git a/python/test/unit/operators/test_blocksparse.py b/python/test/unit/operators/test_blocksparse.py deleted file mode 100644 index 5f94cd8b31bf..000000000000 --- a/python/test/unit/operators/test_blocksparse.py +++ /dev/null @@ -1,219 +0,0 @@ -import pytest -import torch - -import triton -import triton.ops - - -def sparsify_tensor(x, mask, block): - ret = torch.empty((x.size(0), mask.sum(), block, block), dtype=x.dtype, device=x.device) - for idx, (h, i, j) in enumerate(zip(*mask.nonzero(as_tuple=True))): - ret[:, idx, :, :] = x[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] - return ret - - -def make_pair(shape, device="cuda", alpha=1e-2, beta=0., trans=False, data=None, dtype=torch.float32): - if data is None: - data = torch.randn(shape, dtype=torch.float32, requires_grad=True, device=device) - ref_ret = data - ref_ret = ref_ret * alpha + beta - ref_ret = ref_ret.half().to(dtype) - if trans: - ref_ret = ref_ret.t().requires_grad_() - ref_ret = ref_ret.detach().requires_grad_() - tri_ret = ref_ret.clone().detach().requires_grad_() - return ref_ret, tri_ret - - -def mask_tensor(x, mask, block, value=0): - ret = x.clone() - for h, i, j in zip(*(mask == 0).nonzero(as_tuple=True)): - ret[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] = value - return ret - - -@pytest.mark.parametrize("MODE", ["sdd", "dds", "dsd"]) -@pytest.mark.parametrize("TRANS_A", [False, True]) -@pytest.mark.parametrize("TRANS_B", [False, True]) -@pytest.mark.parametrize("BLOCK", [16, 32, 64]) -@pytest.mark.parametrize("DTYPE", [torch.float16]) -def test_matmul(MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, Z=3, H=2, M=512, N=384, K=256): - seed = 0 - torch.manual_seed(seed) - is_sdd = MODE == "sdd" - is_dsd = MODE == "dsd" - is_dds = MODE == "dds" - do_sparsify = lambda x: sparsify_tensor(x, layout, BLOCK) - do_mask = lambda x: mask_tensor(x, layout, BLOCK) - # create inputs - # create op - a_shape = (Z, H, K, M) if TRANS_A else (Z, H, M, K) - b_shape = (Z, H, N, K) if TRANS_B else (Z, H, K, N) - c_shape = (Z, H, M, N) - shape = { - "sdd": (M, N), - "dsd": (a_shape[2], a_shape[3]), - "dds": (b_shape[2], b_shape[3]), - }[MODE] - layout = torch.randint(2, (H, shape[0] // BLOCK, shape[1] // BLOCK)) - layout[1, 2, :] = 0 - layout[1, :, 1] = 0 - # create data - a_ref, a_tri = make_pair(a_shape, alpha=.1, dtype=DTYPE) - b_ref, b_tri = make_pair(b_shape, alpha=.1, dtype=DTYPE) - dc_ref, dc_tri = make_pair(c_shape, dtype=DTYPE) - # compute [torch] - dc_ref = do_mask(dc_ref) if is_sdd else dc_ref - a_ref = do_mask(a_ref) if is_dsd else a_ref - b_ref = do_mask(b_ref) if is_dds else b_ref - a_ref.retain_grad() - b_ref.retain_grad() - c_ref = torch.matmul(a_ref.transpose(2, 3) if TRANS_A else a_ref, - b_ref.transpose(2, 3) if TRANS_B else b_ref) - c_ref.backward(dc_ref) - c_ref = do_sparsify(c_ref) if is_sdd else c_ref - da_ref = do_sparsify(a_ref.grad) if is_dsd else a_ref.grad - db_ref = do_sparsify(b_ref.grad) if is_dds else b_ref.grad - # triton result - dc_tri = do_sparsify(dc_tri) if is_sdd else dc_tri - a_tri = do_sparsify(a_tri) if is_dsd else a_tri - b_tri = do_sparsify(b_tri) if is_dds else b_tri - a_tri.retain_grad() - b_tri.retain_grad() - op = triton.ops.blocksparse.matmul(layout, BLOCK, MODE, trans_a=TRANS_A, trans_b=TRANS_B, device="cuda") - c_tri = op(a_tri, b_tri) - c_tri.backward(dc_tri) - da_tri = a_tri.grad - db_tri = b_tri.grad - # compare - torch.testing.assert_allclose(c_ref, c_tri) - torch.testing.assert_allclose(da_ref, da_tri) - torch.testing.assert_allclose(db_ref, db_tri) - - -configs = [ - (16, 256), - (32, 576), - (64, 1871), - (128, 2511), -] - - -@pytest.mark.parametrize("is_dense", [False, True]) -@pytest.mark.parametrize("BLOCK, WIDTH", configs) -def test_softmax(BLOCK, WIDTH, is_dense, Z=2, H=2, is_causal=True, scale=0.4): - # set seed - torch.random.manual_seed(0) - Z, H, M, N = 2, 3, WIDTH, WIDTH - # initialize layout - # make sure each row has at least one non-zero element - layout = torch.randint(2, (H, M // BLOCK, N // BLOCK)) - if is_dense: - layout[:] = 1 - else: - layout[1, 2, :] = 0 - layout[1, :, 1] = 0 - # initialize data - a_shape = (Z, H, M, N) - a_ref, a_tri = make_pair(a_shape) - dout_ref, dout_tri = make_pair(a_shape) - # compute [torch] - a_ref = mask_tensor(a_ref, layout, BLOCK, value=float("-inf")) - a_ref.retain_grad() - at_mask = torch.ones((M, N), device="cuda") - if is_causal: - at_mask = torch.tril(at_mask) - M = at_mask[None, None, :, :] + torch.zeros_like(a_ref) - a_ref[M == 0] = float("-inf") - out_ref = torch.softmax(a_ref * scale, -1) - out_ref.backward(dout_ref) - out_ref = sparsify_tensor(out_ref, layout, BLOCK) - da_ref = sparsify_tensor(a_ref.grad, layout, BLOCK) - # compute [triton] - a_tri = sparsify_tensor(a_tri, layout, BLOCK) - a_tri.retain_grad() - dout_tri = sparsify_tensor(dout_tri, layout, BLOCK) - op = triton.ops.blocksparse.softmax(layout, BLOCK, device="cuda", is_dense=is_dense) - out_tri = op(a_tri, scale=scale, is_causal=is_causal) - out_tri.backward(dout_tri) - da_tri = a_tri.grad - # compare - torch.testing.assert_allclose(out_tri, out_ref) - torch.testing.assert_allclose(da_tri, da_ref) - - -@pytest.mark.parametrize("block", [16, 32, 64]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) -def test_attention_fwd_bwd( - block, - dtype, - input_scale=1.0, - scale=1 / 8.0, - n_ctx=256, - batch_size=2, - n_heads=2, -): - capability = torch.cuda.get_device_capability() - if capability[0] < 7: - pytest.skip("Only test tl.dot() on devices with sm >= 70") - - # inputs - qkv_shape = (batch_size, n_heads, n_ctx, 64) - qkvs = [ - torch.nn.Parameter(input_scale * torch.randn(qkv_shape), requires_grad=True).to(dtype).cuda() for _ in range(3) - ] - - # Triton: - n_blocks = n_ctx // block - layout = torch.tril(torch.ones([n_heads, n_blocks, n_blocks], dtype=torch.long)) - query, key, value = [x.clone() for x in qkvs] - query.retain_grad() - key.retain_grad() - value.retain_grad() - attn_out = triton_attention(layout, block, query=query, key=key, value=value, scale=scale) - # ad hoc loss - loss = (attn_out ** 2).mean() - loss.backward() - grads = [query.grad, key.grad, value.grad] - - # Torch version: - torch_q, torch_k, torch_v = [x.clone() for x in qkvs] - attn_mask = torch.ones([n_ctx, n_ctx], device="cuda", dtype=dtype) - attn_mask = torch.tril(attn_mask, diagonal=0) - attn_mask = 1e6 * (-1 + (attn_mask.reshape((1, 1, n_ctx, n_ctx)).cuda())) - torch_q.retain_grad() - torch_k.retain_grad() - torch_v.retain_grad() - scores = scale * torch.einsum("bhsd,bhtd->bhst", torch_q, torch_k) - scores = scores + attn_mask - probs = torch.softmax(scores, dim=-1) - torch_attn_out = torch.einsum("bhst,bhtd->bhsd", probs, torch_v) - # ad hoc loss - torch_loss = (torch_attn_out ** 2).mean() - torch_loss.backward() - torch_grads = [torch_q.grad, torch_k.grad, torch_v.grad] - - # comparison - # print(f"Triton loss {loss} and torch loss {torch_loss}. Also checking grads...") - torch.testing.assert_allclose(loss, torch_loss, atol=1e-3, rtol=0) - for g1, g2 in zip(grads, torch_grads): - torch.testing.assert_allclose(g1, g2) - - -@pytest.mark.parametrize("block", [16, 32, 64]) -def triton_attention( - layout, - block: int, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - scale: float, -): - sparse_dot_sdd_nt = triton.ops.blocksparse.matmul(layout, block, "sdd", trans_a=False, trans_b=True, device=value.device) - sparse_dot_dsd_nn = triton.ops.blocksparse.matmul(layout, block, "dsd", trans_a=False, trans_b=False, device=value.device) - sparse_softmax = triton.ops.blocksparse.softmax(layout, block, device=value.device) - - w = sparse_dot_sdd_nt(query, key) - w = sparse_softmax(w, scale=scale, is_causal=True) - a = sparse_dot_dsd_nn(w, value) - return a diff --git a/python/test/unit/operators/test_cross_entropy.py b/python/test/unit/operators/test_cross_entropy.py deleted file mode 100644 index 20d57ece57a9..000000000000 --- a/python/test/unit/operators/test_cross_entropy.py +++ /dev/null @@ -1,39 +0,0 @@ -import pytest -import torch - -import triton -import triton.ops - - -@pytest.mark.parametrize("M, N, dtype, mode", - [ - (M, N, dtype, mode) for M in [1024, 821] - for N in [512, 857, 1871, 2089, 8573, 31000] - for dtype in ['float16', 'float32'] - for mode in ['forward', 'backward'] - ] - ) -def test_op(M, N, dtype, mode): - capability = torch.cuda.get_device_capability() - if capability[0] < 8 and dtype == "bfloat16": - pytest.skip("Only test bfloat16 on devices with sm >= 80") - dtype = {'bfloat16': torch.bfloat16, 'float16': torch.float16, 'float32': torch.float32}[dtype] - # create inputs - x = torch.randn(M, N, dtype=dtype, device='cuda', requires_grad=True) - idx = 4 + torch.ones(M, dtype=torch.int64, device='cuda') - # forward pass - tt_y = triton.ops.cross_entropy(x, idx) - th_y = torch.nn.CrossEntropyLoss(reduction="none")(x, idx) - if mode == 'forward': - torch.testing.assert_allclose(th_y, tt_y) - # backward pass - elif mode == 'backward': - dy = torch.randn_like(tt_y) - # triton backward - tt_y.backward(dy) - tt_dx = x.grad.clone() - # torch backward - x.grad.zero_() - th_y.backward(dy) - th_dx = x.grad.clone() - torch.testing.assert_allclose(th_dx, tt_dx) diff --git a/python/test/unit/operators/test_flash_attention.py b/python/test/unit/operators/test_flash_attention.py deleted file mode 100644 index c9d8babe342f..000000000000 --- a/python/test/unit/operators/test_flash_attention.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest -import torch - -import triton -import triton.ops - - -@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(4, 48, 1024, 64)]) -@pytest.mark.parametrize('dtype', [torch.float16, torch.bfloat16]) -def test_op(Z, H, N_CTX, D_HEAD, dtype): - capability = torch.cuda.get_device_capability() - if capability[0] < 8: - pytest.skip("Flash attention only supported for compute capability < 80") - torch.manual_seed(20) - q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_() - k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_() - v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_() - sm_scale = 0.2 - dout = torch.randn_like(q) - # reference implementation - M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda")) - p = torch.matmul(q, k.transpose(2, 3)) * sm_scale - for z in range(Z): - for h in range(H): - p[:, :, M == 0] = float("-inf") - p = torch.softmax(p.float(), dim=-1).to(dtype) - # p = torch.exp(p) - ref_out = torch.matmul(p, v) - ref_out.backward(dout) - ref_dv, v.grad = v.grad.clone(), None - ref_dk, k.grad = k.grad.clone(), None - ref_dq, q.grad = q.grad.clone(), None - # # triton implementation - tri_out = triton.ops.attention(q, k, v, sm_scale) - # print(ref_out) - # print(tri_out) - tri_out.backward(dout) - tri_dv, v.grad = v.grad.clone(), None - tri_dk, k.grad = k.grad.clone(), None - tri_dq, q.grad = q.grad.clone(), None - # compare - atol = 1e-1 if dtype == torch.bfloat16 else 1e-2 - torch.testing.assert_allclose(ref_out, tri_out, atol=atol, rtol=0) - torch.testing.assert_allclose(ref_dv, tri_dv, atol=atol, rtol=0) - torch.testing.assert_allclose(ref_dk, tri_dk, atol=atol, rtol=0) - torch.testing.assert_allclose(ref_dq, tri_dq, atol=atol, rtol=0) diff --git a/python/test/unit/operators/test_inductor.py b/python/test/unit/operators/test_inductor.py deleted file mode 100644 index f7e2ce2aa7e0..000000000000 --- a/python/test/unit/operators/test_inductor.py +++ /dev/null @@ -1,155 +0,0 @@ -import torch - -import triton -import triton.language as tl - - -def test_normalization_with_remat(): - - @triton.jit - def triton_(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, rnumel, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr): - xnumel = 512 - rnumel = 4096 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None] - xmask = xindex < xnumel - rbase = tl.arange(0, RBLOCK)[None, :] - x3 = xindex - x0 = xindex % 64 - tmp1 = tl.load(in_ptr0 + (x0), xmask) - tmp3 = tl.load(in_ptr1 + (x0), xmask) - tmp11 = tl.load(in_ptr2 + (x0), xmask) - tmp13 = tl.load(in_ptr3 + (x0), xmask) - _tmp17 = tl.zeros([XBLOCK, RBLOCK], tl.float32) + 0 - for roffset in range(0, rnumel, RBLOCK): - rindex = roffset + rbase - rmask = rindex < rnumel - r2 = rindex - tmp0 = tl.load(in_out_ptr0 + (r2 + (4096 * x3)), rmask & xmask, eviction_policy='evict_last', other=0) - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = tl.sqrt(tmp5) - tmp7 = 1 / tmp6 - tmp8 = 1.0 - tmp9 = tmp7 * tmp8 - tmp10 = tmp2 * tmp9 - tmp12 = tmp10 * tmp11 - tmp14 = tmp12 + tmp13 - _tmp17 = tl.where(rmask & xmask, _tmp17 + tmp14, _tmp17) - tl.store(in_out_ptr0 + (r2 + (4096 * x3) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp14, rmask & xmask) - tmp17 = tl.sum(_tmp17, 1)[:, None] - tmp18 = 4096.0 - tmp19 = tmp17 / tmp18 - tl.store(in_out_ptr1 + (x3 + tl.zeros([XBLOCK, 1], tl.int32)), tmp19, xmask) - - torch.manual_seed(123) - - buf14 = torch.rand(8, 64, 64, 64, device="cuda") - buf16 = torch.rand(8, 1, 64, device="cuda") - arg114_1 = torch.rand(64, device="cuda") - arg115_1 = torch.rand(64, device="cuda") - arg8_1 = torch.rand(64, device="cuda") - arg9_1 = torch.rand(64, device="cuda") - triton_[(512,)](buf14, buf16, arg114_1, arg115_1, arg8_1, arg9_1, 512, 4096, 1, 2048) - torch.testing.assert_allclose(buf16.mean().item(), buf14.mean().item(), atol=1e-7, rtol=0) - - -def test_avg_pool_bw(): - - @triton.jit - def triton_(in_ptr0, out_ptr0, XBLOCK: tl.constexpr): - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - x1 = (xindex // 8) % 8 - x0 = xindex % 8 - x2 = (xindex // 64) - x5 = xindex - tmp0 = (-1) + x1 - tmp1 = (-1) + x0 - tmp2 = 2 + x1 - tmp3 = 2 + x0 - tmp4 = 0 - tmp5 = tl.where(tmp0 != tmp0, tmp0, tl.where(tmp0 > tmp4, tmp0, tmp4)) - tmp6 = tl.where(tmp1 != tmp1, tmp1, tl.where(tmp1 > tmp4, tmp1, tmp4)) - tmp7 = 8 - tmp8 = tl.where(tmp2 != tmp2, tmp2, tl.where(tmp2 < tmp7, tmp2, tmp7)) - tmp9 = tl.where(tmp3 != tmp3, tmp3, tl.where(tmp3 < tmp7, tmp3, tmp7)) - tmp10 = tmp5 + tmp4 - tmp11 = tmp6 + tmp4 - tmp12 = 1 - tmp13 = tmp8 - tmp12 - tmp14 = tl.where(tmp10 != tmp10, tmp10, tl.where(tmp10 < tmp13, tmp10, tmp13)) - tmp15 = tmp9 - tmp12 - tmp16 = tl.where(tmp11 != tmp11, tmp11, tl.where(tmp11 < tmp15, tmp11, tmp15)) - tmp17 = tl.load(in_ptr0 + (tmp16 + (8 * tmp14) + (64 * x2)), None).to(tl.float32) - tmp18 = tmp17 / 9 - tmp19 = tmp10 < tmp8 - tmp20 = tmp11 < tmp9 - tmp21 = tmp19 & tmp20 - tmp22 = 0.0 - tmp23 = tl.where(tmp21, tmp18, tmp22) - tmp24 = tmp6 + tmp12 - tmp25 = tl.where(tmp24 != tmp24, tmp24, tl.where(tmp24 < tmp15, tmp24, tmp15)) - tmp26 = tl.load(in_ptr0 + (tmp25 + (8 * tmp14) + (64 * x2)), None).to(tl.float32) - tmp27 = tmp26 / 9 - tmp28 = tmp24 < tmp9 - tmp29 = tmp19 & tmp28 - tmp30 = tmp23 + tmp27 - tmp31 = tl.where(tmp29, tmp30, tmp23) - tmp32 = 2 - tmp33 = tmp6 + tmp32 - tmp34 = tl.where(tmp33 != tmp33, tmp33, tl.where(tmp33 < tmp15, tmp33, tmp15)) - tmp35 = tl.load(in_ptr0 + (tmp34 + (8 * tmp14) + (64 * x2)), None).to(tl.float32) - tmp36 = tmp35 / 9 - tmp37 = tmp33 < tmp9 - tmp38 = tmp19 & tmp37 - tmp39 = tmp31 + tmp36 - tmp40 = tl.where(tmp38, tmp39, tmp31) - tmp41 = tmp5 + tmp12 - tmp42 = tl.where(tmp41 != tmp41, tmp41, tl.where(tmp41 < tmp13, tmp41, tmp13)) - tmp43 = tl.load(in_ptr0 + (tmp16 + (8 * tmp42) + (64 * x2)), None).to(tl.float32) - tmp44 = tmp43 / 9 - tmp45 = tmp41 < tmp8 - tmp46 = tmp45 & tmp20 - tmp47 = tmp40 + tmp44 - tmp48 = tl.where(tmp46, tmp47, tmp40) - tmp49 = tl.load(in_ptr0 + (tmp25 + (8 * tmp42) + (64 * x2)), None).to(tl.float32) - tmp50 = tmp49 / 9 - tmp51 = tmp45 & tmp28 - tmp52 = tmp48 + tmp50 - tmp53 = tl.where(tmp51, tmp52, tmp48) - tmp54 = tl.load(in_ptr0 + (tmp34 + (8 * tmp42) + (64 * x2)), None).to(tl.float32) - tmp55 = tmp54 / 9 - tmp56 = tmp45 & tmp37 - tmp57 = tmp53 + tmp55 - tmp58 = tl.where(tmp56, tmp57, tmp53) - tmp59 = tmp5 + tmp32 - tmp60 = tl.where(tmp59 != tmp59, tmp59, tl.where(tmp59 < tmp13, tmp59, tmp13)) - tmp61 = tl.load(in_ptr0 + (tmp16 + (8 * tmp60) + (64 * x2)), None).to(tl.float32) - tmp62 = tmp61 / 9 - tmp63 = tmp59 < tmp8 - tmp64 = tmp63 & tmp20 - tmp65 = tmp58 + tmp62 - tmp66 = tl.where(tmp64, tmp65, tmp58) - tmp67 = tl.load(in_ptr0 + (tmp25 + (8 * tmp60) + (64 * x2)), None).to(tl.float32) - tmp68 = tmp67 / 9 - tmp69 = tmp63 & tmp28 - tmp70 = tmp66 + tmp68 - tmp71 = tl.where(tmp69, tmp70, tmp66) - tmp72 = tl.load(in_ptr0 + (tmp34 + (8 * tmp60) + (64 * x2)), None).to(tl.float32) - tmp73 = tmp72 / 9 - tmp74 = tmp63 & tmp37 - tmp75 = tmp71 + tmp73 - tmp76 = tl.where(tmp74, tmp75, tmp71) - tl.store(out_ptr0 + (x5 + tl.zeros([XBLOCK], tl.int32)), tmp76, None) - - inp = torch.ones(8, 2048, 8, 8, device="cuda", dtype=torch.half) - out = torch.ones_like(inp) * 3 - numel = inp.numel() - triton_[(numel // 1024,)](inp, out, 1024) - out_ref = torch.ones_like(inp) - out_ref[:, :, 1:7, 0::7] = 2 / 3 - out_ref[:, :, 0::7, 1:7] = 2 / 3 - out_ref[:, :, 0::7, 0::7] = 4 / 9 - torch.testing.assert_allclose(out, out_ref) diff --git a/python/test/unit/operators/test_matmul.py b/python/test/unit/operators/test_matmul.py deleted file mode 100644 index ec46445ae835..000000000000 --- a/python/test/unit/operators/test_matmul.py +++ /dev/null @@ -1,102 +0,0 @@ -import itertools - -import pytest -import torch - -import triton -import triton.ops - - -@pytest.mark.parametrize( - "BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE", - itertools.chain( - *[ - [ - # 1 warp - (16, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (32, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (16, 32, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (16, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (32, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (16, 32, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (16, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (64, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE), - (16, 64, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE), - # 2 warp - (64, 32, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE), - (32, 64, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE), - (64, 32, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE), - (32, 64, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE), - (128, 32, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE), - (32, 128, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE), - # 4 warp - (128, 64, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE), - (64, 128, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE), - (128, 32, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE), - (32, 128, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE), - (128, 32, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE), - (32, 128, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE), - # 8 warp - (128, 256, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE), - (256, 128, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE), - (256, 128, 32, 1, 8, 2, None, None, None, AT, BT, DTYPE), - # split-k - (64, 64, 16, 2, 4, 2, None, None, None, AT, BT, DTYPE), - (64, 64, 16, 4, 4, 2, None, None, None, AT, BT, DTYPE), - (64, 64, 16, 8, 4, 2, None, None, None, AT, BT, DTYPE), - # variable input - (128, 128, 32, 1, 4, 2, 1024, 1024, 1024, AT, BT, DTYPE), - (128, 128, 32, 1, 4, 2, 384, 128, 640, AT, BT, DTYPE), - (128, 128, 32, 1, 4, 2, 107, 233, 256, AT, BT, DTYPE), - (128, 128, 32, 1, 4, 2, 107, 233, 311, AT, BT, DTYPE), - ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True] - ], - # n-stage - *[ - [ - (16, 16, 16, 1, 1, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), - (64, 32, 64, 1, 2, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), - (128, 64, 16, 1, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), - (256, 128, 32, 1, 8, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), - (128, 128, 32, 1, 4, STAGES, 384, 128, 640, AT, BT, DTYPE), - # split-k - (64, 64, 16, 8, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), - (64, 64, 16, 8, 4, STAGES, 1024, 1024, 32, AT, BT, DTYPE), - ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True] for STAGES in [2, 3, 4] - ] - ), -) -def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE): - capability = torch.cuda.get_device_capability() - if capability[0] < 7: - pytest.skip("Only test tl.dot() on devices with sm >= 70") - if capability[0] < 8 and DTYPE == "bfloat16": - pytest.skip("Only test bfloat16 on devices with sm >= 80") - if DTYPE == "bfloat16" and SPLIT_K != 1: - pytest.skip("bfloat16 matmuls don't allow split_k for now") - torch.manual_seed(0) - # nuke kernel decorators -- will set meta-parameters manually - kwargs = {'BLOCK_M': BLOCK_M, 'BLOCK_N': BLOCK_N, 'BLOCK_K': BLOCK_K, 'SPLIT_K': SPLIT_K} - pre_hook = None if SPLIT_K == 1 else lambda nargs: nargs['C'].zero_() - configs = [triton.Config(kwargs=kwargs, num_warps=NWARP, num_stages=NSTAGE, pre_hook=pre_hook)] - kernel = triton.ops._matmul.kernel - kernel.configs = configs - # kernel.run = kernel.run.run.run - - # get matrix shape - M = BLOCK_M if M is None else M - N = BLOCK_N if N is None else N - K = BLOCK_K * SPLIT_K if K is None else K - # allocate/transpose inputs - DTYPE = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[DTYPE] - a = .1 * torch.randn((K, M) if AT else (M, K), device="cuda", dtype=DTYPE) - b = .1 * torch.randn((N, K) if BT else (K, N), device="cuda", dtype=DTYPE) - a = a.t() if AT else a - b = b.t() if BT else b - # run test - th_c = torch.matmul(a, b) - try: - tt_c = triton.ops.matmul(a, b) - torch.testing.assert_allclose(th_c, tt_c, atol=1e-2, rtol=0) - except triton.OutOfResources as e: - pytest.skip(str(e)) diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py deleted file mode 100644 index c425a36697f6..000000000000 --- a/python/test/unit/runtime/test_autotuner.py +++ /dev/null @@ -1,22 +0,0 @@ -import torch - -import triton -import triton.language as tl - - -def test_kwargs(): - N = 1024 - src = torch.empty(N, device='cuda') - dst = torch.empty(N, device='cuda') - - configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})] - - @triton.autotune(configs=configs, key=['N']) - @triton.jit - def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr): - offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - x = tl.load(src + offsets, mask=offsets < N) - tl.store(dst + offsets, x, mask=offsets < N) - grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']),) - _kernel[grid](dst, src, N) - _kernel[grid](dst=dst, src=src, N=N) diff --git a/python/test/unit/runtime/test_cache.py b/python/test/unit/runtime/test_cache.py deleted file mode 100644 index e13921079992..000000000000 --- a/python/test/unit/runtime/test_cache.py +++ /dev/null @@ -1,208 +0,0 @@ -import os -import shutil - -import pytest -import torch - -import triton -import triton.language as tl -from triton.runtime.jit import JITFunction - -tmpdir = ".tmp" - - -@triton.jit -def function_1(i): - i = i + 1 - i = function_2(i) - return i - - -@triton.jit -def function_2(i): - i = i + 1 - return i - - -@triton.jit -def kernel(X, i, BLOCK: tl.constexpr): - i = i + 1 - i = function_1(i) - tl.store(X, i) - - -@triton.jit(do_not_specialize=["i"]) -def kernel_nospec(X, i, BLOCK: tl.constexpr): - i = i + 1 - i = function_1(i) - tl.store(X, i) - - -def apply_src_change(target, old, new): - kernel.hash = None - function_1.hash = None - function_2.hash = None - function_1.src = function_1.src.replace(old, new) - target.src = target.src.replace(old, new) - ret = target.cache_key - target.src = target.src.replace(new, old) - return ret - - -def test_nochange(): - baseline = kernel.cache_key - updated = apply_src_change(kernel, 'i + 1', 'i + 1') - assert baseline == updated - - -def test_toplevel_change(): - baseline = kernel.cache_key - updated = apply_src_change(kernel, 'i + 1', 'i + 2') - assert baseline != updated - - -def test_nested1_change(): - baseline = kernel.cache_key - updated = apply_src_change(function_1, 'i + 1', 'i + 2') - assert baseline != updated - - -def reset_tmp_dir(): - os.environ["TRITON_CACHE_DIR"] = tmpdir - if os.path.exists(tmpdir): - shutil.rmtree(tmpdir) - - -def test_reuse(): - counter = 0 - - def inc_counter(*args, **kwargs): - nonlocal counter - counter += 1 - JITFunction.cache_hook = inc_counter - reset_tmp_dir() - x = torch.empty(1, dtype=torch.int32, device='cuda') - for i in range(10): - kernel[(1,)](x, 1, BLOCK=1024) - assert counter == 1 - - -@pytest.mark.parametrize('mode', ['enable', 'disable']) -def test_specialize(mode): - counter = 0 - - def inc_counter(*args, **kwargs): - nonlocal counter - counter += 1 - JITFunction.cache_hook = inc_counter - reset_tmp_dir() - x = torch.empty(1, dtype=torch.int32, device='cuda') - function = {'enable': kernel, 'disable': kernel_nospec}[mode] - target = {'enable': 3, 'disable': 1}[mode] - for i in [1, 2, 4, 8, 16, 32]: - function[(1,)](x, i, BLOCK=512) - assert counter == target - - -def test_constexpr_not_callable() -> None: - @triton.jit - def kernel(X, c: tl.constexpr): - tl.store(X, 2) - - x = torch.empty(1, dtype=torch.int32, device='cuda') - error = False - try: - kernel[(1, )](x, c="str") - except BaseException: - error = True - assert error is False - # try and catch - try: - kernel[(1, )](x, c=tl.abs) - except BaseException: - error = True - assert error is True - - -def test_jit_warmup_cache() -> None: - @triton.jit - def kernel_add(a, b, o, N: tl.constexpr): - idx = tl.arange(0, N) - tl.store(o + idx, - tl.load(a + idx) + tl.load(b + idx)) - - args = [ - torch.randn(32, dtype=torch.float32, device="cuda"), - torch.randn(32, dtype=torch.float32, device="cuda"), - torch.randn(32, dtype=torch.float32, device="cuda"), - 32, - ] - assert len(kernel_add.cache) == 0 - kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,)) - assert len(kernel_add.cache) == 1 - kernel_add.warmup(*args, grid=(1,)) - assert len(kernel_add.cache) == 1 - kernel_add.warmup(*args, grid=(1,)) - assert len(kernel_add.cache) == 1 - - -def test_jit_debug() -> None: - @triton.jit - def kernel_add(a, b, o, N: tl.constexpr): - idx = tl.arange(0, N) - tl.device_assert(idx < 32, "idx < 32") - tl.store(o + idx, - tl.load(a + idx) + tl.load(b + idx)) - - device = torch.cuda.current_device() - assert len(kernel_add.cache[device]) == 0 - kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,)) - assert len(kernel_add.cache[device]) == 1 - kernel_add.debug = False - kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,)) - assert len(kernel_add.cache[device]) == 2 - kernel_add.debug = True - kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,)) - assert len(kernel_add.cache[device]) == 3 - bins = list(kernel_add.cache[device].values()) - assert bins[2].asm['ttir'] != bins[1].asm['ttir'] - - -@triton.jit -def add_fn(a, b, o, N: tl.constexpr): - idx = tl.arange(0, N) - tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx)) - - -def test_jit_noinline() -> None: - @triton.jit - def kernel_add_device(a, b, o, N: tl.constexpr): - add_fn(a, b, o, N) - - device = torch.cuda.current_device() - assert len(kernel_add_device.cache[device]) == 0 - kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,)) - assert len(kernel_add_device.cache[device]) == 1 - bins = list(kernel_add_device.cache[device].values()) - inline_ttir = bins[0].asm['ttir'] - add_fn.noinline = True - add_fn.hash = None - kernel_add_device.hash = None - kernel_add_device.cache[device].clear() - kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,)) - assert len(kernel_add_device.cache[device]) == 1 - bins = list(kernel_add_device.cache[device].values()) - noinline_ttir = bins[0].asm['ttir'] - assert inline_ttir != noinline_ttir - - -def test_memory_leak() -> None: - @triton.jit - def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr): - xnumel = 10 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x0 = xindex - tmp0 = tl.load(in_ptr0 + (x0), xmask) - tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) diff --git a/python/test/unit/runtime/test_driver.py b/python/test/unit/runtime/test_driver.py deleted file mode 100644 index b63927d89bfa..000000000000 --- a/python/test/unit/runtime/test_driver.py +++ /dev/null @@ -1,14 +0,0 @@ -import sys - -import triton - - -def test_is_lazy(): - from importlib import reload - reload(sys.modules["triton.runtime.driver"]) - reload(sys.modules["triton.runtime"]) - mod = sys.modules[triton.runtime.driver.__module__] - assert isinstance(triton.runtime.driver, getattr(mod, "LazyProxy")) - assert triton.runtime.driver._obj is None - utils = triton.runtime.driver.utils # noqa: F841 - assert issubclass(triton.runtime.driver._obj.__class__, getattr(mod, "DriverBase")) diff --git a/python/test/unit/runtime/test_launch.py b/python/test/unit/runtime/test_launch.py deleted file mode 100644 index 41c5431027cf..000000000000 --- a/python/test/unit/runtime/test_launch.py +++ /dev/null @@ -1,106 +0,0 @@ -import gc -# import importlib -# import os -# import sys -# import tempfile -# import textwrap -# import time -import tracemalloc - -import torch - -import triton -import triton.language as tl - -# from typing import Tuple - - -def test_memory_leak() -> None: - - @triton.jit - def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr): - xnumel = 10 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x0 = xindex - tmp0 = tl.load(in_ptr0 + (x0), xmask) - tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask) - - tracemalloc.start() - try: - inp = torch.randn(10, device='cuda') - out = torch.randn(10, device='cuda') - kernel[(10,)](inp, out, 10, XBLOCK=16) - gc.collect() - begin, _ = tracemalloc.get_traced_memory() - for _ in range(100): - kernel[(10,)](inp, out, 10, XBLOCK=16) - gc.collect() - end, _ = tracemalloc.get_traced_memory() - assert end - begin < 1000 - finally: - tracemalloc.stop() - - -# LATENCY_THRESHOLD_US = 46 - -# def test_kernel_launch_latency() -> None: -# def define_kernel(kernel_name: str, num_tensor_args: int) -> str: -# arg_str = ",".join([f"arg{i}: torch.Tensor" for i in range(num_tensor_args)]) -# arg_str += ", n_elements: int, BLOCK_SIZE: tl.constexpr" -# func_str = f""" -# import torch - -# import triton -# import triton.language as tl - -# @triton.jit -# def {kernel_name}({arg_str}): -# pass -# """ -# with tempfile.NamedTemporaryFile(mode="w+t", suffix=".py", delete=False) as temp_file: -# temp_file.write(textwrap.dedent(func_str)) -# temp_file_path = temp_file.name - -# return temp_file_path - -# def import_kernel(file_path, kernel_name): -# directory, filename = os.path.split(file_path) -# module_name, _ = os.path.splitext(filename) -# sys.path.insert(0, directory) - -# module = importlib.import_module(module_name) -# kernel = getattr(module, kernel_name) -# return kernel - -# def empty(*kernel_args: Tuple[torch.Tensor]): -# first_arg = kernel_args[0] -# n_elements = first_arg.numel() -# grid = (triton.cdiv(n_elements, 1024),) -# device = torch.cuda.current_device() -# # Warmup -# empty_kernel[grid](*kernel_args, n_elements, BLOCK_SIZE=1024, device=device) -# torch.cuda.synchronize() -# # Measure launch overhead at steady state -# num_runs = 1000 -# start_time = time.time() -# for i in range(num_runs): -# empty_kernel[grid](*kernel_args, n_elements, BLOCK_SIZE=1024, device=device) -# end_time = time.time() -# latency_us = (end_time - start_time) / num_runs * 1e6 - -# assert latency_us < LATENCY_THRESHOLD_US, "Kernel launch time has increased!" - -# num_tensor_args = 40 -# kernel_name = 'empty_kernel' -# file_path = define_kernel(kernel_name, num_tensor_args) -# empty_kernel = import_kernel(file_path, kernel_name) - -# # Initialize random tensors for the empty_kernel -# torch.manual_seed(0) -# size = 1024 -# kernel_args = (torch.rand(size, device='cuda') for i in range(num_tensor_args)) - -# # Run empty, which would run empty_kernel internally -# empty(*kernel_args) diff --git a/python/test/unit/runtime/test_subproc.py b/python/test/unit/runtime/test_subproc.py deleted file mode 100644 index 0e0d33c6fd21..000000000000 --- a/python/test/unit/runtime/test_subproc.py +++ /dev/null @@ -1,83 +0,0 @@ -import multiprocessing -import os -import shutil -from collections import namedtuple - -import torch - -import triton -import triton.language as tl - -tmpdir = ".tmp" - - -def reset_tmp_dir(): - os.environ["TRITON_CACHE_DIR"] = tmpdir - if os.path.exists(tmpdir): - shutil.rmtree(tmpdir) - - -instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"]) - - -def compile_fn(config, cc): - @triton.jit - def kernel_sub(a, b, o, N: tl.constexpr): - idx = tl.arange(0, N) - tl.store(o + idx, tl.load(a + idx) - tl.load(b + idx) * 777) - triton.compile( - fn=kernel_sub, - signature={0: "*fp32", 1: "*fp32", 2: "*fp32"}, - device=0, - constants={3: 32}, - configs=[config], - warm_cache_only=True, - cc=cc, - ) - - -def test_compile_in_subproc() -> None: - major, minor = torch.cuda.get_device_capability(0) - cc = major * 10 + minor - config = instance_descriptor(tuple(range(4)), ()) - - multiprocessing.set_start_method('fork') - proc = multiprocessing.Process( - target=compile_fn, - args=(config, cc)) - proc.start() - proc.join() - assert proc.exitcode == 0 - - -def compile_fn_dot(config, cc): - @triton.jit - def kernel_dot(Z): - offs = tl.arange(0, 16)[:, None] * 16 + tl.arange(0, 16)[None, :] - z = tl.load(Z + offs) - z = tl.dot(z, z) - tl.store(Z + offs, z) - - triton.compile( - fn=kernel_dot, - signature={0: "*fp32"}, - device=0, - configs=[config], - warm_cache_only=True, - cc=cc, - ) - - -def test_compile_in_forked_subproc() -> None: - reset_tmp_dir() - major, minor = torch.cuda.get_device_capability(0) - cc = major * 10 + minor - config = instance_descriptor(tuple(range(1)), ()) - - assert multiprocessing.get_start_method() == 'fork' - proc = multiprocessing.Process( - target=compile_fn_dot, - args=(config, cc)) - proc.start() - proc.join() - assert proc.exitcode == 0 diff --git a/python/triton/_C/include b/python/triton/_C/include deleted file mode 120000 index b85a409837d1..000000000000 --- a/python/triton/_C/include +++ /dev/null @@ -1 +0,0 @@ -../../../include/ \ No newline at end of file diff --git a/python/triton/__init__.py b/python/triton/__init__.py deleted file mode 100644 index 14c9d61bdcb7..000000000000 --- a/python/triton/__init__.py +++ /dev/null @@ -1,68 +0,0 @@ -"""isort:skip_file""" -__version__ = '2.1.0' - -# --------------------------------------- -# Note: import order is significant here. - -# submodules -from .runtime import ( - autotune, - Config, - heuristics, - JITFunction, - KernelInterface, - reinterpret, - TensorWrapper, - OutOfResources, - MockTensor, -) -from .runtime.jit import jit -from .compiler import compile, CompilationError -from .debugger.debugger import program_ids_from_grid - -from . import language -from . import testing - -__all__ = [ - "autotune", - "cdiv", - "CompilationError", - "compile", - "Config", - "heuristics", - "impl", - "jit", - "JITFunction", - "KernelInterface", - "language", - "MockTensor", - "next_power_of_2", - "ops", - "OutOfResources", - "reinterpret", - "runtime", - "TensorWrapper", - "testing", - "program_ids_from_grid", -] - - -# ------------------------------------- -# misc. utilities that don't fit well -# into any specific module -# ------------------------------------- - -def cdiv(x, y): - return (x + y - 1) // y - - -def next_power_of_2(n): - """Return the smallest power of 2 greater than or equal to n""" - n -= 1 - n |= n >> 1 - n |= n >> 2 - n |= n >> 4 - n |= n >> 8 - n |= n >> 16 - n += 1 - return n diff --git a/python/triton/common/__init__.py b/python/triton/common/__init__.py deleted file mode 100644 index cc4d1e10cb49..000000000000 --- a/python/triton/common/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .build import _build - -__all__ = ["_build"] diff --git a/python/triton/common/build.py b/python/triton/common/build.py deleted file mode 100644 index 1bf019ce6d53..000000000000 --- a/python/triton/common/build.py +++ /dev/null @@ -1,116 +0,0 @@ -import contextlib -import functools -import io -import os -import shutil -import subprocess -import sys -import sysconfig - -import setuptools - - -# TODO: is_hip shouldn't be here -def is_hip(): - import torch - return torch.version.hip is not None - - -@functools.lru_cache() -def libcuda_dirs(): - locs = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[1:] - return [os.path.dirname(loc) for loc in locs] - - -@functools.lru_cache() -def rocm_path_dir(): - return os.getenv("ROCM_PATH", default="/opt/rocm") - - -@contextlib.contextmanager -def quiet(): - old_stdout, old_stderr = sys.stdout, sys.stderr - sys.stdout, sys.stderr = io.StringIO(), io.StringIO() - try: - yield - finally: - sys.stdout, sys.stderr = old_stdout, old_stderr - - -def _build(name, src, srcdir): - if is_hip(): - hip_lib_dir = os.path.join(rocm_path_dir(), "lib") - hip_include_dir = os.path.join(rocm_path_dir(), "include") - else: - cuda_lib_dirs = libcuda_dirs() - base_dir = os.path.join(os.path.dirname(__file__), os.path.pardir) - cuda_path = os.path.join(base_dir, "third_party", "cuda") - - cu_include_dir = os.path.join(cuda_path, "include") - triton_include_dir = os.path.join(os.path.dirname(__file__), "include") - cuda_header = os.path.join(cu_include_dir, "cuda.h") - triton_cuda_header = os.path.join(triton_include_dir, "cuda.h") - if not os.path.exists(cuda_header) and os.path.exists(triton_cuda_header): - cu_include_dir = triton_include_dir - suffix = sysconfig.get_config_var('EXT_SUFFIX') - so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix)) - # try to avoid setuptools if possible - cc = os.environ.get("CC") - if cc is None: - # TODO: support more things here. - clang = shutil.which("clang") - gcc = shutil.which("gcc") - cc = gcc if gcc is not None else clang - if cc is None: - raise RuntimeError("Failed to find C compiler. Please specify via CC environment variable.") - # This function was renamed and made public in Python 3.10 - if hasattr(sysconfig, 'get_default_scheme'): - scheme = sysconfig.get_default_scheme() - else: - scheme = sysconfig._get_default_scheme() - # 'posix_local' is a custom scheme on Debian. However, starting Python 3.10, the default install - # path changes to include 'local'. This change is required to use triton with system-wide python. - if scheme == 'posix_local': - scheme = 'posix_prefix' - py_include_dir = sysconfig.get_paths(scheme=scheme)["include"] - - if is_hip(): - ret = subprocess.check_call([cc, src, f"-I{hip_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", f"-L{hip_lib_dir}", "-lamdhip64", "-o", so]) - else: - cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so] - cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs] - ret = subprocess.check_call(cc_cmd) - - if ret == 0: - return so - # fallback on setuptools - extra_compile_args = [] - library_dirs = cuda_lib_dirs - include_dirs = [srcdir, cu_include_dir] - libraries = ['cuda'] - # extra arguments - extra_link_args = [] - # create extension module - ext = setuptools.Extension( - name=name, - language='c', - sources=[src], - include_dirs=include_dirs, - extra_compile_args=extra_compile_args + ['-O3'], - extra_link_args=extra_link_args, - library_dirs=library_dirs, - libraries=libraries, - ) - # build extension module - args = ['build_ext'] - args.append('--build-temp=' + srcdir) - args.append('--build-lib=' + srcdir) - args.append('-q') - args = dict( - name=name, - ext_modules=[ext], - script_args=args, - ) - with quiet(): - setuptools.setup(**args) - return so diff --git a/python/triton/compiler/__init__.py b/python/triton/compiler/__init__.py deleted file mode 100644 index 4d62eeec4ab1..000000000000 --- a/python/triton/compiler/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .compiler import CompiledKernel, compile -from .errors import CompilationError - -__all__ = ["compile", "CompiledKernel", "CompilationError"] diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py deleted file mode 100644 index eb57de82ee96..000000000000 --- a/python/triton/compiler/code_generator.py +++ /dev/null @@ -1,1086 +0,0 @@ -import ast -import inspect -import re -import sys -import warnings -from typing import Any, Callable, Dict, Optional, Tuple, Type, Union - -from .. import language -from ..language import constexpr, tensor -# ideally we wouldn't need any runtime component -from ..runtime import JITFunction -from .errors import (CompilationError, CompileTimeAssertionFailure, - UnsupportedLanguageConstruct) -from triton._C.libtriton.triton import ir - - -def mangle_ty(ty): - if ty.is_ptr(): - return 'P' + mangle_ty(ty.element_ty) - if ty.is_int(): - SIGNED = language.dtype.SIGNEDNESS.SIGNED - prefix = 'i' if ty.int_signedness == SIGNED else 'u' - return prefix + str(ty.int_bitwidth) - if ty.is_fp8(): - return 'fp8' - if ty.is_fp16(): - return 'fp16' - if ty.is_bf16(): - return 'bf16' - if ty.is_fp32(): - return 'fp32' - if ty.is_fp64(): - return 'fp64' - if ty.is_block(): - elt = mangle_ty(ty.scalar) - shape = '_'.join(map(str, ty.shape)) - return f'{elt}S{shape}S' - if ty.is_void(): - return 'V' - assert False, "Unsupported type" - - -def mangle_fn(name, arg_tys, constants): - # doesn't mangle ret type, which must be a function of arg tys - mangled_arg_names = '_'.join([mangle_ty(ty) for ty in arg_tys]) - mangled_constants = '_'.join([f'{i}c{repr(constants[i])}' for i in sorted(constants)]) - mangled_constants = mangled_constants.replace('.', '_d_') - mangled_constants = mangled_constants.replace("'", '_sq_') - # [ and ] are not allowed in LLVM identifiers - mangled_constants = mangled_constants.replace('[', '_').replace(']', '_') - ret = f'{name}__{mangled_arg_names}__{mangled_constants}' - return ret - - -def _is_triton_tensor(o: Any) -> bool: - return isinstance(o, tensor) - - -def _is_constexpr(o: Any) -> bool: - return isinstance(o, constexpr) - - -def _is_triton_scalar(o: Any) -> bool: - return _is_triton_tensor(o) and (not o.type.is_block() or o.type.numel == 1) - - -def _unwrap_if_constexpr(o: Any): - return o.value if isinstance(o, constexpr) else o - - -def _check_fn_args(node, fn, args): - if fn.noinline: - for idx, arg in enumerate(args): - if not _is_constexpr(arg) and not _is_triton_scalar(arg): - raise UnsupportedLanguageConstruct(fn.src, node, f'Function {fn.__name__} is marked noinline, but was called with non-scalar argument {fn.arg_names[idx]}:{arg}') - - -_condition_types = {bool, int, type(None)} # Python types accepted for conditionals inside kernels - - -class enter_sub_region: - def __init__(self, generator): - self.generator = generator - - def __enter__(self): - # record lscope & local_defs in the parent scope - self.liveins = self.generator.lscope.copy() - self.prev_defs = self.generator.local_defs.copy() - self.generator.local_defs = {} - self.insert_block = self.generator.builder.get_insertion_block() - self.insert_point = self.generator.builder.get_insertion_point() - return self.liveins, self.insert_block - - def __exit__(self, *args, **kwargs): - self.generator.builder.restore_insertion_point(self.insert_point) - self.generator.lscope = self.liveins - self.generator.local_defs = self.prev_defs - - -# Check if the given syntax node has an "early" return -class ContainsReturnChecker(ast.NodeVisitor): - def __init__(self, gscope): - self.gscope = gscope - - def _visit_stmts(self, body) -> bool: - for s in body: - if self.visit(s): - return True - return False - - def _visit_function(self, fn) -> bool: - # Currently we only support JITFunctions defined in the global scope - if isinstance(fn, JITFunction) and not fn.noinline: - fn_node = fn.parse() - return ContainsReturnChecker(self.gscope).visit(fn_node) - return False - - def generic_visit(self, node) -> bool: - ret = False - for _, value in ast.iter_fields(node): - if isinstance(value, list): - for item in value: - if isinstance(item, ast.AST): - ret = ret or self.visit(item) - elif isinstance(value, ast.AST): - ret = ret or self.visit(value) - return ret - - def visit_Attribute(self, node: ast.Attribute) -> bool: - # If the left part is a name, it's possible that - # we call triton native function or a jit function from another module. - # If the left part is not a name, it must return a tensor or a constexpr - # whose methods do not contain return statements - # e.g., (tl.load(x)).to(y) - # So we only check if the expressions within value have return or not - if isinstance(node.value, ast.Name): - if node.value.id in self.gscope: - value = self.gscope[node.value.id] - fn = getattr(value, node.attr) - return self._visit_function(fn) - return False - return self.visit(node.value) - - def visit_Name(self, node: ast.Name) -> bool: - if type(node.ctx) == ast.Store: - return False - if node.id in self.gscope: - fn = self.gscope[node.id] - return self._visit_function(fn) - return False - - def visit_Return(self, node: ast.Return) -> bool: - return True - - def visit_Assign(self, node: ast.Assign) -> bool: - # There couldn't be an early return - # x = ... - return False - - def visit_AugAssign(self, node: ast.AugAssign) -> bool: - # There couldn't be an early return - # x += ... - return False - - def visit_Module(self, node: ast.Module) -> bool: - return self._visit_stmts(node.body) - - def visit_FunctionDef(self, node: ast.FunctionDef) -> bool: - return self._visit_stmts(node.body) - - def visit_If(self, node: ast.If) -> bool: - # TODO: optimize the following case in which we actually don't have - # a return when static_cond is false: - # if dynamic_cond - # if static_cond - # func_with_return - # else - # func_without_return - ret = self._visit_stmts(node.body) - if node.orelse: - ret = ret or self._visit_stmts(node.orelse) - return ret - - def visit_IfExp(self, node: ast.IfExp) -> bool: - return self.visit(node.body) or self.visit(node.orelse) - - def visit_Call(self, node: ast.Call) -> bool: - return self.visit(node.func) - - -class CodeGenerator(ast.NodeVisitor): - def __init__(self, context, prototype, gscope, attributes, constants, function_name, - module=None, is_kernel=False, function_types: Optional[Dict] = None, - debug=False, noinline=False): - self.builder = ir.builder(context) - self.module = self.builder.create_module() if module is None else module - self.function_ret_types = {} if function_types is None else function_types - self.prototype = prototype - self.gscope = gscope - self.lscope = dict() - self.attributes = attributes - self.constants = constants - self.function_name = function_name - self.is_kernel = is_kernel - self.last_node = None - self.debug = debug - self.noinline = noinline - self.scf_stack = [] - self.last_ret_type = None - # SSA-construction - # name => language.tensor - self.local_defs: Dict[str, tensor] = {} - self.global_uses: Dict[str, tensor] = {} - self.dereference_name: Callable[[str], Any] = self._define_name_lookup() - - builtin_namespace: Dict[str, Any] = {_.__name__: _ for _ in (range, float, int, isinstance, getattr)} - builtin_namespace.update(( - ('print', language.core.device_print), - ('min', language.minimum), - )) - - def _define_name_lookup(self): - def local_lookup(name: str, absent): - value = self.lscope.get(name, absent) # this needs to be re-fetched from `self` every time, because it gets switched occasionally - if value is not absent and name not in self.local_defs: - self.global_uses[name] = value - return value - - absent_marker = object() - - def name_lookup(name: str) -> Any: - absent = absent_marker - for lookup_function in local_lookup, self.gscope.get, self.builtin_namespace.get: - value = lookup_function(name, absent) - if value is not absent: - return value - raise NameError(f'{name} is not defined') - - return name_lookup - - def set_value(self, name: str, - value: Union[tensor, constexpr]) -> None: - ''' This function: - called by visit_Assign() & visit_FunctionDef() to store left value (lvalue) - 1. record local defined name (FIXME: should consider control flow) - 2. store tensor in self.lvalue - ''' - self.lscope[name] = value - self.local_defs[name] = value - - # - # AST visitor - # - def visit_compound_statement(self, stmts): - for stmt in stmts: - ret_type = self.visit(stmt) - if ret_type is not None and isinstance(stmt, ast.Return): - self.last_ret_type = ret_type - - def visit_Module(self, node): - ast.NodeVisitor.generic_visit(self, node) - - def visit_List(self, node): - ctx = self.visit(node.ctx) - assert ctx is None - elts = [self.visit(elt) for elt in node.elts] - return elts - - # By design, only non-kernel functions can return - def visit_Return(self, node): - ret_value = self.visit(node.value) - # ret_block = self.builder.create_block() - # post_ret_block = self.builder.create_block() - # self.builder.create_branch(ret_block) - # self.builder.set_insertion_point_to_end(ret_block) - if ret_value is None: - self.builder.ret([]) - ret_ty = None - elif isinstance(ret_value, tuple): - ret_values = [language.core._to_tensor(v, self.builder) for v in ret_value] - ret_types = [v.type for v in ret_values] - self.builder.ret([v.handle for v in ret_values]) - ret_ty = tuple(ret_types) - else: - ret = language.core._to_tensor(ret_value, self.builder) - self.builder.ret([ret.handle]) - ret_ty = ret.type - # self.builder.create_branch(post_ret_block) - # self.builder.set_insertion_point_to_end(post_ret_block) - return ret_ty - - def visit_FunctionDef(self, node): - arg_names, kwarg_names = self.visit(node.args) - # initialize defaults - for i, default_value in enumerate(node.args.defaults): - arg_node = node.args.args[-i - 1] - annotation = arg_node.annotation - name = arg_node.arg - st_target = ast.Name(id=name, ctx=ast.Store()) - if annotation is None: - init_node = ast.Assign(targets=[st_target], value=default_value) - else: - init_node = ast.AnnAssign(target=st_target, value=default_value, annotation=annotation) - self.visit(init_node) - # initialize function - visibility = "public" if self.is_kernel else "private" - fn = self.builder.get_or_insert_function(self.module, self.function_name, self.prototype.to_ir(self.builder), visibility, self.noinline) - self.module.push_back(fn) - entry = fn.add_entry_block() - arg_values = [] - idx = 0 - for i, arg_name in enumerate(arg_names): - if i in self.constants: - cst = self.constants[i] - if not _is_constexpr(cst): - cst = constexpr(self.constants[i]) - arg_values.append(cst) - continue - else: - if i in self.attributes: - fn.set_arg_attr(idx, "tt.divisibility", self.attributes[i][1]) - arg_values.append(tensor(fn.args(idx), self.prototype.param_types[idx])) - idx += 1 - - insert_pt = self.builder.get_insertion_block() - for arg_name, arg_value in zip(arg_names, arg_values): - self.set_value(arg_name, arg_value) - self.builder.set_insertion_point_to_start(entry) - # visit function body - self.visit_compound_statement(node.body) - # finalize function - if self.last_ret_type is None: - self.builder.ret([]) - else: - # update return type - if isinstance(self.last_ret_type, tuple): - self.prototype.ret_types = list(self.last_ret_type) - fn.reset_type(self.prototype.to_ir(self.builder)) - else: - self.prototype.ret_types = [self.last_ret_type] - fn.reset_type(self.prototype.to_ir(self.builder)) - if insert_pt: - self.builder.set_insertion_point_to_end(insert_pt) - # Remove dead code - fn.finalize() - - def visit_arguments(self, node): - arg_names = [] - for arg in node.args: - arg_names += [self.visit(arg)] - kwarg_names = self.visit(node.kwarg) - return arg_names, kwarg_names - - def visit_arg(self, node): - ast.NodeVisitor.generic_visit(self, node) - return node.arg - - def visit_AnnAssign(self, node): - # extract attributes - annotation = self.visit(node.annotation) - target = self.visit(node.target) - value = self.visit(node.value) - # constexpr - if annotation == constexpr: - if target in self.lscope: - raise ValueError(f'{target} is already defined.' - f' constexpr cannot be reassigned.') - if not _is_constexpr(value): - value = constexpr(value) - self.lscope[target] = value - return self.lscope[target] - # default: call visit_Assign - return self.visit_Assign(node) - - def visit_Assign(self, node): - _names = [] - for target in node.targets: - _names += [self.visit(target)] - if len(_names) > 1: - raise UnsupportedLanguageConstruct(None, node, "simultaneous multiple assignment is not supported.") - names = _names[0] - values = self.visit(node.value) - if not isinstance(names, tuple): - names = [names] - if not isinstance(values, tuple): - values = [values] - native_nontensor_types = (language.dtype, ) - for name, value in zip(names, values): - # by default, constexpr are assigned into python variable - value = _unwrap_if_constexpr(value) - if not _is_triton_tensor(value) and \ - not isinstance(value, native_nontensor_types): - value = language.core._to_tensor(value, self.builder) - self.set_value(name, value) - - def visit_AugAssign(self, node): - name = node.target.id - lhs = ast.Name(id=name, ctx=ast.Load()) - rhs = ast.BinOp(lhs, node.op, node.value) - assign = ast.Assign(targets=[node.target], value=rhs) - self.visit(assign) - return self.dereference_name(name) - - def visit_Name(self, node): - if type(node.ctx) == ast.Store: - return node.id - return self.dereference_name(node.id) - - def visit_Store(self, node): - ast.NodeVisitor.generic_visit(self, node) - - def visit_Load(self, node): - ast.NodeVisitor.generic_visit(self, node) - - def visit_Tuple(self, node): - args = [self.visit(x) for x in node.elts] - return tuple(args) - - def _apply_binary_method(self, method_name, lhs, rhs): - # TODO: raise something meaningful if getattr fails below, esp for reverse method - if _is_triton_tensor(lhs): - return getattr(lhs, method_name)(rhs, _builder=self.builder) - if _is_triton_tensor(rhs): - reverse_method_name = re.sub(r"__(.*)__", r"__r\1__", method_name) - return getattr(rhs, reverse_method_name)(lhs, _builder=self.builder) - return getattr(lhs, method_name)(rhs) - - def visit_BinOp(self, node): - lhs = self.visit(node.left) - rhs = self.visit(node.right) - method_name = self._method_name_for_bin_op.get(type(node.op)) - if method_name is None: - raise UnsupportedLanguageConstruct(None, node, "AST binary operator '{}' is not (currently) implemented.".format(node.op.__name__)) - return self._apply_binary_method(method_name, lhs, rhs) - _method_name_for_bin_op: Dict[Type[ast.operator], str] = { - ast.Add: '__add__', ast.Sub: '__sub__', ast.Mult: '__mul__', ast.Div: '__truediv__', - ast.FloorDiv: '__floordiv__', ast.Mod: '__mod__', ast.Pow: '__pow__', - ast.LShift: '__lshift__', ast.RShift: '__rshift__', ast.BitAnd: '__and__', ast.BitOr: '__or__', ast.BitXor: '__xor__', - } - - def visit_then_else_blocks(self, node, liveins, then_block, else_block): - # then block - self.builder.set_insertion_point_to_start(then_block) - self.visit_compound_statement(node.body) - then_block = self.builder.get_insertion_block() - then_defs = self.local_defs.copy() - # else block - else_defs = {} - if node.orelse: - self.builder.set_insertion_point_to_start(else_block) - self.lscope = liveins.copy() - self.local_defs = {} - self.visit_compound_statement(node.orelse) - else_defs = self.local_defs.copy() - else_block = self.builder.get_insertion_block() - - # update block arguments - names = [] - ret_types = [] - ir_ret_types = [] - # variables in livein whose value is updated in `if` - for name in liveins: - # check type - for defs, block_name in [(then_defs, 'then'), (else_defs, 'else')]: - if name in defs: - assert defs[name].type == liveins[name].type,\ - f'initial value for `{name}` is of type {liveins[name].type}, '\ - f'but the {block_name} block redefines it as {defs[name].type}' - if name in then_defs or name in else_defs: - names.append(name) - ret_types.append(then_defs[name].type if name in then_defs else else_defs[name].type) - ir_ret_types.append(then_defs[name].handle.get_type() if name in then_defs else else_defs[name].handle.get_type()) - # variable defined in then but not in else - if name in then_defs and name not in else_defs: - else_defs[name] = liveins[name] - # variable defined in else but not in then - if name in else_defs and name not in then_defs: - then_defs[name] = liveins[name] - # variables that are both in then and else but not in liveins - # TODO: could probably be cleaned up - for name in then_defs.keys() & else_defs.keys(): - if name in names: - continue - then_ty = then_defs[name].type - else_ty = else_defs[name].type - assert then_ty == else_ty,\ - f'mismatched type for {name} between then block ({then_ty}) '\ - f'and else block ({else_ty})' - names.append(name) - ret_types.append(then_ty) - ir_ret_types.append(then_defs[name].handle.get_type()) - - return then_defs, else_defs, then_block, else_block, names, ret_types, ir_ret_types - - def visit_if_top_level(self, cond, node): - has_endif_block = True - with enter_sub_region(self) as sr: - liveins, ip_block = sr - then_block = self.builder.create_block() - else_block = self.builder.create_block() - # create basic-block after conditional - endif_block = self.builder.create_block() - # create branch - self.builder.set_insertion_point_to_end(ip_block) - self.builder.create_cond_branch(cond.handle, then_block, else_block) - # visit then and else blocks - then_defs, else_defs, then_block, else_block, names, ret_types, ir_ret_types = \ - self.visit_then_else_blocks(node, liveins, then_block, else_block) - # then terminator - self.builder.set_insertion_point_to_end(then_block) - if then_block.has_return() and else_block.has_return(): - has_endif_block = False - endif_block.erase() - if not then_block.has_terminator() and has_endif_block: - self.builder.create_branch(endif_block, [then_defs[n].handle for n in names]) - # else terminator - self.builder.set_insertion_point_to_end(else_block) - if not else_block.has_terminator() and has_endif_block: - self.builder.create_branch(endif_block, [else_defs[n].handle for n in names]) - if has_endif_block: - for ty in ir_ret_types: - endif_block.add_argument(ty) - if has_endif_block: - # change block - self.builder.set_insertion_point_to_start(endif_block) - # update value - for i, name in enumerate(names): - new_tensor = language.core.tensor(endif_block.arg(i), ret_types[i]) - self.set_value(name, new_tensor) - - # TODO: refactor - def visit_if_scf(self, cond, node): - with enter_sub_region(self) as sr: - liveins, _ = sr - ip = self.builder.get_insertion_point() - then_block = self.builder.create_block() - else_block = self.builder.create_block() if node.orelse else None - then_defs, else_defs, then_block, else_block, names, ret_types, _ = \ - self.visit_then_else_blocks(node, liveins, then_block, else_block) - # create if op - self.builder.restore_insertion_point(ip) - if_op = self.builder.create_if_op([ty.to_ir(self.builder) for ty in ret_types], cond.handle, True) - then_block.merge_block_before(if_op.get_then_block()) - self.builder.set_insertion_point_to_end(if_op.get_then_block()) - if len(names) > 0: - self.builder.create_yield_op([then_defs[n].handle for n in names]) - if not node.orelse: - else_block = if_op.get_else_block() - else: - else_block.merge_block_before(if_op.get_else_block()) - self.builder.set_insertion_point_to_end(if_op.get_else_block()) - if len(names) > 0: - self.builder.create_yield_op([else_defs[n].handle for n in names]) - # update values - for i, name in enumerate(names): - new_tensor = language.core.tensor(if_op.get_result(i), ret_types[i]) - self.set_value(name, new_tensor) - - def visit_If(self, node): - cond = self.visit(node.test) - if _is_triton_tensor(cond): - cond = cond.to(language.int1, _builder=self.builder) - contains_return = ContainsReturnChecker(self.gscope).visit(node) - if self.scf_stack and contains_return: - raise UnsupportedLanguageConstruct(None, node, - "Cannot have `return` statements inside `while` or `for` statements in triton") - elif self.scf_stack or not contains_return: - self.visit_if_scf(cond, node) - else: - self.visit_if_top_level(cond, node) - else: - cond = _unwrap_if_constexpr(cond) - if type(cond) not in _condition_types: # not isinstance - we insist the real thing, no subclasses and no ducks - raise UnsupportedLanguageConstruct( - None, node, "`if` conditionals can only accept values of type {{{}}}, not objects of type {}".format( - ', '.join(_.__name__ for _ in _condition_types), type(cond).__name__)) - if cond: - self.visit_compound_statement(node.body) - else: - self.visit_compound_statement(node.orelse) - - def visit_IfExp(self, node): - cond = self.visit(node.test) - if _is_triton_tensor(cond): - cond = cond.to(language.int1, _builder=self.builder) - if _unwrap_if_constexpr(cond): - return self.visit(node.body) - else: - return self.visit(node.orelse) - - def visit_Pass(self, node): - pass - - def visit_Compare(self, node): - if not (len(node.comparators) == 1 and len(node.ops) == 1): - raise UnsupportedLanguageConstruct(None, node, "simultaneous multiple comparison is not supported") - lhs = _unwrap_if_constexpr(self.visit(node.left)) - rhs = _unwrap_if_constexpr(self.visit(node.comparators[0])) - if type(node.ops[0]) == ast.Is: - return constexpr(lhs is rhs) - if type(node.ops[0]) == ast.IsNot: - return constexpr(lhs is not rhs) - method_name = self._method_name_for_comp_op.get(type(node.ops[0])) - if method_name is None: - raise UnsupportedLanguageConstruct(None, node, "AST comparison operator '{}' is not (currently) implemented.".format(node.ops[0].__name__)) - return self._apply_binary_method(method_name, lhs, rhs) - _method_name_for_comp_op: Dict[Type[ast.cmpop], str] = { - ast.Eq: '__eq__', ast.NotEq: '__ne__', ast.Lt: '__lt__', ast.LtE: '__le__', ast.Gt: '__gt__', ast.GtE: '__ge__' - } - - def visit_UnaryOp(self, node): - op = self.visit(node.operand) - fn = self._method_name_for_unary_op.get(type(node.op)) - if fn is None: - raise UnsupportedLanguageConstruct(None, node, "AST unary operator '{}' is not (currently) implemented.".format(node.op.__name__)) - if _is_triton_tensor(op): - return getattr(op, fn)(_builder=self.builder) - return getattr(op, fn)() - _method_name_for_unary_op: Dict[Type[ast.unaryop], str] = {ast.USub: '__neg__', ast.UAdd: '__pos__', ast.Not: '__not__', ast.Invert: '__invert__'} - - def visit_While(self, node): - with enter_sub_region(self) as sr: - liveins, insert_block = sr - - # loop body (the after region) - # loop_block = self.builder.create_block() - dummy = self.builder.create_block() - self.builder.set_insertion_point_to_start(dummy) - self.scf_stack.append(node) - self.visit_compound_statement(node.body) - self.scf_stack.pop() - loop_defs = self.local_defs - - # collect loop-carried values - names = [] - ret_types = [] - init_args = [] - for name in loop_defs: - if name in liveins: - # We should not def new constexpr - assert _is_triton_tensor(loop_defs[name]) - assert _is_triton_tensor(liveins[name]) - assert loop_defs[name].type == liveins[name].type - # these are loop-carried values - names.append(name) - ret_types.append(loop_defs[name].type) - init_args.append(liveins[name]) - - self.builder.set_insertion_point_to_end(insert_block) - while_op = self.builder.create_while_op([ty.to_ir(self.builder) for ty in ret_types], - [arg.handle for arg in init_args]) - # merge the condition region - before_block = self.builder.create_block_with_parent(while_op.get_before(), - [ty.to_ir(self.builder) for ty in ret_types]) - self.builder.set_insertion_point_to_start(before_block) - for i, name in enumerate(names): - self.lscope[name] = language.core.tensor(before_block.arg(i), ret_types[i]) - self.local_defs[name] = self.lscope[name] - cond = self.visit(node.test) - self.builder.set_insertion_point_to_end(before_block) - # create ConditionOp: e.g., scf.condition(%cond) %arg0, %arg1, ... - self.builder.create_condition_op(cond.handle, [before_block.arg(i) for i in range(len(init_args))]) - # merge the loop body - after_block = self.builder.create_block_with_parent(while_op.get_after(), - [ty.to_ir(self.builder) for ty in ret_types]) - - # generate loop body - self.builder.set_insertion_point_to_start(after_block) - for i, name in enumerate(names): - self.lscope[name] = language.core.tensor(after_block.arg(i), ret_types[i]) - self.local_defs[name] = self.lscope[name] - self.scf_stack.append(node) - self.visit_compound_statement(node.body) - self.scf_stack.pop() - loop_defs = self.local_defs - yields = [] - for name in loop_defs: - if name in liveins: - yields.append(loop_defs[name]) - self.builder.create_yield_op([y.handle for y in yields]) - - # update global uses in while_op - for i, name in enumerate(names): - after_block.replace_use_in_block_with(init_args[i].handle, after_block.arg(i)) - - # WhileOp defines new values, update the symbol table (lscope, local_defs) - for i, name in enumerate(names): - new_def = language.core.tensor(while_op.get_result(i), ret_types[i]) - self.lscope[name] = new_def - self.local_defs[name] = new_def - - for stmt in node.orelse: - assert False, "Not implemented" - ast.NodeVisitor.generic_visit(self, stmt) - - def visit_Subscript(self, node): - assert node.ctx.__class__.__name__ == "Load" - lhs = self.visit(node.value) - slices = self.visit(node.slice) - if _is_triton_tensor(lhs): - return lhs.__getitem__(slices, _builder=self.builder) - return lhs[slices] - - def visit_ExtSlice(self, node): - return [self.visit(dim) for dim in node.dims] - - def visit_For(self, node): - IteratorClass = self.visit(node.iter.func) - iter_args = [self.visit(arg) for arg in node.iter.args] - if IteratorClass == language.static_range: - iterator = IteratorClass(*iter_args) - static_range = range(iterator.start.value, - iterator.end.value, - iterator.step.value) - for i in static_range: - self.lscope[node.target.id] = constexpr(i) - self.visit_compound_statement(node.body) - for stmt in node.orelse: - ast.NodeVisitor.generic_visit(self, stmt) - return - - if IteratorClass is not range: - raise RuntimeError('Only `range` and `static_range` iterators are currently supported') - - # visit iterator arguments - # note: only `range` iterator is supported now - # collect lower bound (lb), upper bound (ub), and step - lb = iter_args[0] if len(iter_args) > 1 else self.visit(ast.Num(0)) - ub = iter_args[1] if len(iter_args) > 1 else self.visit(node.iter.args[0]) - step = iter_args[2] if len(iter_args) > 2 else self.visit(ast.Num(1)) - # handle negative constant step (not supported by scf.for in MLIR) - negative_step = False - if _is_constexpr(step) and step.value < 0: - step = constexpr(-step.value) - negative_step = True - lb, ub = ub, lb - lb = language.core._to_tensor(lb, self.builder) - ub = language.core._to_tensor(ub, self.builder) - step = language.core._to_tensor(step, self.builder) - # induction variable type - if not lb.dtype.is_int() or not ub.dtype.is_int() or not step.dtype.is_int(): - raise TypeError(f"For loop bounds and step must all be ints, are ({lb.dtype}, {ub.dtype}, {step.dtype})") - iv_type = language.semantic.integer_promote_impl(lb.dtype, ub.dtype) - iv_type = language.semantic.integer_promote_impl(iv_type, step.dtype) - iv_ir_type = iv_type.to_ir(self.builder) - iv_is_signed = iv_type.int_signedness == language.core.dtype.SIGNEDNESS.SIGNED - # lb/ub/step might be constexpr, we need to cast them to tensor - lb = lb.handle - ub = ub.handle - step = step.handle - # ForOp can only accept IndexType as lb/ub/step. Cast integer to Index - lb = self.builder.create_int_cast(lb, iv_ir_type, iv_is_signed) - ub = self.builder.create_int_cast(ub, iv_ir_type, iv_is_signed) - step = self.builder.create_int_cast(step, iv_ir_type, iv_is_signed) - # Create placeholder for the loop induction variable - iv = self.builder.create_undef(iv_ir_type) - self.set_value(node.target.id, language.core.tensor(iv, iv_type)) - - with enter_sub_region(self) as sr: - liveins, insert_block = sr - ip = self.builder.get_insertion_point() - - # create loop body block - block = self.builder.create_block() - self.builder.set_insertion_point_to_start(block) - # dry visit loop body - self.scf_stack.append(node) - self.visit_compound_statement(node.body) - self.scf_stack.pop() - block.erase() - - # If a variable (name) is defined in both its parent & itself, then it's - # a loop-carried variable. (They must be of the same type) - init_args = [] - yields = [] - names = [] - for name in self.local_defs: - if name in liveins: - assert _is_triton_tensor(self.local_defs[name]), f'{name} is not tensor' - assert _is_triton_tensor(liveins[name]) - assert self.local_defs[name].type == liveins[name].type,\ - f'Loop-carried variable {name} has initial type {liveins[name].type} '\ - f'but is re-assigned to {self.local_defs[name].type} in loop! '\ - f'Please make sure that the type stays consistent.' - - names.append(name) - init_args.append(language.core._to_tensor(liveins[name], self.builder)) - yields.append(language.core._to_tensor(self.local_defs[name], self.builder)) - - # create ForOp - self.builder.restore_insertion_point(ip) - for_op = self.builder.create_for_op(lb, ub, step, [arg.handle for arg in init_args]) - - self.scf_stack.append(node) - self.builder.set_insertion_point_to_start(for_op.get_body(0)) - for i, name in enumerate(names): - self.set_value(name, language.core.tensor(for_op.get_body(0).arg(i + 1), yields[i].type)) - self.visit_compound_statement(node.body) - self.scf_stack.pop() - yields = [] - for name in self.local_defs: - if name in liveins: - yields.append(language.core._to_tensor(self.local_defs[name], self.builder)) - - # create YieldOp - if len(yields) > 0: - self.builder.create_yield_op([y.handle for y in yields]) - for_op_region = for_op.get_body(0).get_parent() - assert for_op_region.size() == 1, "We use SCF, so the loop body should only have one block" - - # update induction variable with actual value, and replace all uses - self.builder.set_insertion_point_to_start(for_op.get_body(0)) - iv = for_op.get_induction_var() - if negative_step: - iv = self.builder.create_sub(ub, iv) - iv = self.builder.create_add(iv, lb) - self.lscope[node.target.id].handle.replace_all_uses_with(iv) - self.set_value(node.target.id, language.core.tensor(iv, iv_type)) - - # update lscope & local_defs (ForOp defines new values) - for i, name in enumerate(names): - self.set_value(name, language.core.tensor(for_op.get_result(i), yields[i].type)) - - for stmt in node.orelse: - assert False, "Don't know what to do with else after for" - ast.NodeVisitor.generic_visit(self, stmt) - - def visit_Slice(self, node): - lower = self.visit(node.lower) - upper = self.visit(node.upper) - step = self.visit(node.step) - return slice(lower, upper, step) - - def visit_Index(self, node): - return self.visit(node.value) - - def visit_keyword(self, node) -> Tuple[str, Any]: - return node.arg, self.visit(node.value) - - def visit_Assert(self, node) -> Any: - if not self.debug: - return - test = self.visit(node.test) - msg = self.visit(node.msg) - # Convert assert to triton's device_assert which happens on the device - return language.core.device_assert(test, msg, _builder=self.builder) - - def call_JitFunction(self, fn: JITFunction, args, kwargs): - args = inspect.getcallargs(fn.fn, *args, **kwargs) - args = [args[name] for name in fn.arg_names] - args = [arg if _is_triton_tensor(arg) - else constexpr(arg) for arg in args] - # generate function def - attributes = dict() - constexprs = [i for i, arg in enumerate(args) if _is_constexpr(arg)] - constants = {i: args[i] for i in constexprs} - # generate call - args = [None if i in constexprs else arg for i, arg in enumerate(args)] - arg_vals = [arg.handle for arg in args if arg is not None] - arg_types = [arg.type for arg in args if arg is not None] - fn_name = mangle_fn(fn.__name__, arg_types, constants) - # generate function def if necessary - if not self.module.has_function(fn_name): - prototype = language.function_type([], arg_types) - gscope = sys.modules[fn.fn.__module__].__dict__ - # If the callee is not set, we use the same debug setting as the caller - debug = self.debug if fn.debug is None else fn.debug - generator = CodeGenerator(self.builder.context, prototype, gscope, attributes, constants, module=self.module, function_name=fn_name, function_types=self.function_ret_types, debug=debug, noinline=fn.noinline) - generator.visit(fn.parse()) - callee_ret_type = generator.last_ret_type - self.function_ret_types[fn_name] = callee_ret_type - else: - callee_ret_type = self.function_ret_types[fn_name] - symbol = self.module.get_function(fn_name) - call_op = self.builder.call(symbol, arg_vals) - if call_op.get_num_results() == 0 or callee_ret_type is None: - return None - elif call_op.get_num_results() == 1: - return tensor(call_op.get_result(0), callee_ret_type) - else: - # should return a tuple of tl.tensor - results = [] - for i in range(call_op.get_num_results()): - results.append(tensor(call_op.get_result(i), callee_ret_type[i])) - return tuple(results) - - def visit_Call(self, node): - fn = _unwrap_if_constexpr(self.visit(node.func)) - - static_implementation = self.statically_implemented_functions.get(fn) - if static_implementation is not None: - return static_implementation(self, node) - - kws = dict(self.visit(keyword) for keyword in node.keywords) - args = [self.visit(arg) for arg in node.args] - if fn is language.core.device_assert: # TODO: this should not be so hardcoded - if not self.debug: - return - if isinstance(fn, JITFunction): - _check_fn_args(node, fn, args) - return self.call_JitFunction(fn, args, kws) - if (hasattr(fn, '__self__') and _is_triton_tensor(fn.__self__)) or language.core.is_builtin(fn): - extra_kwargs = dict(_builder=self.builder) - sig = inspect.signature(fn) - if '_generator' in sig.parameters: - extra_kwargs['_generator'] = self - return fn(*args, **extra_kwargs, **kws) - if fn in self.builtin_namespace.values(): - args = map(_unwrap_if_constexpr, args) - return fn(*args, **kws) - - def visit_Constant(self, node): - return constexpr(node.value) - - def visit_BoolOp(self, node: ast.BoolOp): - if len(node.values) != 2: - raise UnsupportedLanguageConstruct(None, node, "chained boolean operators (A or B or C) are not supported; use parentheses to split the chain.") - lhs = self.visit(node.values[0]) - rhs = self.visit(node.values[1]) - method_name = self._method_name_for_bool_op.get(type(node.op)) - if method_name is None: - raise UnsupportedLanguageConstruct(None, node, "AST boolean operator '{}' is not (currently) implemented.".format(node.op.__name__)) - return self._apply_binary_method(method_name, lhs, rhs) - _method_name_for_bool_op: Dict[Type[ast.boolop], str] = {ast.And: 'logical_and', ast.Or: 'logical_or'} - - if sys.version_info < (3, 8): - def visit_NameConstant(self, node): - return constexpr(node.value) - - def visit_Num(self, node): - return constexpr(node.n) - - def visit_Str(self, node): - return constexpr(ast.literal_eval(node)) - - def visit_Attribute(self, node): - lhs = self.visit(node.value) - if _is_triton_tensor(lhs): - if node.attr == "T": - return language.semantic.trans(lhs, builder=self.builder) - return getattr(lhs, node.attr) - - def visit_Expr(self, node): - ast.NodeVisitor.generic_visit(self, node) - - def visit_NoneType(self, node): - return None - - def visit_JoinedStr(self, node): - values = list(node.values) - for i, value in enumerate(values): - if isinstance(value, ast.Constant): - values[i] = str(value.value) - elif isinstance(value, ast.FormattedValue): - conversion_code = value.conversion - evaluated = self.visit(value.value) - if not _is_constexpr(evaluated): - raise UnsupportedLanguageConstruct( - None, node, "Cannot evaluate f-string containing non-constexpr conversion values, found conversion of type " + str(type(evaluated))) - values[i] = ("{}" if conversion_code < 0 else "{!" + chr(conversion_code) + "}").format(evaluated.value) - else: - raise AssertionError("encountered unexpected node of type {} in a JoinedStr node".format(type(value))) - return ''.join(values) - - def visit(self, node): - if node is not None: - self.last_node = node - with warnings.catch_warnings(): - # The ast library added visit_Constant and deprecated some other - # methods but we can't move to that without breaking Python 3.6 and 3.7. - warnings.simplefilter("ignore", DeprecationWarning) # python 3.9 - warnings.simplefilter("ignore", PendingDeprecationWarning) # python 3.8 - return super().visit(node) - - def generic_visit(self, node): - raise UnsupportedLanguageConstruct(None, node, "unsupported AST node type: {}".format(type(node).__name__)) - - def execute_static_print(self, node: ast.Call) -> None: - # TODO: too simplistic? Perhaps do something else with non-constexpr - - kws = {name: _unwrap_if_constexpr(value) for name, value in (self.visit(keyword) for keyword in node.keywords)} - args = [_unwrap_if_constexpr(self.visit(arg)) for arg in node.args] - print(*args, **kws) - - def execute_static_assert(self, node: ast.Call) -> None: - arg_count = len(node.args) - if not (0 < arg_count <= 2) or len(node.keywords): - raise TypeError("`static_assert` requires one or two positional arguments only") - - passed = self.visit(node.args[0]) - if not isinstance(passed, bool): - raise NotImplementedError("Assertion condition could not be determined at compile-time. Make sure that it depends only on `constexpr` values") - if not passed: - if arg_count == 1: - message = "" - else: - try: - message = self.visit(node.args[1]) - except Exception as e: - message = "" - - raise CompileTimeAssertionFailure(None, node, _unwrap_if_constexpr(message)) - return None - - statically_implemented_functions: Dict[object, Callable[[ast.Call], Any]] = { - language.core.static_assert: execute_static_assert, - language.core.static_print: execute_static_print, - } - - -def str_to_ty(name): - if name[0] == "*": - ty = str_to_ty(name[1:]) - return language.pointer_type(ty) - tys = { - "fp8e5": language.float8e5, - "fp8e4": language.float8e4, - "fp16": language.float16, - "bf16": language.bfloat16, - "fp32": language.float32, - "fp64": language.float64, - "i1": language.int1, - "i8": language.int8, - "i16": language.int16, - "i32": language.int32, - "i64": language.int64, - "u8": language.uint8, - "u16": language.uint16, - "u32": language.uint32, - "u64": language.uint64, - "B": language.int1, - } - return tys[name] - - -def kernel_suffix(signature, specialization): - # suffix format: - # <'c' if equal to 1><'d' if divisible by 16> - suffix = '' - for i, _ in enumerate(signature): - suffix += str(i) - if i in specialization.equal_to_1: - suffix += 'c' - if i in specialization.divisible_by_16: - suffix += 'd' - return suffix - - -def ast_to_ttir(fn, signature, specialization, constants, debug): - # canonicalize signature - if isinstance(signature, str): - signature = {k: v.strip() for k, v in enumerate(signature.split(","))} - context = ir.context() - context.load_triton() - # create kernel prototype - cst_key = lambda i: fn.arg_names.index(i) if isinstance(i, str) else i - constants = {cst_key(key): value for key, value in constants.items()} - # visit kernel AST - gscope = fn.__globals__.copy() - function_name = '_'.join([fn.__name__, kernel_suffix(signature.values(), specialization)]) - tys = list(signature.values()) - new_constants = {k: True if k in tys and tys[k] == "i1" else 1 for k in specialization.equal_to_1} - new_attrs = {k: ("multiple_of", 16) for k in specialization.divisible_by_16} - all_constants = constants.copy() - all_constants.update(new_constants) - arg_types = [str_to_ty(v) for k, v in signature.items() if k not in constants] - - prototype = language.function_type([], arg_types) - generator = CodeGenerator(context, prototype, gscope=gscope, constants=all_constants, - function_name=function_name, attributes=new_attrs, - is_kernel=True, debug=debug) - try: - generator.visit(fn.parse()) - except CompilationError as e: - if e.src is None: - e.set_source_code(fn.src) - raise - except Exception as e: - node = generator.last_node - if node is None: - raise - raise CompilationError(fn.src, node, repr(e)) from e - ret = generator.module - # module takes ownership of the context - ret.context = context - return ret diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py deleted file mode 100644 index 8c1dacbe12bd..000000000000 --- a/python/triton/compiler/compiler.py +++ /dev/null @@ -1,577 +0,0 @@ -from __future__ import annotations - -import functools -import hashlib -import json -import os -import re -import subprocess -import tempfile -from collections import namedtuple -from pathlib import Path -from typing import Any, Tuple - -import triton -import triton._C.libtriton.triton as _triton -from ..runtime import driver -# TODO: runtime.errors -from ..runtime.autotuner import OutOfResources -from ..runtime.cache import get_cache_manager -from ..tools.disasm import extract -from .code_generator import ast_to_ttir -from .make_launcher import make_stub - - -def inline_triton_ir(mod): - pm = _triton.ir.pass_manager(mod.context) - pm.enable_debug() - pm.add_inliner_pass() - pm.run(mod) - return mod - - -def ttir_compute_capability_rewrite(mod, arch): - # For hardware without support, we must rewrite all load/store - # with block (tensor) pointers into tensors of pointers - pm = _triton.ir.pass_manager(mod.context) - pm.enable_debug() - if _is_cuda(arch): - pm.add_rewrite_tensor_pointer_pass(arch) - pm.run(mod) - return mod - - -def optimize_ttir(mod, arch): - mod = inline_triton_ir(mod) - mod = ttir_compute_capability_rewrite(mod, arch) - pm = _triton.ir.pass_manager(mod.context) - pm.enable_debug() - pm.add_inliner_pass() - pm.add_triton_combine_pass() - pm.add_canonicalizer_pass() - pm.add_cse_pass() - pm.add_licm_pass() - pm.add_symbol_dce_pass() - pm.run(mod) - return mod - - -def ttir_to_ttgir(mod, num_warps): - pm = _triton.ir.pass_manager(mod.context) - pm.add_convert_triton_to_tritongpu_pass(num_warps) - pm.run(mod) - return mod - - -def optimize_ttgir(mod, num_stages, arch): - pm = _triton.ir.pass_manager(mod.context) - pm.enable_debug() - pm.add_tritongpu_coalesce_pass() - pm.add_tritongpu_remove_layout_conversions_pass() - if isinstance(arch, int): - pm.add_tritongpu_accelerate_matmul_pass(arch) - pm.add_tritongpu_remove_layout_conversions_pass() - pm.add_tritongpu_optimize_dot_operands_pass() - pm.add_tritongpu_pipeline_pass(num_stages) - pm.add_tritongpu_prefetch_pass() - pm.add_tritongpu_optimize_dot_operands_pass() - pm.add_tritongpu_remove_layout_conversions_pass() - pm.add_tritongpu_decompose_conversions_pass() - pm.add_tritongpu_reorder_instructions_pass() - pm.add_cse_pass() - pm.add_symbol_dce_pass() - pm.run(mod) - return mod - - -def _add_external_libs(mod, libs): - for name, path in libs.items(): - if len(name) == 0 or len(path) == 0: - return - _triton.add_external_libs(mod, list(libs.keys()), list(libs.values())) - - -def ttgir_to_llir(mod, extern_libs, arch): - if extern_libs: - _add_external_libs(mod, extern_libs) - # TODO: separate tritongpu_to_llvmir for different backends - if _is_cuda(arch): - return _triton.translate_triton_gpu_to_llvmir(mod, arch, False) - else: - return _triton.translate_triton_gpu_to_llvmir(mod, 0, True) - - -# PTX translation - -@functools.lru_cache() -def ptx_get_version(cuda_version) -> int: - ''' - Get the highest PTX version supported by the current CUDA driver. - ''' - assert isinstance(cuda_version, str) - major, minor = map(int, cuda_version.split('.')) - if major == 12: - return 80 + minor - if major == 11: - return 70 + minor - if major == 10: - return 63 + minor - raise RuntimeError("Triton only support CUDA 10.0 or higher") - - -@functools.lru_cache() -def path_to_ptxas(): - base_dir = os.path.join(os.path.dirname(__file__), os.pardir) - paths = [ - os.environ.get("TRITON_PTXAS_PATH", ""), - os.path.join(base_dir, "third_party", "cuda", "bin", "ptxas") - ] - - for ptxas in paths: - if os.path.exists(ptxas) and os.path.isfile(ptxas): - result = subprocess.check_output([ptxas, "--version"], stderr=subprocess.STDOUT) - if result is not None: - version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) - if version is not None: - return ptxas, version.group(1) - raise RuntimeError("Cannot find ptxas") - - -def llir_to_ptx(mod: Any, arch: int, ptx_version: int = None) -> str: - ''' - Translate TritonGPU module to PTX code. - :param mod: a TritonGPU dialect module - :return: PTX code - ''' - if ptx_version is None: - _, cuda_version = path_to_ptxas() - ptx_version = ptx_get_version(cuda_version) - return _triton.translate_llvmir_to_ptx(mod, arch, ptx_version) - - -def ptx_to_cubin(ptx: str, arch: int): - ''' - Compile TritonGPU module to cubin. - :param ptx: ptx code - :param compute_capability: compute capability - :return: str - ''' - ptxas, _ = path_to_ptxas() - return _triton.compile_ptx_to_cubin(ptx, ptxas, arch) - - -# AMDGCN translation - -def get_amdgcn_bitcode_paths(arch): - gpu_arch_agnostic_bitcode_libraries = ["opencl.bc", - "ocml.bc", - "ockl.bc", - "oclc_finite_only_off.bc", - "oclc_daz_opt_off.bc", - "oclc_correctly_rounded_sqrt_on.bc", - "oclc_unsafe_math_off.bc", - "oclc_wavefrontsize64_on.bc"] - - gfx_arch = arch[1] - gfx_arch_id = re.search('gfx(\\w+)', gfx_arch).group(1).strip() - - gpu_arch_specific_bitcode_library = 'oclc_isa_version_' + gfx_arch_id + ".bc" - bitcode_path_dir = os.path.join(Path(__file__).parent.resolve(), "third_party/rocm/lib/bitcode/") - - amdgcn_bitcode_paths = {} - i = 1 - for bc_lib in gpu_arch_agnostic_bitcode_libraries: - bc_path = bitcode_path_dir + bc_lib - if os.path.exists(bc_path): - amdgcn_bitcode_paths['library_' + str(i)] = bc_path - i += 1 - bc_gfx_path = bitcode_path_dir + gpu_arch_specific_bitcode_library - if os.path.exists(bc_gfx_path): - amdgcn_bitcode_paths['library_' + str(i)] = bc_gfx_path - - return amdgcn_bitcode_paths - - -def get_amdgpu_arch_fulldetails(): - """ - get the amdgpu fulll ISA details for compiling: - i.e., arch_triple: amdgcn-amd-amdhsa; arch_name: gfx906; arch_features: sramecc+:xnack- - """ - try: - # TODO: package rocm.cc with Triton - rocm_path_dir = os.getenv("ROCM_PATH", default="/opt/rocm") - rocminfo = subprocess.check_output(rocm_path_dir + '/bin/rocminfo').decode() - gfx_arch_details = re.search('amd.*', rocminfo).group(0).strip().split('--') - arch_triple = gfx_arch_details[0] - arch_name_features = gfx_arch_details[1].split(':') - arch_name = arch_name_features[0] - arch_features = "" - - if (len(arch_name_features) == 3): - arch_features = "+" + re.search('\\w+', arch_name_features[1]).group(0) + ","\ - "-" + re.search('\\w+', arch_name_features[2]).group(0) - return [arch_triple, arch_name, arch_features] - except BaseException: - return None - - -def llir_to_amdgcn_and_hsaco(mod: Any, gfx_arch: str, gfx_triple: str, gfx_features: str) -> Tuple[str, str]: - ''' - Translate TritonGPU module to HSACO code based on full details of gpu architecture. - :param mod: a TritonGPU dialect module - :return: - - AMDGCN code - - Path to HSACO object - ''' - return _triton.translate_llvmir_to_hsaco(mod, gfx_arch, gfx_triple, gfx_features) - - -# ------------------------------------------------------------------------------ -# compiler -# ------------------------------------------------------------------------------ -def get_kernel_name(src: str, pattern: str) -> str: - ''' - Get kernel name from PTX code. - This Kernel name is required when launching the kernel. - ''' - # There is a name mangling in PTX codegen, so the original kernel names in Triton IR are not available in PTX/cubin. - assert src - for line in src.split('\n'): - line = line.strip() - if line.startswith(pattern): - return line.split()[-1] - - -def convert_type_repr(x): - match = re.search(r'!tt\.ptr<(.*)>', x) - if match is not None: - return '*' + convert_type_repr(match.group(1)) - return x - - -def make_hash(fn, arch, **kwargs): - if isinstance(fn, triton.runtime.JITFunction): - configs = kwargs["configs"] - signature = kwargs["signature"] - constants = kwargs.get("constants", dict()) - num_warps = kwargs.get("num_warps", 4) - num_stages = kwargs.get("num_stages", 3) - debug = kwargs.get("debug", False) - # Get unique key for the compiled code - get_conf_key = lambda conf: (sorted(conf.divisible_by_16), sorted(conf.equal_to_1)) - configs_key = [get_conf_key(conf) for conf in configs] - key = f"{fn.cache_key}-{''.join(signature.values())}-{configs_key}-{constants}-{num_warps}-{num_stages}-{debug}-{arch}" - return hashlib.md5(key.encode("utf-8")).hexdigest() - assert isinstance(fn, str) - return hashlib.md5((Path(fn).read_text() + triton.runtime.jit.version_key()).encode("utf-8")).hexdigest() - - -# - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func, -# and any following whitespace -# - (public\s+)? : optionally match the keyword public and any following whitespace -# - (@\w+) : match an @ symbol followed by one or more word characters -# (letters, digits, or underscores), and capture it as group 1 (the function name) -# - (\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\)) : match a pair of parentheses enclosing -# zero or more arguments separated by commas, and capture it as group 2 (the argument list) -mlir_prototype_pattern = r'^\s*tt\.func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$' -ptx_prototype_pattern = r"\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\)" -prototype_pattern = { - "ttir": mlir_prototype_pattern, - "ttgir": mlir_prototype_pattern, - "ptx": ptx_prototype_pattern, -} - -mlir_arg_type_pattern = r'%\w+: ([^,^\)\s]+)(?: \{\S+ = \S+ : \S+\})?,?' -ptx_arg_type_pattern = r"\.param\s+\.(\w+)" -arg_type_pattern = { - "ttir": mlir_arg_type_pattern, - "ttgir": mlir_arg_type_pattern, - "ptx": ptx_arg_type_pattern, -} - -ttgir_num_warps_pattern = r'"triton_gpu.num-warps"\s?=\s?(\d+)\s?:' - - -def _get_jsonable_constants(constants): - def _is_jsonable(x): - try: - json.dumps(x) - return True - except (TypeError, OverflowError): - return False - serialized_constants = {} - for constant in constants: - if _is_jsonable(constants[constant]): - serialized_constants[constant] = constants[constant] - return serialized_constants - - -def parse_mlir_module(path, context): - module = _triton.ir.parse_mlir_module(path, context) - # module takes ownership of the context - module.context = context - return module - - -instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"], defaults=[set(), set()]) - - -# TODO: architecture descriptor class -def _is_cuda(arch): - return isinstance(arch, int) - - -def get_architecture_descriptor(capability): - try: - import torch - except ImportError: - raise ImportError("Triton requires PyTorch to be installed") - if capability is None: - if torch.version.hip is None: - device = triton.runtime.jit.get_current_device() - capability = triton.runtime.jit.get_device_capability(device) - capability = capability[0] * 10 + capability[1] - else: - capability = get_amdgpu_arch_fulldetails() - return capability - - -def add_rocm_stages(arch, extern_libs, stages): - extern_libs.update(get_amdgcn_bitcode_paths(arch)) - - for key in list(extern_libs): - if extern_libs[key] == '' or extern_libs[key] is None: - extern_libs.pop(key) - - gfx_arch_full_details = arch - gfx_arch = os.environ.get('MI_GPU_ARCH', gfx_arch_full_details[1]) - if gfx_arch is None: - raise RuntimeError('gfx_arch is None (not specified)') - stages["amdgcn"] = (lambda path: Path(path).read_text(), - lambda src: llir_to_amdgcn_and_hsaco(src, gfx_arch, - gfx_arch_full_details[0], - gfx_arch_full_details[2])) - - -def add_cuda_stages(arch, extern_libs, stages): - - stages["ptx"] = (lambda path: Path(path).read_text(), - lambda src: llir_to_ptx(src, arch)) - stages["cubin"] = (lambda path: Path(path).read_bytes(), - lambda src: ptx_to_cubin(src, arch)) - - -def compile(fn, **kwargs): - arch = get_architecture_descriptor(kwargs.get("cc", None)) - is_cuda = _is_cuda(arch) - context = _triton.ir.context() - asm = dict() - constants = kwargs.get("constants", dict()) - num_warps = kwargs.get("num_warps", 4) - num_stages = kwargs.get("num_stages", 3 if is_cuda and arch >= 75 else 2) - extern_libs = kwargs.get("extern_libs", dict()) - if extern_libs is None: - extern_libs = dict() - debug = kwargs.get("debug", False) - # build compilation stages - stages = dict() - stages["ast"] = (lambda path: fn, None) - stages["ttir"] = (lambda path: parse_mlir_module(path, context), - lambda src: optimize_ttir(ast_to_ttir(src, signature, configs[0], constants, debug=debug), arch)) - stages["ttgir"] = (lambda path: parse_mlir_module(path, context), - lambda src: optimize_ttgir(ttir_to_ttgir(src, num_warps), num_stages, arch)) - stages["llir"] = (lambda path: Path(path).read_text(), - lambda src: ttgir_to_llir(src, extern_libs, arch)) - if is_cuda: - add_cuda_stages(arch, extern_libs, stages) - else: - add_rocm_stages(arch, extern_libs, stages) - - # find out the signature of the function - if isinstance(fn, triton.runtime.JITFunction): - configs = kwargs.get("configs", None) - signature = kwargs["signature"] - if configs is None: - configs = [instance_descriptor()] - assert len(configs) == 1 - kwargs["configs"] = configs - name = fn.__name__ - first_stage = 0 - if isinstance(signature, str): - signature = {k: v.strip() for k, v in enumerate(signature.split(","))} - kwargs["signature"] = signature - else: - assert isinstance(fn, str) - _, ir = os.path.basename(fn).split(".") - src = Path(fn).read_text() - import re - match = re.search(prototype_pattern[ir], src, re.MULTILINE) - name, signature = match.group(1), match.group(2) - types = re.findall(arg_type_pattern[ir], signature) - if ir == 'ttgir': - num_warps_matches = re.findall(ttgir_num_warps_pattern, src) - assert len(num_warps_matches) == 1, "Expected exactly one match for num_warps" - assert "num_warps" not in kwargs or int(num_warps_matches[0]) == num_warps, "num_warps in ttgir does not match num_warps in compile" - num_warps = int(num_warps_matches[0]) - param_tys = [convert_type_repr(ty) for ty in types] - signature = {k: v for k, v in enumerate(param_tys)} - first_stage = list(stages.keys()).index(ir) - - # cache manager - so_path = make_stub(name, signature, constants) - # create cache manager - fn_cache_manager = get_cache_manager(make_hash(fn, arch, **kwargs)) - # determine name and extension type of provided function - if isinstance(fn, triton.runtime.JITFunction): - name, ext = fn.__name__, "ast" - else: - name, ext = os.path.basename(fn).split(".") - - # load metadata if any - metadata = None - metadata_filename = f"{name}.json" - - # The group is addressed by the metadata - metadata_group = fn_cache_manager.get_group( - metadata_filename - ) or {} - - metadata_path = metadata_group.get(metadata_filename) - - if metadata_path is not None: - with open(metadata_path) as f: - metadata = json.load(f) - else: - metadata = {"num_warps": num_warps, - "num_stages": num_stages, - "constants": _get_jsonable_constants(constants), - "debug": debug} - if ext == "ptx": - assert "shared" in kwargs, "ptx compilation must provide shared memory size" - metadata["shared"] = kwargs["shared"] - - first_stage = list(stages.keys()).index(ext) - asm = dict() - module = fn - # run compilation pipeline and populate metadata - for ir, (parse, compile_kernel) in list(stages.items())[first_stage:]: - ir_filename = f"{name}.{ir}" - - if ir == ext: - next_module = parse(fn) - else: - path = metadata_group.get(ir_filename) - if path is None: - next_module = compile_kernel(module) - if ir == "amdgcn": - extra_file_name = f"{name}.hsaco_path" - metadata_group[ir_filename] = fn_cache_manager.put(next_module[0], ir_filename) - metadata_group[extra_file_name] = fn_cache_manager.put(next_module[1], extra_file_name) - else: - metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename) - fn_cache_manager.put(next_module, ir_filename) - else: - if ir == "amdgcn": - extra_file_name = f"{name}.hsaco_path" - hasco_path = metadata_group.get(extra_file_name) - assert hasco_path is not None, "Expected to have hsaco_path in metadata when we have the amdgcn" - next_module = (parse(path), parse(hasco_path)) - else: - next_module = parse(path) - - if ir == "cubin": - asm[ir] = next_module - elif ir == "amdgcn": - asm[ir] = str(next_module[0]) - else: - asm[ir] = str(next_module) - if ir == "llir" and "shared" not in metadata: - metadata["shared"] = _triton.get_shared_memory_size(module) - if ir == "ptx": - metadata["name"] = get_kernel_name(next_module, pattern='// .globl') - if ir == "amdgcn": - metadata["name"] = get_kernel_name(next_module[0], pattern='.globl') - asm["hsaco_path"] = next_module[1] - module = next_module - # write-back metadata, if it didn't come from the cache - if metadata_path is None: - metadata_group[metadata_filename] = fn_cache_manager.put(json.dumps(metadata), metadata_filename, binary=False) - fn_cache_manager.put_group(metadata_filename, metadata_group) - - # return handle to compiled kernel - return CompiledKernel(fn, so_path, metadata, asm) - - -class CompiledKernel: - - # Hooks for external tools to monitor the execution of triton kernels - launch_enter_hook = None - launch_exit_hook = None - - def __init__(self, fn, so_path, metadata, asm): - # initialize launcher - import importlib.util - spec = importlib.util.spec_from_file_location("__triton_launcher", so_path) - mod = importlib.util.module_from_spec(spec) - self.fn = fn - spec.loader.exec_module(mod) - self.c_wrapper = getattr(mod, "launch") - # initialize metadata - self.shared = metadata["shared"] - self.num_warps = metadata["num_warps"] - self.num_stages = metadata["num_stages"] - self.constants = metadata["constants"] - # initialize asm dict - self.asm = asm - # binaries are lazily initialized - # because it involves doing runtime things - # (e.g., checking amount of shared memory on current device) - self.metadata = metadata - self.cu_module = None - self.cu_function = None - - def _init_handles(self): - if self.cu_module is not None: - return - device = triton.runtime.jit.get_current_device() - bin_path = { - driver.HIP: "hsaco_path", - driver.CUDA: "cubin" - }[driver.backend] - max_shared = driver.utils.get_device_properties(device)["max_shared_mem"] - if self.shared > max_shared: - raise OutOfResources(self.shared, max_shared, "shared memory") - mod, func, n_regs, n_spills = driver.utils.load_binary(self.metadata["name"], self.asm[bin_path], self.shared, device) - - self.n_spills = n_spills - self.n_regs = n_regs - self.cu_module = mod - self.cu_function = func - - def __getattribute__(self, name): - if name == 'c_wrapper': - self._init_handles() - return super().__getattribute__(name) - - def __getitem__(self, grid): - self._init_handles() - - def runner(*args, stream=None): - if stream is None: - stream = triton.runtime.jit.get_cuda_stream() - self.c_wrapper(grid[0], grid[1], grid[2], self.num_warps, self.shared, stream, self.cu_function, - CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, self, *args) - return runner - - def get_sass(self, fun=None): - if 'sass' in self.asm: - return self.asm['sass'] - fd, path = tempfile.mkstemp() - try: - with open(fd, 'wb') as cubin: - cubin.write(self.asm['cubin']) - self.sass = extract(path, fun) - finally: - os.remove(path) - self.asm['sass'] = self.sass - return self.sass diff --git a/python/triton/compiler/errors.py b/python/triton/compiler/errors.py deleted file mode 100644 index 2930117b5170..000000000000 --- a/python/triton/compiler/errors.py +++ /dev/null @@ -1,52 +0,0 @@ -import ast -from typing import Optional, Union - - -class CompilationError(Exception): - source_line_count_max_in_message = 12 - - def _format_message(self) -> str: - node = self.node - if self.src is None: - source_excerpt = " " - else: - source_excerpt = self.src.split('\n')[:node.lineno][-self.source_line_count_max_in_message:] - if source_excerpt: - source_excerpt.append(' ' * node.col_offset + '^') - source_excerpt = '\n'.join(source_excerpt) - else: - source_excerpt = " " - - message = "at {}:{}:{}".format(node.lineno, node.col_offset, source_excerpt) - if self.error_message: - message += '\n' + self.error_message - return message - - def __init__(self, src: Optional[str], node: ast.AST, error_message: Union[str, None]): - self.src = src - self.node = node - self.error_message = error_message - self.message = self._format_message() - - def set_source_code(self, src: Optional[str]): - self.src = src - self.message = self._format_message() - - def __str__(self): - return self.message - - def __repr__(self): - return "{}({!r})".format(type(self).__name__, self.message) - - def __reduce__(self): - # this is necessary to make CompilationError picklable - return type(self), (self.src, self.node, self.error_message) - - -class CompileTimeAssertionFailure(CompilationError): - """Specific exception for failed tests in `static_assert` invocations""" - pass - - -class UnsupportedLanguageConstruct(CompilationError): - pass diff --git a/python/triton/compiler/make_launcher.py b/python/triton/compiler/make_launcher.py deleted file mode 100644 index 3da8ddccf5c5..000000000000 --- a/python/triton/compiler/make_launcher.py +++ /dev/null @@ -1,373 +0,0 @@ -import hashlib -import os -import tempfile - -from ..common import _build -from ..runtime.cache import get_cache_manager -from ..runtime.jit import version_key - - -def is_hip(): - import torch - return torch.version.hip is not None - - -# ----- stub -------- - - -def make_so_cache_key(version_hash, signature, constants): - # Get unique key for the compiled code - signature = {k: 'ptr' if v[0] == '*' else v for k, v in signature.items()} - key = f"{version_hash}-{''.join(signature.values())}{constants}" - key = hashlib.md5(key.encode("utf-8")).hexdigest() - return key - - -def make_stub(name, signature, constants): - # name of files that are cached - so_cache_key = make_so_cache_key(version_key(), signature, constants) - so_cache_manager = get_cache_manager(so_cache_key) - so_name = f"{name}.so" - # retrieve stub from cache if it exists - cache_path = so_cache_manager.get_file(so_name) - if cache_path is None: - with tempfile.TemporaryDirectory() as tmpdir: - src = generate_launcher(constants, signature) - src_path = os.path.join(tmpdir, "main.c") - with open(src_path, "w") as f: - f.write(src) - so = _build(name, src_path, tmpdir) - with open(so, "rb") as f: - return so_cache_manager.put(f.read(), so_name, binary=True) - else: - return cache_path - -# ----- source code generation -------- - - -def ty_to_cpp(ty): - if ty[0] == '*': - return "hipDeviceptr_t" if is_hip() else "CUdeviceptr" - return { - "i1": "int32_t", - "i8": "int8_t", - "i16": "int16_t", - "i32": "int32_t", - "i64": "int64_t", - "u32": "uint32_t", - "u64": "uint64_t", - "fp16": "float", - "bf16": "float", - "fp32": "float", - "f32": "float", - "fp64": "double", - }[ty] - - -def generate_launcher(constants, signature): - arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items()) - - def _extracted_type(ty): - if ty[0] == '*': - return "PyObject*" - return { - 'i1': 'int32_t', - 'i32': 'int32_t', - 'i64': 'int64_t', - 'u32': 'uint32_t', - 'u64': 'uint64_t', - 'fp16': 'float', - 'bf16': 'float', - 'fp32': 'float', - 'f32': 'float', - 'fp64': 'double', - }[ty] - - def format_of(ty): - return { - "PyObject*": "O", - "float": "f", - "double": "d", - "long": "l", - "uint32_t": "I", - "int32_t": "i", - "uint64_t": "K", - "int64_t": "L", - }[ty] - - format = "iiiiiKKOOO" + ''.join([format_of(_extracted_type(ty)) for ty in signature.values()]) - - # generate glue code - if is_hip(): - src = f""" - #define __HIP_PLATFORM_AMD__ - #include - #include - #include - - static inline void gpuAssert(hipError_t code, const char *file, int line) - {{ - if (code != HIP_SUCCESS) - {{ - const char* prefix = "Triton Error [HIP]: "; - const char* str = hipGetErrorString(code); - char err[1024] = {{0}}; - snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str ); - PyErr_SetString(PyExc_RuntimeError, err); - }} - }} - - #define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} - - static void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, hipStream_t stream, hipFunction_t function, {arg_decls}) {{ - void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }}; - if (gridX*gridY*gridZ > 0) {{ - HIP_CHECK(hipModuleLaunchKernel(function, gridX, gridY, gridZ, 64*num_warps, 1, 1, shared_memory, stream, params, 0)); - }} - }} - - typedef struct _DevicePtrInfo {{ - hipDeviceptr_t dev_ptr; - bool valid; - }} DevicePtrInfo; - - static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{ - DevicePtrInfo ptr_info; - ptr_info.dev_ptr = 0; - ptr_info.valid = true; - - if (PyLong_Check(obj)) {{ - ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj); - return ptr_info; - }} - - if (obj == Py_None) {{ - // valid nullptr - return ptr_info; - }} - - PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr"); - - if (ptr) {{ - PyObject *empty_tuple = PyTuple_New(0); - PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL); - Py_DECREF(empty_tuple); - Py_DECREF(ptr); - - if (!PyLong_Check(ret)) {{ - PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); - ptr_info.valid = false; - return ptr_info; - }} - - ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret); - - if (!ptr_info.dev_ptr) - return ptr_info; - - uint64_t dev_ptr; - hipError_t status = hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr); - if (status == hipErrorInvalidValue) {{ - PyErr_Format(PyExc_ValueError, - "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx); - ptr_info.valid = false; - }} - - ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr; - return ptr_info; - }} - - PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); - return ptr_info; - }} - - static PyObject* launch(PyObject* self, PyObject* args) {{ - - int gridX, gridY, gridZ; - uint64_t _stream; - uint64_t _function; - int num_warps; - int shared_memory; - PyObject *launch_enter_hook = NULL; - PyObject *launch_exit_hook = NULL; - PyObject *compiled_kernel = NULL; - - {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])} - if (!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &num_warps, &shared_memory, &_stream, &_function, &launch_enter_hook, &launch_exit_hook, &compiled_kernel, {', '.join(f"&_arg{i}" for i, ty in signature.items())})) {{ - return NULL; - }} - - if (launch_enter_hook != Py_None) {{ - PyObject_CallObject(launch_enter_hook, args); - }} - - // raise exception asap - {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])}; - _launch(gridX, gridY, gridZ, num_warps, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, {', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}" for i, ty in signature.items())}); - if (launch_exit_hook != Py_None) {{ - PyObject_CallObject(launch_exit_hook, args); - }} - if (PyErr_Occurred()) {{ - return NULL; - }} - - // return None - Py_INCREF(Py_None); - return Py_None; - }} - - static PyMethodDef ModuleMethods[] = {{ - {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, - {{NULL, NULL, 0, NULL}} // sentinel - }}; - - static struct PyModuleDef ModuleDef = {{ - PyModuleDef_HEAD_INIT, - \"__triton_launcher\", - NULL, //documentation - -1, //size - ModuleMethods - }}; - - PyMODINIT_FUNC PyInit___triton_launcher(void) {{ - PyObject *m = PyModule_Create(&ModuleDef); - if(m == NULL) {{ - return NULL; - }} - PyModule_AddFunctions(m, ModuleMethods); - return m; - }} - """ - else: - src = f""" -#include \"cuda.h\" -#include -#include - -static inline void gpuAssert(CUresult code, const char *file, int line) -{{ - if (code != CUDA_SUCCESS) - {{ - const char* prefix = "Triton Error [CUDA]: "; - const char* str; - cuGetErrorString(code, &str); - char err[1024] = {{0}}; - strcat(err, prefix); - strcat(err, str); - PyErr_SetString(PyExc_RuntimeError, err); - }} -}} - -#define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} - -static void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, CUstream stream, CUfunction function, {arg_decls}) {{ - void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }}; - if(gridX*gridY*gridZ > 0){{ - CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0)); - }} -}} - -typedef struct _DevicePtrInfo {{ - CUdeviceptr dev_ptr; - bool valid; -}} DevicePtrInfo; - -static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{ - DevicePtrInfo ptr_info; - ptr_info.dev_ptr = 0; - ptr_info.valid = true; - if (PyLong_Check(obj)) {{ - ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj); - return ptr_info; - }} - if (obj == Py_None) {{ - // valid nullptr - return ptr_info; - }} - PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr"); - if(ptr){{ - PyObject *empty_tuple = PyTuple_New(0); - PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL); - Py_DECREF(empty_tuple); - Py_DECREF(ptr); - if (!PyLong_Check(ret)) {{ - PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); - ptr_info.valid = false; - return ptr_info; - }} - ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret); - if(!ptr_info.dev_ptr) - return ptr_info; - uint64_t dev_ptr; - int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr); - if (status == CUDA_ERROR_INVALID_VALUE) {{ - PyErr_Format(PyExc_ValueError, - "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx); - ptr_info.valid = false; - }} - ptr_info.dev_ptr = dev_ptr; - Py_DECREF(ret); // Thanks ChatGPT! - return ptr_info; - }} - PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); - return ptr_info; -}} - -static PyObject* launch(PyObject* self, PyObject* args) {{ - int gridX, gridY, gridZ; - uint64_t _stream; - uint64_t _function; - int num_warps; - int shared_memory; - PyObject *launch_enter_hook = NULL; - PyObject *launch_exit_hook = NULL; - PyObject *compiled_kernel = NULL; - {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])} - if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &num_warps, &shared_memory, &_stream, &_function, &launch_enter_hook, &launch_exit_hook, &compiled_kernel, {', '.join(f"&_arg{i}" for i, ty in signature.items())})) {{ - return NULL; - }} - - if (launch_enter_hook != Py_None) {{ - PyObject_CallObject(launch_enter_hook, args); - }} - - - // raise exception asap - {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])}; - _launch(gridX, gridY, gridZ, num_warps, shared_memory, (CUstream)_stream, (CUfunction)_function, {', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items())}); - - if (launch_exit_hook != Py_None) {{ - PyObject_CallObject(launch_exit_hook, args); - }} - - if(PyErr_Occurred()) {{ - return NULL; - }} - // return None - Py_INCREF(Py_None); - return Py_None; -}} - -static PyMethodDef ModuleMethods[] = {{ - {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, - {{NULL, NULL, 0, NULL}} // sentinel -}}; - -static struct PyModuleDef ModuleDef = {{ - PyModuleDef_HEAD_INIT, - \"__triton_launcher\", - NULL, //documentation - -1, //size - ModuleMethods -}}; - -PyMODINIT_FUNC PyInit___triton_launcher(void) {{ - PyObject *m = PyModule_Create(&ModuleDef); - if(m == NULL) {{ - return NULL; - }} - PyModule_AddFunctions(m, ModuleMethods); - return m; -}} -""" - return src diff --git a/python/triton/debugger/__init__.py b/python/triton/debugger/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/triton/debugger/core.py b/python/triton/debugger/core.py deleted file mode 100644 index 82f3f43a25a0..000000000000 --- a/python/triton/debugger/core.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Tuple - -import dataclasses - - -@dataclasses.dataclass -class ExecutionContext: - program_id: Tuple[int] - program_size: Tuple[int] diff --git a/python/triton/debugger/debugger.py b/python/triton/debugger/debugger.py deleted file mode 100644 index 5c5b97292fac..000000000000 --- a/python/triton/debugger/debugger.py +++ /dev/null @@ -1,170 +0,0 @@ -import itertools -import random -from typing import Tuple - -import triton -import triton.language as tl -from .core import ExecutionContext -from .memory_map import MemoryMap -from .tl_lang import (TritonLangProxy, WrappedTensor, _primitive_to_tensor, - debugger_constexpr) -from triton.debugger import torch_wrapper - -torch = torch_wrapper.torch -tl_method_backup = {} - - -def get_proxy_method(proxy, name): - method = getattr(proxy, name) - - def fun(*args, **kwarg): - return method(*args, **kwarg) - - return fun - - -def attach_triton(module, proxy): - method_list = [func for func in dir(TritonLangProxy) if func[0] != "_"] - for name in method_list: - if hasattr(module, name): - attr = getattr(module, name) - tl_method_backup[name] = attr - if callable(attr): - setattr(module, name, get_proxy_method(proxy, name)) - else: - setattr(module, name, getattr(proxy, name)) - - -def detach_triton(module): - for name, method in tl_method_backup.items(): - setattr(module, name, method) - - -def program_ids_from_grid(grid: Tuple[int, ...]) -> Tuple[int, ...]: - # reverse the grid dimensions and generate the range for each dimension - reversed_grid = reversed(grid) - ranges_for_each_dimension = [range(dim) for dim in reversed_grid] - - # gen all combinations - index_combinations = list(itertools.product(*ranges_for_each_dimension)) - random.shuffle(index_combinations) - - for index_combination in index_combinations: - yield index_combination - - -class DebuggerFunction: - def __init__(self, func, grid=(1,)): - self.func = func - self.grid = grid - - def _is_constexpr(self, name): - return name in self.func.__annotations__ and self.func.__annotations__[name] is triton.language.core.constexpr - - def _get_constexpr(self): - result = [] - for name, annotation in self.func.__annotations__.items(): - if annotation is triton.language.core.constexpr: - result.append(name) - return result - - def _assert_constexpr(self, **kwargs): - constexp = self._get_constexpr() - missing = [i for i in constexp if i not in kwargs.keys()] - assert len(missing) == 0, f"You must specify constexpr {missing}" - - def _get_grid(self, **kwargs): - if callable(self.grid): - return self.grid(kwargs) - else: - return self.grid - - def __call__(self, *args, **kwargs): - self._assert_constexpr(**kwargs) - - memory = MemoryMap() - - def convert_arg(v): - name, arg = v - if torch.is_tensor(arg): - ptr = memory.add_tensor(arg) - return WrappedTensor(torch.tensor([ptr], dtype=torch.int64, device="cuda")) - if self._is_constexpr(name): - return debugger_constexpr(arg) - return WrappedTensor(_primitive_to_tensor(arg)) - - new_args = tuple(map(convert_arg, zip(self.func.__code__.co_varnames, args))) - new_kwargs = {k: convert_arg((k, v)) for (k, v) in kwargs.items() if k not in ["num_warps", "num_stages"]} - - grid = self._get_grid(**kwargs) - for program_id in program_ids_from_grid(grid): - proxy = TritonLangProxy(memory, ExecutionContext(program_id, grid)) - attach_triton(tl, proxy) - self.func(*new_args, **new_kwargs) - detach_triton(tl) - - -class GridSelector: - """ - Entry point of the debugger - """ - - def __init__(self, func): - version = torch.__version__ - assert version[0] == "2", f"Triton Debugger only supports torch >= 2.0, using {version}" - self.func = func - - def __getitem__(self, grid): - return DebuggerFunction(self.func, grid) - - def __call__(self, *args, **kwargs): - return DebuggerFunction(self.func)(*args, **kwargs) - - -class AutotuneGridSelector: - def __init__(self, func, autotune_params): - self.func = func - self.autotune_params = autotune_params - - def __getitem__(self, grid): - return AutotuneRunner(self.func, self.autotune_params, grid) - - def __call__(self, *args, **kwargs): - return AutotuneRunner(self.func, self.autotune_params)(*args, **kwargs) - - -class AutotuneRunner: - def __init__(self, func, autotune_params, grid=None): - self.func = func - self.autotune_params = autotune_params - self.grid = grid - - def __call__(self, *args, **kwargs): - assert len(self.autotune_params["configs"]) >= 1 - - for config in self.autotune_params["configs"][1:]: - - def convert_arg(v): - if torch.is_tensor(v): - return torch.clone(v) - return v - - new_args = tuple(map(convert_arg, args)) - new_kwargs = {k: convert_arg(v) for k, v in kwargs.items()} - if self.grid: - self.func[self.grid](*new_args, **new_kwargs, **config.kwargs) - else: - self.func(*new_args, **new_kwargs, **config.kwargs) - - main_config = self.autotune_params["configs"][0] - if self.grid: - self.func[self.grid](*args, **kwargs, **main_config.kwargs) - else: - self.func(*args, **kwargs, **main_config.kwargs) - - -def triton_debug_autotune(**kwars): - def wrapper(func): - return AutotuneGridSelector(func, kwars) - - return wrapper diff --git a/python/triton/debugger/memory_map.py b/python/triton/debugger/memory_map.py deleted file mode 100644 index edf4c3f77922..000000000000 --- a/python/triton/debugger/memory_map.py +++ /dev/null @@ -1,100 +0,0 @@ -import dataclasses - -from triton.debugger import torch_wrapper - -torch = torch_wrapper.torch - - -@dataclasses.dataclass -class RegisteredStorage: - storage: torch.Storage - dtype: torch.dtype - size: int - ptr: int - - @property - def end_ptr(self) -> int: - return self.ptr + self.size - - @property - def access_tensor(self) -> torch.Tensor: - return torch.tensor(self.storage, dtype=self.dtype, device=self.storage.device) - - def ensure_immutable(self): - assert self.storage.data_ptr() == self.ptr and self.storage.size() == self.size - - -class MemoryMap: - storages: [RegisteredStorage] - - def __init__(self): - self.storages = [] - - def _get_registered_storage(self, pointer: torch.Tensor): - max_pointer = torch.max(pointer).item() - min_pointer = torch.min(pointer).item() - - registered_storage = next( - filter( - lambda registered: min_pointer >= registered.ptr and max_pointer < registered.end_ptr, self.storages - ), - None, - ) - if registered_storage is None: - raise Exception("Storage not found or pointers spanning multiple tensors") - registered_storage.ensure_immutable() - return registered_storage - - def add_tensor(self, t: torch.Tensor): - storage = t.untyped_storage() - self.storages.append(RegisteredStorage(storage, t.dtype, storage.size(), storage.data_ptr())) - return t.data_ptr() - - def load( - self, - pointer: torch.Tensor, - mask: torch.Tensor = None, - other=0.0, - ): - assert pointer.is_cuda - assert 0 < pointer.dim() < 3 - assert pointer.dtype == torch.int64 - - if mask is None: - mask = torch.ones_like(pointer).bool() - assert mask.is_cuda - assert 0 < mask.dim() < 3 - assert mask.dtype == torch.bool - mask = mask.expand(pointer.size()) - - if torch.all(~mask): - # Todo: The type is wrong here, we can't determine the correct type - return torch.full_like(pointer, fill_value=other, dtype=torch.float16, device="cuda") - - registered_storage = self._get_registered_storage(pointer[mask]) - access_tensor = registered_storage.access_tensor - - index_tensor = pointer - registered_storage.ptr - - block = torch.full_like(pointer, fill_value=other, dtype=access_tensor.dtype, device="cuda") - block[mask] = access_tensor[index_tensor[mask]] - return block - - def store(self, pointer: torch.Tensor, value: torch.Tensor, mask=None): - assert 0 < pointer.dim() < 3 - assert pointer.dtype == torch.int64 - - if mask is None: - mask = torch.ones_like(pointer).bool() - assert 0 < mask.dim() < 3 - assert mask.dtype == torch.bool - mask = mask.expand(pointer.size()) - - if torch.all(~mask): - return - - registered_storage = self._get_registered_storage(pointer[mask]) - access_tensor = registered_storage.access_tensor - - index_tensor = pointer - registered_storage.ptr - access_tensor[index_tensor[mask]] = value[mask].to(access_tensor.dtype) diff --git a/python/triton/debugger/tl_lang.py b/python/triton/debugger/tl_lang.py deleted file mode 100644 index 6364b77a3803..000000000000 --- a/python/triton/debugger/tl_lang.py +++ /dev/null @@ -1,621 +0,0 @@ -import triton -from .core import ExecutionContext -from .memory_map import MemoryMap -from triton.debugger import torch_wrapper - -torch = torch_wrapper.torch - - -def _primitive_to_tensor(x): - """ - Converts various Python primitive data types to PyTorch tensor. - """ - tensor_args = {"device": "cuda"} - if isinstance(x, bool): - return torch.tensor([x], dtype=torch.bool, **tensor_args) - elif isinstance(x, int): - if -(2**31) <= x < 2**31: - return torch.tensor([x], dtype=torch.int32, **tensor_args) - elif -(2**63) <= x < 2**63: - return torch.tensor([x], dtype=torch.int64, **tensor_args) - else: - raise RuntimeError(f"Nonrepresentable integer {x}.") - elif isinstance(x, float): - return torch.tensor([x], dtype=torch.float32, **tensor_args) - elif torch.is_tensor(x): - return x - elif isinstance(x, WrappedTensor): - return x - elif isinstance(x, debugger_constexpr): - if x.value is None: - return None - return _primitive_to_tensor(x.value) - elif x is None: - return None - assert False, f"cannot convert {x} of type {type(x)} to tensor" - - -def _infer_tensor(func): - """ - A decorator function to harmonize function args: - - converts primitives to PyTorch tensors - - wraps PyTorch tensors with WrappedTensors - """ - def wrapper(*args): - new_args = tuple(map(lambda v: _primitive_to_tensor(v), args)) - new_args = tuple(map(lambda v: WrappedTensor(v) if torch.is_tensor(v) else v, new_args)) - - return func(*new_args) - - return wrapper - - -def _tensor_operation(func): - """ - A decorator function to unwrap WrappedTensors and debugger_constexpr before calling the function. - Can be combined with _infer_tensor decorator to harmonize args (everything to torch tensor). - """ - def wrapper(*args, **kwargs): - for arg in args: - assert not torch.is_tensor(arg), "unexpected tensor argument" - - def unwrap_tensor(v): - if isinstance(v, WrappedTensor): - return v.tensor - if isinstance(v, debugger_constexpr): - return v.value - return v - - new_args = tuple(map(unwrap_tensor, args)) - new_kwargs = {k: unwrap_tensor(v) for k, v in kwargs.items()} - - result = func(args[0], *new_args[1:], **new_kwargs) - return WrappedTensor(result) if torch.is_tensor(result) else result - - return wrapper - - -class debugger_constexpr: - def __init__(self, value): - if isinstance(value, debugger_constexpr): - self.value = value.value - else: - self.value = value - - def __str__(self) -> str: - return "debugger_constexpr(" + str(self.value) + ")" - - def __index__(self) -> int: - return self.value - - def __bool__(self): - return bool(self.value) - - def __ge__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value >= other - - def __gt__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value > other - - def __le__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value <= other - - def __lt__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value < other - - def __eq__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value == other - - def __or__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value | other - - def __ror__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value | other - - def __and__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value & other - - def __rand__(self, other): - other = other.value if isinstance(other, debugger_constexpr) else other - return self.value & other - - def to(self, dtype, bitcast=False, _builder=None): - if dtype in [torch.int64]: - ret_ty = int - elif dtype == torch.bool: - ret_ty = bool - elif dtype in [torch.float64]: - ret_ty = float - else: - raise ValueError("dtype not supported in debugger") - return debugger_constexpr(ret_ty(self.value)) - - -class WrappedTensor: - def __init__(self, tensor): - self.tensor = tensor - - def __index__(self) -> int: - return self.tensor.item() - - def __str__(self) -> str: - return "wrapped_" + str(self.tensor) - - def __bool__(self) -> bool: - return torch.all(self.tensor == True).item() # noqa: E712 - - @property - def dtype(self): - return self.tensor.dtype - - @_infer_tensor - @_tensor_operation - def __add__(self, other): - return torch.add(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __radd__(self, other): - return self.__add__(other) - - @_infer_tensor - @_tensor_operation - def __sub__(self, other): - return torch.sub(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __rsub__(self, other): - return torch.sub(other, self.tensor) - - @_infer_tensor - @_tensor_operation - def __mul__(self, other): - return torch.mul(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __rmul__(self, other): - return self.__mul__(other) - - @_infer_tensor - @_tensor_operation - def __truediv__(self, other): - return torch.div(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __rtruediv__(self, other): - return torch.div(other, self.tensor) - - @_infer_tensor - @_tensor_operation - def __floordiv__(self, other): - return torch.floor_divide(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __rfloordiv__(self, other): - return torch.floor_divide(other, self.tensor) - - @_infer_tensor - @_tensor_operation - def __mod__(self, other): - return torch.remainder(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __rmod__(self, other): - return torch.remainder(other, self.tensor) - - @_infer_tensor - @_tensor_operation - def __neg__(self): - return -self.tensor - - @_infer_tensor - @_tensor_operation - def __invert__(self): - return ~self.tensor - - @_infer_tensor - @_tensor_operation - def __and__(self, other): - return torch.bitwise_and(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __or__(self, other): - return torch.bitwise_or(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __xor__(self, other): - return torch.bitwise_xor(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __lshift__(self, other): - return torch.bitwise_left_shift(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __rshift__(self, other): - return torch.bitwise_right_shift(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __gt__(self, other): - return self.tensor > other - - @_infer_tensor - @_tensor_operation - def __rgt__(self, other): - return other > self.tensor - - @_infer_tensor - @_tensor_operation - def __ge__(self, other): - return self.tensor >= other - - @_infer_tensor - @_tensor_operation - def __rge__(self, other): - return other >= self.tensor - - @_infer_tensor - @_tensor_operation - def __lt__(self, other): - return self.tensor < other - - @_infer_tensor - @_tensor_operation - def __rlt__(self, other): - return other < self.tensor - - @_infer_tensor - @_tensor_operation - def __le__(self, other): - return self.tensor <= other - - @_infer_tensor - @_tensor_operation - def __rle__(self, other): - return other <= self.tensor - - @_infer_tensor - @_tensor_operation - def __eq__(self, other): - return torch.equal(self.tensor, other) - - @_infer_tensor - @_tensor_operation - def __ne__(self, other): - return not torch.equal(self.tensor, other) - - @_tensor_operation - def __getitem__(self, slices): - return self.tensor.__getitem__(slices) - # if isinstance(slices, slice): - # slices = [slices] - # src_shape = self.shape - # dst_shape = [] - # curr = 0 - # for sl in slices: - # if isinstance(sl, constexpr) and sl.value is None: - # dst_shape.append(1) - # elif sl == slice(None, None, None): - # dst_shape.append(src_shape[curr].value) - # curr += 1 - # ret = torch.reshape(self.tensor, dst_shape, ) - # return ret - - @_tensor_operation - def to(self, dtype, bitcast=False): - return self.tensor.to(dtype) - # if isinstance(bitcast, constexpr): - # bitcast = bitcast.value - # if bitcast: - # return semantic.bitcast(self, dtype, ) - # return semantic.cast(self, dtype, ) - - -def _constexpr_to_value(v): - if isinstance(v, debugger_constexpr): - return v.value - return v - - -class TritonLangProxy: - _memory_map: MemoryMap - _context: ExecutionContext - - def __init__(self, memory_map: MemoryMap, context: ExecutionContext): - self._memory_map = memory_map - self._context = context - - # Types - # Removed void, int1, float8, uint16, uint32, uint64, pi32_t - - # constexpr = debugger_constexpr - - # Program functions - - @_tensor_operation - def load( - self, - pointer: torch.Tensor, - mask: torch.Tensor = None, - other=0.0, - cache_modifier="", - eviction_policy="", - volatile=False, - ): - return self._memory_map.load(pointer, mask, other) - - @_tensor_operation - def store(self, pointer: torch.Tensor, value: torch.Tensor, mask=None): - return self._memory_map.store(pointer, value, mask) - - @_tensor_operation - def program_id(self, axis): - assert axis < len(self._context.program_id) - return torch.tensor([self._context.program_id[axis]], dtype=torch.int32, device="cuda") - - @_tensor_operation - def num_programs(self, axis): - assert axis < len(self._context.program_size) - return torch.tensor([self._context.program_size[axis]], dtype=torch.int32, device="cuda") - - @_tensor_operation - def arange(self, start, end): - return torch.arange(start=start, end=end, dtype=torch.int32, device="cuda") - - @_tensor_operation - def zeros(self, shape, dtype): - for i, d in enumerate(shape): - if not isinstance(d, debugger_constexpr): - raise TypeError(f"Shape element {i} must have type `constexpr`") - if not isinstance(d.value, int): - raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]") - shape = [x.value for x in shape] - if isinstance(dtype, triton.language.core.dtype): - if dtype.is_fp32(): - dtype = torch.float32 - elif dtype.is_fp16(): - dtype = torch.float16 - elif dtype.is_bf16(): - dtype = torch.bfloat16 - elif dtype.is_int32(): - dtype = torch.int32 - elif dtype.is_int16(): - dtype = torch.int16 - elif dtype.is_int8(): - dtype = torch.int8 - else: - raise TypeError(f"Unsupported dtype {dtype}") - return torch.zeros(size=shape, dtype=dtype, device="cuda") - - @_tensor_operation - def dequantize(self, input, scale, shift, nbit, dst_ty=torch.float16): - raise NotImplementedError() - - @_tensor_operation - def broadcast(self, input, other): - raise NotImplementedError() - - @_tensor_operation - def broadcast_to(self, input, shape): - raise NotImplementedError() - - @_tensor_operation - def cat(self, input, shape): - raise NotImplementedError() - - @_tensor_operation - def reshape(self, input, shape): - raise NotImplementedError() - - @_tensor_operation - def dot(self, input, other, trans_a=False, trans_b=False, allow_tf32=True): - assert input.dtype == other.dtype - if trans_a: - input = input.T - if trans_b: - other = other.T - return torch.matmul(input=input, other=other) - - @_tensor_operation - def atomic_cas(self, pointer, cmp, val): - stored = self._memory_map.load(pointer, None, 0.0) - if not isinstance(cmp, torch.Tensor): - cmp = torch.tensor([cmp], dtype=stored.dtype, device="cuda") - if not isinstance(val, torch.Tensor): - val = torch.tensor([val], dtype=stored.dtype, device="cuda") - if stored == cmp: - self._memory_map.store(pointer, val, None) - return stored - - @_tensor_operation - def atomic_xchg(self, pointer, val, mask=None): - if isinstance(val, int): - val = torch.tensor([val], dtype=torch.int32, device="cuda") - stored = self._memory_map.load(pointer, mask, 0.0) - self._memory_map.store(pointer, val, mask) - return stored - - @_tensor_operation - def atomic_add(self, pointer, val, mask=None): - # arbitrary other value as it will masked during storing - stored = self._memory_map.load(pointer, mask, 0.0) - result = stored + val - self._memory_map.store(pointer, result, mask) - return stored - - @_tensor_operation - def atomic_max(self, pointer, val, mask=None): - stored = self._memory_map.load(pointer, mask, 0.0) - result = torch.maximum(stored, val) - self._memory_map.store(pointer, result, mask) - return stored - - @_tensor_operation - def atomic_min(self, pointer, val, mask=None): - stored = self._memory_map.load(pointer, mask, 0.0) - result = torch.minimum(stored, val) - self._memory_map.store(pointer, result, mask) - return stored - - @_tensor_operation - def atomic_and(self, pointer, val, mask=None): - stored = self._memory_map.load(pointer, mask, 0) - result = torch.bitwise_and(stored, val) - self._memory_map.store(pointer, result, mask) - return stored - - @_tensor_operation - def atomic_or(self, pointer, val, mask=None): - stored = self._memory_map.load(pointer, mask, 0) - result = torch.bitwise_or(stored, val) - self._memory_map.store(pointer, result, mask) - return stored - - @_tensor_operation - def atomic_xor(self, pointer, val, mask=None): - stored = self._memory_map.load(pointer, mask, 0) - result = torch.bitwise_xor(stored, val) - self._memory_map.store(pointer, result, mask) - return stored - - @_tensor_operation - def where(self, condition, x, y): - condition = _primitive_to_tensor(condition) - x = _primitive_to_tensor(x) - y = _primitive_to_tensor(y) - return torch.where(condition, x, y) - - @_tensor_operation - def umulhi(self, x, y): - raise NotImplementedError() - - @_tensor_operation - def fdiv(self, x, y, ieee_rounding=False): - raise NotImplementedError() - - @_tensor_operation - def exp(self, x): - return torch.exp(x) - - @_tensor_operation - def log(self, x): - return torch.log(x) - - @_tensor_operation - def cos(self, x): - return torch.cos(x) - - @_tensor_operation - def sin(self, x): - return torch.sin(x) - - @_tensor_operation - def sqrt(self, x): - return torch.sqrt(x) - - @_tensor_operation - def globaltimer(self): - raise NotImplementedError() - - @_tensor_operation - def clock(self): - raise NotImplementedError() - - @_tensor_operation - def debug_barrier(self): - raise NotImplementedError() - - @_tensor_operation - def multiple_of(self, input, values): - return input - - @_tensor_operation - def max_contiguous(self, input, values): - return input - - @_tensor_operation - def abs(self, x): - return torch.abs(x) - - @_tensor_operation - def cdiv(self, x, div): - return (x + div - 1) // div - - @_tensor_operation - def minimum(self, x, y): - if isinstance(x, int): - x = torch.tensor(x, device="cuda") - if isinstance(y, int): - y = torch.tensor(y, device="cuda") - return torch.minimum(x, y) - - @_tensor_operation - def maximum(self, x, y): - return torch.maximum(x, y) - - @_tensor_operation - def sigmoid(self, x): - raise NotImplementedError() - - @_tensor_operation - def softmax(self, x, ieee_rounding=False): - raise NotImplementedError() - - @_tensor_operation - def ravel(self, x): - raise NotImplementedError() - - @_tensor_operation - def swizzle2d(self, i, j, size_i, size_j, size_g): - raise NotImplementedError() - - @_tensor_operation - def zeros_like(self, input): - raise NotImplementedError() - - @_tensor_operation - def max(self, input, axis=None): - if axis is None: - return torch.max(input) - return torch.max(input, dim=axis).values - - @_tensor_operation - def argmax(self, input, axis): - raise NotImplementedError() - - @_tensor_operation - def min(self, input, axis=None): - if axis is None: - return torch.min(input) - return torch.min(input, dim=axis).values - - @_tensor_operation - def argmin(self, input, axis): - raise NotImplementedError() - - @_tensor_operation - def sum(self, input, axis=None): - if axis is None: - return torch.sum(input) - return torch.sum(input, dim=axis) - - @_tensor_operation - def xor_sum(self, input, axis): - raise NotImplementedError() diff --git a/python/triton/debugger/torch_wrapper.py b/python/triton/debugger/torch_wrapper.py deleted file mode 100644 index 44aa17eb1355..000000000000 --- a/python/triton/debugger/torch_wrapper.py +++ /dev/null @@ -1,18 +0,0 @@ -try: - import torch as _torch -except ImportError: - _torch = None - - -class TorchWrapper: - """ - Helps in making torch an optional dependency - """ - - def __getattr__(self, name): - if _torch is None: - raise ImportError("Triton requires PyTorch to be installed") - return getattr(_torch, name) - - -torch = TorchWrapper() diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py deleted file mode 100644 index 7485f374b9e9..000000000000 --- a/python/triton/language/__init__.py +++ /dev/null @@ -1,201 +0,0 @@ -"""isort:skip_file""" -# Import order is significant here. - -from . import math -from . import extra -from .standard import ( - cdiv, - sigmoid, - softmax, - ravel, - swizzle2d, - zeros, - zeros_like, -) -from .core import ( - abs, - advance, - arange, - argmin, - argmax, - atomic_add, - atomic_and, - atomic_cas, - atomic_max, - atomic_min, - atomic_or, - atomic_xchg, - atomic_xor, - bfloat16, - block_type, - broadcast, - broadcast_to, - cat, - constexpr, - cos, - debug_barrier, - device_assert, - device_print, - dot, - dtype, - exp, - expand_dims, - full, - fdiv, - float16, - float32, - float64, - float8e4, - float8e5, - function_type, - int1, - int16, - int32, - int64, - int8, - load, - log, - make_block_ptr, - max, - max_contiguous, - maximum, - min, - minimum, - multiple_of, - num_programs, - pi32_t, - pointer_type, - program_id, - reduce, - reshape, - sin, - sqrt, - static_assert, - static_print, - store, - sum, - static_range, - tensor, - trans, - triton, - uint16, - uint32, - uint64, - uint8, - umulhi, - view, - void, - where, - xor_sum, -) -from .random import ( - pair_uniform_to_normal, - philox, - philox_impl, - rand, - rand4x, - randint, - randint4x, - randn, - randn4x, - uint32_to_uniform_float, -) - - -__all__ = [ - "abs", - "advance", - "arange", - "argmin", - "argmax", - "atomic_add", - "atomic_and", - "atomic_cas", - "atomic_max", - "atomic_min", - "atomic_or", - "atomic_xchg", - "atomic_xor", - "bfloat16", - "block_type", - "broadcast", - "broadcast_to", - "builtin", - "cat", - "cdiv", - "constexpr", - "cos", - "debug_barrier", - "device_assert", - "device_print", - "dot", - "dtype", - "exp", - "expand_dims", - "extra", - "fdiv", - "float16", - "float32", - "float64", - "float8e4", - "float8e5", - "full", - "function_type", - "int1", - "int16", - "int32", - "int64", - "int8", - "ir", - "math", - "load", - "log", - "make_block_ptr", - "max", - "max_contiguous", - "maximum", - "min", - "minimum", - "multiple_of", - "num_programs", - "pair_uniform_to_normal", - "philox", - "philox_impl", - "pi32_t", - "pointer_type", - "program_id", - "rand", - "rand4x", - "randint", - "randint4x", - "randn", - "randn4x", - "ravel", - "reduce", - "reshape", - "sigmoid", - "sin", - "softmax", - "sqrt", - "static_range", - "static_assert", - "static_print", - "store", - "sum", - "swizzle2d", - "tensor", - "trans", - "triton", - "uint16", - "uint32", - "uint32_to_uniform_float", - "uint64", - "uint8", - "umulhi", - "view", - "void", - "where", - "xor_sum", - "zeros", - "zeros_like", -] diff --git a/python/triton/language/core.py b/python/triton/language/core.py deleted file mode 100644 index a3c4609961f0..000000000000 --- a/python/triton/language/core.py +++ /dev/null @@ -1,1702 +0,0 @@ -from __future__ import annotations - -from contextlib import contextmanager -from enum import Enum -from functools import wraps -from typing import Callable, List, Sequence, TypeVar - -import triton -from . import semantic -from triton._C.libtriton.triton import ir - -T = TypeVar('T') - -TRITON_MAX_TENSOR_NUMEL = 131072 - -TRITON_BUILTIN = "__triton_builtin__" - - -def builtin(fn: T) -> T: - """Mark a function as a builtin.""" - assert callable(fn) - - @wraps(fn) - def wrapper(*args, **kwargs): - if "_builder" not in kwargs or kwargs["_builder"] is None: - raise ValueError( - "Did you forget to add @triton.jit ? " - "(`_builder` argument must be provided outside of JIT functions.)" - ) - return fn(*args, **kwargs) - - setattr(wrapper, TRITON_BUILTIN, True) - - return wrapper - - -def is_builtin(fn) -> bool: - """Is this a registered triton builtin function?""" - return getattr(fn, TRITON_BUILTIN, False) - - -def _to_tensor(x, builder): - if isinstance(x, bool): - return tensor(builder.get_int1(x), int1) - # Note: compile-time const integers are represented by unsigned values - elif isinstance(x, int): - if -2**31 <= x < 2**31: - return tensor(builder.get_int32(x), int32) - elif 2**31 <= x < 2**32: - return tensor(builder.get_int32(x), uint32) - elif -2**63 <= x < 2**63: - return tensor(builder.get_int64(x), int64) - elif 2**63 <= x < 2**64: - return tensor(builder.get_int64(x), uint64) - else: - raise RuntimeError(f'Nonrepresentable integer {x}.') - elif isinstance(x, float): - min_float32 = 2 ** -126 - max_float32 = (2 - 2**-23) * 2**127 - abs_x = __builtins__['abs'](x) - if abs_x == float("inf") or\ - abs_x == 0.0 or \ - x != x or \ - min_float32 <= abs_x <= max_float32: - return tensor(builder.get_fp32(x), float32) - else: - return tensor(builder.get_fp64(x), float64) - - elif isinstance(x, constexpr): - return _to_tensor(x.value, builder) - elif isinstance(x, tensor): - return x - assert False, f"cannot convert {x} of type {type(x)} to tensor" - - -class dtype: - SINT_TYPES = ['int8', 'int16', 'int32', 'int64'] - UINT_TYPES = ['int1', 'uint8', 'uint16', 'uint32', 'uint64'] - FP_TYPES = ['fp8e4', 'fp8e5', 'fp16', 'bf16', 'fp32', 'fp64'] - STANDARD_FP_TYPES = ['fp16', 'bf16', 'fp32', 'fp64'] - OTHER_TYPES = ['void'] - - class SIGNEDNESS(Enum): - SIGNED = 0 - UNSIGNED = 1 - - def __init__(self, name): - self.name = name - assert name in dtype.SINT_TYPES + dtype.UINT_TYPES + dtype.FP_TYPES + dtype.OTHER_TYPES, name - if name in dtype.SINT_TYPES: - self.int_signedness = dtype.SIGNEDNESS.SIGNED - self.int_bitwidth = int(name.split('int')[-1]) - self.primitive_bitwidth = self.int_bitwidth - elif name in dtype.UINT_TYPES: - self.int_signedness = dtype.SIGNEDNESS.UNSIGNED - self.int_bitwidth = int(name.split('int')[-1]) - self.primitive_bitwidth = self.int_bitwidth - elif name in dtype.FP_TYPES: - if name == 'fp8e4': - self.fp_mantissa_width = 3 - self.primitive_bitwidth = 8 - elif name == 'fp8e5': - self.fp_mantissa_width = 2 - self.primitive_bitwidth = 8 - elif name == 'fp16': - self.fp_mantissa_width = 10 - self.primitive_bitwidth = 16 - elif name == 'bf16': - self.fp_mantissa_width = 7 - self.primitive_bitwidth = 16 - elif name == 'fp32': - self.fp_mantissa_width = 23 - self.primitive_bitwidth = 32 - elif name == 'fp64': - self.fp_mantissa_width = 53 - self.primitive_bitwidth = 64 - else: - raise RuntimeError(f'Unsupported floating-point type {name}') - elif name == 'void': - self.primitive_bitwidth = 0 - - def is_fp8(self): - return 'fp8' in self.name - - def is_fp16(self): - return self.name == 'fp16' - - def is_bf16(self): - return self.name == 'bf16' - - def is_fp32(self): - return self.name == 'fp32' - - def is_fp64(self): - return self.name == 'fp64' - - def is_int1(self): - return self.name == 'int1' - - def is_int8(self): - return self.name == 'int8' - - def is_int16(self): - return self.name == 'int16' - - def is_int32(self): - return self.name == 'int32' - - def is_int64(self): - return self.name == 'int64' - - def is_uint8(self): - return self.name == 'uint8' - - def is_uint16(self): - return self.name == 'uint16' - - def is_uint32(self): - return self.name == 'uint32' - - def is_uint64(self): - return self.name == 'uint64' - - def is_floating(self): - return self.name in dtype.FP_TYPES - - def is_standard_floating(self): - return self.name in dtype.STANDARD_FP_TYPES - - def is_int_signed(self): - return self.name in dtype.SINT_TYPES - - def is_int_unsigned(self): - return self.name in dtype.UINT_TYPES - - def is_int(self): - return self.name in dtype.SINT_TYPES + dtype.UINT_TYPES - - def is_bool(self): - return self.is_int1() - - @staticmethod - def is_void(): - raise RuntimeError("Not implemented") - - @staticmethod - def is_block(): - return False - - @staticmethod - def is_ptr(): - return False - - def __eq__(self, other: dtype): - if not isinstance(other, dtype): - return False - return self.name == other.name - - def __ne__(self, other: dtype): - return not self.__eq__(other) - - def __hash__(self): - return hash((self.name,)) - - @property - def scalar(self): - return self - - def to_ir(self, builder: ir.builder) -> ir.type: - if self.name == 'void': - return builder.get_void_ty() - elif self.name == 'int1': - return builder.get_int1_ty() - elif self.name in ('int8', 'uint8'): - return builder.get_int8_ty() - elif self.name in ('int16', 'uint16'): - return builder.get_int16_ty() - elif self.name in ('int32', 'uint32'): - return builder.get_int32_ty() - elif self.name in ('int64', 'uint64'): - return builder.get_int64_ty() - elif self.name == 'fp8e5': - return builder.get_fp8e5_ty() - elif self.name == 'fp8e4': - return builder.get_fp8e4_ty() - elif self.name == 'fp16': - return builder.get_half_ty() - elif self.name == 'bf16': - return builder.get_bf16_ty() - elif self.name == 'fp32': - return builder.get_float_ty() - elif self.name == 'fp64': - return builder.get_double_ty() - raise ValueError(f'fail to convert {self} to ir type') - - def __str__(self): - return self.name - - @property - def cache_key_part(self) -> str: - """See cache_key_part() in triton.cc.""" - return self.name - - def __repr__(self): - return f'triton.language.{self.name}' - - -class pointer_type(dtype): - def __init__(self, element_ty: dtype, address_space: int = 1): - if not isinstance(element_ty, dtype): - raise TypeError('element_ty is a {type(element_ty).__name__}.') - self.element_ty = element_ty - self.address_space = address_space - - self.name = self.__str__() - - def to_ir(self, builder: ir.builder) -> ir.pointer_type: - return builder.get_ptr_ty(self.element_ty.to_ir(builder), 1) - - def __str__(self): - return f'pointer<{self.element_ty}>' - - def __repr__(self): - return self.__str__() - - def is_ptr(self): - return True - - def __eq__(self, other: pointer_type) -> bool: - if not isinstance(other, pointer_type): - return False - return self.element_ty == other.element_ty and self.address_space == other.address_space - - def __ne__(self, other: pointer_type) -> bool: - return not self.__eq__(other) - - @property - def scalar(self): - return self - - -class block_type(dtype): - def __init__(self, element_ty: dtype, shape: List): - self.element_ty = element_ty - - # Note that block_type's shape is a list of int - # while tensor's shape is a list of constexpr. - - # shape can be empty ([]) when an input is a 0D tensor. - if not shape: - raise TypeError('0d block_type is forbidden') - if isinstance(shape[0], constexpr): - shape = [s.value for s in shape] - - self.shape = shape - self.numel = 1 - for s in self.shape: - self.numel *= s - if self.numel > TRITON_MAX_TENSOR_NUMEL: - raise ValueError(f"numel ({self.numel}) exceeds triton maximum tensor numel ({TRITON_MAX_TENSOR_NUMEL})") - - self.name = self.__str__() - - def to_ir(self, builder: ir.builder) -> ir.block_type: - return builder.get_block_ty(self.element_ty.to_ir(builder), self.shape) - - def __str__(self): - return f'<{self.shape}, {self.element_ty}>' - - def __repr__(self): - return self.__str__() - - def is_block(self): - return True - - def get_block_shapes(self) -> List[int]: - return self.shape - - def __eq__(self, other: block_type) -> bool: - if not isinstance(other, block_type): - return False - return self.element_ty == other.element_ty and self.shape == other.shape - - def __ne__(self, other: block_type) -> bool: - return not self.__eq__(other) - - @property - def scalar(self): - return self.element_ty - - -class function_type(dtype): - def __init__(self, ret_types: List[dtype], param_types: List[dtype]) -> None: - self.ret_types = ret_types - self.param_types = param_types - - def __str__(self): - return f'fn ({self.param_types}) -> {self.ret_types}' - - def to_ir(self, builder: ir.builder): - ir_param_types = [ty.to_ir(builder) for ty in self.param_types] - ret_types = [ret_type.to_ir(builder) for ret_type in self.ret_types] - return builder.get_function_ty(ir_param_types, ret_types) - - -# scalar types -void = dtype('void') -int1 = dtype('int1') -int8 = dtype('int8') -int16 = dtype('int16') -int32 = dtype('int32') -int64 = dtype('int64') -uint8 = dtype('uint8') -uint16 = dtype('uint16') -uint32 = dtype('uint32') -uint64 = dtype('uint64') -float8e5 = dtype('fp8e5') -float8e4 = dtype('fp8e4') -float16 = dtype('fp16') -bfloat16 = dtype('bf16') -float32 = dtype('fp32') -float64 = dtype('fp64') -# pointer types -pi32_t = pointer_type(int32) - -# ----------------------- -# constexpr -# ----------------------- - - -class constexpr: - """ - This class is used to store a value that is known at compile-time. - """ - - def __init__(self, value): - if isinstance(value, constexpr): - self.value = value.value - else: - self.value = value - - def __repr__(self) -> str: - return f"constexpr[{self.value}]" - - def __add__(self, other): - return constexpr(self.value + other.value) - - def __radd__(self, other): - return constexpr(other.value + self.value) - - def __sub__(self, other): - return constexpr(self.value - other.value) - - def __rsub__(self, other): - return constexpr(other.value - self.value) - - def __mul__(self, other): - return constexpr(self.value * other.value) - - def __mod__(self, other): - return constexpr(self.value % other.value) - - def __rmul__(self, other): - return constexpr(other.value * self.value) - - def __truediv__(self, other): - return constexpr(self.value / other.value) - - def __rtruediv__(self, other): - return constexpr(other.value / self.value) - - def __floordiv__(self, other): - return constexpr(self.value // other.value) - - def __rfloordiv__(self, other): - return constexpr(other.value // self.value) - - def __gt__(self, other): - return constexpr(self.value > other.value) - - def __rgt__(self, other): - return constexpr(other.value > self.value) - - def __ge__(self, other): - return constexpr(self.value >= other.value) - - def __rge__(self, other): - return constexpr(other.value >= self.value) - - def __lt__(self, other): - return constexpr(self.value < other.value) - - def __rlt__(self, other): - return constexpr(other.value < self.value) - - def __le__(self, other): - return constexpr(self.value <= other.value) - - def __rle__(self, other): - return constexpr(other.value <= self.value) - - def __eq__(self, other): - return constexpr(self.value == other.value) - - def __ne__(self, other): - return constexpr(self.value != other.value) - - def __bool__(self): - return bool(self.value) - - def __neg__(self): - return constexpr(-self.value) - - def __and__(self, other): - return constexpr(self.value & other.value) - - def logical_and(self, other): - return constexpr(self.value and other.value) - - def __or__(self, other): - return constexpr(self.value | other.value) - - def __xor__(self, other): - return constexpr(self.value ^ other.value) - - def logical_or(self, other): - return constexpr(self.value or other.value) - - def __pos__(self): - return constexpr(+self.value) - - def __invert__(self): - return constexpr(~self.value) - - def __pow__(self, other): - return constexpr(self.value ** other.value) - - def __rshift__(self, other): - return constexpr(self.value >> other.value) - - def __lshift__(self, other): - return constexpr(self.value << other.value) - - def __not__(self): - return constexpr(not self.value) - - def __call__(self, *args, **kwds): - return self.value(*args, **kwds) - - -class tensor: - def __init__(self, handle, type: dtype): - # IR handle - self.handle = handle - # Block shape - self.shape = (1, ) - if type.is_block(): - self.shape = type.shape - self.numel = 1 - for s in self.shape: - self.numel *= s - self.numel = constexpr(self.numel) - self.type = type # Tensor type (can be block_type) - # Following the practice in pytorch, dtype is scalar type - self.dtype = type.scalar - self.shape = [constexpr(s) for s in self.shape] - - def __str__(self) -> str: - # ex. "float32[3,4]" - return str(self.dtype) + '[' + ','.join(str(s) for s in self.shape) + ']' - - @builtin - def __add__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.add(self, other, _builder) - - def __radd__(self, other, _builder=None): - return self.__add__(other, _builder=_builder) - - @builtin - def __sub__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.sub(self, other, _builder) - - def __rsub__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.sub(other, self, _builder) - - @builtin - def __mul__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.mul(self, other, _builder) - - def __rmul__(self, other, _builder=None): - return self.__mul__(other, _builder=_builder) - - @builtin - def __truediv__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.truediv(self, other, _builder) - - def __rtruediv__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.truediv(other, self, _builder) - - @builtin - def __floordiv__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.floordiv(self, other, _builder) - - @builtin - def __rfloordiv__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.floordiv(other, self, _builder) - - @builtin - def __mod__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.mod(self, other, _builder) - - @builtin - def __rmod__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.mod(other, self, _builder) - - # unary operators - @builtin - def __neg__(self, _builder=None): - return semantic.minus(self, _builder) - - @builtin - def __invert__(self, _builder=None): - return semantic.invert(self, _builder) - - # bitwise operators - - @builtin - def __and__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.and_(self, other, _builder) - - @builtin - def __rand__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.and_(other, self, _builder) - - @builtin - def __or__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.or_(self, other, _builder) - - @builtin - def __ror__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.or_(other, self, _builder) - - @builtin - def __xor__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.xor_(self, other, _builder) - - @builtin - def __rxor__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.xor_(other, self, _builder) - - @builtin - def __lshift__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.shl(self, other, _builder) - - @builtin - def __rlshift__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.shl(other, self, _builder) - - @builtin - def __rshift__(self, other, _builder=None): - other = _to_tensor(other, _builder) - if self.dtype.is_int_signed(): - return semantic.ashr(self, other, _builder) - else: - return semantic.lshr(self, other, _builder) - - @builtin - def __rrshift__(self, other, _builder=None): - other = _to_tensor(other, _builder) - if self.dtype.is_int_signed(): - return semantic.ashr(other, self, _builder) - else: - return semantic.lshr(other, self, _builder) - - # comparison operators - - # > - @builtin - def __gt__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.greater_than(self, other, _builder) - - @builtin - def __rgt__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.greater_than(other, self, _builder) - - # >= - @builtin - def __ge__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.greater_equal(self, other, _builder) - - @builtin - def __rge__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.greater_equal(other, self, _builder) - - # < - @builtin - def __lt__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.less_than(self, other, _builder) - - @builtin - def __rlt__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.less_than(other, self, _builder) - - # <= - @builtin - def __le__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.less_equal(self, other, _builder) - - @builtin - def __rle__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.less_equal(other, self, _builder) - - # == - @builtin - def __eq__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.equal(self, other, _builder) - - @builtin - def __ne__(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.not_equal(self, other, _builder) - - @builtin - def logical_and(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.logical_and(self, other, _builder) - - @builtin - def logical_or(self, other, _builder=None): - other = _to_tensor(other, _builder) - return semantic.logical_or(self, other, _builder) - - # note: __not__ isn't actually a magic method in python - # but it's ok because our ASTVisitor handles it - @builtin - def __not__(self, _builder=None): - return semantic.not_(self, _builder) - - @builtin - def __getitem__(self, slices, _builder=None): - if isinstance(slices, slice): - slices = [slices] - ret = self - for dim, sl in enumerate(slices): - if isinstance(sl, constexpr) and sl.value is None: - ret = semantic.expand_dims(ret, dim, _builder) - elif sl == slice(None, None, None): - pass - else: - assert False, f"unsupported tensor index: {sl}" - return ret - - @property - def T(self): - assert False, "Transposition must be created by the AST Visitor" - - @builtin - def to(self, dtype, bitcast=False, _builder=None): - if isinstance(bitcast, constexpr): - bitcast = bitcast.value - if bitcast: - return semantic.bitcast(self, dtype, _builder) - return semantic.cast(self, dtype, _builder) - - -# ----------------------- -# SPMD Programming Model -# ----------------------- -def _constexpr_to_value(v): - if isinstance(v, constexpr): - return v.value - return v - - -@builtin -def program_id(axis, _builder=None): - """ - Returns the id of the current program instance along the given :code:`axis`. - - :param axis: The axis of the 3D launch grid. Has to be either 0, 1 or 2. - :type axis: int - """ - # if axis == -1: - # pid0 = program_id(0, _builder) - # pid1 = program_id(1, _builder) - # pid2 = program_id(2, _builder) - # npg0 = num_programs(0, _builder) - # npg1 = num_programs(0, _builder) - # return pid0 + pid1*npg0 + pid2*npg0*npg1 - axis = _constexpr_to_value(axis) - return semantic.program_id(axis, _builder) - - -@builtin -def num_programs(axis, _builder=None): - """ - Returns the number of program instances launched along the given :code:`axis`. - - :param axis: The axis of the 3D launch grid. Has to be either 0, 1 or 2. - :type axis: int - """ - axis = _constexpr_to_value(axis) - return semantic.num_programs(axis, _builder) - - -# ----------------------- -# Block Initialization -# ----------------------- - - -@builtin -def arange(start, end, _builder=None): - """ - Returns contiguous values within the left-closed and right-open interval [:code:`start`, :code:`end`). \ - End - Start must be less than or equal to TRITON_MAX_TENSOR_NUMEL = 131072 - - :param start: Start of the interval. Must be a power of two. - :type start: int32 - :param end: End of the interval. Must be a power of two > start. - :type end: int32 - """ - start = _constexpr_to_value(start) - end = _constexpr_to_value(end) - return semantic.arange(start, end, _builder) - - -def _shape_check_impl(shape): - shape = _constexpr_to_value(shape) - for i, d in enumerate(shape): - if not isinstance(d, constexpr): - raise TypeError(f"Shape element {i} must have type `constexpr`") - if not isinstance(d.value, int): - raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]") - return [_constexpr_to_value(x) for x in shape] - - -@builtin -def full(shape, value, dtype, _builder=None): - """ - Returns a tensor filled with the scalar value for the given :code:`shape` and :code:`dtype`. - - :param shape: Shape of the new array, e.g., (8, 16) or (8, ) - :value value: A scalar value to fill the array with - :type shape: tuple of ints - :param dtype: Data-type of the new array, e.g., :code:`tl.float16` - :type dtype: DType - """ - shape = _shape_check_impl(shape) - value = _constexpr_to_value(value) - dtype = _constexpr_to_value(dtype) - return semantic.full(shape, value, dtype, _builder) - - -# ----------------------- -# Shape Manipulation -# ----------------------- - - -@builtin -def broadcast(input, other, _builder=None): - """ - Tries to broadcast the two given blocks to a common compatible shape. - - :param input: The first input tensor. - :type input: Block - :param other: The second input tensor. - :type other: Block - """ - return semantic.broadcast_impl_value(input, other, _builder) - - -@builtin -def broadcast_to(input, shape, _builder=None): - """ - Tries to broadcast the given tensor to a new :code:`shape`. - - :param input: The input tensor. - :type input: Block - :param shape: The desired shape. - :type shape: Tuple[int] - """ - shape = _shape_check_impl(shape) - return semantic.broadcast_impl_shape(input, shape, _builder) - - -@builtin -def trans(input, _builder=None): - return semantic.trans(input, _builder) - - -@builtin -def cat(input, other, can_reorder=False, _builder=None): - """ - Concatenate the given blocks - - :param input: The first input tensor. - :type input: - :param other: The second input tensor. - :type other: - :param reorder: Compiler hint. If true, the compiler is - allowed to reorder elements while concatenating inputs. - Only use if the order does not matter (e.g., result is - only used in reduction ops) - """ - return semantic.cat(input, other, can_reorder, _builder) - - -@builtin -def view(input, shape, _builder=None): - """ - Returns a tensor with the same elements as `input` but a different shape. - The order of the elements may not be preserved. - - :param input: The input tensor. - :type input: - :param shape: The desired shape. - :type shape: Tuple[int] - - """ - shape = _shape_check_impl(shape) - return semantic.view(input, shape, _builder) - - -@builtin -def reshape(input, shape, _builder=None): - shape = _shape_check_impl(shape) - return semantic.reshape(input, shape, _builder) - - -def _wrap_axis(axis, ndim): - if not (-ndim <= axis < ndim): - raise ValueError(f"invalid axis {axis}. Expected {-ndim} <= axis < {ndim}") - - return axis if axis >= 0 else axis + ndim - - -@builtin -def expand_dims(input, axis, _builder=None): - """ - Expand the shape of a tensor, by inserting new length-1 dimensions. - - Axis indices are with respect to the resulting tensor, so - ``result.shape[axis]`` will be 1 for each axis. - - :param input: The input tensor. - :type input: tl.tensor - :param axis: The indices to add new axes - :type axis: int | Sequence[int] - - """ - axis = _constexpr_to_value(axis) - axes = list(axis) if isinstance(axis, Sequence) else [axis] - new_ndim = len(input.shape) + len(axes) - axes = [_wrap_axis(_constexpr_to_value(d), new_ndim) for d in axes] - - if len(set(axes)) != len(axes): - raise ValueError(f"expand_dims recieved duplicate axes, normalized axes = {axes}") - - ret = input - for a in sorted(axes): - ret = semantic.expand_dims(ret, a, _builder) - return ret - -# ----------------------- -# Linear Algebra -# ----------------------- - - -@builtin -def dot(input, other, allow_tf32=True, out_dtype=float32, _builder=None): - """ - Returns the matrix product of two blocks. - - The two blocks must be two-dimensional and have compatible inner dimensions. - - :param input: The first tensor to be multiplied. - :type input: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`} - :param other: The second tensor to be multiplied. - :type other: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`} - """ - allow_tf32 = _constexpr_to_value(allow_tf32) - out_dtype = _constexpr_to_value(out_dtype) - return semantic.dot(input, other, allow_tf32, out_dtype, _builder) - - -# ----------------------- -# Non-Atomic Memory Operations -# ----------------------- - - -@builtin -def load(pointer, mask=None, other=None, boundary_check=tuple(), padding_option="", cache_modifier="", - eviction_policy="", volatile=False, _builder=None): - """ - Return a tensor of data whose values are loaded from memory at location defined by `pointer`: - (1) `pointer` could be a single element pointer, then a scalar will be loaded - - `mask` and `other` must be scalar too - - `other` is implicitly typecast to `pointer.dtype.element_ty` - - `boundary_check` and `padding_option` must be empty - (2) `pointer` could be element-wise tensor of pointers, in which case: - - `mask` and `other` are implicitly broadcast to `pointer.shape` - - `other` is implicitly typecast to `pointer.dtype.element_ty` - - `boundary_check` and `padding_option` must be empty - (3) `pointer` could be a block pointer defined by `make_block_ptr`, in which case: - - `mask` and `other` must be None - - `boundary_check` and `padding_option` can be specified to control the behavior of out-of-bound access - - :param pointer: Pointer to the data to be loaded - :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType` - :param mask: if `mask[idx]` is false, do not load the data at address `pointer[idx]` - (must be `None` with block pointers) - :type mask: Block of `triton.int1`, optional - :param other: if `mask[idx]` is false, return `other[idx]` - :type other: Block, optional - :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check - :type boundary_check: tuple of ints, optional - :param padding_option: should be one of {"", "zero", "nan"}, do padding while out of bound - :param cache_modifier: changes cache option in NVIDIA PTX - :type cache_modifier: str, optional - :param eviction_policy: changes eviction policy in NVIDIA PTX - :type eviction_policy: str, optional - :param volatile: changes volatile option in NVIDIA PTX - :type volatile: bool, optional - """ - # `mask` and `other` can be constexpr - if _constexpr_to_value(mask) is not None: - mask = _to_tensor(mask, _builder) - if _constexpr_to_value(other) is not None: - other = _to_tensor(other, _builder) - padding_option = _constexpr_to_value(padding_option) - cache_modifier = _constexpr_to_value(cache_modifier) - eviction_policy = _constexpr_to_value(eviction_policy) - volatile = _constexpr_to_value(volatile) - return semantic.load(pointer, mask, other, boundary_check, padding_option, cache_modifier, eviction_policy, - volatile, _builder) - - -@builtin -def store(pointer, value, mask=None, boundary_check=(), cache_modifier="", eviction_policy="", _builder=None): - """ - Store a tensor of data into memory locations defined by `pointer`: - (1) `pointer` could be a single element pointer, then a scalar will be stored - - `mask` must be scalar too - - `boundary_check` and `padding_option` must be empty - (2) `pointer` could be element-wise tensor of pointers, in which case: - - `mask` is implicitly broadcast to `pointer.shape` - - `boundary_check` must be empty - (3) or `pointer` could be a block pointer defined by `make_block_ptr`, in which case: - - `mask` must be None - - `boundary_check` can be specified to control the behavior of out-of-bound access - `value` is implicitly broadcast to `pointer.shape` and typecast to `pointer.dtype.element_ty`. - - :param pointer: The memory location where the elements of `value` are stored - :type pointer: `triton.PointerType`, or block of `dtype=triton.PointerType` - :param value: The tensor of elements to be stored - :type value: Block - :param mask: If `mask[idx]` is false, do not store `value[idx]` at `pointer[idx]` - :type mask: Block of triton.int1, optional - :param boundary_check: tuple of integers, indicating the dimensions which should do the boundary check - :type boundary_check: tuple of ints, optional - :param cache_modifier: changes cache option in NVIDIA PTX - :type cache_modifier: str, optional - :param eviction_policy: changes eviction policy in NVIDIA PTX - :type eviction_policy: str, optional - """ - # `value` can be constexpr - value = _to_tensor(value, _builder) - if _constexpr_to_value(mask) is not None: - mask = _to_tensor(mask, _builder) - cache_modifier = _constexpr_to_value(cache_modifier) - eviction_policy = _constexpr_to_value(eviction_policy) - return semantic.store(pointer, value, mask, boundary_check, cache_modifier, eviction_policy, _builder) - - -@builtin -def make_block_ptr(base: tensor, shape, strides, offsets, block_shape, order, _builder=None): - """ - Returns a pointer to a block in a parent tensor - - :param base: The base pointer to the parent tensor - :param shape: The shape of the parent tensor - :param strides: The strides of the parent tensor - :param offsets: The offsets to the block - :param block_shape: The shape of the block - :param order: The order of the original data format - """ - return semantic.make_block_ptr(base, shape, strides, offsets, block_shape, order, _builder) - - -@builtin -def advance(base: tensor, offsets, _builder=None): - """ - Advance a block pointer - - :param base: the block pointer to advance - :param offsets: the offsets to advance, a tuple by dimension - """ - return semantic.advance(base, offsets, _builder) - -# ----------------------- -# Atomic Memory Operations -# ----------------------- - - -def _add_atomic_docstr(name: str) -> Callable[[T], T]: - - def _decorator(func: T) -> T: - docstr = """ - Performs an atomic {name} at the memory location specified by :code:`pointer`. - - Return the data stored at :code:`pointer` before the atomic operation. - - :param pointer: The memory locations to compare-and-swap. - :type pointer: Block of dtype=triton.PointerDType - :param cmp: The values expected to be found in the atomic object - :type cmp: Block of dtype=`pointer.dtype.element_ty` - :param val: The values to copy in case the expected value matches the contained value. - :type val: Block of dtype=`pointer.dtype.element_ty` - """ - func.__doc__ = docstr.format(name=name) - return func - - return _decorator - - -@builtin -@_add_atomic_docstr("compare-and-swap") -def atomic_cas(pointer, cmp, val, _builder=None): - cmp = _to_tensor(cmp, _builder) - val = _to_tensor(val, _builder) - return semantic.atomic_cas(pointer, cmp, val, _builder) - - -@builtin -@_add_atomic_docstr("exchange") -def atomic_xchg(pointer, val, mask=None, _builder=None): - val = _to_tensor(val, _builder) - return semantic.atomic_xchg(pointer, val, mask, _builder) - - -@builtin -@_add_atomic_docstr("add") -def atomic_add(pointer, val, mask=None, _builder=None): - val = _to_tensor(val, _builder) - return semantic.atomic_add(pointer, val, mask, _builder) - - -@builtin -@_add_atomic_docstr("max") -def atomic_max(pointer, val, mask=None, _builder=None): - val = _to_tensor(val, _builder) - return semantic.atomic_max(pointer, val, mask, _builder) - - -@builtin -@_add_atomic_docstr("min") -def atomic_min(pointer, val, mask=None, _builder=None): - val = _to_tensor(val, _builder) - return semantic.atomic_min(pointer, val, mask, _builder) - - -@builtin -@_add_atomic_docstr("logical and") -def atomic_and(pointer, val, mask=None, _builder=None): - val = _to_tensor(val, _builder) - return semantic.atomic_and(pointer, val, mask, _builder) - - -@builtin -@_add_atomic_docstr("logical or") -def atomic_or(pointer, val, mask=None, _builder=None): - val = _to_tensor(val, _builder) - return semantic.atomic_or(pointer, val, mask, _builder) - - -@builtin -@_add_atomic_docstr("logical xor") -def atomic_xor(pointer, val, mask=None, _builder=None): - val = _to_tensor(val, _builder) - return semantic.atomic_xor(pointer, val, mask, _builder) - - -# ----------------------- -# Conditioning -# ----------------------- - -@builtin -def where(condition, x, y, _builder=None): - """ - Returns a tensor of elements from either :code:`x` or :code:`y`, depending on :code:`condition`. - - Note that :code:`x` and :code:`y` are always evaluated regardless of the value of :code:`condition`. - - If you want to avoid unintended memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead. - - The shape of :code:`x` and :code:`y` are both broadcast to the shape of :code:`condition`. - :code:`x` and :code:`y` must have the same data type. - - :param condition: When True (nonzero), yield x, otherwise yield y. - :type condition: Block of triton.bool - :param x: values selected at indices where condition is True. - :param y: values selected at indices where condition is False. - """ - condition = _to_tensor(condition, _builder) - x = _to_tensor(x, _builder) - y = _to_tensor(y, _builder) - return semantic.where(condition, x, y, _builder) - - -# ----------------------- -# Math -# ----------------------- - -@builtin -def umulhi(x, y, _builder=None): - x = _to_tensor(x, _builder) - y = _to_tensor(y, _builder) - return semantic.umulhi(x, y, _builder) - - -@builtin -def fdiv(x, y, ieee_rounding=False, _builder=None): - ieee_rounding = _constexpr_to_value(ieee_rounding) - return semantic.fdiv(x, y, ieee_rounding, _builder) - - -def _add_math_1arg_docstr(name: str) -> Callable[[T], T]: - - def _decorator(func: T) -> T: - docstr = """ - Computes the element-wise {name} of :code:`x`. - - :param x: the input values - :type x: Block - """ - func.__doc__ = docstr.format(name=name) - return func - - return _decorator - - -@builtin -@_add_math_1arg_docstr("exponential") -def exp(x, _builder=None): - return semantic.exp(x, _builder) - - -@builtin -@_add_math_1arg_docstr("natural logarithm") -def log(x, _builder=None): - return semantic.log(x, _builder) - - -@builtin -@_add_math_1arg_docstr("cosine") -def cos(x, _builder=None): - return semantic.cos(x, _builder) - - -@builtin -@_add_math_1arg_docstr("sine") -def sin(x, _builder=None): - return semantic.sin(x, _builder) - - -@builtin -@_add_math_1arg_docstr("square root") -def sqrt(x, _builder=None): - return semantic.sqrt(x, _builder) - - -@builtin -@_add_math_1arg_docstr("absolute value") -def abs(x, _builder=None): - return semantic.abs(x, _builder) - - -# ----------------------- -# Reductions -# ----------------------- - -def _add_reduction_docstr(name: str) -> Callable[[T], T]: - - def _decorator(func: T) -> T: - docstr = """ - Returns the {name} of all elements in the :code:`input` tensor along the provided :code:`axis` - - :param input: the input values - :param axis: the dimension along which the reduction should be done - """ - func.__doc__ = docstr.format(name=name) - return func - - return _decorator - - -@contextmanager -def _insertion_guard(builder): - ip = builder.get_insertion_point() - yield - builder.restore_insertion_point(ip) - - -@builtin -def reduce(input, axis, combine_fn, _builder=None, _generator=None): - """Applies the combine_fn to all elements in :code:`input` tensors along the provided :code:`axis` - - :param input: the input tensor, or tuple of tensors - :param axis: the dimension along which the reduction should be done - :param combine_fn: a function to combine two groups of scalar tensors (must be marked with @triton.jit) - - """ - if isinstance(input, tensor): - return reduce((input,), axis, combine_fn, - _builder=_builder, _generator=_generator)[0] - - def make_combine_region(reduce_op): - in_scalar_tys = [t.type.scalar for t in input] - prototype = function_type(in_scalar_tys, in_scalar_tys * 2) - - region = reduce_op.get_region(0) - with _insertion_guard(_builder): - param_types = [ty.to_ir(_builder) for ty in prototype.param_types] - block = _builder.create_block_with_parent(region, param_types) - args = [tensor(block.arg(i), ty) - for i, ty in enumerate(prototype.param_types)] - results = _generator.call_JitFunction(combine_fn, args, kwargs={}) - if isinstance(results, tensor): - handles = [results.handle] - else: - handles = [r.handle for r in results] - _builder.create_reduce_ret(*handles) - - axis = _constexpr_to_value(axis) - return semantic.reduction(input, axis, make_combine_region, _builder) - - -@builtin -def _promote_reduction_input(t, _builder=None): - scalar_ty = t.type.scalar - # input is extended to 32-bits if necessary - # this increases numerical accuracy and can be done pretty much for free - # on GPUs - if scalar_ty.is_int() and scalar_ty.int_bitwidth < 32: - return t.to(int32, _builder=_builder) - - # hardware doesn't support FMAX, FMIN, CMP for bfloat16 - if scalar_ty is bfloat16: - return t.to(float32, _builder=_builder) - - return t - - -@builtin -def _argreduce(input, axis, combine_fn, _builder=None, _generator=None): - axis = _constexpr_to_value(axis) - n = input.shape[axis] - index = arange(0, n, _builder=_builder) - - if len(input.shape) > 1: - # Broadcast index across the non-reduced axes - axes_to_expand = [constexpr(d) for d in range(len(input.shape))] - del axes_to_expand[axis] - index = expand_dims(index, axes_to_expand, _builder=_builder) - index = broadcast_to(index, input.shape, _builder=_builder) - - rvalue, rindices = reduce((input, index), axis, combine_fn, - _builder=_builder, _generator=_generator) - return rindices - - -@triton.jit -def minimum(x, y): - """ - Computes the element-wise minimum of :code:`x` and :code:`y`. - - :param input: the first input tensor - :type input: Block - :param other: the second input tensor - :type other: Block - """ - return where(x < y, x, y) - - -@triton.jit -def maximum(x, y): - """ - Computes the element-wise maximum of :code:`x` and :code:`y`. - - :param input: the first input tensor - :type input: Block - :param other: the second input tensor - :type other: Block - """ - return where(x > y, x, y) - - -@triton.jit -def _max_combine(a, b): - return maximum(a, b) - - -@triton.jit -@_add_reduction_docstr("maximum") -def max(input, axis): - input = _promote_reduction_input(input) - return reduce(input, axis, _max_combine) - - -@triton.jit -def _argmax_combine(value1, index1, value2, index2): - gt = value1 > value2 - lt = value1 < value2 - index_min = minimum(index1, index2) - index_ret = where(gt, index1, where(lt, index2, index_min)) - value_ret = maximum(value1, value2) - return value_ret, index_ret - - -@triton.jit -@_add_reduction_docstr("maximum index") -def argmax(input, axis): - input = _promote_reduction_input(input) - return _argreduce(input, axis, _argmax_combine) - - -@triton.jit -def _min_combine(a, b): - # TODO: minimum/maximum doesn't get lowered to fmin/fmax... - return minimum(a, b) - - -@triton.jit -@_add_reduction_docstr("minimum") -def min(input, axis): - input = _promote_reduction_input(input) - return reduce(input, axis, _min_combine) - - -@triton.jit -def _argmin_combine(value1, index1, value2, index2): - lt = value1 < value2 - gt = value1 > value2 - index_min = minimum(index1, index2) - index_ret = where(lt, index1, where(gt, index2, index_min)) - value_ret = minimum(value1, value2) - return value_ret, index_ret - - -@triton.jit -@_add_reduction_docstr("minimum index") -def argmin(input, axis): - input = _promote_reduction_input(input) - return _argreduce(input, axis, _argmin_combine) - - -@triton.jit -def _sum_combine(a, b): - return a + b - - -@triton.jit -@_add_reduction_docstr("sum") -def sum(input, axis): - input = _promote_reduction_input(input) - return reduce(input, axis, _sum_combine) - - -@triton.jit -def _xor_combine(a, b): - return a ^ b - - -@builtin -@_add_reduction_docstr("xor sum") -def xor_sum(input, axis, _builder=None, _generator=None): - scalar_ty = input.type.scalar - if not scalar_ty.is_int(): - raise ValueError("xor_sum only supported for integers") - - input = _promote_reduction_input(input, _builder=_builder) - return reduce(input, axis, _xor_combine, - _builder=_builder, _generator=_generator) - - -# ----------------------- -# Compiler Hint Ops -# ----------------------- - - -@builtin -def debug_barrier(_builder=None): - ''' - Insert a barrier to synchronize all threads in a block. - ''' - return semantic.debug_barrier(_builder) - - -@builtin -def multiple_of(input, values, _builder=None): - """ - Let the compiler knows that the values in :code:`input` are all multiples of :code:`value`. - """ - if isinstance(values, constexpr): - values = [values] - for i, d in enumerate(values): - if not isinstance(d, constexpr): - raise TypeError(f"values element {i} must have type `constexpr`") - if not isinstance(d.value, int): - raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]") - values = [x.value for x in values] - return semantic.multiple_of(input, values) - - -@builtin -def max_contiguous(input, values, _builder=None): - """ - Let the compiler knows that the `value` first values in :code:`input` are contiguous. - """ - if isinstance(values, constexpr): - values = [values] - for i, d in enumerate(values): - if not isinstance(d, constexpr): - raise TypeError(f"values element {i} must have type `constexpr`") - if not isinstance(d.value, int): - raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]") - values = [x.value for x in values] - return semantic.max_contiguous(input, values) - -# ----------------------- -# Debugging functions -# ----------------------- - - -@builtin -def static_print(*values, sep: str = " ", end: str = "\n", file=None, flush=False, _builder=None): - ''' - Print the values at compile time. The parameters are the same as the builtin :code:`print`. - ''' - pass - - -@builtin -def static_assert(cond, msg="", _builder=None): - ''' - Assert the condition at compile time. The parameters are the same as the builtin :code:`assert`. - ''' - pass - - -@builtin -def device_print(prefix, *args, _builder=None): - ''' - Print the values at runtime from the device. - - :param prefix: a prefix to print before the values. This is required to be a string literal. - :param args: the values to print. They can be any tensor or scalar. - ''' - import string - prefix = _constexpr_to_value(prefix) - assert isinstance(prefix, str), f"{prefix} is not string" - b_ascii = True - for ch in prefix: - if ch not in string.printable: - b_ascii = False - break - assert b_ascii, f"{prefix} is not an ascii string" - new_args = [] - for arg in args: - new_args.append(_to_tensor(arg, _builder)) - return semantic.device_print(prefix, new_args, _builder) - - -@builtin -def device_assert(cond, msg="", _builder=None): - ''' - Assert the condition at runtime from the device. - - :param cond: the condition to assert. This is required to be a boolean tensor. - :param msg: the message to print if the assertion fails. This is required to be a string literal. - ''' - msg = _constexpr_to_value(msg) - import inspect - frame = inspect.currentframe() - module = inspect.getmodule(frame) - # The triton function module doesn't have the name attribute. - # We use this trick to find the caller. - while hasattr(module, "__name__"): - frame = frame.f_back - module = inspect.getmodule(frame) - func_name = frame.f_code.co_name - file_name = frame.f_back.f_code.co_filename - # TODO: The line number currently indicates the line - # where the triton function is called but not where the - # device_assert is called. Need to enhance this. - lineno = frame.f_back.f_lineno - return semantic.device_assert(_to_tensor(cond, _builder), msg, file_name, func_name, lineno, _builder) - - -# ----------------------- -# Iterators -# ----------------------- - - -class static_range: - - """ - Iterator that counts upward forever. - - .. highlight:: python - .. code-block:: python - - @triton.jit - def kernel(...): - for i in tl.static_range(10): - ... - :note: This is a special iterator used to implement similar semantics to Python's :code:`range` in the context of - :code:`triton.jit` functions. In addition, it also guides the compiler to unroll the loop aggressively. - :param arg1: the start value. - :param arg2: the end value. - :param step: the step value. - """ - - def __init__(self, arg1, arg2=None, step=None): - assert isinstance(arg1, constexpr) - if step is None: - self.step = constexpr(1) - else: - assert isinstance(step, constexpr) - self.step = step - if arg2 is None: - self.start = constexpr(0) - self.end = arg1 - else: - assert isinstance(arg2, constexpr) - self.start = arg1 - self.end = arg2 - - def __iter__(self): - raise RuntimeError("static_range can only be used in @triton.jit'd functions") - - def __next__(self): - raise RuntimeError("static_range can only be used in @triton.jit'd functions") - - -# ----------------------- -# Extern functions -# ----------------------- - -def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, ret_shape: tuple, is_pure: bool, _builder=None): - ''' - Dispatch a function to a library - :param func: the function to dispatch - :param lib_name: the name of the library - :param lib_path: the path of the library - :param args: the arguments of the function - :param arg_type_symbol_dict: the type of the arguments - :param ret_shape: the shape of the return value - :param _builder: the builder - :return: the return value of the function - ''' - if len(arg_type_symbol_dict) == 0: - raise ValueError("arg_type_symbol_dict is empty") - - num_args = len(list(arg_type_symbol_dict.keys())[0]) - if len(args) != num_args: - raise ValueError(f"length of input args does not match." - f"Expect {len(args)}, got {num_args}") - - arg_types = [] - arg_list = [] - for arg in args: - if isinstance(arg, tensor): - arg_types.append(arg.dtype) - arg_list.append(arg.handle) - else: - arg_types.append(type(arg)) - arg_list.append(arg) - arg_types = tuple(arg_types) - - if arg_types not in arg_type_symbol_dict: - raise ValueError(f"input arg type does not match." - f"Expect one of {arg_type_symbol_dict.keys()}, got {arg_types}") - else: - symbol = arg_type_symbol_dict[arg_types][0] - ret_type = arg_type_symbol_dict[arg_types][1] - if ret_shape: - ret_type = block_type(ret_type, ret_shape) - return tensor(func(lib_name, lib_path, symbol, arg_list, ret_type.to_ir(_builder), is_pure), ret_type) - - -def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, is_pure: bool, _builder=None): - ''' - Dispatch an elementwise function to a library - :param lib_name: the name of the library - :param lib_path: the path of the library - :param args: the arguments of the function - :param arg_type_symbol_dict: the type of the arguments - :param is_pure: whether the function is pure - :param _builder: the builder - :return: the return value of the function - ''' - dispatch_args = args.copy() - all_scalar = True - ret_shape = None - arg_types = [] - for i in range(len(dispatch_args)): - dispatch_args[i] = _to_tensor(dispatch_args[i], _builder) - arg_types.append(dispatch_args[i].dtype) - if dispatch_args[i].type.is_block(): - all_scalar = False - if len(arg_types) > 0: - arg_types = tuple(arg_types) - arithmetic_check = True - # If there's a type tuple that is not supported by the library, we will do arithmetic check - if arg_types in arg_type_symbol_dict: - arithmetic_check = False - broadcast_arg = dispatch_args[0] - # Get the broadcast shape over all the arguments - for i, item in enumerate(dispatch_args): - _, broadcast_arg = semantic.binary_op_type_checking_impl( - item, broadcast_arg, _builder, arithmetic_check=arithmetic_check) - # Change the shape of each argument based on the broadcast shape - for i in range(len(dispatch_args)): - dispatch_args[i], _ = semantic.binary_op_type_checking_impl( - dispatch_args[i], broadcast_arg, _builder, arithmetic_check=arithmetic_check) - if not all_scalar: - ret_shape = broadcast_arg.shape - func = getattr(_builder, "create_extern_elementwise") - return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, is_pure, _builder) - - -def extern(fn): - """A decorator for external functions.""" - return builtin(fn) diff --git a/python/triton/language/extra/__init__.py b/python/triton/language/extra/__init__.py deleted file mode 100644 index 2fd0ff3eeee3..000000000000 --- a/python/triton/language/extra/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from . import cuda - -__all__ = ['cuda'] diff --git a/python/triton/language/extra/cuda.bc b/python/triton/language/extra/cuda.bc deleted file mode 100644 index 4538ac35446a..000000000000 Binary files a/python/triton/language/extra/cuda.bc and /dev/null differ diff --git a/python/triton/language/extra/cuda.py b/python/triton/language/extra/cuda.py deleted file mode 100644 index 92df37a67c77..000000000000 --- a/python/triton/language/extra/cuda.py +++ /dev/null @@ -1,19 +0,0 @@ -import os - -from .. import core - -__path__ = os.path.dirname(os.path.abspath(__file__)) - - -@core.extern -def globaltimer(_builder=None): - return core.extern_elementwise("cuda", os.path.join(__path__, "cuda.bc"), [], - {tuple(): ("globaltimer", core.dtype("int64")), - }, is_pure=False, _builder=_builder) - - -@core.extern -def smid(_builder=None): - return core.extern_elementwise("cuda", os.path.join(__path__, "cuda.bc"), [], - {tuple(): ("smid", core.dtype("int32")), - }, is_pure=True, _builder=_builder) diff --git a/python/triton/language/math.py b/python/triton/language/math.py deleted file mode 100644 index 56e1ac5a11a0..000000000000 --- a/python/triton/language/math.py +++ /dev/null @@ -1,1534 +0,0 @@ -import functools -import os - -from . import core - - -@functools.lru_cache() -def libdevice_path(): - import torch - third_party_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "third_party") - if torch.version.hip is None: - default = os.path.join(third_party_dir, "cuda", "lib", "libdevice.10.bc") - else: - default = '' - return os.getenv("TRITON_LIBDEVICE_PATH", default) - - -@core.extern -def clz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_clz", core.dtype("int32")), - (core.dtype("int64"),): ("__nv_clzll", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def popc(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_popc", core.dtype("int32")), - (core.dtype("int64"),): ("__nv_popcll", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def byte_perm(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("int32"), core.dtype("int32"), core.dtype("int32"),): ("__nv_byte_perm", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def min(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("int32"),): ("__nv_min", core.dtype("int32")), - (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umin", core.dtype("uint32")), - (core.dtype("int64"), core.dtype("int64"),): ("__nv_llmin", core.dtype("int64")), - (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_ullmin", core.dtype("uint64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fminf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmin", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def max(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("int32"),): ("__nv_max", core.dtype("int32")), - (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umax", core.dtype("uint32")), - (core.dtype("int64"), core.dtype("int64"),): ("__nv_llmax", core.dtype("int64")), - (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_ullmax", core.dtype("uint64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaxf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmax", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def mulhi(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("int32"),): ("__nv_mulhi", core.dtype("int32")), - (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umulhi", core.dtype("uint32")), - (core.dtype("int64"), core.dtype("int64"),): ("__nv_mul64hi", core.dtype("int64")), - (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_umul64hi", core.dtype("uint64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def mul24(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("int32"),): ("__nv_mul24", core.dtype("int32")), - (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umul24", core.dtype("uint32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def brev(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_brev", core.dtype("int32")), - (core.dtype("int64"),): ("__nv_brevll", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sad(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("int32"), core.dtype("int32"), core.dtype("uint32"),): ("__nv_sad", core.dtype("int32")), - (core.dtype("uint32"), core.dtype("uint32"), core.dtype("uint32"),): ("__nv_usad", core.dtype("uint32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def abs(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_abs", core.dtype("int32")), - (core.dtype("int64"),): ("__nv_llabs", core.dtype("int64")), - (core.dtype("fp32"),): ("__nv_fabsf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_fabs", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def floor(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_floorf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_floor", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rcp64h(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_rcp64h", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rsqrt(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_rsqrtf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_rsqrt", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ceil(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_ceil", core.dtype("fp64")), - (core.dtype("fp32"),): ("__nv_ceilf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def trunc(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_trunc", core.dtype("fp64")), - (core.dtype("fp32"),): ("__nv_truncf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def exp2(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_exp2f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_exp2", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def saturatef(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_saturatef", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fma_rn(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rn", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fma_rz(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rz", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rz", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fma_rd(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rd", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rd", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fma_ru(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_ru", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_ru", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_dividef(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fast_fdividef", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def div_rn(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rn", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def div_rz(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rz", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rz", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def div_rd(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rd", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rd", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def div_ru(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_ru", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_ru", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rcp_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_frcp_rn", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_drcp_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rcp_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_frcp_rz", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_drcp_rz", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rcp_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_frcp_rd", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_drcp_rd", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rcp_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_frcp_ru", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_drcp_ru", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sqrt_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fsqrt_rn", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_dsqrt_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sqrt_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fsqrt_rz", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_dsqrt_rz", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sqrt_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fsqrt_rd", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_dsqrt_rd", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sqrt_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fsqrt_ru", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_dsqrt_ru", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sqrt(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_sqrtf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_sqrt", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def add_rn(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rn", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def add_rz(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rz", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rz", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def add_rd(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rd", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rd", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def add_ru(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_ru", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_ru", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def mul_rn(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rn", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def mul_rz(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rz", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rz", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def mul_rd(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rd", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rd", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def mul_ru(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_ru", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_ru", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2float_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2float_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2float_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2float_rz", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2float_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2float_rd", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2float_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2float_ru", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2int_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2int_rn", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2int_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2int_rz", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2int_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2int_rd", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2int_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2int_ru", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2uint_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2uint_rn", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2uint_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2uint_rz", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2uint_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2uint_rd", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2uint_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2uint_ru", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def int2double_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_int2double_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def uint2double_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint32"),): ("__nv_uint2double_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2int_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2int_rn", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2int_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2int_rz", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2int_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2int_rd", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2int_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2int_ru", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2uint_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2uint_rn", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2uint_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2uint_rz", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2uint_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2uint_rd", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2uint_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2uint_ru", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def int2float_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_int2float_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def int2float_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_int2float_rz", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def int2float_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_int2float_rd", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def int2float_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_int2float_ru", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def uint2float_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint32"),): ("__nv_uint2float_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def uint2float_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint32"),): ("__nv_uint2float_rz", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def uint2float_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint32"),): ("__nv_uint2float_rd", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def uint2float_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint32"),): ("__nv_uint2float_ru", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def hiloint2double(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("int32"),): ("__nv_hiloint2double", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2loint(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2loint", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2hiint(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2hiint", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ll_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ll_rn", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ll_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ll_rz", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ll_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ll_rd", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ll_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ll_ru", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ull_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ull_rn", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ull_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ull_rz", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ull_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ull_rd", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float2ull_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float2ull_ru", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ll_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ll_rn", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ll_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ll_rz", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ll_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ll_rd", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ll_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ll_ru", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ull_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ull_rn", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ull_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ull_rz", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ull_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ull_rd", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double2ull_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double2ull_ru", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2float_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2float_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2float_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2float_rz", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2float_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2float_rd", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2float_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2float_ru", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2float_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2float_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2float_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2float_rz", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2float_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2float_rd", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2float_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2float_ru", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2double_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2double_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2double_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2double_rz", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2double_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2double_rd", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ll2double_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_ll2double_ru", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2double_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2double_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2double_rz(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2double_rz", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2double_rd(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2double_rd", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ull2double_ru(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint64"),): ("__nv_ull2double_ru", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def int_as_float(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_int_as_float", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float_as_int(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float_as_int", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def uint_as_float(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("uint32"),): ("__nv_uint_as_float", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def float_as_uint(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_float_as_uint", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def longlong_as_double(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int64"),): ("__nv_longlong_as_double", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def double_as_longlong(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_double_as_longlong", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_sinf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_sinf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_cosf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_cosf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_log2f(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_log2f", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_logf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_logf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_expf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_expf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_tanf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_tanf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_exp10f(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_exp10f", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_log10f(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_fast_log10f", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fast_powf(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fast_powf", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def hadd(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("int32"),): ("__nv_hadd", core.dtype("int32")), - (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_uhadd", core.dtype("uint32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rhadd(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("int32"),): ("__nv_rhadd", core.dtype("int32")), - (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_urhadd", core.dtype("uint32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sub_rn(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rn", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sub_rz(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rz", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rz", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sub_rd(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rd", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rd", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sub_ru(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_ru", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_ru", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rsqrt_rn(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_frsqrt_rn", core.dtype("fp32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ffs(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("int32"),): ("__nv_ffs", core.dtype("int32")), - (core.dtype("int64"),): ("__nv_ffsll", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rint(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_rintf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_rint", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def llrint(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_llrintf", core.dtype("int64")), - (core.dtype("fp64"),): ("__nv_llrint", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def nearbyint(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_nearbyintf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_nearbyint", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def isnan(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_isnanf", core.dtype("int32")), - (core.dtype("fp64"),): ("__nv_isnand", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def signbit(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_signbitf", core.dtype("int32")), - (core.dtype("fp64"),): ("__nv_signbitd", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def copysign(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_copysignf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_copysign", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def finitef(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_finitef", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def isinf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_isinff", core.dtype("int32")), - (core.dtype("fp64"),): ("__nv_isinfd", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def nextafter(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_nextafterf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_nextafter", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sin(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_sinf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_sin", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def cos(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_cosf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_cos", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sinpi(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_sinpif", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_sinpi", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def cospi(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_cospif", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_cospi", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def tan(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_tanf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_tan", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def log2(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_log2f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_log2", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def exp(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_expf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_exp", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def exp10(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_exp10f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_exp10", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def cosh(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_coshf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_cosh", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def sinh(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_sinhf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_sinh", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def tanh(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_tanhf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_tanh", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def atan2(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_atan2f", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_atan2", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def atan(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_atanf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_atan", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def asin(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_asinf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_asin", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def acos(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_acosf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_acos", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def log(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_logf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_log", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def log10(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_log10f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_log10", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def log1p(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_log1pf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_log1p", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def acosh(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_acoshf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_acosh", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def asinh(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_asinhf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_asinh", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def atanh(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_atanhf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_atanh", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def expm1(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_expm1f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_expm1", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def hypot(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_hypotf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_hypot", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rhypot(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rhypotf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rhypot", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def norm3d(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_norm3df", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_norm3d", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rnorm3d(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rnorm3df", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rnorm3d", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def norm4d(arg0, arg1, arg2, arg3, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, arg3, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_norm4df", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_norm4d", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rnorm4d(arg0, arg1, arg2, arg3, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, arg3, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rnorm4df", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rnorm4d", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def cbrt(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_cbrtf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_cbrt", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def rcbrt(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_rcbrtf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_rcbrt", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def j0(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_j0f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_j0", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def j1(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_j1f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_j1", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def y0(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_y0f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_y0", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def y1(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_y1f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_y1", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def yn(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("fp32"),): ("__nv_ynf", core.dtype("fp32")), - (core.dtype("int32"), core.dtype("fp64"),): ("__nv_yn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def jn(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("int32"), core.dtype("fp32"),): ("__nv_jnf", core.dtype("fp32")), - (core.dtype("int32"), core.dtype("fp64"),): ("__nv_jn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def cyl_bessel_i0(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_cyl_bessel_i0f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_cyl_bessel_i0", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def cyl_bessel_i1(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_cyl_bessel_i1f", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_cyl_bessel_i1", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def erf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_erff", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_erf", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def erfinv(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_erfinvf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_erfinv", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def erfc(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_erfcf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_erfc", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def erfcx(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_erfcxf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_erfcx", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def erfcinv(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_erfcinvf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_erfcinv", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def normcdfinv(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_normcdfinvf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_normcdfinv", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def normcdf(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_normcdff", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_normcdf", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def lgamma(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_lgammaf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_lgamma", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ldexp(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_ldexpf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("int32"),): ("__nv_ldexp", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def scalbn(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_scalbnf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("int32"),): ("__nv_scalbn", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fmod(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmodf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmod", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def remainder(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_remainderf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_remainder", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fma(arg0, arg1, arg2, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, arg2, ], - {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def pow(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_powif", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("int32"),): ("__nv_powi", core.dtype("fp64")), - (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_powf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_pow", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def tgamma(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_tgammaf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_tgamma", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def round(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_roundf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_round", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def llround(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_llroundf", core.dtype("int64")), - (core.dtype("fp64"),): ("__nv_llround", core.dtype("int64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def fdim(arg0, arg1, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, arg1, ], - {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdimf", core.dtype("fp32")), - (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fdim", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def ilogb(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_ilogbf", core.dtype("int32")), - (core.dtype("fp64"),): ("__nv_ilogb", core.dtype("int32")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def logb(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp32"),): ("__nv_logbf", core.dtype("fp32")), - (core.dtype("fp64"),): ("__nv_logb", core.dtype("fp64")), - }, is_pure=True, _builder=_builder) - - -@core.extern -def isfinited(arg0, _builder=None): - return core.extern_elementwise("libdevice", libdevice_path(), [arg0, ], - {(core.dtype("fp64"),): ("__nv_isfinited", core.dtype("int32")), - }, is_pure=True, _builder=_builder) diff --git a/python/triton/language/random.py b/python/triton/language/random.py deleted file mode 100644 index a9ddbd829f12..000000000000 --- a/python/triton/language/random.py +++ /dev/null @@ -1,178 +0,0 @@ -import triton -from . import core as tl - -PHILOX_KEY_A: tl.constexpr = 0x9E3779B9 -PHILOX_KEY_B: tl.constexpr = 0xBB67AE85 -PHILOX_ROUND_A: tl.constexpr = 0xD2511F53 -PHILOX_ROUND_B: tl.constexpr = 0xCD9E8D57 -N_ROUNDS_DEFAULT = 10 # Default number of rounds for philox - -# ------------------- -# randint -# ------------------- - - -@triton.jit -def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - """ - Run `n_rounds` rounds of Philox for state (c0, c1, c2, c3) and key (k0, k1). - """ - for _ in tl.static_range(n_rounds): - # for _ in range(n_rounds): - # update random state - A = PHILOX_ROUND_A - B = PHILOX_ROUND_B - _c0, _c2 = c0, c2 - c0 = tl.umulhi(B, _c2) ^ c1 ^ k0 - c2 = tl.umulhi(A, _c0) ^ c3 ^ k1 - c1 = B * _c2 - c3 = A * _c0 - # raise key - k0 = k0 + PHILOX_KEY_A - k1 = k1 + PHILOX_KEY_B - return c0, c1, c2, c3 - - -@triton.jit -def philox(seed, c0, c1, c2, c3, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - seed = seed.to(tl.uint64) - seed_hi = ((seed >> 32) & 0xffffffff).to(tl.uint32) - seed_lo = (seed & 0xffffffff).to(tl.uint32) - c0 = c0.to(tl.uint32, bitcast=True) - c1 = c1.to(tl.uint32, bitcast=True) - c2 = c2.to(tl.uint32, bitcast=True) - c3 = c3.to(tl.uint32, bitcast=True) - return philox_impl(c0, c1, c2, c3, seed_lo, seed_hi, n_rounds) - - -@triton.jit -def randint(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - """ - Given a :code:`seed` scalar and an :code:`offset` block, returns a single - block of random :code:`int32`. - - If you need multiple streams of random numbers, - using `randint4x` is likely to be faster than calling `randint` 4 times. - - :param seed: The seed for generating random numbers. - :param offsets: The offsets to generate random numbers for. - """ - ret, _, _, _ = randint4x(seed, offset, n_rounds) - return ret - - -@triton.jit -def randint4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - """ - Given a :code:`seed` scalar and an :code:`offset` block, returns four - blocks of random :code:`int32`. - - This is the maximally efficient entry point - to Triton's Philox pseudo-random number generator. - - :param seed: The seed for generating random numbers. - :param offsets: The offsets to generate random numbers for. - """ - # _0 = tl.zeros(offset.shape, offset.dtype) - _0 = offset * 0 - return philox(seed, offset, _0, _0, _0, n_rounds) - - -# ------------------- -# rand -# ------------------- - -# @triton.jit -# def uint32_to_uniform_float(x): -# """ -# Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1). -# """ -# two_to_the_minus_32: tl.constexpr = 2.328306e-10 -# return x * two_to_the_minus_32 - -@triton.jit -def uint32_to_uniform_float(x): - """ - Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1). - """ - x = x.to(tl.int32, bitcast=True) - # maximum value such that `MAX_INT * scale < 1.0` (with float rounding) - scale = 4.6566127342e-10 - x = tl.where(x < 0, -x - 1, x) - return x * scale - - -@triton.jit -def rand(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - """ - Given a :code:`seed` scalar and an :code:`offset` block, - returns a block of random :code:`float32` in :math:`U(0, 1)`. - - :param seed: The seed for generating random numbers. - :param offsets: The offsets to generate random numbers for. - """ - offset = offset.to(tl.uint32, bitcast=True) - source = randint(seed, offset, n_rounds) - return uint32_to_uniform_float(source) - - -@triton.jit -def rand4x(seed, offsets, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - """ - Given a :code:`seed` scalar and an :code:`offsets` block, - returns a 4 blocks of random :code:`float32` in :math:`U(0, 1)`. - - :param seed: The seed for generating random numbers. - :param offsets: The offsets to generate random numbers for. - """ - offsets = offsets.to(tl.uint32, bitcast=True) - i1, i2, i3, i4 = randint4x(seed, offsets, n_rounds) - u1 = uint32_to_uniform_float(i1) - u2 = uint32_to_uniform_float(i2) - u3 = uint32_to_uniform_float(i3) - u4 = uint32_to_uniform_float(i4) - return u1, u2, u3, u4 - -# ------------------- -# randn -# ------------------- - - -@triton.jit -def pair_uniform_to_normal(u1, u2): - """Box-Muller transform""" - u1 = tl.maximum(1.0e-7, u1) - th = 6.283185307179586 * u2 - r = tl.sqrt(-2.0 * tl.log(u1)) - return r * tl.cos(th), r * tl.sin(th) - - -@triton.jit -def randn(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - """ - Given a :code:`seed` scalar and an :code:`offset` block, - returns a block of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`. - - :param seed: The seed for generating random numbers. - :param offsets: The offsets to generate random numbers for. - """ - i1, i2, _, _ = randint4x(seed, offset, n_rounds) - u1 = uint32_to_uniform_float(i1) - u2 = uint32_to_uniform_float(i2) - n1, _ = pair_uniform_to_normal(u1, u2) - return n1 - - -@triton.jit -def randn4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): - """ - Given a :code:`seed` scalar and an :code:`offset` block, - returns a 4 blocks of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`. - - :param seed: The seed for generating random numbers. - :param offsets: The offsets to generate random numbers for. - """ - u1, u2, u3, u4 = rand4x(seed, offset, n_rounds) - n1, n2 = pair_uniform_to_normal(u1, u2) - n3, n4 = pair_uniform_to_normal(u3, u4) - return n1, n2, n3, n4 diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py deleted file mode 100644 index 968e57f59385..000000000000 --- a/python/triton/language/semantic.py +++ /dev/null @@ -1,1440 +0,0 @@ -from __future__ import annotations # remove after python 3.11 - -from functools import wraps -from typing import List, Optional, Sequence, Tuple, TypeVar - -from . import core as tl -from triton._C.libtriton.triton import ir - -T = TypeVar('T') - -# Create custom exception that prints message "hello" - - -class IncompatibleTypeErrorImpl(Exception): - def __init__(self, type_a, type_b): - self.type_a = type_a - self.type_b = type_b - self.message = "invalid operands of type " + self.type_a.__repr__() + " and " + self.type_b.__repr__() - super(IncompatibleTypeErrorImpl, self).__init__(self.message) - - -# ===----------------------------------------------------------------------===## -# Programming Model -# ===----------------------------------------------------------------------===## - -def program_id(axis: int, builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_get_program_id(axis), tl.int32) - - -def num_programs(axis: int, builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_get_num_programs(axis), tl.int32) - -# ===----------------------------------------------------------------------===// -# Implicit Casting Utilities -# ===----------------------------------------------------------------------===// - - -def integer_promote_impl(a_ty: tl.dtype, b_ty: tl.dtype) -> tl.dtype: - a_rank = a_ty.int_bitwidth - b_rank = b_ty.int_bitwidth - a_sn = a_ty.int_signedness - b_sn = b_ty.int_signedness - # Rules for signedness taken from "Usual arithmetic conversions" on - # https://en.cppreference.com/w/c/language/conversion. - if a_sn == b_sn: - return a_ty if a_rank > b_rank else b_ty - elif a_sn == tl.dtype.SIGNEDNESS.UNSIGNED: - return a_ty if a_rank >= b_rank else b_ty - elif b_sn == tl.dtype.SIGNEDNESS.UNSIGNED: - return b_ty if b_rank >= a_rank else a_ty - assert False - - -def computation_type_impl(a_ty: tl.dtype, b_ty: tl.dtype, div_or_mod: bool) -> tl.dtype: - # 1) if one operand is double, the other is implicitly - # converted to double - if a_ty.is_fp64() or b_ty.is_fp64(): - return tl.float64 - # 2) if one operand is float, the other is implicitly - # converted to float - if a_ty.is_fp32() or b_ty.is_fp32(): - return tl.float32 - # 3 ) if one operand is half, the other is implicitly converted to half - # unless we're doing / or %, which do not exist natively in PTX for fp16. - # Supported PTX op: add, sub, mul, fma, neg, abs, min, max, tanh, ex2, setp - if a_ty.is_fp16() or b_ty.is_fp16(): - if div_or_mod: - return tl.float32 - else: - return tl.float16 - # 4) return bf16 only if both operands are of bf16 - if a_ty.is_bf16() or b_ty.is_bf16(): - if div_or_mod: - return tl.float32 - if a_ty.is_bf16() and b_ty.is_bf16(): - return tl.bfloat16 - return tl.float32 - if not a_ty.is_int() or not b_ty.is_int(): - assert False - # 5 ) both operands are integer and undergo - # integer promotion - if div_or_mod and a_ty.int_signedness != b_ty.int_signedness: - raise ValueError("Cannot use /, #, or % with " + a_ty.__repr__() + " and " + b_ty.__repr__() + " because they have different signedness;" - "this is unlikely to result in a useful answer. Cast them to the same signedness.") - return integer_promote_impl(a_ty, b_ty) - -# ===----------------------------------------------------------------------===// -# Binary Operators -# ===----------------------------------------------------------------------===// - - -def check_ptr_type_impl(type_a: tl.dtype, type_b: tl.dtype, allow_ptr_a: bool) -> None: - if type_a.is_ptr(): - if not allow_ptr_a: - raise IncompatibleTypeErrorImpl(type_a, type_b) - # T* + U* with T != U - if type_b.is_ptr() and (type_a != type_b): - raise IncompatibleTypeErrorImpl(type_a, type_b) - # T* + float - if type_b.is_floating(): - raise IncompatibleTypeErrorImpl(type_a, type_b) - - -def binary_op_type_checking_impl(lhs: tl.tensor, - rhs: tl.tensor, - builder: ir.builder, - allow_lhs_ptr=False, allow_rhs_ptr=False, - arithmetic_check=True, div_or_mod=False - ) -> Tuple[tl.tensor, tl.tensor]: - # implicit broadcasting - lhs, rhs = broadcast_impl_value(lhs, rhs, builder) - # implicit typecasting - lhs_sca_ty = lhs.type.scalar - rhs_sca_ty = rhs.type.scalar - check_ptr_type_impl(lhs_sca_ty, rhs_sca_ty, allow_lhs_ptr) - check_ptr_type_impl(rhs_sca_ty, lhs_sca_ty, allow_rhs_ptr) - if arithmetic_check and not lhs_sca_ty.is_ptr() and not rhs_sca_ty.is_ptr(): - ret_sca_ty = computation_type_impl(lhs_sca_ty, rhs_sca_ty, div_or_mod) - lhs = cast(lhs, ret_sca_ty, builder) - rhs = cast(rhs, ret_sca_ty, builder) - return lhs, rhs - - -def add(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder, True, True) - input_scalar_ty = input.type.scalar - other_scalar_ty = other.type.scalar - - # offset + ptr - # ptr + offset - if other_scalar_ty.is_ptr() and not input_scalar_ty.is_ptr(): - input, other = other, input - if input_scalar_ty.is_ptr(): - return tl.tensor(builder.create_addptr(input.handle, other.handle), input.type) - # float + float - elif input_scalar_ty.is_floating(): - return tl.tensor(builder.create_fadd(input.handle, other.handle), input.type) - # int + int - elif input_scalar_ty.is_int(): - return tl.tensor(builder.create_add(input.handle, other.handle), input.type) - assert False - - -def sub(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder, True, False) - scalar_ty = input.type.scalar - # ptr - offset - if scalar_ty.is_ptr(): - return tl.tensor(builder.create_addptr(input.handle, minus(other, builder).handle), - input.type) - # float - float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fsub(input.handle, other.handle), input.type) - # int - int - elif scalar_ty.is_int(): - return tl.tensor(builder.create_sub(input.handle, other.handle), input.type) - assert False - - -def mul(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder) - scalar_ty = input.type.scalar - # float * float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fmul(input.handle, other.handle), input.type) - # * int - elif scalar_ty.is_int(): - return tl.tensor(builder.create_mul(input.handle, other.handle), input.type) - assert False - - -def truediv(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) - input_scalar_ty = input.type.scalar - other_scalar_ty = other.type.scalar - # float / int - if input_scalar_ty.is_floating() and other_scalar_ty.is_int(): - other = cast(other, input_scalar_ty, builder) - # int / float - elif input_scalar_ty.is_int() and other_scalar_ty.is_floating(): - input = cast(input, other_scalar_ty, builder) - # int / int (cast to tl.float32) - elif input_scalar_ty.is_int() and other_scalar_ty.is_int(): - input = cast(input, tl.float32, builder) - other = cast(other, tl.float32, builder) - # float / float (cast to highest exponent type) - elif input_scalar_ty.is_floating() and other_scalar_ty.is_floating(): - if input_scalar_ty.fp_mantissa_width > other_scalar_ty.fp_mantissa_width: - other = cast(other, input_scalar_ty, builder) - else: - input = cast(input, other_scalar_ty, builder) - # unreachable - else: - assert False - return tl.tensor(builder.create_fdiv(input.handle, other.handle), input.type) - - -def floordiv(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) - input_scalar_ty = input.type.scalar - other_scalar_ty = other.type.scalar - if input_scalar_ty.is_int() and other_scalar_ty.is_int(): - ret_ty = integer_promote_impl(input_scalar_ty, other_scalar_ty) - input = cast(input, ret_ty, builder) - other = cast(other, ret_ty, builder) - if ret_ty.is_int_signed(): - return tl.tensor(builder.create_sdiv(input.handle, other.handle), input.type) - else: - return tl.tensor(builder.create_udiv(input.handle, other.handle), input.type) - assert False - - -def fdiv(input: tl.tensor, - other: tl.tensor, - ieee_rounding: bool, - builder: ir.builder) -> tl.tensor: - input_scalar_ty = input.type.scalar - other_scalar_ty = other.type.scalar - if not input_scalar_ty.is_floating() or not other_scalar_ty.is_floating(): - raise ValueError("both operands of fdiv must have floating scalar type") - input, other = binary_op_type_checking_impl(input, other, builder, False, False, False, True) - ret = builder.create_fdiv(input.handle, other.handle) - return tl.tensor(ret, input.type) - - -def mod(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) - scalar_ty = input.type.scalar - other_scalar_ty = other.type.scalar - # float % float - if scalar_ty.is_floating(): - # input - input.div(other, rounding_mode="floor") * other - ret = sub(input, mul(floor(fdiv(input, other, False, builder), builder), - other, builder), - builder) - return ret - # % int - elif scalar_ty.is_int(): - if scalar_ty.int_signedness != other_scalar_ty.int_signedness: - raise ValueError("Cannot mod " + scalar_ty.__repr__() + " by " + other_scalar_ty.__repr__() + " " - "because they have different signedness;" - "this is unlikely to result in a useful answer. Cast them to the same signedness.") - if scalar_ty.is_int_signed(): - return tl.tensor(builder.create_srem(input.handle, other.handle), input.type) - else: - return tl.tensor(builder.create_urem(input.handle, other.handle), input.type) - assert False - -############## -# bitwise ops -############## - - -def bitwise_op_type_checking_impl(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> Tuple[tl.tensor, tl.tensor]: - input, other = binary_op_type_checking_impl(input, other, builder, False, False, False) - input_sca_ty = input.type.scalar - other_sca_ty = other.type.scalar - if not input_sca_ty.is_int() or not other_sca_ty.is_int(): - raise IncompatibleTypeErrorImpl(input_sca_ty, other_sca_ty) - ret_sca_ty = integer_promote_impl(input_sca_ty, other_sca_ty) - if ret_sca_ty != input_sca_ty: - input = cast(input, ret_sca_ty, builder) - if ret_sca_ty != other_sca_ty: - other = cast(other, ret_sca_ty, builder) - return input, other - - -def and_(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = bitwise_op_type_checking_impl(input, other, builder) - return tl.tensor(builder.create_and(input.handle, other.handle), input.type) - - -def or_(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = bitwise_op_type_checking_impl(input, other, builder) - return tl.tensor(builder.create_or(input.handle, other.handle), input.type) - - -def xor_(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = bitwise_op_type_checking_impl(input, other, builder) - return tl.tensor(builder.create_xor(input.handle, other.handle), input.type) - - -def logical_and(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: - if not input.type.is_int1(): - input = bitcast(input, tl.dtype("int1"), builder) - if not other.type.is_int1(): - other = bitcast(other, tl.dtype("int1"), builder) - return and_(input, other, builder) - - -def logical_or(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: - if not input.type.is_int1(): - input = bitcast(input, tl.dtype("int1"), builder) - if not other.type.is_int1(): - other = bitcast(other, tl.dtype("int1"), builder) - return or_(input, other, builder) - - -def not_(input: tl.tensor, builder: ir.builder): - if not input.type.is_int1(): - input = bitcast(input, tl.dtype("int1"), builder) - return invert(input, builder) - - -def lshr(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = bitwise_op_type_checking_impl(input, other, builder) - return tl.tensor(builder.create_lshr(input.handle, other.handle), input.type) - - -def ashr(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = bitwise_op_type_checking_impl(input, other, builder) - return tl.tensor(builder.create_ashr(input.handle, other.handle), input.type) - - -def shl(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = bitwise_op_type_checking_impl(input, other, builder) - return tl.tensor(builder.create_shl(input.handle, other.handle), input.type) - -# ===----------------------------------------------------------------------===// -# Unary Operators -# ===----------------------------------------------------------------------===// - - -def plus(input: tl.tensor) -> tl.tensor: - return input - - -def minus(input: tl.tensor, - builder: ir.builder) -> tl.tensor: - input_sca_ty = input.type.scalar - if input_sca_ty.is_ptr(): - raise ValueError("wrong type argument to unary minus (" + input_sca_ty.__repr__() + ")") - _0 = tl.tensor(builder.get_null_value(input_sca_ty.to_ir(builder)), input_sca_ty) - return sub(_0, input, builder) - - -def invert(input: tl.tensor, - builder: tl.tensor) -> tl.tensor: - input_sca_ty = input.type.scalar - if input_sca_ty.is_ptr() or input_sca_ty.is_floating(): - raise ValueError("wrong type argument to unary invert (" + input_sca_ty.__repr__() + ")") - _1 = tl.tensor(builder.get_all_ones_value(input_sca_ty.to_ir(builder)), input_sca_ty) - return xor_(input, _1, builder) - - -# ===----------------------------------------------------------------------===// -# Comparison Operators -# ===----------------------------------------------------------------------===// -def _bool_like(v: tl.tensor) -> tl.block_type: - if not v.type.is_block(): - return tl.int1 - shape = v.type.shape - return tl.block_type(tl.int1, shape) - - -def greater_than(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder) - scalar_ty = input.type.scalar - # float > float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fcmpOGT(input.handle, other.handle), _bool_like(input)) - # > int - elif scalar_ty.is_int(): - if scalar_ty.is_int_signed(): - return tl.tensor(builder.create_icmpSGT(input.handle, other.handle), _bool_like(input)) - else: - return tl.tensor(builder.create_icmpUGT(input.handle, other.handle), _bool_like(input)) - assert False - - -def greater_equal(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder) - scalar_ty = input.type.scalar - # float >= float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fcmpOGE(input.handle, other.handle), _bool_like(input)) - # >= int - elif scalar_ty.is_int(): - if scalar_ty.is_int_signed(): - return tl.tensor(builder.create_icmpSGE(input.handle, other.handle), _bool_like(input)) - else: - return tl.tensor(builder.create_icmpUGE(input.handle, other.handle), _bool_like(input)) - assert False - - -def less_than(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder) - scalar_ty = input.type.scalar - # float < float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fcmpOLT(input.handle, other.handle), _bool_like(input)) - # < int - elif scalar_ty.is_int(): - if scalar_ty.is_int_signed(): - return tl.tensor(builder.create_icmpSLT(input.handle, other.handle), _bool_like(input)) - else: - return tl.tensor(builder.create_icmpULT(input.handle, other.handle), _bool_like(input)) - assert False - - -def less_equal(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder) - scalar_ty = input.type.scalar - # float < float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fcmpOLE(input.handle, other.handle), _bool_like(input)) - # < int - elif scalar_ty.is_int(): - if scalar_ty.is_int_signed(): - return tl.tensor(builder.create_icmpSLE(input.handle, other.handle), _bool_like(input)) - else: - return tl.tensor(builder.create_icmpULE(input.handle, other.handle), _bool_like(input)) - assert False - - -def equal(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder) - scalar_ty = input.type.scalar - # float == float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fcmpOEQ(input.handle, other.handle), _bool_like(input)) - # == int - elif scalar_ty.is_int(): - return tl.tensor(builder.create_icmpEQ(input.handle, other.handle), _bool_like(input)) - assert False - - -def not_equal(input: tl.tensor, - other: tl.tensor, - builder: ir.builder) -> tl.tensor: - input, other = binary_op_type_checking_impl(input, other, builder) - scalar_ty = input.type.scalar - # float == float - if scalar_ty.is_floating(): - return tl.tensor(builder.create_fcmpUNE(input.handle, other.handle), _bool_like(input)) - # == int - elif scalar_ty.is_int(): - return tl.tensor(builder.create_icmpNE(input.handle, other.handle), _bool_like(input)) - assert False - -# ===----------------------------------------------------------------------===// -# Block Creation -# ===----------------------------------------------------------------------===// - - -def arange(start: int, end: int, builder: ir.builder) -> tl.tensor: - if not isinstance(start, int) or not isinstance(end, int): - raise ValueError("arange's arguments must be of type tl.constexpr") - is_start_int64 = bool(start >> 32) - is_end_int64 = bool(end >> 32) - if is_start_int64 or is_end_int64: - raise ValueError("arange must fit in int32") - if end <= start: - raise ValueError("arange's end argument must be greater than the start argument") - - shape = [end - start] - ret_ty = tl.block_type(tl.int32, shape) - return tl.tensor(builder.create_make_range(start, end), ret_ty) - - -def full(shape: List[int], value, dtype: tl.dtype, builder: ir.builder) -> tl.tensor: - if isinstance(value, tl.tensor): - assert value.numel.value == 1, "only accepts size-1 tensor" - value = cast(value, dtype, builder) - ret_ty = tl.block_type(value.dtype, shape) - return tl.tensor(builder.create_splat(value.handle, shape), ret_ty) - else: - # scalar - if value == 0: - value = builder.get_null_value(dtype.to_ir(builder)) - else: - get_value_fn = getattr(builder, f"get_{dtype.name}") - value = get_value_fn(value) - if dtype is None: - raise ValueError("dtype must be specified when value is not a tensor") - ret_ty = tl.block_type(dtype, shape) - return tl.tensor(builder.create_splat(value, shape), ret_ty) - - -# ===----------------------------------------------------------------------===// -# Shape Manipulation -# ===----------------------------------------------------------------------===// - - -def view(input: tl.tensor, - dst_shape: List[int], - builder: ir.builder) -> tl.tensor: - # TODO: disable when TritonToTritonGPU handles views properly - - # assert len(input.shape) == len(dst_shape) - numel = 1 - for s in dst_shape: - numel *= s - if input.type.numel != numel: - raise ValueError("cannot view block of different shape") - ret_ty = tl.block_type(input.type.scalar, dst_shape) - return tl.tensor(builder.create_view(input.handle, dst_shape), ret_ty) - - -def reshape(input: tl.tensor, - dst_shape: List[int], - builder: ir.builder) -> tl.tensor: - raise ValueError("`reshape` is not supported yet. Please use `view` instead if applicable. " - "Note that view may reorder elements in an implementation- and context- dependent way.") - - -def expand_dims(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: - dst_shape = list(input.type.shape) - dst_shape.insert(axis, 1) - ret_ty = tl.block_type(input.type.scalar, dst_shape) - return tl.tensor(builder.create_expand_dims(input.handle, axis), ret_ty) - - -def cat(lhs: tl.tensor, rhs: tl.tensor, can_reorder: bool, builder: ir.builder) -> tl.tensor: - assert can_reorder, "current implementation of `cat` always may reorder elements" - assert len(lhs.shape) == 1 - ret_type = tl.block_type(lhs.type.scalar, [lhs.shape[0] + rhs.shape[0]]) - return tl.tensor(builder.create_cat(lhs.handle, rhs.handle), ret_type) - - -def trans(input: tl.tensor, builder: ir.builder) -> tl.tensor: - if len(input.shape) != 2: - raise ValueError("Only 2D tensors can be transposed") - ret_type = tl.block_type(input.type.scalar, [input.shape[1], input.shape[0]]) - return tl.tensor(builder.create_trans(input.handle), ret_type) - - -def broadcast_impl_shape(input: tl.tensor, - shape: List[int], - builder: ir.builder) -> tl.tensor: - if not input.type.is_block(): - ret_ty = tl.block_type(input.type, shape) - return tl.tensor(builder.create_splat(input.handle, shape), ret_ty) - src_shape = input.type.get_block_shapes() - if len(src_shape) != len(shape): - raise ValueError(f"Cannot broadcast, rank mismatch: {src_shape}, {shape}") - if shape == src_shape: - return input - for i, item in enumerate(src_shape): - if shape[i] != item and item != 1: - raise ValueError(f"Cannot broadcast, the expanded size of the tensor ({shape[i]})" - f" must match the existing size ({item}) at non-singleton dimension" - f" {i}: {src_shape}, {shape}") - ret_ty = tl.block_type(input.type.scalar, shape) - return tl.tensor(builder.create_broadcast(input.handle, shape), ret_ty) - - -def broadcast_impl_value(lhs: tl.tensor, - rhs: tl.tensor, - builder: ir.builder) -> tl.tensor: - lhs_ty = lhs.type - rhs_ty = rhs.type - - # make_shape_compatible(block, scalar) - if lhs_ty.is_block() and not rhs_ty.is_block(): - rhs_ty = tl.block_type(rhs_ty.scalar, lhs_ty.shape) - rhs = tl.tensor(builder.create_splat(rhs.handle, lhs_ty.get_block_shapes()), rhs_ty) - # make_shape_compatible(scalar, block) - elif not lhs_ty.is_block() and rhs_ty.is_block(): - lhs_ty = tl.block_type(lhs_ty.scalar, rhs_ty.shape) - lhs = tl.tensor(builder.create_splat(lhs.handle, rhs_ty.get_block_shapes()), lhs_ty) - # make_shape_compatible(block, block) - elif lhs_ty.is_block() and rhs_ty.is_block(): - lhs_shape = lhs_ty.get_block_shapes() - rhs_shape = rhs_ty.get_block_shapes() - - if len(lhs_shape) < len(rhs_shape): - # Add new axes to lhs - for dim in range(len(lhs_shape), len(rhs_shape)): - lhs = tl.tensor(builder.create_expand_dims(lhs.handle, 0), tl.block_type(lhs_ty.scalar, [1] + lhs_shape)) - lhs_ty = lhs.type - lhs_shape = lhs_ty.get_block_shapes() - elif len(rhs_shape) < len(lhs_shape): - # Add new axes to rhs - for dim in range(len(rhs_shape), len(lhs_shape)): - rhs = tl.tensor(builder.create_expand_dims(rhs.handle, 0), tl.block_type(rhs_ty.scalar, [1] + rhs_shape)) - rhs_ty = rhs.type - rhs_shape = rhs_ty.get_block_shapes() - assert len(rhs_shape) == len(lhs_shape) - - ret_shape = [] - for i, left in enumerate(lhs_shape): - right = rhs_shape[i] - if left == 1: - ret_shape.append(right) - elif right == 1: - ret_shape.append(left) - elif left == right: - ret_shape.append(left) - else: - raise ValueError("Cannot make_shape_compatible: incompatible dimensions " - "at index " + str(i) + ": " + str(left) + " and " + str(right)) - if lhs_shape != ret_shape: - ret_ty = tl.block_type(lhs_ty.scalar, ret_shape) - lhs = tl.tensor(builder.create_broadcast(lhs.handle, ret_shape), ret_ty) - if rhs_shape != ret_shape: - ret_ty = tl.block_type(rhs_ty.scalar, ret_shape) - rhs = tl.tensor(builder.create_broadcast(rhs.handle, ret_shape), ret_ty) - # (scalar, scalar) => returns original blocks - return lhs, rhs - -####### -# cast -####### - - -def bitcast(input: tl.tensor, - dst_ty: tl.dtype, - builder: ir.builder) -> tl.tensor: - src_ty = input.type - if src_ty.is_block(): - dst_ty = tl.block_type(dst_ty.scalar, input.type.get_block_shapes()) - if src_ty == dst_ty: - return input - src_sca_ty = src_ty.scalar - dst_sca_ty = dst_ty.scalar - if src_sca_ty.is_ptr() or dst_sca_ty.is_ptr(): - return cast(input, dst_ty, builder) - # Bitcast - src_bits = src_sca_ty.primitive_bitwidth - dst_bits = dst_sca_ty.primitive_bitwidth - if src_bits != dst_bits: - raise ValueError("Cannot bitcast data-type of size " + str(src_bits) + " to " - "data-type of size " + str(dst_bits)) - return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)), - dst_ty) - - -def cast(input: tl.tensor, - dst_ty: tl.dtype, - builder: ir.builder) -> tl.tensor: - src_ty = input.type - if isinstance(dst_ty, tl.constexpr): - dst_ty = dst_ty.value - if src_ty.is_block(): - dst_ty = tl.block_type(dst_ty.scalar, input.type.get_block_shapes()) - if src_ty == dst_ty: - return input - - src_sca_ty = src_ty.scalar - dst_sca_ty = dst_ty.scalar - - # Casting with customized floating types involved: fp8 <=> bf16, fp16, fp32, fp64 - if (src_sca_ty.is_fp8() and dst_sca_ty.is_floating()) or \ - (src_sca_ty.is_floating() and dst_sca_ty.is_fp8()): - return tl.tensor(builder.create_fp_to_fp(input.handle, dst_ty.to_ir(builder)), - dst_ty) - - # bf16 <=> (not fp32) - if (src_sca_ty.is_fp16() and not dst_sca_ty.is_fp32()) or \ - (src_sca_ty.is_bf16() and not dst_sca_ty.is_fp32()): - return cast(cast(input, tl.float32, builder), dst_sca_ty, builder) - - # Standard floating types' casting: truncation - # fp64 => fp32, fp16, bf16 - # fp32 => fp16, bf16 - truncate_fp = src_sca_ty.is_floating() and \ - dst_sca_ty.is_floating() and \ - src_sca_ty.primitive_bitwidth > dst_sca_ty.primitive_bitwidth - if truncate_fp: - return tl.tensor(builder.create_fp_trunc(input.handle, - dst_ty.to_ir(builder)), - dst_ty) - - # Standard floating types' casting: extension - # fp32 => fp64 - # fp16 => fp32, fp64 - # bf16 => fp32, fp64 - ext_fp = src_sca_ty.is_floating() and \ - dst_sca_ty.is_floating() and \ - src_sca_ty.primitive_bitwidth < dst_sca_ty.primitive_bitwidth - if ext_fp: - return tl.tensor(builder.create_fp_ext(input.handle, - dst_ty.to_ir(builder)), - dst_ty) - - # Casting between integer types - if src_sca_ty.is_int() and dst_sca_ty.is_int() and \ - (src_sca_ty.int_bitwidth != dst_sca_ty.int_bitwidth or src_sca_ty.int_signedness != dst_sca_ty.int_signedness): - sign_extend = src_sca_ty.is_int_signed() and not src_sca_ty.is_bool() - if dst_sca_ty.is_bool(): - ty = input.dtype.to_ir(builder) - _0 = tl.tensor(builder.get_null_value(ty), input.dtype) - return not_equal(input, _0, builder) - else: - return tl.tensor(builder.create_int_cast(input.handle, - dst_ty.to_ir(builder), sign_extend), - dst_ty) - - # Casting standard floating types to integer types - if src_sca_ty.is_standard_floating() and dst_sca_ty.is_int(): - if dst_sca_ty.is_bool(): - ty = input.dtype.to_ir(builder) - _0 = tl.tensor(builder.get_null_value(ty), input.dtype) - return not_equal(input, _0, builder) - elif dst_sca_ty.is_int_signed(): - return tl.tensor(builder.create_fp_to_si(input.handle, - dst_ty.to_ir(builder)), - dst_ty) - else: - return tl.tensor(builder.create_fp_to_ui(input.handle, - dst_ty.to_ir(builder)), - dst_ty) - - # Casting integer types to standard floating types - if src_sca_ty.is_int() and dst_sca_ty.is_standard_floating(): - if src_sca_ty.is_bool() or not src_sca_ty.is_int_signed(): - return tl.tensor(builder.create_ui_to_fp(input.handle, - dst_ty.to_ir(builder)), - dst_ty) - else: - return tl.tensor(builder.create_si_to_fp(input.handle, - dst_ty.to_ir(builder)), - dst_ty) - - # Casting pointer types to integer types - if src_sca_ty.is_ptr() and dst_sca_ty.is_int(): - bitwidth = dst_sca_ty.int_bitwidth - if bitwidth == 64: - return tl.tensor(builder.create_ptr_to_int(input.handle, dst_ty.to_ir(builder)), - dst_ty) - if bitwidth == 1: - return not_equal(cast(input, tl.int64, builder), - tl.tensor(builder.get_int64(0), tl.int64), - builder) - - # Casting integer types to pointer types - if src_sca_ty.is_int() and dst_sca_ty.is_ptr(): - return tl.tensor(builder.create_int_to_ptr(input.handle, dst_ty.to_ir(builder)), dst_ty) - - # Casting pointer types to pointer types - if src_sca_ty.is_ptr() and dst_sca_ty.is_ptr(): - return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)), dst_ty) - - assert False, f'cannot cast {input} to {dst_ty}' - -# ===----------------------------------------------------------------------===// -# Memory Operators -# ===----------------------------------------------------------------------===// - - -def _str_to_cache_modifier(cache_modifier): - cache = ir.CACHE_MODIFIER.NONE # default - if cache_modifier: - if cache_modifier == ".ca": - cache = ir.CACHE_MODIFIER.CA - elif cache_modifier == ".cg": - cache = ir.CACHE_MODIFIER.CG - else: - raise ValueError(f"Cache modifier {cache_modifier} not supported") - return cache - - -def _str_to_eviction_policy(eviction_policy): - eviction = ir.EVICTION_POLICY.NORMAL # default - if eviction_policy: - if eviction_policy == "evict_last": - eviction = ir.EVICTION_POLICY.EVICT_LAST - elif eviction_policy == "evict_first": - eviction = ir.EVICTION_POLICY.EVICT_FIRST - else: - raise ValueError(f"Eviction policy {eviction_policy} not supported") - return eviction - - -def _str_to_padding_option(padding_option): - padding = None # default - if padding_option: - if padding_option == "zero": - padding = ir.PADDING_OPTION.PAD_ZERO - elif padding_option == "nan": - padding = ir.PADDING_OPTION.PAD_NAN - else: - raise ValueError(f"Padding option {padding_option} not supported") - return padding - - -def _canonicalize_boundary_check(boundary_check, block_shape): - if boundary_check: - if not hasattr(boundary_check, "__iter__"): - boundary_check = [boundary_check] - boundary_check = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in boundary_check] - for dim in boundary_check: - assert isinstance(dim, int) and 0 <= dim < len(block_shape) - assert len(boundary_check) > 0 - assert len(boundary_check) == len(set(boundary_check)), "Duplicate dimension in `boundary_check`" - return sorted(boundary_check) - return tuple() - - -def _load_block_pointer(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder): - # Load by a block pointer: `pointer_type>` - # Block pointer can not have `mask` and `other` arguments - if mask or other: - raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers") - - elt_ty = ptr.type.element_ty.element_ty - assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`" - if elt_ty.is_int() and padding == ir.PADDING_OPTION.PAD_NAN: - raise ValueError("Padding option `nan` is not supported for integer block pointers") - - # `dst_ty` is de-referenced type of the pointer type - dst_ty = ptr.type.element_ty - - # Check `boundary_check` argument - boundary_check = _canonicalize_boundary_check(boundary_check, dst_ty.get_block_shapes()) - - # Build IR - return tl.tensor(builder.create_tensor_pointer_load(ptr.handle, boundary_check, padding, cache, eviction, - is_volatile), dst_ty) - - -def _load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder): - # Load by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` - if not ptr.type.scalar.is_ptr(): - raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.load`") - - # Check `mask`, `other`, `boundary_check`, and `padding` arguments - if not mask and other: - raise ValueError("`other` cannot be provided without `mask`") - if padding or boundary_check: - raise ValueError("`padding_option` or `boundary_check` argument is not supported for loading a tensor of" - "pointers or loading a scalar. Because the compiler does not know the boundary; please " - "use block pointers (defined by `make_block_ptr`) instead") - - # For a pointer of scalar, check the type of `mask` and `other` - if not ptr.type.is_block(): - if mask and mask.type.is_block(): - raise ValueError("Mask argument cannot be block type if pointer argument is not a block") - if other and other.type.is_block(): - raise ValueError("Other argument cannot be block type if pointer argument is not a block") - - # Make `mask` and `other` into the same shape as `ptr` - if ptr.type.is_block(): - if mask: - mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) - if other: - other = broadcast_impl_shape(other, ptr.type.get_block_shapes(), builder) - - # Get `pointer_type` and `elt_ty` - ptr_ty = ptr.type.scalar - elt_ty = ptr_ty.element_ty - - # Treat `pointer_type` as `pointer_type` - if elt_ty == tl.int1: - elt_ty = tl.int8 - ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space) - ptr = cast(ptr, ptr_ty, builder) - - # Cast `other` into `ele_ty` type - if other: - other = cast(other, elt_ty, builder) - - # Create loaded result type `dst_ty` - if ptr.type.is_block(): - shape = ptr.type.get_block_shapes() - dst_ty = tl.block_type(elt_ty, shape) - else: - # Load by de-referencing the pointer of scalar - dst_ty = elt_ty - - # Build IR - if not mask: - return tl.tensor(builder.create_load(ptr.handle, cache, eviction, is_volatile), dst_ty) - else: - return tl.tensor(builder.create_masked_load(ptr.handle, mask.handle, other.handle if other else None, cache, - eviction, is_volatile), dst_ty) - - -def load(ptr: tl.tensor, - mask: Optional[tl.tensor], - other: Optional[tl.tensor], - boundary_check, - padding_option: str, - cache_modifier: str, - eviction_policy: str, - is_volatile: bool, - builder: ir.builder) -> tl.tensor: - # Cache, eviction and padding options - cache = _str_to_cache_modifier(cache_modifier) - eviction = _str_to_eviction_policy(eviction_policy) - padding = _str_to_padding_option(padding_option) - - if ptr.type.is_ptr() and ptr.type.element_ty.is_block(): - # Load by a block pointer: `pointer_type>` - return _load_block_pointer(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder) - else: - # Load by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` - return _load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder) - - -def _store_block_pointer(ptr, val, mask, boundary_check, cache, eviction, builder): - # Store by a block pointer: `pointer_type>` - # Block pointers can not have the `mask` argument - if mask: - raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers") - - # Check same shape and element type - block_shape = ptr.type.element_ty.get_block_shapes() - if not val.type.is_block(): - val = broadcast_impl_shape(val, block_shape, builder) - assert val.type.is_block(), "Value argument must be block type or a scalar" - assert block_shape == val.type.get_block_shapes(), "Block shape and value shape mismatch" - assert ptr.type.element_ty.element_ty == val.type.element_ty, "Block element type and value element type mismatch" - - elt_ty = ptr.type.element_ty.element_ty - assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`" - - # Check `boundary_check` argument - boundary_check = _canonicalize_boundary_check(boundary_check, block_shape) - - # Build IR - return tl.tensor(builder.create_tensor_pointer_store(ptr.handle, val.handle, boundary_check, cache, eviction), - tl.void) - - -def _store_legacy(ptr, val, mask, boundary_check, cache, eviction, builder): - # Store by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` - if not ptr.type.scalar.is_ptr(): - raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.store`") - - # Check `boundary_check` argument - if boundary_check: - raise ValueError("`boundary_check` argument is not supported for storing a tensor of pointers or storing a " - "scalar. Because the compiler does not know the boundary; please use block pointers " - "(defined by `make_block_ptr`) instead") - - # For a pointer of scalar, check the type of `val` and `mask` - if not ptr.type.is_block(): - if val.type.is_block(): - raise ValueError("Value argument cannot be block type if pointer argument is not a block") - if mask and mask.type.is_block(): - raise ValueError("Mask argument cannot be block type if pointer argument is not a block") - - # Make `mask` and `val` into the same shape as `ptr` - if ptr.type.is_block(): - val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder) - if mask: - mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) - - ptr_ty = ptr.type.scalar - elt_ty = ptr_ty.element_ty - - # Treat `pointer_type` as `pointer_type` - if elt_ty == tl.int1: - elt_ty = tl.int8 - ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space) - ptr = cast(ptr, ptr_ty, builder) - - # Cast to target data type - val = cast(val, elt_ty, builder) - - # Build IR - if not mask: - return tl.tensor(builder.create_store(ptr.handle, val.handle, cache, eviction), tl.void) - if not mask.type.scalar.is_bool(): - raise ValueError("Mask must have boolean scalar type") - return tl.tensor(builder.create_masked_store(ptr.handle, val.handle, mask.handle, cache, eviction), tl.void) - - -def store(ptr: tl.tensor, - val: tl.tensor, - mask: Optional[tl.tensor], - boundary_check, - cache_modifier: str, - eviction_policy: str, - builder: ir.builder) -> tl.tensor: - # Cache and eviction options - cache = _str_to_cache_modifier(cache_modifier) - eviction = _str_to_eviction_policy(eviction_policy) - - if ptr.type.is_ptr() and ptr.type.element_ty.is_block(): - # Store by a block pointer: `pointer_type>` - return _store_block_pointer(ptr, val, mask, boundary_check, cache, eviction, builder) - else: - # Store by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` - return _store_legacy(ptr, val, mask, boundary_check, cache, eviction, builder) - - -######### -# atomic -######### - - -def atomic_cas(ptr: tl.tensor, - cmp: tl.tensor, - val: tl.tensor, - builder: ir.builder) -> tl.tensor: - element_ty = ptr.type.scalar.element_ty - if element_ty.primitive_bitwidth not in [16, 32, 64]: - raise ValueError("atomic_cas only supports elements with width {16, 32, 64}") - return tl.tensor(builder.create_atomic_cas(ptr.handle, cmp.handle, val.handle), val.type) - - -def atom_red_typechecking_impl(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - op: str, - builder: ir.builder) -> Tuple[tl.tensor, tl.tensor, tl.tensor]: - if not ptr.type.scalar.is_ptr(): - raise ValueError("Pointer argument of store instruction is " + ptr.type.__repr__()) - - element_ty = ptr.type.scalar.element_ty - if element_ty is tl.float16 and op != 'add': - raise ValueError("atomic_" + op + " does not support fp16") - if element_ty in [tl.int1, tl.int8, tl.int16, tl.bfloat16]: - raise ValueError("atomic_" + op + " does not support " + str(element_ty)) - if ptr.type.is_block(): - if mask: - mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) - if val: - val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder) - val = cast(val, ptr.type.scalar.element_ty, builder) - if not mask: - mask_ir = builder.get_int1(True) - mask_ty = tl.int1 - if ptr.type.is_block(): - mask_ir = builder.create_splat(mask_ir, ptr.type.get_block_shapes()) - mask_ty = tl.block_type(tl.int1, ptr.type.get_block_shapes()) - mask = tl.tensor(mask_ir, mask_ty) - return ptr, val, mask - - -def atomic_max(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - builder: ir.builder) -> tl.tensor: - ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'max', builder) - sca_ty = val.type.scalar - # direct call to atomic_max for integers - if sca_ty.is_int(): - if sca_ty.is_int_signed(): - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, - ptr.handle, - val.handle, - mask.handle), - val.type) - else: - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, - ptr.handle, - val.handle, - mask.handle), - val.type) - # for float - # return atomic_smax(i_ptr, i_val) if val >= 0 - # return atomic_umin(i_ptr, i_val) if val < 0 - i_val = bitcast(val, tl.int32, builder) - i_ptr = bitcast(ptr, tl.pointer_type(tl.int32, 1), builder) - pos = greater_equal(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) - neg = less_than(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) - pos_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, i_ptr.handle, i_val.handle, and_(mask, pos, builder).handle), i_val.type) - neg_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, i_ptr.handle, i_val.handle, and_(mask, neg, builder).handle), i_val.type) - return where(pos, pos_ret, neg_ret, builder) - - -def atomic_min(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - builder: ir.builder) -> tl.tensor: - ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'min', builder) - sca_ty = val.type.scalar - # direct call to atomic_min for integers - if sca_ty.is_int(): - if sca_ty.is_int_signed(): - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, - ptr.handle, - val.handle, - mask.handle), - val.type) - else: - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, - ptr.handle, - val.handle, - mask.handle), - val.type) - # for float - # return atomic_smin(i_ptr, i_val) if val >= 0 - # return atomic_umax(i_ptr, i_val) if val < 0 - i_val = bitcast(val, tl.int32, builder) - i_ptr = bitcast(ptr, tl.pointer_type(tl.int32, 1), builder) - pos = greater_equal(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) - neg = less_than(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) - pos_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, - i_ptr.handle, - i_val.handle, - and_(mask, pos, builder).handle), - i_val.type) - neg_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, - i_ptr.handle, - i_val.handle, - and_(mask, neg, builder).handle), - i_val.type) - return where(pos, pos_ret, neg_ret, builder) - - -def atomic_add(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - builder: ir.builder) -> tl.tensor: - ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'add', builder) - sca_ty = val.type.scalar - op = ir.ATOMIC_OP.FADD if sca_ty.is_floating() else ir.ATOMIC_OP.ADD - return tl.tensor(builder.create_atomic_rmw(op, ptr.handle, val.handle, mask.handle), val.type) - - -def atomic_and(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - builder: ir.builder) -> tl.tensor: - ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'and', builder) - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.AND, ptr.handle, val.handle, mask.handle), val.type) - - -def atomic_or(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - builder: ir.builder) -> tl.tensor: - ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'or', builder) - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.OR, ptr.handle, val.handle, mask.handle), val.type) - - -def atomic_xor(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - builder: ir.builder) -> tl.tensor: - ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xor', builder) - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XOR, ptr.handle, val.handle, mask.handle), val.type) - - -def atomic_xchg(ptr: tl.tensor, - val: tl.tensor, - mask: tl.tensor, - builder: ir.builder) -> tl.tensor: - ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xchg', builder) - return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XCHG, ptr.handle, val.handle, mask.handle), val.type) - -# ===----------------------------------------------------------------------===// -# Linear Algebra -# ===----------------------------------------------------------------------===// - - -def dot(lhs: tl.tensor, - rhs: tl.tensor, - allow_tf32: bool, - out_dtype: tl.dtype, - builder: ir.builder) -> tl.tensor: - assert lhs.type.is_block() and rhs.type.is_block() - assert lhs.dtype == rhs.dtype, "lhs and rhs must have the same dtype!" - assert len(lhs.shape) == 2 and len(rhs.shape) == 2 - assert lhs.shape[1].value == rhs.shape[0].value - assert lhs.shape[0].value >= 16 and lhs.shape[1].value >= 16 \ - and rhs.shape[1].value >= 16,\ - "small blocks not supported!" - if lhs.type.scalar.is_int(): - assert lhs.type.scalar == tl.int8, "only int8 supported!" - # TODO: This is CUDA specific, check if ROCm has the same limitation - assert lhs.shape[1].value >= 32, "small blocks not supported!" - _0 = builder.get_int32(0) - ret_scalar_ty = tl.int32 - elif lhs.type.scalar.is_fp32() or lhs.type.scalar.is_bf16(): - _0 = builder.get_fp32(0) - ret_scalar_ty = tl.float32 - else: - _0 = builder.get_fp16(0) if out_dtype.is_fp16() else builder.get_fp32(0) - ret_scalar_ty = out_dtype - - M = lhs.type.shape[0] - N = rhs.type.shape[1] - _0 = builder.create_splat(_0, [M, N]) - ret_ty = tl.block_type(ret_scalar_ty, [M, N]) - return tl.tensor(builder.create_dot(lhs.handle, rhs.handle, _0, allow_tf32), - ret_ty) - - -# ===----------------------------------------------------------------------===// -# Indexing -# ===----------------------------------------------------------------------===// - -def where(condition: tl.tensor, - x: tl.tensor, - y: tl.tensor, - builder: ir.builder) -> tl.tensor: - condition = cast(condition, tl.int1, builder) - if condition.type.is_block(): - condition, x = broadcast_impl_value(condition, x, builder) - x, y = broadcast_impl_value(x, y, builder) - condition, x = broadcast_impl_value(condition, x, builder) - - x, y = binary_op_type_checking_impl(x, y, builder, True, True) - if not condition.type.is_block(): - condition, _ = broadcast_impl_value(condition, x, builder) - ret_ty = x.type - return tl.tensor(builder.create_select(condition.handle, x.handle, y.handle), ret_ty) - -# ===----------------------------------------------------------------------===// -# Reduction -# ===----------------------------------------------------------------------=== - - -def reduction( - inputs: Sequence[tl.tensor], axis: int, region_builder_fn, builder: ir.builder -) -> Tuple[tl.tensor, ...]: - # get result shape - shape = inputs[0].type.shape - ret_shape = [s for i, s in enumerate(shape) if i != axis] - for t in inputs: - assert t.type.shape == shape - - def wrap_tensor(x, scalar_ty): - if ret_shape: - res_ty = tl.block_type(scalar_ty, ret_shape) - else: - # 0d-tensor -> scalar - res_ty = scalar_ty - return tl.tensor(x, res_ty) - - reduce_op = builder.create_reduce([t.handle for t in inputs], axis) - region_builder_fn(reduce_op) - reduce_op.verify() - - return tuple( - wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar) - for i in range(len(inputs)) - ) - - -# ===----------------------------------------------------------------------=== -# Math -# ===----------------------------------------------------------------------=== - -def _check_dtype(dtypes: List[str]) -> T: - """ - We following libdevice's convention to check accepted data types for math functions. - It is not a good practice to support all data types as accelerators/GPUs don't support - many float16 and bfloat16 math operations. - We should let the users know that they are using and invoke explicit cast to convert - the data type to the supported one. - """ - def wrapper(fn): - @wraps(fn) - def check(*args, **kwargs): - # concatenate args and kwargs - all_args = list(args) + list(kwargs.values()) - for arg in [a for a in all_args if isinstance(a, tl.tensor)]: - if arg.type.scalar.name not in dtypes: - raise ValueError(f"Expected dtype {dtypes} but got {arg.type.scalar.name}") - return fn(*args, **kwargs) - return check - - return wrapper - - -def umulhi(x: tl.tensor, y: tl.tensor, builder: ir.builder) -> tl.tensor: - x, y = binary_op_type_checking_impl(x, y, builder) - # FIXME(Keren): not portable, should be fixed - from . import math - return math.mulhi(x, y, _builder=builder) - - -@_check_dtype(dtypes=["fp32", "fp64"]) -def floor(x: tl.tensor, builder: ir.builder) -> tl.tensor: - # FIXME(Keren): not portable, should be fixed - from . import math - return math.floor(x, _builder=builder) - - -@_check_dtype(dtypes=["fp32", "fp64"]) -def exp(x: tl.tensor, builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_exp(x.handle), x.type) - - -@_check_dtype(dtypes=["fp32", "fp64"]) -def log(x: tl.tensor, builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_log(x.handle), x.type) - - -@_check_dtype(dtypes=["fp32", "fp64"]) -def cos(x: tl.tensor, builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_cos(x.handle), x.type) - - -@_check_dtype(dtypes=["fp32", "fp64"]) -def sin(x: tl.tensor, builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_sin(x.handle), x.type) - - -@_check_dtype(dtypes=["fp32", "fp64"]) -def sqrt(x: tl.tensor, builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_sqrt(x.handle), x.type) - - -def abs(x: tl.tensor, builder: ir.builder) -> tl.tensor: - dtype = x.dtype - if dtype.is_floating(): - return tl.tensor(builder.create_fabs(x.handle), x.type) - elif dtype.is_int_signed(): - return tl.tensor(builder.create_iabs(x.handle), x.type) - elif dtype.is_int_unsigned(): - return x # no-op - else: - assert False, f"Unexpected dtype {dtype}" - - -## - - -def multiple_of(x: tl.tensor, values: List[int]) -> tl.tensor: - if len(x.shape) != len(values): - raise ValueError("Shape of input to multiple_of does not match the length of values") - x.handle.set_attr("tt.divisibility", ir.make_attr(values, x.handle.get_context())) - return x - - -def max_contiguous(x: tl.tensor, values: List[int]) -> tl.tensor: - if len(x.shape) != len(values): - raise ValueError("Shape of input to max_contiguous does not match the length of values") - x.handle.set_attr("tt.contiguity", ir.make_attr(values, x.handle.get_context())) - return x - - -def debug_barrier(builder: ir.builder) -> tl.tensor: - return tl.tensor(builder.create_barrier(), tl.void) - - -def device_print(prefix: str, args: List[tl.tensor], builder: ir.builder) -> tl.tensor: - new_args = [] - for arg in args: - new_args.append(arg.handle) - return tl.tensor(builder.create_print(prefix, new_args), tl.void) - - -def device_assert(cond: tl.tensor, msg: str, file_name: str, func_name, lineno: int, builder: ir.builder) -> tl.tensor: - cond_ty = cond.type - if not cond_ty.is_block(): - cond_ty = tl.block_type(cond_ty.scalar, (1,)) - cond = tl.tensor(builder.create_splat(cond.handle, (1,)), cond_ty) - return tl.tensor(builder.create_assert(cond.handle, msg, file_name, func_name, lineno), tl.void) - - -def _convert_elem_to_ir_value(builder, elem, require_i64): - if isinstance(elem, tl.constexpr): - return builder.get_int64(elem.value) if require_i64 else builder.get_int32(elem.value) - elif isinstance(elem, tl.tensor): - assert elem.numel.value == 1, "Expected a scalar in shape/strides/offsets" - assert elem.dtype.is_int(), "Expected an integer scalar type in shape/strides/offsets" - if elem.dtype != tl.int64 and require_i64: - return builder.create_int_cast(elem.handle, builder.get_int64_ty(), elem.dtype.is_int_signed()) - elif elem.dtype != tl.int32: - return builder.create_int_cast(elem.handle, builder.get_int32_ty(), elem.dtype.is_int_signed()) - return elem.handle - assert False, f"Unsupported element type in shape/strides/offsets: {type(elem)}" - - -def _convert_to_ir_values(builder, list_like, require_i64=True): - if hasattr(list_like, "__iter__"): - return [_convert_elem_to_ir_value(builder, elem, require_i64) for elem in list_like] - return [_convert_elem_to_ir_value(builder, list_like, require_i64)] - - -def make_block_ptr(base: tl.tensor, shape, strides, offsets, block_shape, order, builder: ir.builder) -> tl.tensor: - # Convert dynamic arguments to IR values - # NOTES(Chenggang): current `shape/strides` are `int64_t`, while `offsets/block_shape` are `int32_t` - shape = _convert_to_ir_values(builder, shape) - strides = _convert_to_ir_values(builder, strides) - offsets = _convert_to_ir_values(builder, offsets, require_i64=False) - - # Check `base` type - if not base.type.is_ptr() or base.type.element_ty.is_block(): - raise ValueError("Expected `base` to be a pointer type (but not a block pointer type or others)") - - # Treat `pointer_type` as `pointer_type` - if base.type.element_ty == tl.int1: - base = cast(base, tl.pointer_type(tl.int8, base.type.address_space), builder) - - # Check whether `block_shape` is static - if not hasattr(block_shape, "__iter__"): - block_shape = [block_shape] - block_shape = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in block_shape] - assert all([isinstance(elem, int) and -2**31 <= elem < 2**31 for elem in block_shape]), \ - "Expected a list of constant integers (`int32_t` range) in `block_shape`" - - # Check `order` - if not hasattr(order, "__iter__"): - order = [order] - order = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in order] - assert sorted(order) == list(range(len(order))), "Expected a permutation of (0, 1, ..., len(order)-1) in order" - - # Must have same length - assert all([len(block_shape) == len(list_like) for list_like in [shape, strides, offsets, order]]), \ - "Expected shape/strides/offsets/block_shape to have the same length" - - # Build value, the type is: - # `pointer_type>` in Python - # `tt.ptr>` in MLIR - handle = builder.create_make_block_ptr(base.handle, shape, strides, offsets, block_shape, order) - return tl.tensor(handle, tl.pointer_type(tl.block_type(base.type.element_ty, block_shape))) - - -def advance(base: tl.tensor, offsets, builder: ir.builder) -> tl.tensor: - # Convert dynamic offsets to IR values - offsets = _convert_to_ir_values(builder, offsets, require_i64=False) - - # Advanced block pointer type is the same as before - return tl.tensor(builder.create_advance(base.handle, offsets), base.type) diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py deleted file mode 100644 index b997674c91b6..000000000000 --- a/python/triton/language/standard.py +++ /dev/null @@ -1,98 +0,0 @@ -from __future__ import annotations - -from ..runtime.jit import jit -from . import core - -# ----------------------- -# Standard library -# ----------------------- - - -@jit -def cdiv(x, div): - """ - Computes the ceiling division of :code:`x` by :code:`div` - - :param x: the input number - :type input: Block - :param div: the divisor - :param div: Block - """ - return (x + div - 1) // div - - -@jit -@core._add_math_1arg_docstr("sigmoid") -def sigmoid(x): - return 1 / (1 + core.exp(-x)) - - -@jit -@core._add_math_1arg_docstr("softmax") -def softmax(x, ieee_rounding=False): - z = x - core.max(x, 0) - num = core.exp(z) - den = core.sum(num, 0) - return core.fdiv(num, den, ieee_rounding) - - -@jit -def ravel(x): - """ - Returns a contiguous flattened view of :code:`x`. - - :param x: the input tensor - :type x: Block - """ - return core.view(x, [x.numel]) - - -@jit -def swizzle2d(i, j, size_i, size_j, size_g): - """ - Transforms indices of a row-major size_i*size_j matrix into those - of one where indices are row major for each group of size_j rows. - For example, for size_i = size_j = 4 and size_g = 2, it will transform - [[0 , 1 , 2 , 3 ], - [4 , 5 , 6 , 7 ], - [8 , 9 , 10, 11], - [12, 13, 14, 15]] - into - [[0, 2, 4 , 6 ], - [1, 3, 5 , 7 ], - [8, 10, 12, 14], - [9, 11, 13, 15]] - """ - # "unrolled index in array" - ij = i * size_j + j - # number of elements in `size_g` groups - # of `size_j` columns - size_gj = size_g * size_j - # index of the group in which (i,j) is - group_id = ij // size_gj - # row-index of the first element of this group - off_i = group_id * size_g - # last group may have fewer rows - size_g = core.minimum(size_i - off_i, size_g) - # new row and column indices - new_i = off_i + (ij % size_g) - new_j = (ij % size_gj) // size_g - return new_i, new_j - - -@jit -def zeros(shape, dtype): - """ - Returns a tensor filled with the scalar value 0 for the given :code:`shape` and :code:`dtype`. - - :param shape: Shape of the new array, e.g., (8, 16) or (8, ) - :type shape: tuple of ints - :param dtype: Data-type of the new array, e.g., :code:`tl.float16` - :type dtype: DType - """ - return core.full(shape, 0, dtype) - - -@jit -def zeros_like(input): - return zeros(input.shape, input.dtype) diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py deleted file mode 100644 index 6ceec8b56a00..000000000000 --- a/python/triton/ops/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# from .conv import _conv, conv -from . import blocksparse -from .cross_entropy import _cross_entropy, cross_entropy -from .flash_attention import attention -from .matmul import _matmul, matmul - -__all__ = [ - "blocksparse", - "_cross_entropy", - "cross_entropy", - "_matmul", - "matmul", - "attention", -] diff --git a/python/triton/ops/blocksparse/__init__.py b/python/triton/ops/blocksparse/__init__.py deleted file mode 100644 index 6b24b5377fab..000000000000 --- a/python/triton/ops/blocksparse/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .matmul import matmul -from .softmax import softmax - -__all__ = [ - "matmul", - "softmax", -] diff --git a/python/triton/ops/blocksparse/matmul.py b/python/triton/ops/blocksparse/matmul.py deleted file mode 100644 index c599af26a055..000000000000 --- a/python/triton/ops/blocksparse/matmul.py +++ /dev/null @@ -1,437 +0,0 @@ -import torch - -import triton -import triton.language as tl - -# ******************************************************** -# -------------------------------------------------------- -# Sparse = Dense x Dense (SDD) -# This operation uses super-blocking to make sure that -# it's done efficiently when small blocks can be grouped -# together -# -------------------------------------------------------- -# ******************************************************** - - -@triton.heuristics({ - 'EVEN_K': lambda nargs: nargs['K'] % nargs['TILE_K'] == 0, -}) -@triton.jit -def _sdd_kernel( - A, B, C, - stride_za, stride_ha, stride_ma, stride_ak, - stride_zb, stride_hb, stride_bk, stride_nb, - stride_zc, stride_hc, stride_mc, stride_nc, - K, grid_offset, lut, - TILE_M: tl.constexpr, TILE_N: tl.constexpr, TILE_K: tl.constexpr, - BLOCK: tl.constexpr, EVEN_K: tl.constexpr -): - # ------------ # - # - Prologue - # - # ------------ # - block_id = tl.program_id(0) + grid_offset - lut += block_id * 3 - # offsets - off_z = tl.program_id(2) # batch - off_h = tl.load(lut + 0) # head - - # initialize pointers to A - start_am = tl.load(lut + 1) - offs_am = start_am * BLOCK + (tl.arange(0, TILE_M) % BLOCK) - offs_ak = tl.arange(0, TILE_K) - a_ptrs = A \ - + off_z * stride_za \ - + off_h * stride_ha \ - + offs_am[:, None] * stride_ma \ - + offs_ak[None, :] * stride_ak - # initialize pointers to B - start_bn = tl.load(lut + 2) - offs_bn = start_bn * BLOCK + (tl.arange(0, TILE_N) % BLOCK) - offs_bk = tl.arange(0, TILE_K) - b_ptrs = B \ - + off_z * stride_zb \ - + off_h * stride_hb \ - + offs_bn[None, :] * stride_nb \ - + offs_bk[:, None] * stride_bk - # ---------------- # - # Inner Loop # - # ---------------- # - acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32) - for k in range(K, 0, -TILE_K): - if EVEN_K: - a = tl.load(a_ptrs) - b = tl.load(b_ptrs) - else: - a = tl.load(a_ptrs, mask=offs_ak[None, :] < k, other=0.) - b = tl.load(b_ptrs, mask=offs_bk[:, None] < k, other=0.) - acc += tl.dot(a, b, out_dtype=tl.float32) - a_ptrs += TILE_K * stride_ak - b_ptrs += TILE_K * stride_bk - c = acc.to(C.dtype.element_ty) - # ---------------- # - # Epilogue # - # ---------------- # - offs_cm = tl.arange(0, TILE_M) % BLOCK - offs_cn = tl.arange(0, TILE_N) % BLOCK - pc = C \ - + off_z * stride_zc \ - + block_id * stride_hc \ - + offs_cm[:, None] * stride_mc \ - + offs_cn[None, :] * stride_nc - tl.store(pc, c, mask=True) - - -def sdd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, widths, out=None): - if a.stride(2) != 1 and a.stride(3) != 1: - a = a.contiguous() - if b.stride(2) != 1 and b.stride(3) != 1: - b = b.contiguous() - # (A * B)^T = B^T * A^T - if trans_c: - a, b = b, a - trans_a, trans_b = not trans_b, not trans_a - # shape constraints - a_dim = -2 if trans_a else -1 - b_dim = -1 if trans_b else -2 - Ka, Kb = a.shape[a_dim], b.shape[b_dim] - if Ka != Kb: - raise ValueError(f"Inner dimension mismatch (A: {Ka} vs B: {Kb})") - # allocate output - if out is None: - c = torch.empty((a.shape[0], lut.shape[0], block, block), dtype=a.dtype, device=a.device) - else: - assert out.shape == (a.shape[0], lut.shape[0], block, block) - c = out - grid = [c.shape[1], 1, c.shape[0]] - _sdd_kernel[grid]( - a, b, c, - a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3), - b.stride(0), b.stride(1), b.stride(3 if trans_b else 2), b.stride(2 if trans_b else 3), - c.stride(0), c.stride(1), c.stride(2), c.stride(3), - Ka, 0, lut, - TILE_M=block, TILE_N=block, TILE_K=32, BLOCK=block, num_stages=4, - num_warps=4, - ) - return c - - -def sdd_lut(layout, block, device): - lut = layout.nonzero(as_tuple=False).to(device).int() - lut = lut.contiguous() - return lut, None - -# ----------------------------- -# Dense = Sparse x Dense (DSD) -# This operation uses a look-up table that contains pre-computed pointer increments -# in order to minimize computations in the inner loop of the matmul kernel. -# ----------------------------- - - -@triton.jit -def _dsd_kernel( - A, B, C, - stride_az, stride_ha, stride_am, stride_ak, - stride_zb, stride_hb, stride_bk, stride_bn, - stride_zc, stride_hc, stride_cm, stride_cn, - DS0, DS1, lut, - TILE_M: tl.constexpr, TILE_N: tl.constexpr, TILE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, BLOCK: tl.constexpr -): - # ------------ # - # - Prologue - # - # ------------ # - pid_m = tl.program_id(0) - pid_n = tl.program_id(1) - num_pid_m = tl.num_programs(0) - num_pid_n = tl.num_programs(1) - pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_SIZE_M) - pidz = tl.program_id(2) - header = lut + pid_n * 4 - offset = tl.load(header + 0) - K = tl.load(header + 1) - column = tl.load(header + 2) - off_h = tl.load(header + 3) - pinc = lut + offset - # initialize pointers to A (sparse) - block_id = tl.load(pinc + 1) - block_id = tl.multiple_of(block_id, 8) # compiler hint - offs_am = tl.arange(0, TILE_M) - offs_ak = tl.arange(0, TILE_K) - pa = A + pidz * stride_az \ - + block_id * stride_ha \ - + offs_am[:, None] * stride_am \ - + offs_ak[None, :] * stride_ak - # initialize pointers to B (dense) - offs_bn = pid_m * TILE_N + tl.arange(0, TILE_N) - offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn % DS0, TILE_N), TILE_N) - start_bk = tl.load(pinc) - start_bk = tl.multiple_of(start_bk, 8) # compiler hint - offs_bk = start_bk + tl.arange(0, TILE_K) - pb = B + pidz * stride_zb \ - + off_h * stride_hb \ - + offs_bn[None, :] * stride_bn \ - + offs_bk[:, None] * stride_bk - # ---------------- # - # Inner Loop # - # ---------------- # - acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32) - pinc += 2 - inc_a = tl.load(pinc + 1) - inc_a = tl.multiple_of(inc_a, 8) - inc_b = tl.load(pinc) - inc_b = tl.multiple_of(inc_b, 8) - for k in range(K, 0, -TILE_K): - a = tl.load(pa) - b = tl.load(pb) - acc += tl.dot(a, b, out_dtype=tl.float32) - pa += inc_a - pb += inc_b * stride_bk - pinc += 2 - inc_a = tl.load(pinc + 1) - inc_a = tl.multiple_of(inc_a, 8) - inc_b = tl.load(pinc) - inc_b = tl.multiple_of(inc_b, 8) - c = acc.to(C.dtype.element_ty) - # initialize pointers to C - offs_cm = column * TILE_M + tl.arange(0, TILE_M) - offs_cn = pid_m * TILE_N + tl.arange(0, TILE_N) - pc = C \ - + off_h * stride_hc \ - + pidz * stride_zc \ - + offs_cm[:, None] * stride_cm \ - + offs_cn[None, :] * stride_cn - tl.store(pc, c, mask=offs_cn[None, :] < DS0) - - -def dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=None): - if a.stride(2) != 1 and a.stride(3) != 1: - a = a.contiguous() - if b.stride(2) != 1 and b.stride(3) != 1: - b = b.contiguous() - # shapes / dtypes - AS1 = block * spdims[2 if trans_a else 1] - BS0 = b.size(0) - BS1 = b.size(1) - BS3 = b.size(2 if trans_b else 3) - dtype = a.dtype - # allocate output - CS0 = BS0 - CS1 = BS1 - CS2 = BS3 if trans_c else AS1 - CS3 = AS1 if trans_c else BS3 - if out is None: - c = torch.empty((CS0, CS1, CS2, CS3), dtype=dtype, device=a.device) - else: - assert out.shape == (CS0, CS1, CS2, CS3) - c = out - # meta-parameter heuristics - TILE_N = 128 - # compute output - grid = lambda meta: [triton.cdiv(BS3, meta['TILE_N']), width, BS0] - _dsd_kernel[grid]( - a, b, c, - a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3), - b.stride(0), b.stride(1), b.stride(3 if trans_b else 2), b.stride(2 if trans_b else 3), - c.stride(0), c.stride(1), c.stride(3 if trans_c else 2), c.stride(2 if trans_c else 3), - BS3, AS1, lut, - TILE_M=block, TILE_N=TILE_N, TILE_K=min(block, 32), BLOCK=block, num_stages=4, - num_warps=4, GROUP_SIZE_M=4, - ) - # exit() - return c - - -def dsd_lut(layout, block, step, trans, device): - """ - Generates the look-up table for incrementing pointers in the DSD/DDS matmul. - Example (BLOCK=32, STEP=16) - [[1, 0, 0, 1, 0], - [0, 1, 1, 0, 1], - [1, 0, 1, 0, 0]] - - Then the offsets for A are - [0 , 16, 32, 48] <- row 0 - \\----/ \\----/ - col=0 col=3 - [64, 80, 96, 112, 128, 144] <- row 1 - \\----/ \\----/ \\------/ - col=1 col=2 col=3 - [160, 176, 192, 208] - which leads to increments table - [0, 16, 16, 16, || 64, 16, 16, 16, 16, 16, || 160, 16, 16, 16] - - Because B is dense, the offsets are - [0, 16, 96, 112] <- row 0 - [32, 48, 64, 80] <- row 1 - [0, 16, 64, 80] <- row 2 - """ - sizes = torch.sum(layout, 2 if trans else 1) - head_id, col_id = torch.ones_like(sizes).nonzero(as_tuple=True) - sizes = sizes.flatten() - segments = sizes * step - # pointer increments - if trans: - nnz = layout.nonzero(as_tuple=False) - else: - nnz = layout.transpose(1, 2).nonzero(as_tuple=False) - num_blocks = nnz.size(0) - offsets = torch.zeros_like(sizes) - offsets[1:] = torch.cumsum(sizes[:-1], dim=0) - offsets = torch.min(offsets, (num_blocks - 1) * torch.ones_like(offsets)) - # ------------------------------- - # dense input pointer increments - # ------------------------------- - # Note that the inner loop matmul kernel may have a fixed step size (e.g., TILE_K) - # that is smaller than the block size, so we need to do a bit of extra work - # to handle this case - B_idx = nnz[:, 2] * block - B_incs = B_idx.clone() - B_incs[1:] -= B_idx[:-1] - div = block // step - B_incs = B_incs.view(-1, 1).repeat(1, div) - B_incs[:, 1:] = step - B_incs[:, 0] -= (div - 1) * step - # first increment for each reduction is actually the offset - B_incs[offsets[segments > 0], 0] = B_idx[offsets[segments > 0]] - B_incs = B_incs.view(-1) - # ------------------------------- - # sparse input pointer increments - # ------------------------------- - # same as above, except that the increments are in the sparse memory layout - if trans: - A_idx = torch.arange(num_blocks, device=layout.device) - else: - A_idx = torch.tensor([], dtype=torch.int64, device=layout.device) - current_offset = 0 - for z in range(layout.size(0)): - layoutw = layout[z, :, :].clone().long() - msum = layoutw.sum() - layoutw[layoutw > 0] = 1 + torch.arange(msum, device=layout.device) - A_idx = torch.cat((A_idx, current_offset + layoutw.T[layoutw.T > 0] - 1)) - current_offset += msum - A_incs = A_idx * block * block - A_incs[1:] -= A_idx[:-1] * block * block - A_incs = A_incs.view(-1, 1).repeat(1, div) - if trans: - A_incs[:, 1:] = step - A_incs[:, 0] -= (div - 1) * step - else: - A_incs[:, 1:] = step * block - A_incs[:, 0] -= (div - 1) * step * block - A_incs[offsets[segments > 0], 0] = A_idx[offsets[segments > 0]] - A_incs = A_incs.view(-1) - # create header - width = col_id.size(0) - offsets = offsets * 2 * div + 4 * width - segments = segments * div - header = torch.stack((offsets, segments, col_id, head_id), dim=1).view(-1).contiguous() - # create increments - incs = torch.stack((B_incs, A_incs), dim=1).view(-1).contiguous() - # pad by a factor 2*MAX_NUM_STAGES - # to accommodate pre-fetching inside the kernel - pad = torch.zeros(20, device=incs.device, dtype=incs.dtype) - incs = torch.cat((incs, pad)) - # create lut - lut = torch.cat((header, incs)) - lut = lut.type(torch.int32).to(device) - # create locks - return lut, width - -# ----------------------------- -# Dense = Dense x Sparse (DDS) -# ----------------------------- -# AB = (B^T A^T)^T - - -def dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=None): - return dsd_matmul(b, a, not trans_b, not trans_a, not trans_c, spdims, block, lut, width, out=out) - -############## -# MAIN API # -############## - - -class _matmul(torch.autograd.Function): - - fn = {'sdd': sdd_matmul, 'dsd': dsd_matmul, 'dds': dds_matmul} - - @staticmethod - def forward( - ctx, a, b, trans_a, trans_b, trans_c, mode, spdims, block, - c_lut, c_width, da_lut, da_width, db_lut, db_width, out - ): - c = _matmul.fn[mode](a, b, trans_a, trans_b, trans_c, spdims, block, c_lut, c_width, out=out) - # save for backward - ctx.save_for_backward(a, b) - ctx.da_lut = da_lut - ctx.da_width = da_width - ctx.db_lut = db_lut - ctx.db_width = db_width - ctx.mode = mode - ctx.spdims = spdims - ctx.block = block - ctx.trans_a = trans_a - ctx.trans_b = trans_b - ctx.trans_c = trans_c - ctx.has_out = out is not None - return c - - @staticmethod - def backward(ctx, dc): - # saved for backward - a, b = ctx.saved_tensors - da, db = None, None - mode = ctx.mode - # gradients w.r.t. a - if ctx.needs_input_grad[0]: - mode_da = mode[1] + mode[0] + mode[2] - da = _matmul.fn[mode_da]( - dc, b, ctx.trans_c, not ctx.trans_b, ctx.trans_a, ctx.spdims, ctx.block, ctx.da_lut, ctx.da_width, - ) - # gradients w.r.t. b - if ctx.needs_input_grad[1]: - mode_db = mode[2] + mode[1] + mode[0] - db = _matmul.fn[mode_db]( - a, dc, not ctx.trans_a, ctx.trans_c, ctx.trans_b, ctx.spdims, ctx.block, ctx.db_lut, ctx.db_width, - ) - dout = dc if ctx.has_out else None - return da, db, None, None, None,\ - None, None, None, None,\ - None, None, None, None, None, dout - - -class matmul: - - def __init__(self, layout, block, mode, device, trans_a=False, trans_b=False, trans_c=False): - if mode not in ['sdd', 'dsd', 'dds']: - raise NotImplementedError('Supported modes are: sdd, dsd, dds') - self.block = block - self.mode = mode - self.trans_a = trans_a - self.trans_b = trans_b - self.trans_c = trans_c - self.layout = layout - self.spdims = layout.shape - step = min(block, 32) - if self.mode == 'sdd': - self.c_lut, self.c_width = sdd_lut(layout, block, device) - self.da_lut, self.da_width = dsd_lut(layout, block, step, True, device) - self.db_lut, self.db_width = dsd_lut(layout, block, step, False, device) - if self.mode == 'dsd': - self.c_lut, self.c_width = dsd_lut(layout, block, step, not self.trans_a, device) - self.da_lut, self.da_width = sdd_lut(layout, block, device) - self.db_lut, self.db_width = dsd_lut(layout, block, step, self.trans_a, device) - if self.mode == 'dds': - self.c_lut, self.c_width = dsd_lut(layout, block, step, self.trans_b, device) - self.da_lut, self.da_width = dsd_lut(layout, block, step, not self.trans_b, device) - self.db_lut, self.db_width = sdd_lut(layout, block, device) - - def __call__(self, a, b, out=None): - c = _matmul.apply( - a, b, self.trans_a, self.trans_b, self.trans_c, self.mode, self.spdims, self.block, - self.c_lut, self.c_width, - self.da_lut, self.da_width, - self.db_lut, self.db_width, - out - ) - return c diff --git a/python/triton/ops/blocksparse/softmax.py b/python/triton/ops/blocksparse/softmax.py deleted file mode 100644 index ac2b7c3eb985..000000000000 --- a/python/triton/ops/blocksparse/softmax.py +++ /dev/null @@ -1,239 +0,0 @@ -import torch - -import triton -import triton.language as tl - - -def num_warps(n): - if n <= 128: - return 1 - if n <= 256: - return 2 - if n <= 512: - return 4 - if n <= 4096: - return 8 - return 16 - - -@triton.jit -def _blocksparse_softmax_fwd( - Out, A, stride_xz, LUT, - R, extent, stride_zr, stride_hr, # relative attention - scale, is_causal, - ROW_SIZE: tl.constexpr, - BLOCK_SIZE: tl.constexpr, - IS_DENSE: tl.constexpr, -): - h = tl.program_id(0) - m = tl.program_id(1) - z = tl.program_id(2) - # create index ranges - hm = h * tl.num_programs(1) + m - lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE - block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE - # extract information from LUT - header = LUT + (hm // BLOCK_SIZE) * 2 - size = tl.load(header + 0) - offset = tl.load(header + 1) - # pointer offset - off_a = z * stride_xz - off_a += (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE # block indx - off_a += (m % BLOCK_SIZE) * BLOCK_SIZE # row indx - # do not need to read column indices in the dense case - if IS_DENSE: - ns = tl.arange(0, ROW_SIZE) - else: - off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE - start_n = tl.load(LUT + off_lut + block_n, mask=block_n < size, other=0) - ns = start_n * BLOCK_SIZE + lane_n - # load X - mask = block_n < size - a = tl.load(A + off_a + lane_n, mask=mask, other=-float("inf")) - a = a.to(tl.float32) - # compute - out = a - out *= scale - # apply relative attention - if R is not None: - R += z * stride_zr - R += h * stride_hr - off_lo = (extent - m - 1) + ns - mask_lo = (off_lo >= 0) & (off_lo < extent) - rel_logits = tl.load(R + m * extent + off_lo, mask=mask_lo, other=0.0) - out += rel_logits - out = out.to(tl.float32) - # apply causal mask - out = tl.where((ns > m) & is_causal, -float("inf"), out) - # computation - out = tl.softmax(out) - # write-back - tl.store(Out + off_a + lane_n, out, mask=mask) - - -@triton.jit -def _blocksparse_softmax_bwd( - DA, stride_zdx, - DOut, stride_zdout, - Out, stride_zout, - scale, - LUT, - DR, extent, stride_zr, stride_hr, stride_er, - is_causal, - ROW_SIZE: tl.constexpr, - BLOCK_SIZE: tl.constexpr, - IS_DENSE: tl.constexpr, -): - h = tl.program_id(0) - m = tl.program_id(1) - z = tl.program_id(2) - # create index ranges - hm = h * tl.num_programs(1) + m - lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE - block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE - # extract information from LUT - header = LUT + (hm // BLOCK_SIZE) * 2 - size = tl.load(header + 0) - offset = tl.load(header + 1) - # row-col offset - off_mn = (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE - off_mn += (m % BLOCK_SIZE) * BLOCK_SIZE - mask = block_n < size - # pointers - As = Out + z * stride_zout + off_mn - DOuts = DOut + z * stride_zdout + off_mn - # do not need to read column indices in the dense case - if IS_DENSE: - ns = tl.arange(0, ROW_SIZE) - else: - off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE - start_n = tl.load(LUT + off_lut + block_n, mask=mask, other=0) - ns = start_n * BLOCK_SIZE + lane_n - # load data - a = tl.load(As + lane_n, mask=mask, other=0.0) - a = a.to(tl.float32) - dout = tl.load(DOuts + lane_n, mask=mask, other=0.0) - dout = dout.to(tl.float32) - # compute - a = tl.where((ns > m) & is_causal & (a == a), 0., a) - da = a * (dout - tl.sum(a * dout, 0)) - # apply relative attention - if DR is not None: - DR += z * stride_zr - DR += h * stride_hr - off_lo = (extent - m - 1) + ns - mask_lo = (off_lo >= 0) & (off_lo < extent) & mask - tl.store(DR + m * extent + off_lo, da, mask=mask_lo) - da = da * scale - # convert da - # write-back - DAs = DA + z * stride_zdx + off_mn - tl.store(DAs + lane_n, da, mask=mask) - - -class _softmax(torch.autograd.Function): - @staticmethod - def make_lut(layout, block, device): - _empty = torch.tensor([], dtype=torch.int64, device=layout.device) - sizes = _empty.clone() - # sizes along rows - for h in range(layout.shape[0]): - sizes = torch.cat((sizes, layout[h, :, :].sum(-1))) - total_sizes = sizes * block - # offsets in block format - offsets = torch.zeros_like(sizes) - offsets[1:] = torch.cumsum(sizes[:-1], dim=0) - # block indices - columns = layout.nonzero(as_tuple=False)[:, 2] - header = torch.stack((sizes, offsets), dim=1).view(-1) - lut = torch.cat((header, columns)).type(torch.int32).to(device) - return lut, int(total_sizes.max()) - - @staticmethod - def forward( - ctx, a, scale, rel_logits, is_causal, - spdims, block, lut, maxlut, is_dense - ): - if scale is not None and isinstance(scale, torch.Tensor): - assert scale.device.type == "cpu" - scale = scale.item() - M = a.shape[0] - grid = [spdims[0], spdims[1] * block, M] - rel_shape = (1, 1, 1, 1) if rel_logits is None else rel_logits.shape - rel_strides = (1, 1, 1, 1) if rel_logits is None else rel_logits.stride() - # enqueue kernel - out = torch.empty_like(a) - _blocksparse_softmax_fwd[grid]( - out, a, a.stride(0), lut, - rel_logits, rel_shape[-1], rel_strides[0], rel_strides[1], # relative attn - scale, - is_causal, - BLOCK_SIZE=block, - ROW_SIZE=triton.next_power_of_2(maxlut), - IS_DENSE=is_dense, - num_warps=num_warps(maxlut) - ) - # save to context - # ctx.mark_dirty(x) - ctx.save_for_backward(out, lut) - ctx.spdims = spdims - ctx.block = block - ctx.maxlut = maxlut - ctx.scale = scale - ctx.rel_shape = rel_shape - ctx.rel_strides = rel_strides - ctx.rel_dtype = a.dtype - ctx.is_dense = is_dense - ctx.is_causal = is_causal - return out - - @staticmethod - def backward(ctx, dout): - # retrieve from context - out, lut = ctx.saved_tensors - # relative logits gradients - dr = None - if ctx.needs_input_grad[3]: - dr = torch.zeros(ctx.rel_shape, dtype=ctx.rel_dtype, device=out.device) - # run kernel - M = out.shape[0] - grid = (ctx.spdims[0], ctx.spdims[1] * ctx.block, M) - da = torch.empty_like(dout) - _blocksparse_softmax_bwd[grid]( - da, da.stride(0), - dout, dout.stride(0), - out, out.stride(0), - ctx.scale, - lut, - dr, ctx.rel_shape[-1], ctx.rel_strides[0], ctx.rel_strides[1], ctx.rel_strides[2], - ctx.is_causal, - BLOCK_SIZE=ctx.block, - ROW_SIZE=triton.next_power_of_2(ctx.maxlut), - IS_DENSE=ctx.is_dense, - num_warps=num_warps(ctx.maxlut) - ) - return (da, None, None, dr, None, - None, None, None, None, None, - None, - None, None, None, - None, - None, None, None - ) - - -class softmax: - def __init__(self, layout, block, device, is_dense=False): - self.spdims = layout.shape - self.layout = layout - self.block = block - self.lut, self.maxlut = _softmax.make_lut(self.layout, self.block, device) - self.is_dense = is_dense - - def __call__(self, a, *, scale=1.0, rel_logits=None, is_causal=False): - if rel_logits is not None and rel_logits.dtype != a.dtype: - raise ValueError(f"relative position embedding must be {a.dtype}") - a = _softmax.apply( - a, scale, rel_logits, is_causal, - self.spdims, self.block, self.lut, self.maxlut, self.is_dense, - ) - return a diff --git a/python/triton/ops/cross_entropy.py b/python/triton/ops/cross_entropy.py deleted file mode 100644 index f66cddf37d21..000000000000 --- a/python/triton/ops/cross_entropy.py +++ /dev/null @@ -1,94 +0,0 @@ -import torch - -import triton -import triton.language as tl - - -def num_warps(N): - if N < 2048: - return 4 - elif N < 8192: - return 8 - return 16 - - -@triton.heuristics({'num_warps': lambda nargs: num_warps(nargs['N'])}) -@triton.heuristics({'BLOCK': lambda nargs: triton.next_power_of_2(nargs['N'])}) -@triton.jit -def _forward(LOGITS, PROBS, IDX, LOSS, N, BLOCK: tl.constexpr): - row = tl.program_id(0) - cols = tl.arange(0, BLOCK) - idx = tl.load(IDX + row) - # pointers to logit and probs - LOGITS = LOGITS + row * N + cols - WRIT_PROBS = PROBS + row * N + cols - READ_PROBS = PROBS + row * N + idx - # write-back negative log-probs - logits = tl.load(LOGITS, mask=cols < N, other=-float('inf')) - logits = logits.to(tl.float32) - logits = logits - tl.max(logits, 0) - probs = tl.log(tl.sum(tl.exp(logits), 0)) - logits - tl.store(WRIT_PROBS, probs, mask=cols < N) - # There is a bug in the compiler, which fails to insert a barrier here. - # We add it explicitly for now. Will be fixed soon. - tl.debug_barrier() - # write-back loss - probs = tl.load(READ_PROBS) - tl.store(LOSS + row, probs) - - -@triton.heuristics({'num_warps': lambda nargs: num_warps(nargs['N'])}) -@triton.heuristics({'BLOCK': lambda nargs: triton.next_power_of_2(nargs['N'])}) -@triton.jit -def _backward(PROBS, IDX, DPROBS, N, BLOCK: tl.constexpr): - row = tl.program_id(0) - cols = tl.arange(0, BLOCK) - idx = tl.load(IDX + row) - # pointers to probs - PROBS = PROBS + row * N + cols - # We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k] - # and we have -log(p[k]) stored in PROBS, so this is easy - probs = -tl.load(PROBS, mask=cols < N, other=float('inf')) - probs = tl.exp(probs.to(tl.float32)) - delta = cols == idx - # write result in-place in PROBS - dout = tl.load(DPROBS + row) - din = (probs - delta) * dout - tl.store(PROBS, din.to(PROBS.dtype.element_ty), mask=cols < N) - - -class _cross_entropy(torch.autograd.Function): - @classmethod - def forward(cls, ctx, logits, indices): - # make sure we can use triton - assert (indices.dtype == torch.int64), "Indices are expected to be of type long." - # make kernel - device, dtype = logits.device, logits.dtype - n_cols = logits.shape[-1] - # run the kernel - result = torch.empty_like(indices, dtype=dtype, device=device) - neg_logprobs = torch.empty_like(logits, dtype=dtype, device=device) - grid = lambda opt: (logits.numel() // n_cols, ) - _forward[grid](logits, neg_logprobs, indices, result, n_cols) - # save for backward - ctx.save_for_backward(neg_logprobs, indices) - return result - - @classmethod - def backward(cls, ctx, dneg_logprobs): - """We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k] - so we initialize the gradient as neg_logprobs, so we can just exponentiate - to get p[k], which is most of what we need... neg_logprobs will be - modified in place to become the gradient we want - """ - # load saved tensors - neg_logprobs, indices = ctx.saved_tensors - # run the kernel - # neg_logprobs will be modified in place to become our gradient: - n_cols = neg_logprobs.shape[-1] - grid = lambda opt: (neg_logprobs.numel() // n_cols, ) - _backward[grid](neg_logprobs, indices, dneg_logprobs, n_cols) - return neg_logprobs, None - - -cross_entropy = _cross_entropy.apply diff --git a/python/triton/ops/flash_attention.py b/python/triton/ops/flash_attention.py deleted file mode 100644 index 33c0da791fb7..000000000000 --- a/python/triton/ops/flash_attention.py +++ /dev/null @@ -1,267 +0,0 @@ -""" -Fused Attention -=============== -This is a Triton implementation of the Flash Attention algorithm -(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf) -""" - -import torch - -import triton -import triton.language as tl - - -@triton.jit -def _fwd_kernel( - Q, K, V, sm_scale, - L, M, - Out, - stride_qz, stride_qh, stride_qm, stride_qk, - stride_kz, stride_kh, stride_kn, stride_kk, - stride_vz, stride_vh, stride_vk, stride_vn, - stride_oz, stride_oh, stride_om, stride_on, - Z, H, N_CTX, - BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, -): - start_m = tl.program_id(0) - off_hz = tl.program_id(1) - # initialize offsets - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk - off_k = off_hz * stride_qh + offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kk - off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk - # Initialize pointers to Q, K, V - q_ptrs = Q + off_q - k_ptrs = K + off_k - v_ptrs = V + off_v - # initialize pointer to m and l - m_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") - l_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - # load q: it will stay in SRAM throughout - q = tl.load(q_ptrs) - # loop over k, v and update accumulator - for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N): - # -- compute qk ---- - k = tl.load(k_ptrs) - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk *= sm_scale - qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) - # compute new m - m_curr = tl.maximum(tl.max(qk, 1), m_prev) - # correct old l - l_prev *= tl.exp(m_prev - m_curr) - # attention weights - p = tl.exp(qk - m_curr[:, None]) - l_curr = tl.sum(p, 1) + l_prev - # rescale operands of matmuls - l_rcp = 1. / l_curr - p *= l_rcp[:, None] - acc *= (l_prev * l_rcp)[:, None] - # update acc - p = p.to(Q.dtype.element_ty) - v = tl.load(v_ptrs) - acc += tl.dot(p, v) - # update m_i and l_i - l_prev = l_curr - m_prev = m_curr - # update pointers - k_ptrs += BLOCK_N * stride_kn - v_ptrs += BLOCK_N * stride_vk - # rematerialize offsets to save registers - start_m = tl.program_id(0) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - # write back l and m - l_ptrs = L + off_hz * N_CTX + offs_m - m_ptrs = M + off_hz * N_CTX + offs_m - tl.store(l_ptrs, l_prev) - tl.store(m_ptrs, m_prev) - # initialize pointers to output - offs_n = tl.arange(0, BLOCK_DMODEL) - off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on - out_ptrs = Out + off_o - tl.store(out_ptrs, acc) - - -@triton.jit -def _bwd_preprocess( - Out, DO, L, - NewDO, Delta, - BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr, -): - off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) - off_n = tl.arange(0, D_HEAD) - # load - o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) - do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) - denom = tl.load(L + off_m).to(tl.float32) - # compute - do = do / denom[:, None] - delta = tl.sum(o * do, axis=1) - # write-back - tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do) - tl.store(Delta + off_m, delta) - - -@triton.jit -def _bwd_kernel( - Q, K, V, sm_scale, Out, DO, - DQ, DK, DV, - L, M, - D, - stride_qz, stride_qh, stride_qm, stride_qk, - stride_kz, stride_kh, stride_kn, stride_kk, - stride_vz, stride_vh, stride_vk, stride_vn, - Z, H, N_CTX, - num_block, - BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, -): - off_hz = tl.program_id(0) - off_z = off_hz // H - off_h = off_hz % H - # offset pointers for batch/head - Q += off_z * stride_qz + off_h * stride_qh - K += off_z * stride_qz + off_h * stride_qh - V += off_z * stride_qz + off_h * stride_qh - DO += off_z * stride_qz + off_h * stride_qh - DQ += off_z * stride_qz + off_h * stride_qh - DK += off_z * stride_qz + off_h * stride_qh - DV += off_z * stride_qz + off_h * stride_qh - for start_n in range(0, num_block): - lo = start_n * BLOCK_M - # initialize row/col offsets - offs_qm = lo + tl.arange(0, BLOCK_M) - offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M) - offs_m = tl.arange(0, BLOCK_N) - offs_k = tl.arange(0, BLOCK_DMODEL) - # initialize pointers to value-like data - q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) - v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) - do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - # pointer to row-wise quantities in value-like data - D_ptrs = D + off_hz * N_CTX - m_ptrs = M + off_hz * N_CTX - # initialize dv amd dk - dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - # k and v stay in SRAM throughout - k = tl.load(k_ptrs) - v = tl.load(v_ptrs) - # loop over rows - for start_m in range(lo, num_block * BLOCK_M, BLOCK_M): - offs_m_curr = start_m + offs_m - # load q, k, v, do on-chip - q = tl.load(q_ptrs) - # recompute p = softmax(qk, dim=-1).T - # NOTE: `do` is pre-divided by `l`; no normalization here - qk = tl.dot(q, tl.trans(k)) - qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) - m = tl.load(m_ptrs + offs_m_curr) - p = tl.exp(qk * sm_scale - m[:, None]) - # compute dv - do = tl.load(do_ptrs) - dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do) - # compute dp = dot(v, do) - Di = tl.load(D_ptrs + offs_m_curr) - dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None] - dp += tl.dot(do, tl.trans(v)) - # compute ds = p * (dp - delta[:, None]) - ds = p * dp * sm_scale - # compute dk = dot(ds.T, q) - dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q) - # compute dq - dq = tl.load(dq_ptrs) - dq += tl.dot(ds.to(Q.dtype.element_ty), k) - tl.store(dq_ptrs, dq) - # increment pointers - dq_ptrs += BLOCK_M * stride_qm - q_ptrs += BLOCK_M * stride_qm - do_ptrs += BLOCK_M * stride_qm - # write-back - dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) - dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) - tl.store(dv_ptrs, dv) - tl.store(dk_ptrs, dk) - - -class _attention(torch.autograd.Function): - - @staticmethod - def forward(ctx, q, k, v, sm_scale): - # only support for Ampere now - capability = torch.cuda.get_device_capability() - if capability[0] < 8: - raise RuntimeError("Flash attention currently only supported for compute capability >= 80") - BLOCK = 128 - # shape constraints - Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] - assert Lq == Lk and Lk == Lv - # assert Lk in {16, 32, 64, 128} - assert Lk in {64} # TODO: fix other cases - o = torch.empty_like(q) - grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1], 1) - L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) - m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) - num_warps = 4 if Lk <= 64 else 8 - - _fwd_kernel[grid]( - q, k, v, sm_scale, - L, m, - o, - q.stride(0), q.stride(1), q.stride(2), q.stride(3), - k.stride(0), k.stride(1), k.stride(2), k.stride(3), - v.stride(0), v.stride(1), v.stride(2), v.stride(3), - o.stride(0), o.stride(1), o.stride(2), o.stride(3), - q.shape[0], q.shape[1], q.shape[2], - BLOCK_M=BLOCK, BLOCK_N=BLOCK, - BLOCK_DMODEL=Lk, num_warps=num_warps, - num_stages=2, - ) - - ctx.save_for_backward(q, k, v, o, L, m) - ctx.grid = grid - ctx.sm_scale = sm_scale - ctx.BLOCK_DMODEL = Lk - return o - - @staticmethod - def backward(ctx, do): - BLOCK = 128 - q, k, v, o, l, m = ctx.saved_tensors - do = do.contiguous() - dq = torch.zeros_like(q, dtype=torch.float32) - dk = torch.empty_like(k) - dv = torch.empty_like(v) - do_scaled = torch.empty_like(do) - delta = torch.empty_like(l) - _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )]( - o, do, l, - do_scaled, delta, - BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL, - ) - _bwd_kernel[(ctx.grid[1],)]( - q, k, v, ctx.sm_scale, - o, do_scaled, - dq, dk, dv, - l, m, - delta, - q.stride(0), q.stride(1), q.stride(2), q.stride(3), - k.stride(0), k.stride(1), k.stride(2), k.stride(3), - v.stride(0), v.stride(1), v.stride(2), v.stride(3), - q.shape[0], q.shape[1], q.shape[2], - ctx.grid[0], - BLOCK_M=BLOCK, BLOCK_N=BLOCK, - BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8, - num_stages=1, - ) - return dq, dk, dv, None - - -attention = _attention.apply diff --git a/python/triton/ops/matmul.py b/python/triton/ops/matmul.py deleted file mode 100644 index 688186fef16f..000000000000 --- a/python/triton/ops/matmul.py +++ /dev/null @@ -1,163 +0,0 @@ -import torch - -import triton -import triton.language as tl -from .matmul_perf_model import early_config_prune, estimate_matmul_time - - -def init_to_zero(name): - return lambda nargs: nargs[name].zero_() - - -def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) - return configs - - -@triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 - }, -) -@triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, -}) -@triton.jit -def _kernel(A, B, C, M, N, K, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - dot_out_dtype: tl.constexpr, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=dot_out_dtype) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) - acc += tl.dot(a, b, out_dtype=dot_out_dtype) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - acc = acc.to(C.dtype.element_ty) - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) - - -class _matmul(torch.autograd.Function): - kernel = _kernel - - _locks = {} - - @staticmethod - def _call(a, b, dot_out_dtype): - device = a.device - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=a.dtype) - if dot_out_dtype is None: - if a.dtype in [torch.float16, torch.float32, torch.bfloat16]: - dot_out_dtype = tl.float32 - else: - dot_out_dtype = tl.int32 - else: - assert isinstance(dot_out_dtype, torch.dtype), "dot_out_dtype must be a torch.dtype" - if dot_out_dtype == torch.float16: - dot_out_dtype = tl.float16 - elif dot_out_dtype in [torch.float32, torch.bfloat16]: - dot_out_dtype = tl.float32 - else: - dot_out_dtype = tl.int32 - # launch kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _kernel[grid](a, b, c, M, N, K, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - dot_out_dtype=dot_out_dtype, - GROUP_M=8) - return c - - @staticmethod - def forward(ctx, a, b, dot_out_dtype=None): - return _matmul._call(a, b, dot_out_dtype=dot_out_dtype) - - -matmul = _matmul.apply diff --git a/python/triton/ops/matmul_perf_model.py b/python/triton/ops/matmul_perf_model.py deleted file mode 100644 index 740426b13e34..000000000000 --- a/python/triton/ops/matmul_perf_model.py +++ /dev/null @@ -1,158 +0,0 @@ -import heapq - -import torch - -import triton -import triton._C.libtriton.triton as _triton -from triton.runtime import driver -from triton.testing import get_dram_gbps, get_max_simd_tflops, get_max_tensorcore_tflops - - -def get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype): - ''' return compute throughput in TOPS ''' - total_warps = num_ctas * min(num_warps, 4) - num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops(dtype, backend, device) - return tflops - - -def get_simd_tflops(backend, device, num_ctas, num_warps, dtype): - ''' return compute throughput in TOPS ''' - total_warps = num_ctas * min(num_warps, 4) - num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, backend, device) - return tflops - - -def get_tflops(backend, device, num_ctas, num_warps, dtype): - capability = torch.cuda.get_device_capability(device) - if capability[0] < 8 and dtype == torch.float32: - return get_simd_tflops(backend, device, num_ctas, num_warps, dtype) - return get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype) - - -def estimate_matmul_time( - # backend, device, - num_warps, num_stages, - A, B, C, - M, N, K, - BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, - debug=False, **kwargs -): - ''' return estimated running time in ms - = max(compute, loading) + store ''' - backend = _triton.runtime.backend.CUDA - device = torch.cuda.current_device() - dtype = A.dtype - dtsize = A.element_size() - - num_cta_m = triton.cdiv(M, BLOCK_M) - num_cta_n = triton.cdiv(N, BLOCK_N) - num_cta_k = SPLIT_K - num_ctas = num_cta_m * num_cta_n * num_cta_k - - # If the input is smaller than the block size - M, N = max(M, BLOCK_M), max(N, BLOCK_N) - - # time to compute - total_ops = 2 * M * N * K / (1024 * 1024 * 1024) # GOPS - tput = get_tflops(backend, device, num_ctas, num_warps, dtype) - compute_ms = total_ops / tput - - # time to load data - num_sm = driver.utils.get_device_properties(device)["multiprocessor_count"] - active_cta_ratio = min(1, num_ctas / num_sm) - active_cta_ratio_bw1 = min(1, num_ctas / 32) # 32 active ctas are enough to saturate - active_cta_ratio_bw2 = max(min(1, (num_ctas - 32) / (108 - 32)), 0) # 32-108, remaining 5% - dram_bw = get_dram_gbps(backend, device) * (active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05) # in GB/s - l2_bw = dram_bw * 4 # rough estimation (should be 4.7 for A100?) - # assume 80% of (following) loads are in L2 cache - load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1)) - load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1) - load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1)) - load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1) - # total - total_dram = (load_a_dram + load_b_dram) / (1024 * 1024) # MB - total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024) - # loading time in ms - load_ms = total_dram / dram_bw + total_l2 / l2_bw - - # estimate storing time - store_bw = dram_bw * 0.6 # :o - store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024) # MB - if SPLIT_K == 1: - store_ms = store_c_dram / store_bw - else: - reduce_bw = store_bw - store_ms = store_c_dram / reduce_bw - # c.zero_() - zero_ms = M * N * 2 / (1024 * 1024) / store_bw - store_ms += zero_ms - - total_time_ms = max(compute_ms, load_ms) + store_ms - if debug: - print(f'Total time: {total_time_ms}ms, compute time: {compute_ms}ms, ' - f'loading time: {load_ms}ms, store time: {store_ms}ms, ' - f'Activate CTAs: {active_cta_ratio*100}%') - return total_time_ms - - -def early_config_prune(configs, named_args): - device = torch.cuda.current_device() - capability = torch.cuda.get_device_capability() - # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages - dtsize = named_args['A'].element_size() - dtype = named_args['A'].dtype - - # 1. make sure we have enough smem - pruned_configs = [] - for config in configs: - kw = config.kwargs - BLOCK_M, BLOCK_N, BLOCK_K, num_stages = \ - kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], config.num_stages - - max_shared_memory = driver.utils.get_device_properties(device)["max_shared_mem"] - required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize - if required_shared_memory <= max_shared_memory: - pruned_configs.append(config) - configs = pruned_configs - - # Some dtypes do not allow atomic_add - if dtype not in [torch.float16, torch.float32]: - configs = [config for config in configs if config.kwargs['SPLIT_K'] == 1] - - # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps) - configs_map = {} - for config in configs: - kw = config.kwargs - BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = \ - kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], kw['SPLIT_K'], config.num_warps, config.num_stages - - key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps) - if key in configs_map: - configs_map[key].append((config, num_stages)) - else: - configs_map[key] = [(config, num_stages)] - - pruned_configs = [] - for k, v in configs_map.items(): - BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k - if capability[0] >= 8: - # compute cycles (only works for ampere GPUs) - mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16) - mma_cycles = mmas / min(4, num_warps) * 8 - - ldgsts_latency = 300 # Does this matter? - optimal_num_stages = ldgsts_latency / mma_cycles - - # nearest stages, prefer large #stages - nearest = heapq.nsmallest(2, v, key=lambda x: 10 + abs(x[1] - optimal_num_stages) - if (x[1] - optimal_num_stages) < 0 else x[1] - optimal_num_stages) - - for n in nearest: - pruned_configs.append(n[0]) - else: # Volta & Turing only supports num_stages <= 2 - random_config = v[0][0] - random_config.num_stages = 2 - pruned_configs.append(random_config) - return pruned_configs diff --git a/python/triton/runtime/__init__.py b/python/triton/runtime/__init__.py deleted file mode 100644 index a4291ab31c8e..000000000000 --- a/python/triton/runtime/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from .autotuner import (Autotuner, Config, Heuristics, OutOfResources, autotune, - heuristics) -from .driver import driver -from .jit import (JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret, - version_key) - -__all__ = [ - "driver", - "Config", - "Heuristics", - "autotune", - "heuristics", - "JITFunction", - "KernelInterface", - "version_key", - "reinterpret", - "TensorWrapper", - "OutOfResources", - "MockTensor", - "Autotuner", -] diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py deleted file mode 100644 index 3cb9f9dbe862..000000000000 --- a/python/triton/runtime/autotuner.py +++ /dev/null @@ -1,244 +0,0 @@ -from __future__ import annotations - -import builtins -import time -from typing import Dict - -from ..testing import do_bench -from .jit import KernelInterface - - -class OutOfResources(Exception): - def __init__(self, required, limit, name): - self.message = f'out of resource: {name}, '\ - f'Required: {required}, '\ - f'Hardware limit: {limit}' - self.message += '. Reducing block sizes or `num_stages` may help.' - self.required = required - self.limit = limit - self.name = name - super().__init__(self.message) - - def __reduce__(self): - # this is necessary to make CompilationError picklable - return (type(self), (self.required, self.limit, self.name)) - - -class Autotuner(KernelInterface): - def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None): - ''' - :param prune_configs_by: a dict of functions that are used to prune configs, fields: - 'perf_model': performance model used to predicate running time with different configs, returns running time - 'top_k': number of configs to bench - 'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs. - ''' - if not configs: - self.configs = [Config({}, num_warps=4, num_stages=2)] - else: - self.configs = configs - self.key_idx = [arg_names.index(k) for k in key] - self.cache = {} - # hook to reset all required tensor to zeros before relaunching a kernel - self.hook = lambda args: 0 - if reset_to_zero is not None: - self.reset_idx = [arg_names.index(k) for k in reset_to_zero] - - def _hook(args): - for i in self.reset_idx: - args[i].zero_() - self.hook = _hook - self.arg_names = arg_names - # prune configs - if prune_configs_by: - perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k'] - if 'early_config_prune' in prune_configs_by: - early_config_prune = prune_configs_by['early_config_prune'] - else: - perf_model, top_k, early_config_prune = None, None, None - self.perf_model, self.configs_top_k = perf_model, top_k - self.early_config_prune = early_config_prune - self.fn = fn - - def _bench(self, *args, config, **meta): - # check for conflicts, i.e. meta-parameters both provided - # as kwargs and by the autotuner - conflicts = meta.keys() & config.kwargs.keys() - if conflicts: - raise ValueError( - f"Conflicting meta-parameters: {', '.join(conflicts)}." - " Make sure that you don't re-define auto-tuned symbols." - ) - # augment meta-parameters with tunable ones - current = dict(meta, **config.kwargs) - - def kernel_call(): - if config.pre_hook: - config.pre_hook(self.nargs) - self.hook(args) - self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current) - try: - return do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8)) - except OutOfResources: - return [float('inf'), float('inf'), float('inf')] - - def run(self, *args, **kwargs): - self.nargs = dict(zip(self.arg_names, args)) - if len(self.configs) > 1: - all_args = {**self.nargs, **kwargs} - _args = [] - for name in self.arg_names: - if name in all_args: - _args.append(all_args[name]) - key = tuple(_args[i] for i in self.key_idx) - if key not in self.cache: - # prune configs - pruned_configs = self.prune_configs(kwargs) - bench_start = time.time() - timings = {config: self._bench(*args, config=config, **kwargs) - for config in pruned_configs} - bench_end = time.time() - self.bench_time = bench_end - bench_start - self.cache[key] = builtins.min(timings, key=timings.get) - self.hook(args) - self.configs_timings = timings - config = self.cache[key] - else: - config = self.configs[0] - self.best_config = config - if config.pre_hook is not None: - config.pre_hook(self.nargs) - return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs) - - def prune_configs(self, kwargs): - pruned_configs = self.configs - if self.early_config_prune: - pruned_configs = self.early_config_prune(self.configs, self.nargs) - if self.perf_model: - top_k = self.configs_top_k - if isinstance(top_k, float) and top_k <= 1.0: - top_k = int(len(self.configs) * top_k) - if len(pruned_configs) > top_k: - est_timing = { - config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages, - num_warps=config.num_warps) - for config in pruned_configs - } - pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k] - return pruned_configs - - def warmup(self, *args, **kwargs): - self.nargs = dict(zip(self.arg_names, args)) - for config in self.prune_configs(kwargs): - self.fn.warmup( - *args, - num_warps=config.num_warps, - num_stages=config.num_stages, - **kwargs, - **config.kwargs, - ) - self.nargs = None - - -class Config: - """ - An object that represents a possible kernel configuration for the auto-tuner to try. - - :ivar meta: a dictionary of meta-parameters to pass to the kernel as keyword arguments. - :type meta: dict[Str, Any] - :ivar num_warps: the number of warps to use for the kernel when compiled for GPUs. For example, if - `num_warps=8`, then each kernel instance will be automatically parallelized to - cooperatively execute using `8 * 32 = 256` threads. - :type num_warps: int - :ivar num_stages: the number of stages that the compiler should use when software-pipelining loops. - Mostly useful for matrix multiplication workloads on SM80+ GPUs. - :type num_stages: int - :ivar pre_hook: a function that will be called before the kernel is called. Parameters of this - function are args. - """ - - def __init__(self, kwargs, num_warps=4, num_stages=2, pre_hook=None): - self.kwargs = kwargs - self.num_warps = num_warps - self.num_stages = num_stages - self.pre_hook = pre_hook - - def __str__(self): - res = [] - for k, v in self.kwargs.items(): - res.append(f'{k}: {v}') - res.append(f'num_warps: {self.num_warps}') - res.append(f'num_stages: {self.num_stages}') - return ', '.join(res) - - -def autotune(configs, key, prune_configs_by=None, reset_to_zero=None): - """ - Decorator for auto-tuning a :code:`triton.jit`'d function. - - .. highlight:: python - .. code-block:: python - - @triton.autotune(configs=[ - triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4), - triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8), - ], - key=['x_size'] # the two above configs will be evaluated anytime - # the value of x_size changes - ) - @triton.jit - def kernel(x_ptr, x_size, **META): - BLOCK_SIZE = META['BLOCK_SIZE'] - :note: When all the configurations are evaluated, the kernel will run multiple times. - This means that whatever value the kernel updates will be updated multiple times. - To avoid this undesired behavior, you can use the `reset_to_zero` argument, which - resets the value of the provided tensor to `zero` before running any configuration. - :param configs: a list of :code:`triton.Config` objects - :type configs: list[triton.Config] - :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs. - :type key: list[str] - :param prune_configs_by: a dict of functions that are used to prune configs, fields: - 'perf_model': performance model used to predicate running time with different configs, returns running time - 'top_k': number of configs to bench - 'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs. - :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs. - :type reset_to_zero: list[str] - """ - def decorator(fn): - return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by) - - return decorator - - -class Heuristics(KernelInterface): - - def __init__(self, fn, arg_names, values) -> None: - self.fn = fn - self.values = values - self.arg_names = arg_names - - def run(self, *args, **kwargs): - for v, heur in self.values.items(): - kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs}) - return self.fn.run(*args, **kwargs) - - -def heuristics(values): - """ - Decorator for specifying how the values of certain meta-parameters may be computed. - This is useful for cases where auto-tuning is prohibitevely expensive, or just not applicable. - - .. highlight:: python - .. code-block:: python - - @triton.heuristics(values={'BLOCK_SIZE': lambda args: 2 ** int(math.ceil(math.log2(args[1])))}) - @triton.jit - def kernel(x_ptr, x_size, **META): - BLOCK_SIZE = META['BLOCK_SIZE'] # smallest power-of-two >= x_size - :param values: a dictionary of meta-parameter names and functions that compute the value of the meta-parameter. - each such function takes a list of positional arguments as input. - :type values: dict[str, Callable[[list[Any]], Any]] - """ - def decorator(fn): - return Heuristics(fn, fn.arg_names, values) - - return decorator diff --git a/python/triton/runtime/backends/cuda.c b/python/triton/runtime/backends/cuda.c deleted file mode 100644 index a03297639f32..000000000000 --- a/python/triton/runtime/backends/cuda.c +++ /dev/null @@ -1,124 +0,0 @@ -#include "cuda.h" -#define PY_SSIZE_T_CLEAN -#include - -static inline void gpuAssert(CUresult code, const char *file, int line) { - if (code != CUDA_SUCCESS) { - const char *prefix = "Triton Error [CUDA]: "; - const char *str; - cuGetErrorString(code, &str); - char err[1024] = {0}; - strcat(err, prefix); - strcat(err, str); - PyErr_SetString(PyExc_RuntimeError, err); - } -} - -#define CUDA_CHECK(ans) \ - { \ - gpuAssert((ans), __FILE__, __LINE__); \ - if (PyErr_Occurred()) \ - return NULL; \ - } - -static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { - int device_id; - if (!PyArg_ParseTuple(args, "i", &device_id)) - return NULL; - // Get device handle - CUdevice device; - cuDeviceGet(&device, device_id); - - // create a struct to hold device properties - int max_shared_mem; - int multiprocessor_count; - int sm_clock_rate; - int mem_clock_rate; - int mem_bus_width; - CUDA_CHECK(cuDeviceGetAttribute( - &max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, - device)); - CUDA_CHECK(cuDeviceGetAttribute( - &multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); - CUDA_CHECK(cuDeviceGetAttribute(&sm_clock_rate, - CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); - CUDA_CHECK(cuDeviceGetAttribute( - &mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); - CUDA_CHECK(cuDeviceGetAttribute( - &mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); - - return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", - max_shared_mem, "multiprocessor_count", - multiprocessor_count, "sm_clock_rate", sm_clock_rate, - "mem_clock_rate", mem_clock_rate, "mem_bus_width", - mem_bus_width); -} - -static PyObject *loadBinary(PyObject *self, PyObject *args) { - const char *name; - const char *data; - Py_ssize_t data_size; - int shared; - int device; - if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, - &device)) { - return NULL; - } - CUfunction fun; - CUmodule mod; - int32_t n_regs = 0; - int32_t n_spills = 0; - // create driver handles - CUDA_CHECK(cuModuleLoadData(&mod, data)); - CUDA_CHECK(cuModuleGetFunction(&fun, mod, name)); - // get allocated registers and spilled registers from the function - CUDA_CHECK(cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun)); - CUDA_CHECK( - cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun)); - n_spills /= 4; - // set dynamic shared memory if necessary - int shared_optin; - CUDA_CHECK(cuDeviceGetAttribute( - &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, - device)); - if (shared > 49152 && shared_optin > 49152) { - CUDA_CHECK(cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED)); - int shared_total, shared_static; - CUDA_CHECK(cuDeviceGetAttribute( - &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, - device)); - CUDA_CHECK(cuFuncGetAttribute(&shared_static, - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun)); - CUDA_CHECK( - cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - shared_optin - shared_static)); - } - - if (PyErr_Occurred()) { - return NULL; - } - return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, - n_spills); -} - -static PyMethodDef ModuleMethods[] = { - {"load_binary", loadBinary, METH_VARARGS, - "Load provided cubin into CUDA driver"}, - {"get_device_properties", getDeviceProperties, METH_VARARGS, - "Get the properties for a given device"}, - {NULL, NULL, 0, NULL} // sentinel -}; - -static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils", - NULL, // documentation - -1, // size - ModuleMethods}; - -PyMODINIT_FUNC PyInit_cuda_utils(void) { - PyObject *m = PyModule_Create(&ModuleDef); - if (m == NULL) { - return NULL; - } - PyModule_AddFunctions(m, ModuleMethods); - return m; -} diff --git a/python/triton/runtime/backends/hip.c b/python/triton/runtime/backends/hip.c deleted file mode 100644 index 5ed5f19ce837..000000000000 --- a/python/triton/runtime/backends/hip.c +++ /dev/null @@ -1,120 +0,0 @@ -#define __HIP_PLATFORM_AMD__ -#include -#define PY_SSIZE_T_CLEAN -#include -#include -#include - -static inline void gpuAssert(hipError_t code, const char *file, int line) { - { - if (code != HIP_SUCCESS) { - { - const char *prefix = "Triton Error [HIP]: "; - const char *str = hipGetErrorString(code); - char err[1024] = {0}; - snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str); - PyErr_SetString(PyExc_RuntimeError, err); - } - } - } -} - -#define HIP_CHECK(ans) \ - { \ - gpuAssert((ans), __FILE__, __LINE__); \ - if (PyErr_Occurred()) \ - return NULL; \ - } - -static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { - int device_id; - if (!PyArg_ParseTuple(args, "i", &device_id)) - return NULL; - - hipDeviceProp_t props; - HIP_CHECK(hipGetDeviceProperties(&props, device_id)); - - // create a struct to hold device properties - return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", - props.sharedMemPerBlock, "multiprocessor_count", - props.multiProcessorCount, "sm_clock_rate", - props.clockRate, "mem_clock_rate", props.memoryClockRate, - "mem_bus_width", props.memoryBusWidth); -} - -static PyObject *loadBinary(PyObject *self, PyObject *args) { - const char *name; - const char *data; - Py_ssize_t data_size; - int shared; - int device; - if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, - &device)) { - return NULL; - } - - // Open HSACO file - FILE *hsaco_file; - if ((hsaco_file = fopen(data, "rb")) == NULL) { - return NULL; - } - - // Read HSCAO file into Buffer - fseek(hsaco_file, 0L, SEEK_END); - size_t hsaco_file_size = ftell(hsaco_file); - unsigned char *hsaco = - (unsigned char *)malloc(hsaco_file_size * sizeof(unsigned char)); - rewind(hsaco_file); - fread(hsaco, sizeof(unsigned char), hsaco_file_size, hsaco_file); - fclose(hsaco_file); - - // set HIP options - hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, - hipJitOptionErrorLogBuffer, - hipJitOptionInfoLogBufferSizeBytes, - hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose}; - const unsigned int errbufsize = 8192; - const unsigned int logbufsize = 8192; - char _err[errbufsize]; - char _log[logbufsize]; - void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err, - (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1}; - - // launch HIP Binary - hipModule_t mod; - hipFunction_t fun; - hipModuleLoadDataEx(&mod, hsaco, 5, opt, optval); - hipModuleGetFunction(&fun, mod, name); - free(hsaco); - - // get allocated registers and spilled registers from the function - int n_regs = 0; - int n_spills = 0; - if (PyErr_Occurred()) { - return NULL; - } - return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, - n_spills); -} - -static PyMethodDef ModuleMethods[] = { - {"load_binary", loadBinary, METH_VARARGS, - "Load provided hsaco into HIP driver"}, - {"get_device_properties", getDeviceProperties, METH_VARARGS, - "Get the properties for a given device"}, - {NULL, NULL, 0, NULL} // sentinel -}; - -static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils", - NULL, // documentation - -1, // size - ModuleMethods}; - -PyMODINIT_FUNC PyInit_hip_utils(void) { - PyObject *m = PyModule_Create(&ModuleDef); - if (m == NULL) { - return NULL; - } - PyModule_AddFunctions(m, ModuleMethods); - return m; -} diff --git a/python/triton/runtime/cache.py b/python/triton/runtime/cache.py deleted file mode 100644 index 43e6660a59df..000000000000 --- a/python/triton/runtime/cache.py +++ /dev/null @@ -1,131 +0,0 @@ -import json -import os -import random -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Dict, Optional - - -def default_cache_dir(): - return os.path.join(Path.home(), ".triton", "cache") - - -class CacheManager(ABC): - def __init__(self, key): - pass - - @abstractmethod - def get_file(self, filename) -> Optional[str]: - pass - - @abstractmethod - def has_file(self, filename) -> bool: - pass - - @abstractmethod - def put(self, data, filename, binary=True) -> str: - pass - - @abstractmethod - def get_group(self, filename: str) -> Optional[Dict[str, str]]: - pass - - @abstractmethod - def put_group(self, filename: str, group: Dict[str, str]): - pass - - -class FileCacheManager(CacheManager): - def __init__(self, key): - self.key = key - self.lock_path = None - # create cache directory if it doesn't exist - self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir()) - if self.cache_dir: - self.cache_dir = os.path.join(self.cache_dir, self.key) - self.lock_path = os.path.join(self.cache_dir, "lock") - os.makedirs(self.cache_dir, exist_ok=True) - - def _make_path(self, filename) -> str: - return os.path.join(self.cache_dir, filename) - - def has_file(self, filename): - if not self.cache_dir: - return False - return os.path.exists(self._make_path(filename)) - - def get_file(self, filename) -> Optional[str]: - if self.has_file(filename): - return self._make_path(filename) - else: - return None - - def get_group(self, filename: str) -> Optional[Dict[str, str]]: - grp_filename = f"__grp__{filename}" - if not self.has_file(grp_filename): - return None - grp_filepath = self._make_path(grp_filename) - with open(grp_filepath) as f: - grp_data = json.load(f) - child_paths = grp_data.get("child_paths", None) - # Invalid group data. - if child_paths is None: - return None - result = {} - for c in child_paths: - p = self._make_path(c) - if not os.path.exists(p): - raise Exception(f"Group file {p} does not exist from group {grp_filename} ") - result[c] = p - return result - - # Note a group of pushed files as being part of a group - def put_group(self, filename: str, group: Dict[str, str]): - if not self.cache_dir: - return - grp_contents = json.dumps({"child_paths": sorted(list(group.keys()))}) - grp_filename = f"__grp__{filename}" - return self.put(grp_contents, grp_filename, binary=False) - - def put(self, data, filename, binary=True) -> str: - if not self.cache_dir: - return - binary = isinstance(data, bytes) - if not binary: - data = str(data) - assert self.lock_path is not None - filepath = self._make_path(filename) - # Random ID to avoid any collisions - rnd_id = random.randint(0, 1000000) - # we use the PID incase a bunch of these around so we can see what PID made it - pid = os.getpid() - # use tempfile to be robust against program interruptions - temp_path = f"{filepath}.tmp.pid_{pid}_{rnd_id}" - mode = "wb" if binary else "w" - with open(temp_path, mode) as f: - f.write(data) - # Replace is guaranteed to be atomic on POSIX systems if it succeeds - # so filepath cannot see a partial write - os.replace(temp_path, filepath) - return filepath - - -__cache_cls = FileCacheManager -__cache_cls_nme = "DEFAULT" - - -def get_cache_manager(key) -> CacheManager: - import os - - user_cache_manager = os.environ.get("TRITON_CACHE_MANAGER", None) - global __cache_cls - global __cache_cls_nme - - if user_cache_manager is not None and user_cache_manager != __cache_cls_nme: - import importlib - module_path, clz_nme = user_cache_manager.split(":") - module = importlib.import_module(module_path) - __cache_cls = getattr(module, clz_nme) - __cache_cls_nme = user_cache_manager - - return __cache_cls(key) diff --git a/python/triton/runtime/driver.py b/python/triton/runtime/driver.py deleted file mode 100644 index 3850821536c5..000000000000 --- a/python/triton/runtime/driver.py +++ /dev/null @@ -1,174 +0,0 @@ -import abc -import hashlib -import os -import tempfile -from pathlib import Path - -from ..common.build import _build -from .cache import get_cache_manager - - -class DriverBase(metaclass=abc.ABCMeta): - - CUDA = 0 - HIP = 1 - - @staticmethod - def third_party_dir(): - return os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "third_party") - - def __init__(self) -> None: - pass -# ----------------------------- -# CUDA -# ----------------------------- - - -class CudaUtils(object): - - def __new__(cls): - if not hasattr(cls, 'instance'): - cls.instance = super(CudaUtils, cls).__new__(cls) - return cls.instance - - def __init__(self): - dirname = os.path.dirname(os.path.realpath(__file__)) - src = Path(os.path.join(dirname, "backends", "cuda.c")).read_text() - key = hashlib.md5(src.encode("utf-8")).hexdigest() - cache = get_cache_manager(key) - fname = "cuda_utils.so" - cache_path = cache.get_file(fname) - if cache_path is None: - with tempfile.TemporaryDirectory() as tmpdir: - src_path = os.path.join(tmpdir, "main.c") - with open(src_path, "w") as f: - f.write(src) - so = _build("cuda_utils", src_path, tmpdir) - with open(so, "rb") as f: - cache_path = cache.put(f.read(), fname, binary=True) - import importlib.util - spec = importlib.util.spec_from_file_location("cuda_utils", cache_path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - self.load_binary = mod.load_binary - self.get_device_properties = mod.get_device_properties - - -class CudaDriver(DriverBase): - - def __new__(cls): - if not hasattr(cls, 'instance'): - cls.instance = super(CudaDriver, cls).__new__(cls) - return cls.instance - - def __init__(self): - self.utils = CudaUtils() - self.backend = self.CUDA - -# ----------------------------- -# HIP -# ----------------------------- - - -class HIPUtils(object): - def __new__(cls): - if not hasattr(cls, 'instance'): - cls.instance = super(HIPUtils, cls).__new__(cls) - return cls.instance - - def __init__(self): - dirname = os.path.dirname(os.path.realpath(__file__)) - src = Path(os.path.join(dirname, "backends", "hip.c")).read_text() - key = hashlib.md5(src.encode("utf-8")).hexdigest() - cache = get_cache_manager(key) - fname = "hip_utils.so" - cache_path = cache.get_file(fname) - if cache_path is None: - with tempfile.TemporaryDirectory() as tmpdir: - src_path = os.path.join(tmpdir, "main.c") - with open(src_path, "w") as f: - f.write(src) - so = _build("hip_utils", src_path, tmpdir) - with open(so, "rb") as f: - cache_path = cache.put(f.read(), fname, binary=True) - import importlib.util - spec = importlib.util.spec_from_file_location("hip_utils", cache_path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - self.load_binary = mod.load_binary - self.get_device_properties = mod.get_device_properties - - -class HIPDriver(DriverBase): - - def __new__(cls): - if not hasattr(cls, 'instance'): - cls.instance = super(HIPDriver, cls).__new__(cls) - return cls.instance - - def __init__(self): - self.utils = HIPUtils() - self.backend = self.HIP - - -class UnsupportedDriver(DriverBase): - - def __new__(cls): - if not hasattr(cls, 'instance'): - cls.instance = super(UnsupportedDriver, cls).__new__(cls) - return cls.instance - - def __init__(self): - self.utils = None - self.backend = None - -# ----------------------------- -# Driver -# ----------------------------- - - -class LazyProxy: - def __init__(self, init_fn): - self._init_fn = init_fn - self._obj = None - - def _initialize_obj(self): - if self._obj is None: - self._obj = self._init_fn() - - def __getattr__(self, name): - self._initialize_obj() - return getattr(self._obj, name) - - def __setattr__(self, name, value): - if name in ['_init_fn', '_obj']: - super().__setattr__(name, value) - else: - self._initialize_obj() - setattr(self._obj, name, value) - - def __delattr__(self, name): - self._initialize_obj() - delattr(self._obj, name) - - def __repr__(self): - if self._obj is None: - return f"<{self.__class__.__name__} for {self._init_fn} not yet initialized>" - return repr(self._obj) - - def __str__(self): - self._initialize_obj() - return str(self._obj) - - -def initialize_driver(): - import torch - if torch.version.hip is not None: - return HIPDriver() - elif torch.cuda.is_available(): - return CudaDriver() - else: - return UnsupportedDriver() - - -driver = LazyProxy(initialize_driver) diff --git a/python/triton/runtime/errors.py b/python/triton/runtime/errors.py deleted file mode 100644 index 4ff900574c4f..000000000000 --- a/python/triton/runtime/errors.py +++ /dev/null @@ -1,15 +0,0 @@ - -class OutOfResources(Exception): - def __init__(self, required, limit, name): - self.message = f'out of resource: {name}, '\ - f'Required: {required}, '\ - f'Hardware limit: {limit}' - self.message += '. Reducing block sizes or `num_stages` may help.' - self.required = required - self.limit = limit - self.name = name - super().__init__(self.message) - - def __reduce__(self): - # this is necessary to make CompilationError picklable - return (type(self), (self.required, self.limit, self.name)) diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py deleted file mode 100644 index 787cd4c0d3ed..000000000000 --- a/python/triton/runtime/jit.py +++ /dev/null @@ -1,532 +0,0 @@ -from __future__ import annotations, division - -import ast -import functools -import hashlib -import inspect -import os -import subprocess -import textwrap -from collections import defaultdict, namedtuple -from typing import Callable, Generic, Iterable, Optional, TypeVar, Union, cast, overload - -import triton - - -def get_cuda_stream(idx=None): - if idx is None: - idx = get_current_device() - try: - from torch._C import _cuda_getCurrentRawStream - return _cuda_getCurrentRawStream(idx) - except ImportError: - import torch - return torch.cuda.current_stream(idx).cuda_stream - - -def get_current_device(): - import torch - return torch.cuda.current_device() - - -def set_current_device(idx): - import torch - torch.cuda.set_device(idx) - - -def get_device_capability(idx): - import torch - return torch.cuda.get_device_capability(idx) - - -T = TypeVar('T') - -# ----------------------------------------------------------------------------- -# Dependencies Finder -# ----------------------------------------------------------------------------- - - -class DependenciesFinder(ast.NodeVisitor): - """ - This AST visitor is used to find dependencies of a JITFunction. This can - be used to invalidate a JITFunction's hash when its source code -- or - that of its dependencies -- changes. - """ - - def __init__(self, globals, src) -> None: - super().__init__() - self.ret = hashlib.md5(src.encode("utf-8")).hexdigest() - self.globals = globals - - def visit_Name(self, node): - return self.globals.get(node.id, None) - - def visit_Attribute(self, node): - lhs = self.visit(node.value) - while isinstance(lhs, ast.Attribute): - lhs = self.visit(lhs.value) - if lhs is None or lhs is triton: - return None - return getattr(lhs, node.attr) - - def visit_Call(self, node): - func = self.visit(node.func) - if func is None: - return - if inspect.isbuiltin(func): - return - if func.__module__ and func.__module__.startswith('triton.'): - return - assert isinstance(func, JITFunction), f"Function \"{func.__name__}\" is being called from a Triton function but is not a Triton function itself. Decorate it with @triton.jit to fix this" - if func.hash is None: - tree = ast.parse(func.src) - finder = DependenciesFinder(func.__globals__, func.src) - finder.visit(tree) - func.hash = finder.ret - noinline = str(getattr(func, 'noinline', False)) - self.ret = (self.ret + func.hash + noinline).encode("utf-8") - self.ret = hashlib.md5(self.ret).hexdigest() - -# ----------------------------------------------------------------------------- -# JITFunction -# ----------------------------------------------------------------------------- - - -@functools.lru_cache() -def version_key(): - import pkgutil - contents = [] - # frontend - with open(__file__, "rb") as f: - contents += [hashlib.md5(f.read()).hexdigest()] - # compiler - compiler_path = os.path.join(*triton.__path__, 'compiler') - for lib in pkgutil.iter_modules([compiler_path]): - with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f: - contents += [hashlib.md5(f.read()).hexdigest()] - # backend - with open(triton._C.libtriton.__file__, "rb") as f: - contents += [hashlib.md5(f.read()).hexdigest()] - # language - language_path = os.path.join(*triton.__path__, 'language') - for lib in pkgutil.iter_modules([language_path]): - with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f: - contents += [hashlib.md5(f.read()).hexdigest()] - # ptxas version - try: - ptxas_version = hashlib.md5(subprocess.check_output(["ptxas", "--version"])).hexdigest() - except Exception: - ptxas_version = '' - return '-'.join(triton.__version__) + '-' + ptxas_version + '-' + '-'.join(contents) - - -class KernelInterface(Generic[T]): - run: T - - def __getitem__(self, grid) -> T: - """ - A JIT function is launched with: fn[grid](*args, **kwargs). - Hence JITFunction.__getitem__ returns a callable proxy that - memorizes the grid. - """ - return cast(T, functools.partial(cast(Callable, self.run), grid=grid)) - - -class JITFunction(KernelInterface[T]): - - # Hook for inspecting compiled functions and modules - cache_hook = None - divisibility = 16 - - @staticmethod - def _key_of(arg): - if hasattr(arg, "dtype"): - return arg.dtype - elif isinstance(arg, bool): - return "i1" - elif isinstance(arg, int): - if -2**31 <= arg and arg <= 2**31 - 1: - return "i32" - elif 2**63 <= arg and arg <= 2**64 - 1: - return "u64" - else: - return "i64" - elif isinstance(arg, float): - return 'fp32' - elif arg is None: - return None - else: - raise TypeError(f'Unsupported type {type(arg)} for {arg}') - - @staticmethod - def _spec_of(arg): - if hasattr(arg, "data_ptr"): - return (arg.data_ptr() % JITFunction.divisibility == 0) - elif isinstance(arg, int): - return (arg % 16 == 0, arg == 1) - return (arg is None, ) - - def _get_config(self, *args): - def is_divisible_by_16(x): - if hasattr(x, "data_ptr"): - return x.data_ptr() % JITFunction.divisibility == 0 - elif isinstance(x, int): - return x % JITFunction.divisibility == 0 - if x is None: - return True - return False - divisible_by_16 = {i for i, arg in enumerate(args) if is_divisible_by_16(arg) and i not in self.do_not_specialize} - equal_to_1 = {i for i, arg in enumerate(args) if not isinstance(arg, bool) and isinstance(arg, int) and arg == 1 and i not in self.do_not_specialize} - return namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"])(tuple(divisible_by_16), tuple(equal_to_1)) - # return _triton.code_gen.instance_descriptor(divisible_by_16, equal_to_1) - - @staticmethod - def _type_of(key): - # None are nullptr -- implicitly converted to *i8 - if key is None: - return '*i8' - dtype_str = str(key).split(".")[-1] - tys = { - "bool": "i1", - "float8e5": "fp8e5", - "float8e4": "fp8e4", - "float16": "fp16", - "bfloat16": "bf16", - "float32": "fp32", - "float64": "fp64", - "int8": "i8", - "int16": "i16", - "int32": "i32", - "int64": "i64", - "uint8": "u8", - "uint16": "u16", - "uint32": "u32", - "uint64": "u64", - } - # reinterpret can create triton type - for v in list(tys.values()): - tys[v] = v - return key if isinstance(key, str) else f"*{tys[dtype_str]}" - - def _make_signature(self, sig_key): - signature = ",".join([self._type_of(k) for i, k in enumerate(sig_key)]) - return signature - - def _make_constants(self, constexpr_key): - constants = dict(zip(self.constexprs, constexpr_key)) - return constants - - def _call_hook(self, key, signature, device, constants, num_warps, num_stages, extern_libs, configs): - if JITFunction.cache_hook is None: - return False - name = self.fn.__name__ - module = self.fn.__module__ - arg_reprs = ', '.join([f'{name}: {ty}' for name, ty in zip(self.arg_names, key[1])]) - repr = f"{name}[num_warps={num_warps}, num_stages={num_stages}]({arg_reprs})" - key = str(key) - - class LegacyCompiler: - def __init__(self, module, name): - self.module = module - self.name = name - pass - - kwargs = dict(signature=signature, device=device, constants=constants, - num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs, - configs=configs) - - return JITFunction.cache_hook(key=key, repr=repr, fn=LegacyCompiler(module, name), compile={"key": key, **kwargs}, is_manual_warmup=False, already_compiled=False) - - def _get_arg_specialization_key(self, arg) -> str: - arg_annotation = self.__annotations__.get(arg, '') - if arg_annotation == '': - return f'({arg}.data_ptr() % {JITFunction.divisibility} == 0) if hasattr({arg}, "data_ptr") \ - else ({arg} % {JITFunction.divisibility} == 0, {arg} == 1) if isinstance({arg}, int) \ - else (False,)' - elif 'Tensor' in arg_annotation: - return f'({arg}.data_ptr() % {JITFunction.divisibility} == 0)' - elif arg_annotation == 'int': - return f'({arg} % {JITFunction.divisibility} == 0, {arg} == 1)' - else: - return '(False,)' - - def _get_arg_sig_key(self, arg) -> str: - arg_annotation = self.__annotations__.get(arg, '') - if 'Tensor' in arg_annotation: - return f'{arg}.dtype' - elif arg_annotation == 'bool': - return "i1" - elif arg_annotation == 'float': - return 'fp32' - else: - return f'_key_of({arg})' - - def _make_launcher(self): - regular_args = [f'{arg}' for i, arg in enumerate(self.arg_names) if i not in self.constexprs] - constexpr_args = [f'{arg}' for i, arg in enumerate(self.arg_names) if i in self.constexprs] - args = ', '.join(regular_args) - # cache key for regular argument type - sig_keys = ', '.join([self._get_arg_sig_key(arg) for arg in regular_args]) - # cache key for constexpr argument values - constexpr_keys = ', '.join(constexpr_args) - # cache key for argument specialization - specializations = [] - for i, arg in enumerate(regular_args): - if i in self.do_not_specialize: - continue - specializations += [self._get_arg_specialization_key(arg)] - - spec_keys = ', '.join(specializations) - grid_args = ','.join([f'"{arg}": {arg}' for arg in self.arg_names]) - - src = f""" -def {self.fn.__name__}({', '.join(self.arg_names)}, grid, num_warps=4, num_stages=3, extern_libs=None, stream=None, warmup=False, device=None): - sig_key = {sig_keys}, - constexpr_key = {f'{constexpr_keys},' if len(constexpr_keys) > 0 else ()} - spec_key = {f'{spec_keys},' if len(spec_keys) > 0 else ()} - key = (version_key, sig_key, constexpr_key, spec_key, num_warps, num_stages, self.debug) - if not extern_libs is None: - key = (key, tuple(extern_libs.items())) - assert num_warps > 0 and (num_warps & (num_warps - 1)) == 0, "num_warps must be a power of 2" - if callable(grid): - grid = grid({{{grid_args}}}) - grid_size = len(grid) - grid_0 = grid[0] - grid_1 = grid[1] if grid_size > 1 else 1 - grid_2 = grid[2] if grid_size > 2 else 1 - if device is None: - device = get_current_device() - set_current_device(device) - if stream is None and not warmup: - stream = get_cuda_stream(device) - bin = cache[device].get(key, None) - if bin is not None: - if not warmup: - bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, triton.compiler.CompiledKernel.launch_enter_hook, triton.compiler.CompiledKernel.launch_exit_hook, bin, {args}) - return bin - # kernel not cached -- compile - else: - # build dict of constant values - args = [{args}] - all_args = {', '.join([f'{arg}' for arg in self.arg_names])}, - configs = self._get_config(*all_args), - constants = self._make_constants(constexpr_key) - constants.update({{i: None for i, arg in enumerate(all_args) if arg is None}}) - constants.update({{i: 1 for i in configs[0].equal_to_1}}) - # build kernel signature -- doesn't include specialized arguments - signature = {{ i: self._type_of(_key_of(arg)) for i, arg in enumerate(all_args) if i not in self.constexprs }} - # build stub signature -- includes arguments that are specialized - for i, arg in constants.items(): - if callable(arg): - raise TypeError(f"Callable constexpr at index {{i}} is not supported") - if not self._call_hook(key, signature, device, constants, num_warps, num_stages, extern_libs, configs): - bin = triton.compile(self, signature=signature, device=device, constants=constants, num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs, configs=configs, debug=self.debug) - if not warmup: - bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, triton.compiler.CompiledKernel.launch_enter_hook, triton.compiler.CompiledKernel.launch_exit_hook, bin, *args) - self.cache[device][key] = bin - return bin - return None -""" - scope = {"version_key": version_key(), "get_cuda_stream": get_cuda_stream, - "self": self, "_spec_of": self._spec_of, "_key_of": self._key_of, - "cache": self.cache, "triton": triton, - "get_current_device": get_current_device, - "set_current_device": set_current_device} - exec(src, scope) - return scope[self.fn.__name__] - - def __init__(self, fn, version=None, do_not_specialize=None, debug=None, noinline=None): - self.fn = fn - self.module = fn.__module__ - self.version = version - # function signature information - signature = inspect.signature(fn) - self.arg_names = [v.name for v in signature.parameters.values()] - self.has_defaults = any(v.default != inspect._empty for v in signature.parameters.values()) - # specialization hints - self.do_not_specialize = [] if do_not_specialize is None else do_not_specialize - self.do_not_specialize = {self.arg_names.index(arg) if isinstance(arg, str) else arg for arg in self.do_not_specialize} - # function source code (without decorators) - self.src = textwrap.dedent(inspect.getsource(fn)) - self.src = self.src[self.src.find("def"):] - # cache of just-in-time compiled kernels - self.cache = defaultdict(dict) - self.hash = None - # JITFunction can be instantiated as kernel - # when called with a grid using __getitem__ - self.kernel_decorators = [] - self.kernel = None - self.debug = True if os.environ.get("TRITON_DEBUG", "0") == "1" else debug - self.noinline = noinline - # annotations - normalize_ty = lambda ty: ty.__name__ if isinstance(ty, type) else ty - self.__annotations__ = {name: normalize_ty(ty) for name, ty in fn.__annotations__.items()} - # index of constexprs - self.constexprs = [self.arg_names.index(name) for name, ty in self.__annotations__.items() if 'constexpr' in ty] - # launcher - self.run = self._make_launcher() - # re-use docs of wrapped function - self.__doc__ = fn.__doc__ - self.__name__ = fn.__name__ - self.__globals__ = fn.__globals__ - self.__module__ = fn.__module__ - - @property - def cache_key(self): - # TODO : hash should be attribute of `self` - if self.hash is None: - dependencies_finder = DependenciesFinder(globals=self.__globals__, src=self.src) - dependencies_finder.visit(self.parse()) - self.hash = dependencies_finder.ret + version_key() - return self.hash - - def warmup(self, *args, **kwargs): - return self.run(*map(MockTensor.wrap_dtype, args), **kwargs, warmup=True) - - # we do not parse `src` in the constructor because - # the user might want to monkey-patch self.src dynamically. - # Our unit tests do this, for example. - def parse(self): - tree = ast.parse(self.src) - assert isinstance(tree, ast.Module) - assert len(tree.body) == 1 - assert isinstance(tree.body[0], ast.FunctionDef) - return tree - - def __call__(self, *args, **kwargs): - raise RuntimeError("Cannot call @triton.jit'd outside of the scope of a kernel") - - def __setattr__(self, name, value): - # - when kernel decorators change, cached kernel - # needs to be cleared - if name == 'kernel_decorators': - self.kernel = None - super(JITFunction, self).__setattr__(name, value) - # - when `.src` attribute is set, cache path needs - # to be reinitialized - if name == 'src': - self.hash = None - - def __repr__(self): - return f"JITFunction({self.module}:{self.fn.__name__})" - - -# ----------------------------------------------------------------------------- -# `jit` decorator -# ----------------------------------------------------------------------------- - - -@overload -def jit(fn: T) -> JITFunction[T]: - ... - - -@overload -def jit( - *, - version=None, - do_not_specialize: Optional[Iterable[int]] = None, - debug: Optional[bool] = None, - noinline: Optional[bool] = None, -) -> Callable[[T], JITFunction[T]]: - ... - - -def jit( - fn: Optional[T] = None, - *, - version=None, - do_not_specialize: Optional[Iterable[int]] = None, - debug: Optional[bool] = None, - noinline: Optional[bool] = None, - interpret: Optional[bool] = None, -) -> Union[JITFunction[T], Callable[[T], JITFunction[T]]]: - """ - Decorator for JIT-compiling a function using the Triton compiler. - - :note: When a jit'd function is called, arguments are - implicitly converted to pointers if they have a :code:`.data_ptr()` method - and a `.dtype` attribute. - - :note: This function will be compiled and run on the GPU. It will only have access to: - - * python primitives, - * builtins within the triton package, - * arguments to this function, - * other jit'd functions - - :param fn: the function to be jit-compiled - :type fn: Callable - """ - - def decorator(fn: T) -> JITFunction[T]: - assert callable(fn) - if interpret: - from ..debugger.debugger import GridSelector - return GridSelector(fn) - else: - return JITFunction( - fn, - version=version, - do_not_specialize=do_not_specialize, - debug=debug, - noinline=noinline, - ) - if fn is not None: - return decorator(fn) - - else: - return decorator - -# ----------------------------------------------------------------------------- -# Utilities for mocking tensors -# ----------------------------------------------------------------------------- - - -class MockTensor: - """ - Can be used in place of real tensors when calling: - kernel.warmup(MockTensor(torch.float32), ...) - """ - @staticmethod - def wrap_dtype(arg): - if arg.__class__.__name__ == "dtype" and\ - arg.__module__ == "torch": - return MockTensor(arg) - return arg - - def __init__(self, dtype): - self.dtype = dtype - - @staticmethod - def data_ptr(): - return 0 # optimistically assumes multiple of 16 - - -class TensorWrapper: - def __init__(self, base, dtype): - self.dtype = dtype - self.base = base - self.is_cuda = base.is_cuda - self.device = base.device - - def data_ptr(self): - return self.base.data_ptr() - - def __str__(self) -> str: - return f'TensorWrapper[{self.dtype}]({self.base})' - - -def reinterpret(tensor, dtype): - if isinstance(tensor, TensorWrapper): - if dtype == tensor.base.dtype: - # Reinterpreting to the original interpretation; return the base. - return tensor.base - else: - # Reinterpreting a wrapped tensor to a different type. - return TensorWrapper(tensor.base, dtype) - elif hasattr(tensor, "data_ptr"): - # A new wrapper is needed around an unwrapped tensor. - return TensorWrapper(tensor, dtype) - else: - raise TypeError(f'Cannot reinterpret a {type(tensor)}.') diff --git a/python/triton/testing.py b/python/triton/testing.py deleted file mode 100644 index 321f03dbe8ba..000000000000 --- a/python/triton/testing.py +++ /dev/null @@ -1,423 +0,0 @@ -import functools -import os -import subprocess -import sys -from contextlib import contextmanager - -import triton._C.libtriton.triton as _triton - - -def nvsmi(attrs): - attrs = ','.join(attrs) - cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits'] - out = subprocess.check_output(cmd) - ret = out.decode(sys.stdout.encoding).split(',') - ret = [int(x) for x in ret] - return ret - - -def do_bench(fn, warmup=25, rep=100, grad_to_none=None, - quantiles=None, - fast_flush=True, - return_mode="mean"): - assert return_mode in ["min", "max", "mean", "median"] - import torch - """ - Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with - the 20-th and 80-th performance percentile. - - :param fn: Function to benchmark - :type fn: Callable - :param warmup: Warmup time (in ms) - :type warmup: int - :param rep: Repetition time (in ms) - :type rep: int - :param grad_to_none: Reset the gradient of the provided tensor to None - :type grad_to_none: torch.tensor, optional - :param quantiles: Performance percentile to return in addition to the median. - :type quantiles: list[float] - :param fast_flush: Use faster kernel to flush L2 between measurements - :type fast_flush: bool - """ - - fn() - torch.cuda.synchronize() - - # We maintain a buffer of 256 MB that we clear - # before each kernel call to make sure that the L2 - # doesn't contain any input data before the run - if fast_flush: - cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda') - else: - cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda') - - # Estimate the runtime of the function - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - start_event.record() - for _ in range(5): - cache.zero_() - fn() - end_event.record() - torch.cuda.synchronize() - estimate_ms = start_event.elapsed_time(end_event) / 5 - - # compute number of warmup and repeat - n_warmup = max(1, int(warmup / estimate_ms)) - n_repeat = max(1, int(rep / estimate_ms)) - start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)] - end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)] - # Warm-up - for _ in range(n_warmup): - fn() - # Benchmark - for i in range(n_repeat): - # we don't want `fn` to accumulate gradient values - # if it contains a backward pass. So we clear the - # provided gradients - if grad_to_none is not None: - for x in grad_to_none: - x.grad = None - # we clear the L2 cache before each run - cache.zero_() - # record time of `fn` - start_event[i].record() - fn() - end_event[i].record() - # Record clocks - torch.cuda.synchronize() - times = torch.tensor([s.elapsed_time(e) for s, e in zip(start_event, end_event)]) - if quantiles is not None: - ret = torch.quantile(times, torch.tensor(quantiles)).tolist() - if len(ret) == 1: - ret = ret[0] - return ret - return getattr(torch, return_mode)(times).item() - - -def assert_close(x, y, atol=None, rtol=None, err_msg=''): - import numpy as np - import torch - - # canonicalize arguments to be tensors - if not isinstance(x, torch.Tensor): - x = torch.tensor(x) - if not isinstance(y, torch.Tensor): - y = torch.tensor(y) - # absolute tolerance - if atol is None: - atol = 1e-2 - atol = atol(x.dtype) if callable(atol) else atol - # relative tolerance hook - if rtol is None: - rtol = 0. - rtol = rtol(x.dtype) if callable(rtol) else rtol - # we use numpy instead of pytorch - # as it seems more memory efficient - # pytorch tends to oom on large tensors - if isinstance(x, torch.Tensor): - if x.dtype == torch.bfloat16: - x = x.float() - x = x.cpu().detach().numpy() - if isinstance(y, torch.Tensor): - if y.dtype == torch.bfloat16: - y = y.float() - y = y.cpu().detach().numpy() - # we handle size==1 case separately as we can - # provide better error message there - if x.size > 1 or y.size > 1: - np.testing.assert_allclose(x, y, atol=atol, rtol=rtol, equal_nan=True) - return - if not np.allclose(x, y, atol=atol, rtol=rtol): - raise AssertionError(f'{err_msg} {x} is not close to {y} (atol={atol}, rtol={rtol})') - - -class Benchmark: - """ - This class is used by the :code:`perf_report` function to generate line plots with a concise API. - """ - - def __init__( - self, - x_names, - x_vals, - line_arg, - line_vals, - line_names, - plot_name, - args, - xlabel='', - ylabel='', - x_log=False, - y_log=False, - color=None, - styles=None, - ): - """ - Constructor - - :param x_names: Name of the arguments that should appear on the x axis of the plot. If the list contains more than one element, all the arguments are assumed to have the same value. - :type x_names: List[str] - :param x_vals: List of values to use for the arguments in :code:`x_names`. - :type x_vals: List[Any] - :param line_arg: Argument name for which different values correspond to different lines in the plot. - :type line_arg: str - :param line_vals: List of values to use for the arguments in :code:`line_arg`. - :type line_vals: List[str] - :param line_names: Label names for the different lines. - :type line_names: List[str] - :param plot_name: Name of the plot. - :type plot_name: str - :param args: List of arguments to remain fixed throughout the benchmark. - :type args: List[str] - :param xlabel: Label for the x axis of the plot. - :type xlabel: str, optional - :param ylabel: Label for the y axis of the plot. - :type ylabel: str, optional - :param x_log: Whether the x axis should be log scale. - :type x_log: bool, optional - :param y_log: Whether the y axis should be log scale. - :type y_log: bool, optional - """ - self.x_names = x_names - self.x_vals = x_vals - self.x_log = x_log - self.line_arg = line_arg - self.line_vals = line_vals - self.line_names = line_names - self.y_log = y_log - self.styles = styles - # plot info - self.xlabel = xlabel - self.ylabel = ylabel - self.plot_name = plot_name - self.args = args - - -class Mark: - def __init__(self, fn, benchmarks): - self.fn = fn - self.benchmarks = benchmarks - - def _run(self, bench, save_path, show_plots, print_data): - import os - - import matplotlib.pyplot as plt - import pandas as pd - y_mean = bench.line_names - y_min = [f'{x}-min' for x in bench.line_names] - y_max = [f'{x}-max' for x in bench.line_names] - df = pd.DataFrame(columns=[bench.x_names[0]] + y_mean + y_min + y_max) - for x in bench.x_vals: - x_args = {x_name: x for x_name in bench.x_names} - row_mean, row_min, row_max = [], [], [] - for y in bench.line_vals: - ret = self.fn(**x_args, **{bench.line_arg: y}, **bench.args) - try: - y_mean, y_min, y_max = ret - except TypeError: - y_mean, y_min, y_max = ret, None, None - row_mean += [y_mean] - row_min += [y_min] - row_max += [y_max] - df.loc[len(df)] = [x] + row_mean + row_min + row_max - if bench.plot_name: - plt.figure() - ax = plt.subplot() - x = bench.x_names[0] - for i, y in enumerate(bench.line_names): - y_min, y_max = df[y + '-min'], df[y + '-max'] - col = bench.styles[i][0] if bench.styles else None - sty = bench.styles[i][1] if bench.styles else None - ax.plot(df[x], df[y], label=y, color=col, ls=sty) - if y_min is not None and y_max is not None: - ax.fill_between(df[x], y_min, y_max, alpha=0.15, color=col) - ax.legend() - xlabel = bench.xlabel if bench.xlabel else " = ".join(bench.x_names) - ax.set_xlabel(xlabel) - ax.set_ylabel(bench.ylabel) - # ax.set_title(bench.plot_name) - ax.set_xscale("log" if bench.x_log else "linear") - ax.set_yscale("log" if bench.y_log else "linear") - if show_plots: - plt.show() - if save_path: - plt.savefig(os.path.join(save_path, f"{bench.plot_name}.png")) - df = df[[bench.x_names[0]] + bench.line_names] - if print_data: - print(bench.plot_name + ':') - print(df) - if save_path: - df.to_csv(os.path.join(save_path, f"{bench.plot_name}.csv"), float_format='%.1f', index=False) - - def run(self, show_plots=False, print_data=False, save_path=''): - has_single_bench = isinstance(self.benchmarks, Benchmark) - benchmarks = [self.benchmarks] if has_single_bench else self.benchmarks - if save_path: - html = open(os.path.join(save_path, "results.html"), "w") - html.write("\n") - for bench in benchmarks: - self._run(bench, save_path, show_plots, print_data) - if save_path: - html.write(f"\n") - if save_path: - html.write("\n") - - -def perf_report(benchmarks): - """ - Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value. - - :param benchmarks: Benchmarking configurations. - :type benchmarks: List of :class:`Benchmark` - """ - wrapper = lambda fn: Mark(fn, benchmarks) - return wrapper - - -def get_dram_gbps(backend=None, device=None): - ''' return DRAM bandwidth in GB/s ''' - import torch - - from .runtime import driver - if not backend: - backend = _triton.runtime.backend.CUDA - if not device: - device = torch.cuda.current_device() - mem_clock_khz = driver.utils.get_device_properties(device)["mem_clock_rate"] # in kHz - bus_width = driver.utils.get_device_properties(device)["mem_bus_width"] - bw_gbps = mem_clock_khz * bus_width * 2 / 1e6 / 8 # In GB/s - return bw_gbps - - -def get_max_tensorcore_tflops(dtype, backend=None, device=None, clock_rate=None): - import torch - - from .runtime import driver - if not backend: - backend = _triton.runtime.backend.CUDA - if not device: - device = torch.cuda.current_device() - - num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 - if not clock_rate: - clock_rate = driver.utils.get_device_properties(device)["sm_clock_rate"] # in kHz - capability = torch.cuda.get_device_capability(device) - if capability[0] < 8: - assert dtype == torch.float16 - ops_per_sub_core = 256 # 2 4x4x4 Tensor Cores - else: - if dtype == torch.float32: - ops_per_sub_core = 256 - elif dtype in [torch.float16, torch.bfloat16]: - ops_per_sub_core = 512 - elif dtype == torch.int8: - ops_per_sub_core = 1024 - else: - raise RuntimeError("dtype not supported") - tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9 - return tflops - -# create decorator that wraps test function into -# a cuda-memcheck system call - - -def cuda_memcheck(**target_kwargs): - def decorator(test_fn): - @functools.wraps(test_fn) - def wrapper(*args, **kwargs): - import psutil - ppid_name = psutil.Process(os.getppid()).name() - run_cuda_memcheck = target_kwargs.items() <= kwargs.items() - if run_cuda_memcheck and ppid_name != "cuda-memcheck": - path = os.path.realpath(test_fn.__globals__["__file__"]) - # get path of current file - env = {"PATH": os.environ["PATH"], "PYTORCH_NO_CUDA_MEMORY_CACHING": "1"} - assert 'request' in kwargs, "memcheck'ed test must have a (possibly unused) `request` fixture" - test_id = kwargs['request'].node.callspec.id - cmd = f"{path}::{test_fn.__name__}[{test_id}]" - out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env) - assert out.returncode == 0, "cuda-memcheck returned an error: bounds checking failed" - assert "ERROR SUMMARY: 0 errors" in str(out.stdout) - else: - test_fn(*args, **kwargs) - return wrapper - return decorator - - -def nvsmi_attr(attrs): - attrs = ",".join(attrs) - cmd = [ - "nvidia-smi", - "-i", - "0", - "--query-gpu=" + attrs, - "--format=csv,noheader,nounits", - ] - out = subprocess.check_output(cmd) - ret = out.decode(sys.stdout.encoding).split(",") - ret = [int(x) for x in ret] - return ret - - -@contextmanager -def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215): - try: - subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "1"]) - subprocess.check_output( - [ - "nvidia-smi", - "-i", - "0", - f"--lock-gpu-clocks={ref_sm_clock},{ref_sm_clock}", - ] - ) - subprocess.check_output( - [ - "nvidia-smi", - "-i", - "0", - f"--lock-memory-clocks={ref_mem_clock},{ref_mem_clock}", - ] - ) - cur_sm_clock = nvsmi_attr(["clocks.current.sm"])[0] - cur_mem_clock = nvsmi_attr(["clocks.current.memory"])[0] - assert abs(cur_sm_clock - ref_sm_clock) < 10, f"GPU SMs must run at {ref_sm_clock} MHz" - assert abs(cur_mem_clock - ref_mem_clock) < 10, f"GPU SMs must run at {ref_mem_clock} MHz" - tflops = 1e-6 * 2 * 108 * 4 * 256 * ref_sm_clock - gbps = 640 * 2 * ref_mem_clock * 1e-3 - yield tflops, gbps - finally: - subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "0"]) - subprocess.check_output(["nvidia-smi", "-i", "0", "-rgc"]) - subprocess.check_output(["nvidia-smi", "-i", "0", "-rmc"]) - - -def get_max_simd_tflops(dtype, backend=None, device=None): - import torch - - from .runtime import driver - if not backend: - backend = _triton.runtime.backend.CUDA - if not device: - device = torch.cuda.current_device() - - num_subcores = driver.utils.get_device_properties(device)["multiprocessor_count"] * 4 - clock_rate = driver.utils.get_device_properties(device)["sm_clock_rate"] # in kHz - capability = torch.cuda.get_device_capability() - if capability[0] < 8: - if dtype == torch.float32: - ops_per_sub_core = 32 # 2*16 - elif dtype == torch.float16: - ops_per_sub_core = 64 - else: - raise RuntimeError("dtype not supported") - else: - if dtype == torch.float32: - ops_per_sub_core = 32 - elif dtype in [torch.float16, torch.bfloat16]: - ops_per_sub_core = 64 - else: - raise RuntimeError("dtype not supported") - tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9 - return tflops diff --git a/python/triton/third_party/cuda/bin/ptxas b/python/triton/third_party/cuda/bin/ptxas deleted file mode 100755 index 8b47936ea212..000000000000 Binary files a/python/triton/third_party/cuda/bin/ptxas and /dev/null differ diff --git a/python/triton/third_party/cuda/include/cuda.h b/python/triton/third_party/cuda/include/cuda.h deleted file mode 100755 index c713bf316a16..000000000000 --- a/python/triton/third_party/cuda/include/cuda.h +++ /dev/null @@ -1,19348 +0,0 @@ -/* - * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#ifndef __cuda_cuda_h__ -#define __cuda_cuda_h__ - - - -#include -#ifdef _MSC_VER -typedef unsigned __int32 cuuint32_t; -typedef unsigned __int64 cuuint64_t; -#else -#include -typedef uint32_t cuuint32_t; -typedef uint64_t cuuint64_t; -#endif - -#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) -#define __CUDA_DEPRECATED -#elif defined(_MSC_VER) -#define __CUDA_DEPRECATED __declspec(deprecated) -#elif defined(__GNUC__) -#define __CUDA_DEPRECATED __attribute__((deprecated)) -#else -#define __CUDA_DEPRECATED -#endif - -#if defined(CUDA_FORCE_API_VERSION) -#error "CUDA_FORCE_API_VERSION is no longer supported." -#endif - -#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) - #define __CUDA_API_PER_THREAD_DEFAULT_STREAM - #define __CUDA_API_PTDS(api) api ## _ptds - #define __CUDA_API_PTSZ(api) api ## _ptsz -#else - #define __CUDA_API_PTDS(api) api - #define __CUDA_API_PTSZ(api) api -#endif - -#define cuDeviceTotalMem cuDeviceTotalMem_v2 -#define cuCtxCreate cuCtxCreate_v2 -#define cuCtxCreate_v3 cuCtxCreate_v3 -#define cuModuleGetGlobal cuModuleGetGlobal_v2 -#define cuMemGetInfo cuMemGetInfo_v2 -#define cuMemAlloc cuMemAlloc_v2 -#define cuMemAllocPitch cuMemAllocPitch_v2 -#define cuMemFree cuMemFree_v2 -#define cuMemGetAddressRange cuMemGetAddressRange_v2 -#define cuMemAllocHost cuMemAllocHost_v2 -#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 -#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) -#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) -#define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) -#define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) -#define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) -#define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) -#define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) -#define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) -#define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) -#define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) -#define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) -#define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) -#define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) -#define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) -#define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) -#define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) -#define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) -#define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) -#define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) -#define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) -#define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) -#define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) -#define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) -#define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) -#define cuArrayCreate cuArrayCreate_v2 -#define cuArrayGetDescriptor cuArrayGetDescriptor_v2 -#define cuArray3DCreate cuArray3DCreate_v2 -#define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 -#define cuTexRefSetAddress cuTexRefSetAddress_v2 -#define cuTexRefGetAddress cuTexRefGetAddress_v2 -#define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 -#define cuCtxDestroy cuCtxDestroy_v2 -#define cuCtxPopCurrent cuCtxPopCurrent_v2 -#define cuCtxPushCurrent cuCtxPushCurrent_v2 -#define cuStreamDestroy cuStreamDestroy_v2 -#define cuEventDestroy cuEventDestroy_v2 -#define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 -#define cuLinkCreate cuLinkCreate_v2 -#define cuLinkAddData cuLinkAddData_v2 -#define cuLinkAddFile cuLinkAddFile_v2 -#define cuMemHostRegister cuMemHostRegister_v2 -#define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 -#define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) -#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 -#define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 -#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 -#define cuDeviceGetUuid_v2 cuDeviceGetUuid_v2 -#define cuIpcOpenMemHandle cuIpcOpenMemHandle_v2 -#define cuGraphInstantiate cuGraphInstantiate_v2 - -#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) - #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) - #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) - #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) - #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) - #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) - #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) - #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) - - #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) - #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) - #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) - #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) - #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) - #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) - - #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) - #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) - #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) - #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) - #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) - #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) - #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) - #define cuStreamGetCaptureInfo_v2 __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) - #define cuStreamUpdateCaptureDependencies __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies) - #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) - #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) - #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) - #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) - #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) - #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags) - #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) - - - - #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) - #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) - #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) - - #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) - #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) - #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) - #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) - #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) - - #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) - - #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) - #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) - - #define cuGraphUpload __CUDA_API_PTSZ(cuGraphUpload) - #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) - #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) - #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) - #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) - #define cuMemMapArrayAsync __CUDA_API_PTSZ(cuMemMapArrayAsync) - - #define cuMemFreeAsync __CUDA_API_PTSZ(cuMemFreeAsync) - #define cuMemAllocAsync __CUDA_API_PTSZ(cuMemAllocAsync) - #define cuMemAllocFromPoolAsync __CUDA_API_PTSZ(cuMemAllocFromPoolAsync) -#endif - -/** - * \file cuda.h - * \brief Header file for the CUDA Toolkit application programming interface. - * - * \file cudaGL.h - * \brief Header file for the OpenGL interoperability functions of the - * low-level CUDA driver application programming interface. - * - * \file cudaD3D9.h - * \brief Header file for the Direct3D 9 interoperability functions of the - * low-level CUDA driver application programming interface. - */ - -/** - * \defgroup CUDA_TYPES Data types used by CUDA driver - * @{ - */ - -/** - * CUDA API version number - */ -#define CUDA_VERSION 11060 - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * CUDA device pointer - * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. - */ -#if defined(_WIN64) || defined(__LP64__) -typedef unsigned long long CUdeviceptr_v2; -#else -typedef unsigned int CUdeviceptr_v2; -#endif -typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device pointer */ - -typedef int CUdevice_v1; /**< CUDA device */ -typedef CUdevice_v1 CUdevice; /**< CUDA device */ -typedef struct CUctx_st *CUcontext; /**< CUDA context */ -typedef struct CUmod_st *CUmodule; /**< CUDA module */ -typedef struct CUfunc_st *CUfunction; /**< CUDA function */ -typedef struct CUarray_st *CUarray; /**< CUDA array */ -typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ -typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ -typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ -typedef struct CUevent_st *CUevent; /**< CUDA event */ -typedef struct CUstream_st *CUstream; /**< CUDA stream */ -typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ -typedef unsigned long long CUtexObject_v1; /**< An opaque value that represents a CUDA texture object */ -typedef CUtexObject_v1 CUtexObject; /**< An opaque value that represents a CUDA texture object */ -typedef unsigned long long CUsurfObject_v1; /**< An opaque value that represents a CUDA surface object */ -typedef CUsurfObject_v1 CUsurfObject; /**< An opaque value that represents a CUDA surface object */ -typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ -typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ -typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ -typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ -typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ -typedef struct CUmemPoolHandle_st *CUmemoryPool; /**< CUDA memory pool */ -typedef struct CUuserObject_st *CUuserObject; /**< CUDA user object for graphs */ - -#ifndef CU_UUID_HAS_BEEN_DEFINED -#define CU_UUID_HAS_BEEN_DEFINED -typedef struct CUuuid_st { /**< CUDA definition of UUID */ - char bytes[16]; -} CUuuid; -#endif - -/** - * CUDA IPC handle size - */ -#define CU_IPC_HANDLE_SIZE 64 - -/** - * CUDA IPC event handle - */ -typedef struct CUipcEventHandle_st { - char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcEventHandle_v1; -typedef CUipcEventHandle_v1 CUipcEventHandle; - -/** - * CUDA IPC mem handle - */ -typedef struct CUipcMemHandle_st { - char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcMemHandle_v1; -typedef CUipcMemHandle_v1 CUipcMemHandle; - -/** - * CUDA Ipc Mem Flags - */ -typedef enum CUipcMem_flags_enum { - CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ -} CUipcMem_flags; - - -/** - * CUDA Mem Attach Flags - */ -typedef enum CUmemAttach_flags_enum { - CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ - CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ - CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ -} CUmemAttach_flags; - -/** - * Context creation flags - */ -typedef enum CUctx_flags_enum { - CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ - CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ - CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ - CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ - CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling - * \deprecated This flag was deprecated as of CUDA 4.0 - * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ - CU_CTX_SCHED_MASK = 0x07, - CU_CTX_MAP_HOST = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 - * and it no longer has any effect. All contexts - * as of CUDA 3.2 behave as though the flag is enabled. */ - CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ - CU_CTX_FLAGS_MASK = 0x1f -} CUctx_flags; - -/** - * Stream creation flags - */ -typedef enum CUstream_flags_enum { - CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ - CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ -} CUstream_flags; - -/** - * Legacy stream handle - * - * Stream handle that can be passed as a CUstream to use an implicit stream - * with legacy synchronization behavior. - * - * See details of the \link_sync_behavior - */ -#define CU_STREAM_LEGACY ((CUstream)0x1) - -/** - * Per-thread stream handle - * - * Stream handle that can be passed as a CUstream to use an implicit stream - * with per-thread synchronization behavior. - * - * See details of the \link_sync_behavior - */ -#define CU_STREAM_PER_THREAD ((CUstream)0x2) - -/** - * Event creation flags - */ -typedef enum CUevent_flags_enum { - CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ - CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ - CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ - CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ -} CUevent_flags; - -/** - * Event record flags - */ -typedef enum CUevent_record_flags_enum { - CU_EVENT_RECORD_DEFAULT = 0x0, /**< Default event record flag */ - CU_EVENT_RECORD_EXTERNAL = 0x1 /**< When using stream capture, create an event record node - * instead of the default behavior. This flag is invalid - * when used outside of capture. */ -} CUevent_record_flags; - -/** - * Event wait flags - */ -typedef enum CUevent_wait_flags_enum { - CU_EVENT_WAIT_DEFAULT = 0x0, /**< Default event wait flag */ - CU_EVENT_WAIT_EXTERNAL = 0x1 /**< When using stream capture, create an event wait node - * instead of the default behavior. This flag is invalid - * when used outside of capture.*/ -} CUevent_wait_flags; - -/** - * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 - */ -typedef enum CUstreamWaitValue_flags_enum { - CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit - values). Note this is a cyclic comparison which ignores wraparound. - (Default behavior.) */ - CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ - CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ - CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be - queried with ::cuDeviceGetAttribute() and - ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ - CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This - means that, if a remote write operation is guaranteed to have reached the - device before the wait can be satisfied, that write is guaranteed to be - visible to downstream device work. The device is permitted to reorder - remote writes internally. For example, this flag would be required if - two remote writes arrive in a defined order, the wait is satisfied by the - second write, and downstream work needs to observe the first write. - Support for this operation is restricted to selected platforms and can be - queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/ -} CUstreamWaitValue_flags; - -/** - * Flags for ::cuStreamWriteValue32 - */ -typedef enum CUstreamWriteValue_flags_enum { - CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ - CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued - before it, as a performance optimization. Normally, - ::cuStreamWriteValue32 will provide a memory fence before the - write, which has similar semantics to - __threadfence_system() but is scoped to the stream - rather than a CUDA thread. */ -} CUstreamWriteValue_flags; - -/** - * Operations for ::cuStreamBatchMemOp - */ -typedef enum CUstreamBatchMemOpType_enum { - CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ - CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ - CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ - CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ - CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a - standalone operation. */ -} CUstreamBatchMemOpType; - -/** - * Per-operation parameters for ::cuStreamBatchMemOp - */ -typedef union CUstreamBatchMemOpParams_union { - CUstreamBatchMemOpType operation; - struct CUstreamMemOpWaitValueParams_st { - CUstreamBatchMemOpType operation; - CUdeviceptr address; - union { - cuuint32_t value; - cuuint64_t value64; - }; - unsigned int flags; - CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ - } waitValue; - struct CUstreamMemOpWriteValueParams_st { - CUstreamBatchMemOpType operation; - CUdeviceptr address; - union { - cuuint32_t value; - cuuint64_t value64; - }; - unsigned int flags; - CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ - } writeValue; - struct CUstreamMemOpFlushRemoteWritesParams_st { - CUstreamBatchMemOpType operation; - unsigned int flags; - } flushRemoteWrites; - cuuint64_t pad[6]; -} CUstreamBatchMemOpParams_v1; -typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams; - -/** - * Occupancy calculator flag - */ -typedef enum CUoccupancy_flags_enum { - CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ - CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ -} CUoccupancy_flags; - -/** - * Flags for ::cuStreamUpdateCaptureDependencies - */ -typedef enum CUstreamUpdateCaptureDependencies_flags_enum { - CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */ - CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1 /**< Replace the dependency set with the new nodes */ -} CUstreamUpdateCaptureDependencies_flags; - -/** - * Array formats - */ -typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ - CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ - CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ - CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ - CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */ - CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */ - CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */ - CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ - CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ - CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ - CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/ - CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ - CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/ - CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ - CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ - CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ - CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ - CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ - CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ - CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ - CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ -} CUarray_format; - -/** - * Texture reference addressing modes - */ -typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ - CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ - CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ - CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ -} CUaddress_mode; - -/** - * Texture reference filtering modes - */ -typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ - CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ -} CUfilter_mode; - -/** - * Device properties - */ -typedef enum CUdevice_attribute_enum { - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ - CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ - CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ - CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ - CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ - CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ - CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ - CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ - CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ - CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ - CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ - CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ - CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ - CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ - CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ - CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ - CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ - CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ - CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ - CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ - CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ - CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ - CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ - CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ - CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ - CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/ - CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ - CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ - CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ - CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< Maximum value of CUaccessPolicyWindow::num_bytes. */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ - CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ - CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ - CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */ - CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ - CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, /**< Handle types supported with mempool based IPC */ - - - - - CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */ - - CU_DEVICE_ATTRIBUTE_MAX -} CUdevice_attribute; - -/** - * Legacy device properties - */ -typedef struct CUdevprop_st { - int maxThreadsPerBlock; /**< Maximum number of threads per block */ - int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ - int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ - int sharedMemPerBlock; /**< Shared memory available per block in bytes */ - int totalConstantMemory; /**< Constant memory available on device in bytes */ - int SIMDWidth; /**< Warp size in threads */ - int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ - int regsPerBlock; /**< 32-bit registers available per block */ - int clockRate; /**< Clock frequency in kilohertz */ - int textureAlign; /**< Alignment requirement for textures */ -} CUdevprop_v1; -typedef CUdevprop_v1 CUdevprop; - -/** - * Pointer information - */ -typedef enum CUpointer_attribute_enum { - CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ - CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ - CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ - CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ - CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ - CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ - CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ - CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ - CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ - CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ - CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ - CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ - CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ - CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ - CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16, /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */ - CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/ -} CUpointer_attribute; - -/** - * Function properties - */ -typedef enum CUfunction_attribute_enum { - /** - * The maximum number of threads per block, beyond which a launch of the - * function would fail. This number depends on both the function and the - * device on which the function is currently loaded. - */ - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, - - /** - * The size in bytes of statically-allocated shared memory required by - * this function. This does not include dynamically-allocated shared - * memory requested by the user at runtime. - */ - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, - - /** - * The size in bytes of user-allocated constant memory required by this - * function. - */ - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, - - /** - * The size in bytes of local memory used by each thread of this function. - */ - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, - - /** - * The number of registers used by each thread of this function. - */ - CU_FUNC_ATTRIBUTE_NUM_REGS = 4, - - /** - * The PTX virtual architecture version for which the function was - * compiled. This value is the major PTX version * 10 + the minor PTX - * version, so a PTX version 1.3 function would return the value 13. - * Note that this may return the undefined value of 0 for cubins - * compiled prior to CUDA 3.0. - */ - CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, - - /** - * The binary architecture version for which the function was compiled. - * This value is the major binary version * 10 + the minor binary version, - * so a binary version 1.3 function would return the value 13. Note that - * this will return a value of 10 for legacy cubins that do not have a - * properly-encoded binary architecture version. - */ - CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, - - /** - * The attribute to indicate whether the function has been compiled with - * user specified option "-Xptxas --dlcm=ca" set . - */ - CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, - - /** - * The maximum size in bytes of dynamically-allocated shared memory that can be used by - * this function. If the user-specified dynamic shared memory size is larger than this - * value, the launch will fail. - * See ::cuFuncSetAttribute - */ - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, - - /** - * On devices where the L1 cache and shared memory use the same hardware resources, - * this sets the shared memory carveout preference, in percent of the total shared memory. - * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. - * This is only a hint, and the driver can choose a different ratio if required to execute the function. - * See ::cuFuncSetAttribute - */ - CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - CU_FUNC_ATTRIBUTE_MAX -} CUfunction_attribute; - -/** - * Function cache configurations - */ -typedef enum CUfunc_cache_enum { - CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ - CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ - CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ - CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ -} CUfunc_cache; - -/** - * Shared memory configurations - */ -typedef enum CUsharedconfig_enum { - CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ - CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ - CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ -} CUsharedconfig; - -/** - * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute - */ -typedef enum CUshared_carveout_enum { - CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ - CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ - CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ -} CUshared_carveout; - -/** - * Memory types - */ -typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ - CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ - CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ - CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ -} CUmemorytype; - -/** - * Compute Modes - */ -typedef enum CUcomputemode_enum { - CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ - CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ - CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ -} CUcomputemode; - -/** - * Memory advise values - */ -typedef enum CUmem_advise_enum { - CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */ - CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ - CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ - CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ - CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ - CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ -} CUmem_advise; - -typedef enum CUmem_range_attribute_enum { - CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */ - CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ - CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ - CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ -} CUmem_range_attribute; - -/** - * Online compiler and linker options - */ -typedef enum CUjit_option_enum -{ - /** - * Max number of registers that a thread may use.\n - * Option type: unsigned int\n - * Applies to: compiler only - */ - CU_JIT_MAX_REGISTERS = 0, - - /** - * IN: Specifies minimum number of threads per block to target compilation - * for\n - * OUT: Returns the number of threads the compiler actually targeted. - * This restricts the resource utilization fo the compiler (e.g. max - * registers) such that a block with the given number of threads should be - * able to launch based on register limitations. Note, this option does not - * currently take into account any other resource limitations, such as - * shared memory utilization.\n - * Cannot be combined with ::CU_JIT_TARGET.\n - * Option type: unsigned int\n - * Applies to: compiler only - */ - CU_JIT_THREADS_PER_BLOCK, - - /** - * Overwrites the option value with the total wall clock time, in - * milliseconds, spent in the compiler and linker\n - * Option type: float\n - * Applies to: compiler and linker - */ - CU_JIT_WALL_TIME, - - /** - * Pointer to a buffer in which to print any log messages - * that are informational in nature (the buffer size is specified via - * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n - * Option type: char *\n - * Applies to: compiler and linker - */ - CU_JIT_INFO_LOG_BUFFER, - - /** - * IN: Log buffer size in bytes. Log messages will be capped at this size - * (including null terminator)\n - * OUT: Amount of log buffer filled with messages\n - * Option type: unsigned int\n - * Applies to: compiler and linker - */ - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - - /** - * Pointer to a buffer in which to print any log messages that - * reflect errors (the buffer size is specified via option - * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n - * Option type: char *\n - * Applies to: compiler and linker - */ - CU_JIT_ERROR_LOG_BUFFER, - - /** - * IN: Log buffer size in bytes. Log messages will be capped at this size - * (including null terminator)\n - * OUT: Amount of log buffer filled with messages\n - * Option type: unsigned int\n - * Applies to: compiler and linker - */ - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - - /** - * Level of optimizations to apply to generated code (0 - 4), with 4 - * being the default and highest level of optimizations.\n - * Option type: unsigned int\n - * Applies to: compiler only - */ - CU_JIT_OPTIMIZATION_LEVEL, - - /** - * No option value required. Determines the target based on the current - * attached context (default)\n - * Option type: No option value needed\n - * Applies to: compiler and linker - */ - CU_JIT_TARGET_FROM_CUCONTEXT, - - /** - * Target is chosen based on supplied ::CUjit_target. Cannot be - * combined with ::CU_JIT_THREADS_PER_BLOCK.\n - * Option type: unsigned int for enumerated type ::CUjit_target\n - * Applies to: compiler and linker - */ - CU_JIT_TARGET, - - /** - * Specifies choice of fallback strategy if matching cubin is not found. - * Choice is based on supplied ::CUjit_fallback. This option cannot be - * used with cuLink* APIs as the linker requires exact matches.\n - * Option type: unsigned int for enumerated type ::CUjit_fallback\n - * Applies to: compiler only - */ - CU_JIT_FALLBACK_STRATEGY, - - /** - * Specifies whether to create debug information in output (-g) - * (0: false, default)\n - * Option type: int\n - * Applies to: compiler and linker - */ - CU_JIT_GENERATE_DEBUG_INFO, - - /** - * Generate verbose log messages (0: false, default)\n - * Option type: int\n - * Applies to: compiler and linker - */ - CU_JIT_LOG_VERBOSE, - - /** - * Generate line number information (-lineinfo) (0: false, default)\n - * Option type: int\n - * Applies to: compiler only - */ - CU_JIT_GENERATE_LINE_INFO, - - /** - * Specifies whether to enable caching explicitly (-dlcm) \n - * Choice is based on supplied ::CUjit_cacheMode_enum.\n - * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n - * Applies to: compiler only - */ - CU_JIT_CACHE_MODE, - - /** - * The below jit options are used for internal purposes only, in this version of CUDA - */ - CU_JIT_NEW_SM3X_OPT, - CU_JIT_FAST_COMPILE, - - /** - * Array of device symbol names that will be relocated to the corresponing - * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n - * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n - * When loding a device module, driver will relocate all encountered - * unresolved symbols to the host addresses.\n - * It is only allowed to register symbols that correspond to unresolved - * global variables.\n - * It is illegal to register the same device symbol at multiple addresses.\n - * Option type: const char **\n - * Applies to: dynamic linker only - */ - CU_JIT_GLOBAL_SYMBOL_NAMES, - - /** - * Array of host addresses that will be used to relocate corresponding - * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n - * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n - * Option type: void **\n - * Applies to: dynamic linker only - */ - CU_JIT_GLOBAL_SYMBOL_ADDRESSES, - - /** - * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and - * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n - * Option type: unsigned int\n - * Applies to: dynamic linker only - */ - CU_JIT_GLOBAL_SYMBOL_COUNT, - - /** - * Enable link-time optimization (-dlto) for device code (0: false, default).\n - * This option is not supported on 32-bit platforms.\n - * Option type: int\n - * Applies to: compiler and linker - */ - CU_JIT_LTO, - - /** - * Control single-precision denormals (-ftz) support (0: false, default). - * 1 : flushes denormal values to zero - * 0 : preserves denormal values - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_FTZ, - - /** - * Control single-precision floating-point division and reciprocals - * (-prec-div) support (1: true, default). - * 1 : Enables the IEEE round-to-nearest mode - * 0 : Enables the fast approximation mode - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_PREC_DIV, - - /** - * Control single-precision floating-point square root - * (-prec-sqrt) support (1: true, default). - * 1 : Enables the IEEE round-to-nearest mode - * 0 : Enables the fast approximation mode - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_PREC_SQRT, - - /** - * Enable/Disable the contraction of floating-point multiplies - * and adds/subtracts into floating-point multiply-add (-fma) - * operations (1: Enable, default; 0: Disable). - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_FMA, - - CU_JIT_NUM_OPTIONS - -} CUjit_option; - -/** - * Online compilation targets - */ -typedef enum CUjit_target_enum -{ - - CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ - CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ - - - CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ - CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ - CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ - CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ - - - CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ - CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ - CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ - - - CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ - CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ - CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ - - - CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ - CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ - - CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ - - CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/ - CU_TARGET_COMPUTE_86 = 86 /**< Compute device class 8.6.*/ - -} CUjit_target; - -/** - * Cubin matching fallback strategies - */ -typedef enum CUjit_fallback_enum -{ - CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ - - CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ - -} CUjit_fallback; - -/** - * Caching modes for dlcm - */ -typedef enum CUjit_cacheMode_enum -{ - CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ - CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ - CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ -} CUjit_cacheMode; - -/** - * Device code formats - */ -typedef enum CUjitInputType_enum -{ - /** - * Compiled device-class-specific device code\n - * Applicable options: none - */ - CU_JIT_INPUT_CUBIN = 0, - - /** - * PTX source code\n - * Applicable options: PTX compiler options - */ - CU_JIT_INPUT_PTX, - - /** - * Bundle of multiple cubins and/or PTX of some device code\n - * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - */ - CU_JIT_INPUT_FATBINARY, - - /** - * Host object with embedded device code\n - * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - */ - CU_JIT_INPUT_OBJECT, - - /** - * Archive of host objects with embedded device code\n - * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - */ - CU_JIT_INPUT_LIBRARY, - - /** - * High-level intermediate code for link-time optimization\n - * Applicable options: NVVM compiler options, PTX compiler options - */ - CU_JIT_INPUT_NVVM, - - CU_JIT_NUM_INPUT_TYPES -} CUjitInputType; - -typedef struct CUlinkState_st *CUlinkState; - -/** - * Flags to register a graphics resource - */ -typedef enum CUgraphicsRegisterFlags_enum { - CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, - CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, - CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, - CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, - CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 -} CUgraphicsRegisterFlags; - -/** - * Flags for mapping and unmapping interop resources - */ -typedef enum CUgraphicsMapResourceFlags_enum { - CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 -} CUgraphicsMapResourceFlags; - -/** - * Array indices for cube faces - */ -typedef enum CUarray_cubemap_face_enum { - CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ - CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ - CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ -} CUarray_cubemap_face; - -/** - * Limits - */ -typedef enum CUlimit_enum { - CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ - CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ - CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ - CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ - CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ - CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ - CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ - CU_LIMIT_MAX -} CUlimit; - -/** - * Resource types - */ -typedef enum CUresourcetype_enum { - CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ - CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ - CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ - CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ -} CUresourcetype; - -#ifdef _WIN32 -#define CUDA_CB __stdcall -#else -#define CUDA_CB -#endif - -/** - * CUDA host function - * \param userData Argument value passed to the function - */ -typedef void (CUDA_CB *CUhostFn)(void *userData); - -/** - * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members. - */ -typedef enum CUaccessProperty_enum { - CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ - CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ - CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ -} CUaccessProperty; - -/** - * Specifies an access policy for a window, a contiguous extent of memory - * beginning at base_ptr and ending at base_ptr + num_bytes. - * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. - * Partition into many segments and assign segments such that: - * sum of "hit segments" / window == approx. ratio. - * sum of "miss segments" / window == approx 1-ratio. - * Segments and ratio specifications are fitted to the capabilities of - * the architecture. - * Accesses in a hit segment apply the hitProp access policy. - * Accesses in a miss segment apply the missProp access policy. - */ -typedef struct CUaccessPolicyWindow_st { - void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ - size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ - float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ - CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ - CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ -} CUaccessPolicyWindow_v1; -typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow; - -/** - * GPU kernel node parameters - */ -typedef struct CUDA_KERNEL_NODE_PARAMS_st { - CUfunction func; /**< Kernel to launch */ - unsigned int gridDimX; /**< Width of grid in blocks */ - unsigned int gridDimY; /**< Height of grid in blocks */ - unsigned int gridDimZ; /**< Depth of grid in blocks */ - unsigned int blockDimX; /**< X dimension of each thread block */ - unsigned int blockDimY; /**< Y dimension of each thread block */ - unsigned int blockDimZ; /**< Z dimension of each thread block */ - unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ - void **kernelParams; /**< Array of pointers to kernel parameters */ - void **extra; /**< Extra options */ -} CUDA_KERNEL_NODE_PARAMS_v1; -typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS; - -/** - * Memset node parameters - */ -typedef struct CUDA_MEMSET_NODE_PARAMS_st { - CUdeviceptr dst; /**< Destination device pointer */ - size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ - unsigned int value; /**< Value to be set */ - unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ - size_t width; /**< Width of the row in elements */ - size_t height; /**< Number of rows */ -} CUDA_MEMSET_NODE_PARAMS_v1; -typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS; - -/** - * Host node parameters - */ -typedef struct CUDA_HOST_NODE_PARAMS_st { - CUhostFn fn; /**< The function to call when the node executes */ - void* userData; /**< Argument to pass to the function */ -} CUDA_HOST_NODE_PARAMS_v1; -typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS; - -/** - * Graph node types - */ -typedef enum CUgraphNodeType_enum { - CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ - CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ - CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ - CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ - CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ - CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ - CU_GRAPH_NODE_TYPE_WAIT_EVENT = 6, /**< External event wait node */ - CU_GRAPH_NODE_TYPE_EVENT_RECORD = 7, /**< External event record node */ - CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */ - CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */ - CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */ - CU_GRAPH_NODE_TYPE_MEM_FREE = 11 /**< Memory Free Node */ -} CUgraphNodeType; - -typedef enum CUsynchronizationPolicy_enum { - CU_SYNC_POLICY_AUTO = 1, - CU_SYNC_POLICY_SPIN = 2, - CU_SYNC_POLICY_YIELD = 3, - CU_SYNC_POLICY_BLOCKING_SYNC = 4 -} CUsynchronizationPolicy; - -/** - * Graph kernel node Attributes - */ -typedef enum CUkernelNodeAttrID_enum { - CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */ - CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2 /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */ -} CUkernelNodeAttrID; - -/** - * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute - */ -typedef union CUkernelNodeAttrValue_union { - CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ - int cooperative; /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */ -} CUkernelNodeAttrValue_v1; -typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue; - -/** - * Possible stream capture statuses returned by ::cuStreamIsCapturing - */ -typedef enum CUstreamCaptureStatus_enum { - CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ - CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ - CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that - has been invalidated, but not terminated */ -} CUstreamCaptureStatus; - -/** - * Possible modes for stream capture thread interactions. For more details see - * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode - */ -typedef enum CUstreamCaptureMode_enum { - CU_STREAM_CAPTURE_MODE_GLOBAL = 0, - CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, - CU_STREAM_CAPTURE_MODE_RELAXED = 2 -} CUstreamCaptureMode; - -/** - * Stream Attributes - */ -typedef enum CUstreamAttrID_enum { - CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */ - CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< ::CUsynchronizationPolicy for work queued up in this stream */ -} CUstreamAttrID; - -/** - * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute - */ -typedef union CUstreamAttrValue_union { - CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ - CUsynchronizationPolicy syncPolicy; /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */ -} CUstreamAttrValue_v1; -typedef CUstreamAttrValue_v1 CUstreamAttrValue; - -/** - * Flags to specify search options. For more details see ::cuGetProcAddress - */ -typedef enum CUdriverProcAddress_flags_enum { - CU_GET_PROC_ADDRESS_DEFAULT = 0, /**< Default search mode for driver symbols. */ - CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0, /**< Search for legacy versions of driver symbols. */ - CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1 /**< Search for per-thread versions of driver symbols. */ -} CUdriverProcAddress_flags; - -/** - * Execution Affinity Types - */ -typedef enum CUexecAffinityType_enum { - CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0, /**< Create a context with limited SMs. */ - CU_EXEC_AFFINITY_TYPE_MAX -} CUexecAffinityType; - -/** - * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT - */ -typedef struct CUexecAffinitySmCount_st { - unsigned int val; /**< The number of SMs the context is limited to use. */ -} CUexecAffinitySmCount_v1; -typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount; - -/** - * Execution Affinity Parameters - */ -typedef struct CUexecAffinityParam_st { - CUexecAffinityType type; - union { - CUexecAffinitySmCount smCount; /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ - } param; -} CUexecAffinityParam_v1; -typedef CUexecAffinityParam_v1 CUexecAffinityParam; - -/** - * Error codes - */ -typedef enum cudaError_enum { - /** - * The API call returned with no errors. In the case of query calls, this - * also means that the operation being queried is complete (see - * ::cuEventQuery() and ::cuStreamQuery()). - */ - CUDA_SUCCESS = 0, - - /** - * This indicates that one or more of the parameters passed to the API call - * is not within an acceptable range of values. - */ - CUDA_ERROR_INVALID_VALUE = 1, - - /** - * The API call failed because it was unable to allocate enough memory to - * perform the requested operation. - */ - CUDA_ERROR_OUT_OF_MEMORY = 2, - - /** - * This indicates that the CUDA driver has not been initialized with - * ::cuInit() or that initialization has failed. - */ - CUDA_ERROR_NOT_INITIALIZED = 3, - - /** - * This indicates that the CUDA driver is in the process of shutting down. - */ - CUDA_ERROR_DEINITIALIZED = 4, - - /** - * This indicates profiler is not initialized for this run. This can - * happen when the application is running with external profiling tools - * like visual profiler. - */ - CUDA_ERROR_PROFILER_DISABLED = 5, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to attempt to enable/disable the profiling via ::cuProfilerStart or - * ::cuProfilerStop without initialization. - */ - CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to call cuProfilerStart() when profiling is already enabled. - */ - CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to call cuProfilerStop() when profiling is already disabled. - */ - CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, - - /** - * This indicates that the CUDA driver that the application has loaded is a - * stub library. Applications that run with the stub rather than a real - * driver loaded will result in CUDA API returning this error. - */ - CUDA_ERROR_STUB_LIBRARY = 34, - - /** - * This indicates that no CUDA-capable devices were detected by the installed - * CUDA driver. - */ - CUDA_ERROR_NO_DEVICE = 100, - - /** - * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device or that the action requested is - * invalid for the specified device. - */ - CUDA_ERROR_INVALID_DEVICE = 101, - - /** - * This error indicates that the Grid license is not applied. - */ - CUDA_ERROR_DEVICE_NOT_LICENSED = 102, - - /** - * This indicates that the device kernel image is invalid. This can also - * indicate an invalid CUDA module. - */ - CUDA_ERROR_INVALID_IMAGE = 200, - - /** - * This most frequently indicates that there is no context bound to the - * current thread. This can also be returned if the context passed to an - * API call is not a valid handle (such as a context that has had - * ::cuCtxDestroy() invoked on it). This can also be returned if a user - * mixes different API versions (i.e. 3010 context with 3020 API calls). - * See ::cuCtxGetApiVersion() for more details. - */ - CUDA_ERROR_INVALID_CONTEXT = 201, - - /** - * This indicated that the context being supplied as a parameter to the - * API call was already the active context. - * \deprecated - * This error return is deprecated as of CUDA 3.2. It is no longer an - * error to attempt to push the active context via ::cuCtxPushCurrent(). - */ - CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, - - /** - * This indicates that a map or register operation has failed. - */ - CUDA_ERROR_MAP_FAILED = 205, - - /** - * This indicates that an unmap or unregister operation has failed. - */ - CUDA_ERROR_UNMAP_FAILED = 206, - - /** - * This indicates that the specified array is currently mapped and thus - * cannot be destroyed. - */ - CUDA_ERROR_ARRAY_IS_MAPPED = 207, - - /** - * This indicates that the resource is already mapped. - */ - CUDA_ERROR_ALREADY_MAPPED = 208, - - /** - * This indicates that there is no kernel image available that is suitable - * for the device. This can occur when a user specifies code generation - * options for a particular CUDA source file that do not include the - * corresponding device configuration. - */ - CUDA_ERROR_NO_BINARY_FOR_GPU = 209, - - /** - * This indicates that a resource has already been acquired. - */ - CUDA_ERROR_ALREADY_ACQUIRED = 210, - - /** - * This indicates that a resource is not mapped. - */ - CUDA_ERROR_NOT_MAPPED = 211, - - /** - * This indicates that a mapped resource is not available for access as an - * array. - */ - CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, - - /** - * This indicates that a mapped resource is not available for access as a - * pointer. - */ - CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, - - /** - * This indicates that an uncorrectable ECC error was detected during - * execution. - */ - CUDA_ERROR_ECC_UNCORRECTABLE = 214, - - /** - * This indicates that the ::CUlimit passed to the API call is not - * supported by the active device. - */ - CUDA_ERROR_UNSUPPORTED_LIMIT = 215, - - /** - * This indicates that the ::CUcontext passed to the API call can - * only be bound to a single CPU thread at a time but is already - * bound to a CPU thread. - */ - CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, - - /** - * This indicates that peer access is not supported across the given - * devices. - */ - CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, - - /** - * This indicates that a PTX JIT compilation failed. - */ - CUDA_ERROR_INVALID_PTX = 218, - - /** - * This indicates an error with OpenGL or DirectX context. - */ - CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, - - /** - * This indicates that an uncorrectable NVLink error was detected during the - * execution. - */ - CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, - - /** - * This indicates that the PTX JIT compiler library was not found. - */ - CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, - - /** - * This indicates that the provided PTX was compiled with an unsupported toolchain. - */ - - CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222, - - /** - * This indicates that the PTX JIT compilation was disabled. - */ - CUDA_ERROR_JIT_COMPILATION_DISABLED = 223, - - /** - * This indicates that the ::CUexecAffinityType passed to the API call is not - * supported by the active device. - */ - CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224, - - /** - * This indicates that the device kernel source is invalid. This includes - * compilation/linker errors encountered in device code or user error. - */ - CUDA_ERROR_INVALID_SOURCE = 300, - - /** - * This indicates that the file specified was not found. - */ - CUDA_ERROR_FILE_NOT_FOUND = 301, - - /** - * This indicates that a link to a shared object failed to resolve. - */ - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, - - /** - * This indicates that initialization of a shared object failed. - */ - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, - - /** - * This indicates that an OS call failed. - */ - CUDA_ERROR_OPERATING_SYSTEM = 304, - - /** - * This indicates that a resource handle passed to the API call was not - * valid. Resource handles are opaque types like ::CUstream and ::CUevent. - */ - CUDA_ERROR_INVALID_HANDLE = 400, - - /** - * This indicates that a resource required by the API call is not in a - * valid state to perform the requested operation. - */ - CUDA_ERROR_ILLEGAL_STATE = 401, - - /** - * This indicates that a named symbol was not found. Examples of symbols - * are global/constant variable names, driver function names, texture names, - * and surface names. - */ - CUDA_ERROR_NOT_FOUND = 500, - - /** - * This indicates that asynchronous operations issued previously have not - * completed yet. This result is not actually an error, but must be indicated - * differently than ::CUDA_SUCCESS (which indicates completion). Calls that - * may return this value include ::cuEventQuery() and ::cuStreamQuery(). - */ - CUDA_ERROR_NOT_READY = 600, - - /** - * While executing a kernel, the device encountered a - * load or store instruction on an invalid memory address. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_ILLEGAL_ADDRESS = 700, - - /** - * This indicates that a launch did not occur because it did not have - * appropriate resources. This error usually indicates that the user has - * attempted to pass too many arguments to the device kernel, or the - * kernel launch specifies too many threads for the kernel's register - * count. Passing arguments of the wrong size (i.e. a 64-bit pointer - * when a 32-bit int is expected) is equivalent to passing too many - * arguments and can also result in this error. - */ - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, - - /** - * This indicates that the device kernel took too long to execute. This can - * only occur if timeouts are enabled - see the device attribute - * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_LAUNCH_TIMEOUT = 702, - - /** - * This error indicates a kernel launch that uses an incompatible texturing - * mode. - */ - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, - - /** - * This error indicates that a call to ::cuCtxEnablePeerAccess() is - * trying to re-enable peer access to a context which has already - * had peer access to it enabled. - */ - CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, - - /** - * This error indicates that ::cuCtxDisablePeerAccess() is - * trying to disable peer access which has not been enabled yet - * via ::cuCtxEnablePeerAccess(). - */ - CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, - - /** - * This error indicates that the primary context for the specified device - * has already been initialized. - */ - CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, - - /** - * This error indicates that the context current to the calling thread - * has been destroyed using ::cuCtxDestroy, or is a primary context which - * has not yet been initialized. - */ - CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, - - /** - * A device-side assert triggered during kernel execution. The context - * cannot be used anymore, and must be destroyed. All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - CUDA_ERROR_ASSERT = 710, - - /** - * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices - * passed to ::cuCtxEnablePeerAccess(). - */ - CUDA_ERROR_TOO_MANY_PEERS = 711, - - /** - * This error indicates that the memory range passed to ::cuMemHostRegister() - * has already been registered. - */ - CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, - - /** - * This error indicates that the pointer passed to ::cuMemHostUnregister() - * does not correspond to any currently registered memory region. - */ - CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, - - /** - * While executing a kernel, the device encountered a stack error. - * This can be due to stack corruption or exceeding the stack size limit. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_HARDWARE_STACK_ERROR = 714, - - /** - * While executing a kernel, the device encountered an illegal instruction. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, - - /** - * While executing a kernel, the device encountered a load or store instruction - * on a memory address which is not aligned. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_MISALIGNED_ADDRESS = 716, - - /** - * While executing a kernel, the device encountered an instruction - * which can only operate on memory locations in certain address spaces - * (global, shared, or local), but was supplied a memory address not - * belonging to an allowed address space. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, - - /** - * While executing a kernel, the device program counter wrapped its address space. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_INVALID_PC = 718, - - /** - * An exception occurred on the device while executing a kernel. Common - * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. Less common cases can be system specific - more - * information about these cases can be found in the system specific user guide. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_LAUNCH_FAILED = 719, - - /** - * This error indicates that the number of blocks launched per grid for a kernel that was - * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice - * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor - * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors - * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. - */ - CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, - - /** - * This error indicates that the attempted operation is not permitted. - */ - CUDA_ERROR_NOT_PERMITTED = 800, - - /** - * This error indicates that the attempted operation is not supported - * on the current system or device. - */ - CUDA_ERROR_NOT_SUPPORTED = 801, - - /** - * This error indicates that the system is not yet ready to start any CUDA - * work. To continue using CUDA, verify the system configuration is in a - * valid state and all required driver daemons are actively running. - * More information about this error can be found in the system specific - * user guide. - */ - CUDA_ERROR_SYSTEM_NOT_READY = 802, - - /** - * This error indicates that there is a mismatch between the versions of - * the display driver and the CUDA driver. Refer to the compatibility documentation - * for supported versions. - */ - CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, - - /** - * This error indicates that the system was upgraded to run with forward compatibility - * but the visible hardware detected by CUDA does not support this configuration. - * Refer to the compatibility documentation for the supported hardware matrix or ensure - * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES - * environment variable. - */ - CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, - - /** - * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. - */ - CUDA_ERROR_MPS_CONNECTION_FAILED = 805, - - /** - * This error indicates that the remote procedural call between the MPS server and the MPS client failed. - */ - CUDA_ERROR_MPS_RPC_FAILURE = 806, - - /** - * This error indicates that the MPS server is not ready to accept new MPS client requests. - * This error can be returned when the MPS server is in the process of recovering from a fatal failure. - */ - CUDA_ERROR_MPS_SERVER_NOT_READY = 807, - - /** - * This error indicates that the hardware resources required to create MPS client have been exhausted. - */ - CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808, - - /** - * This error indicates the the hardware resources required to support device connections have been exhausted. - */ - CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809, - - /** - * This error indicates that the operation is not permitted when - * the stream is capturing. - */ - CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, - - /** - * This error indicates that the current capture sequence on the stream - * has been invalidated due to a previous error. - */ - CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, - - /** - * This error indicates that the operation would have resulted in a merge - * of two independent capture sequences. - */ - CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, - - /** - * This error indicates that the capture was not initiated in this stream. - */ - CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, - - /** - * This error indicates that the capture sequence contains a fork that was - * not joined to the primary stream. - */ - CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, - - /** - * This error indicates that a dependency would have been created which - * crosses the capture sequence boundary. Only implicit in-stream ordering - * dependencies are allowed to cross the boundary. - */ - CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, - - /** - * This error indicates a disallowed implicit dependency on a current capture - * sequence from cudaStreamLegacy. - */ - CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, - - /** - * This error indicates that the operation is not permitted on an event which - * was last recorded in a capturing stream. - */ - CUDA_ERROR_CAPTURED_EVENT = 907, - - /** - * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED - * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a - * different thread. - */ - CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, - - /** - * This error indicates that the timeout specified for the wait operation has lapsed. - */ - CUDA_ERROR_TIMEOUT = 909, - - /** - * This error indicates that the graph update was not performed because it included - * changes which violated constraints specific to instantiated graph update. - */ - CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, - - /** - * This indicates that an async error has occurred in a device outside of CUDA. - * If CUDA was waiting for an external device's signal before consuming shared data, - * the external device signaled an error indicating that the data is not valid for - * consumption. This leaves the process in an inconsistent state and any further CUDA - * work will return the same error. To continue using CUDA, the process must be - * terminated and relaunched. - */ - CUDA_ERROR_EXTERNAL_DEVICE = 911, - - - - - - - - - /** - * This indicates that an unknown internal error has occurred. - */ - CUDA_ERROR_UNKNOWN = 999 -} CUresult; - -/** - * P2P Attributes - */ -typedef enum CUdevice_P2PAttribute_enum { - CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ - CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ - CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ - CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ - CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ -} CUdevice_P2PAttribute; - - - - - - - - - - - - -/** - * CUDA stream callback - * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. - * \param status ::CUDA_SUCCESS or any persistent error on the stream. - * \param userData User parameter provided at registration. - */ -typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); - -/** - * Block size to per-block dynamic shared memory mapping for a certain - * kernel \param blockSize Block size of the kernel. - * - * \return The dynamic shared memory needed by a block. - */ -typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); - -/** - * If set, host memory is portable between CUDA contexts. - * Flag for ::cuMemHostAlloc() - */ -#define CU_MEMHOSTALLOC_PORTABLE 0x01 - -/** - * If set, host memory is mapped into CUDA address space and - * ::cuMemHostGetDevicePointer() may be called on the host pointer. - * Flag for ::cuMemHostAlloc() - */ -#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 - -/** - * If set, host memory is allocated as write-combined - fast to write, - * faster to DMA, slow to read except via SSE4 streaming load instruction - * (MOVNTDQA). - * Flag for ::cuMemHostAlloc() - */ -#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 - -/** - * If set, host memory is portable between CUDA contexts. - * Flag for ::cuMemHostRegister() - */ -#define CU_MEMHOSTREGISTER_PORTABLE 0x01 - -/** - * If set, host memory is mapped into CUDA address space and - * ::cuMemHostGetDevicePointer() may be called on the host pointer. - * Flag for ::cuMemHostRegister() - */ -#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 - -/** - * If set, the passed memory pointer is treated as pointing to some - * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. - * On Windows the flag is a no-op. - * On Linux that memory is marked as non cache-coherent for the GPU and - * is expected to be physically contiguous. It may return - * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, - * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. - * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED - * is returned. - * Flag for ::cuMemHostRegister() - */ -#define CU_MEMHOSTREGISTER_IOMEMORY 0x04 - -/** -* If set, the passed memory pointer is treated as pointing to memory that is -* considered read-only by the device. On platforms without -* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is -* required in order to register memory mapped to the CPU as read-only. Support -* for the use of this flag can be queried from the device attribute -* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with -* a current context associated with a device that does not have this attribute -* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED. -*/ -#define CU_MEMHOSTREGISTER_READ_ONLY 0x08 - -/** - * 2D memory copy parameters - */ -typedef struct CUDA_MEMCPY2D_st { - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - - size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ - size_t Height; /**< Height of 2D memory copy */ -} CUDA_MEMCPY2D_v2; -typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D; - -/** - * 3D memory copy parameters - */ -typedef struct CUDA_MEMCPY3D_st { - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - size_t srcZ; /**< Source Z */ - size_t srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - void *reserved0; /**< Must be NULL */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - size_t dstZ; /**< Destination Z */ - size_t dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - void *reserved1; /**< Must be NULL */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - - size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ - size_t Height; /**< Height of 3D memory copy */ - size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D_v2; -typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D; - -/** - * 3D memory cross-context copy parameters - */ -typedef struct CUDA_MEMCPY3D_PEER_st { - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - size_t srcZ; /**< Source Z */ - size_t srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - size_t dstZ; /**< Destination Z */ - size_t dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - - size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ - size_t Height; /**< Height of 3D memory copy */ - size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D_PEER_v1; -typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER; - -/** - * Array descriptor - */ -typedef struct CUDA_ARRAY_DESCRIPTOR_st -{ - size_t Width; /**< Width of array */ - size_t Height; /**< Height of array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ -} CUDA_ARRAY_DESCRIPTOR_v2; -typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR; - -/** - * 3D array descriptor - */ -typedef struct CUDA_ARRAY3D_DESCRIPTOR_st -{ - size_t Width; /**< Width of 3D array */ - size_t Height; /**< Height of 3D array */ - size_t Depth; /**< Depth of 3D array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - unsigned int Flags; /**< Flags */ -} CUDA_ARRAY3D_DESCRIPTOR_v2; -typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR; - -/** - * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers - */ -#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1 - -/** - * CUDA array sparse properties - */ -typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st { - struct { - unsigned int width; /**< Width of sparse tile in elements */ - unsigned int height; /**< Height of sparse tile in elements */ - unsigned int depth; /**< Depth of sparse tile in elements */ - } tileExtent; - - /** - * First mip level at which the mip tail begins. - */ - unsigned int miptailFirstLevel; - /** - * Total size of the mip tail. - */ - unsigned long long miptailSize; - /** - * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL - */ - unsigned int flags; - unsigned int reserved[4]; -} CUDA_ARRAY_SPARSE_PROPERTIES_v1; -typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES; - - -/** - * CUDA array memory requirements - */ -typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st { - size_t size; /**< Total required memory size */ - size_t alignment; /**< alignment requirement */ - unsigned int reserved[4]; -} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1; -typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS; - - -/** - * CUDA Resource descriptor - */ -typedef struct CUDA_RESOURCE_DESC_st -{ - CUresourcetype resType; /**< Resource type */ - - union { - struct { - CUarray hArray; /**< CUDA array */ - } array; - struct { - CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ - } mipmap; - struct { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t sizeInBytes; /**< Size in bytes */ - } linear; - struct { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t width; /**< Width of the array in elements */ - size_t height; /**< Height of the array in elements */ - size_t pitchInBytes; /**< Pitch between two rows in bytes */ - } pitch2D; - struct { - int reserved[32]; - } reserved; - } res; - - unsigned int flags; /**< Flags (must be zero) */ -} CUDA_RESOURCE_DESC_v1; -typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC; - -/** - * Texture descriptor - */ -typedef struct CUDA_TEXTURE_DESC_st { - CUaddress_mode addressMode[3]; /**< Address modes */ - CUfilter_mode filterMode; /**< Filter mode */ - unsigned int flags; /**< Flags */ - unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ - CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ - float mipmapLevelBias; /**< Mipmap level bias */ - float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ - float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ - float borderColor[4]; /**< Border Color */ - int reserved[12]; -} CUDA_TEXTURE_DESC_v1; -typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC; - -/** - * Resource view format - */ -typedef enum CUresourceViewFormat_enum -{ - CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ - CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ - CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ -} CUresourceViewFormat; - -/** - * Resource view descriptor - */ -typedef struct CUDA_RESOURCE_VIEW_DESC_st -{ - CUresourceViewFormat format; /**< Resource view format */ - size_t width; /**< Width of the resource view */ - size_t height; /**< Height of the resource view */ - size_t depth; /**< Depth of the resource view */ - unsigned int firstMipmapLevel; /**< First defined mipmap level */ - unsigned int lastMipmapLevel; /**< Last defined mipmap level */ - unsigned int firstLayer; /**< First layer index */ - unsigned int lastLayer; /**< Last layer index */ - unsigned int reserved[16]; -} CUDA_RESOURCE_VIEW_DESC_v1; -typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC; - -/** - * GPU Direct v3 tokens - */ -typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { - unsigned long long p2pToken; - unsigned int vaSpaceToken; -} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1; -typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; - -/** -* Access flags that specify the level of access the current context's device has -* on the memory referenced. -*/ -typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum { - CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE = 0x0, /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */ - CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ = 0x1, /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */ - CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3 /**< Read-write access, the device has full read-write access to the memory */ -} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS; - -/** - * Kernel launch parameters - */ -typedef struct CUDA_LAUNCH_PARAMS_st { - CUfunction function; /**< Kernel to launch */ - unsigned int gridDimX; /**< Width of grid in blocks */ - unsigned int gridDimY; /**< Height of grid in blocks */ - unsigned int gridDimZ; /**< Depth of grid in blocks */ - unsigned int blockDimX; /**< X dimension of each thread block */ - unsigned int blockDimY; /**< Y dimension of each thread block */ - unsigned int blockDimZ; /**< Z dimension of each thread block */ - unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ - CUstream hStream; /**< Stream identifier */ - void **kernelParams; /**< Array of pointers to kernel parameters */ -} CUDA_LAUNCH_PARAMS_v1; -typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS; - -/** - * External memory handle types - */ -typedef enum CUexternalMemoryHandleType_enum { - /** - * Handle is an opaque file descriptor - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, - /** - * Handle is an opaque shared NT handle - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, - /** - * Handle is an opaque, globally shared handle - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - /** - * Handle is a D3D12 heap object - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, - /** - * Handle is a D3D12 committed resource - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, - /** - * Handle is a shared NT handle to a D3D11 resource - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, - /** - * Handle is a globally shared handle to a D3D11 resource - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, - /** - * Handle is an NvSciBuf object - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 -} CUexternalMemoryHandleType; - -/** - * Indicates that the external memory object is a dedicated resource - */ -#define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 - -/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS - * contains this flag, it indicates that signaling an external semaphore object - * should skip performing appropriate memory synchronization operations over all - * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, - * which otherwise are performed by default to ensure data coherency with other - * importers of the same NvSciBuf memory objects. - */ -#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 - -/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS - * contains this flag, it indicates that waiting on an external semaphore object - * should skip performing appropriate memory synchronization operations over all - * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, - * which otherwise are performed by default to ensure data coherency with other - * importers of the same NvSciBuf memory objects. - */ -#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 - -/** - * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, - * it indicates that application needs signaler specific NvSciSyncAttr - * to be filled by ::cuDeviceGetNvSciSyncAttributes. - */ -#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 - -/** - * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, - * it indicates that application needs waiter specific NvSciSyncAttr - * to be filled by ::cuDeviceGetNvSciSyncAttributes. - */ -#define CUDA_NVSCISYNC_ATTR_WAIT 0x2 -/** - * External memory handle descriptor - */ -typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { - /** - * Type of the handle - */ - CUexternalMemoryHandleType type; - union { - /** - * File descriptor referencing the memory object. Valid - * when type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD - */ - int fd; - /** - * Win32 handle referencing the semaphore object. Valid when - * type is one of the following: - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT - * Exactly one of 'handle' and 'name' must be non-NULL. If - * type is one of the following: - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT - * then 'name' must be NULL. - */ - struct { - /** - * Valid NT handle. Must be NULL if 'name' is non-NULL - */ - void *handle; - /** - * Name of a valid memory object. - * Must be NULL if 'handle' is non-NULL. - */ - const void *name; - } win32; - /** - * A handle representing an NvSciBuf Object. Valid when type - * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF - */ - const void *nvSciBufObject; - } handle; - /** - * Size of the memory allocation - */ - unsigned long long size; - /** - * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1; -typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC; - -/** - * External memory buffer descriptor - */ -typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { - /** - * Offset into the memory object where the buffer's base is - */ - unsigned long long offset; - /** - * Size of the buffer - */ - unsigned long long size; - /** - * Flags reserved for future use. Must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1; -typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC; - -/** - * External memory mipmap descriptor - */ -typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { - /** - * Offset into the memory object where the base level of the - * mipmap chain is. - */ - unsigned long long offset; - /** - * Format, dimension and type of base level of the mipmap chain - */ - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - /** - * Total number of levels in the mipmap chain - */ - unsigned int numLevels; - unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1; -typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; - -/** - * External semaphore handle types - */ -typedef enum CUexternalSemaphoreHandleType_enum { - /** - * Handle is an opaque file descriptor - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, - /** - * Handle is an opaque shared NT handle - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, - /** - * Handle is an opaque, globally shared handle - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - /** - * Handle is a shared NT handle referencing a D3D12 fence object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, - /** - * Handle is a shared NT handle referencing a D3D11 fence object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, - /** - * Opaque handle to NvSciSync Object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, - /** - * Handle is a shared NT handle referencing a D3D11 keyed mutex object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, - /** - * Handle is a globally shared handle referencing a D3D11 keyed mutex object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, - /** - * Handle is an opaque file descriptor referencing a timeline semaphore - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, - /** - * Handle is an opaque shared NT handle referencing a timeline semaphore - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 -} CUexternalSemaphoreHandleType; - -/** - * External semaphore handle descriptor - */ -typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { - /** - * Type of the handle - */ - CUexternalSemaphoreHandleType type; - union { - /** - * File descriptor referencing the semaphore object. Valid - * when type is one of the following: - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD - */ - int fd; - /** - * Win32 handle referencing the semaphore object. Valid when - * type is one of the following: - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 - * Exactly one of 'handle' and 'name' must be non-NULL. If - * type is one of the following: - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT - * then 'name' must be NULL. - */ - struct { - /** - * Valid NT handle. Must be NULL if 'name' is non-NULL - */ - void *handle; - /** - * Name of a valid synchronization primitive. - * Must be NULL if 'handle' is non-NULL. - */ - const void *name; - } win32; - /** - * Valid NvSciSyncObj. Must be non NULL - */ - const void* nvSciSyncObj; - } handle; - /** - * Flags reserved for the future. Must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1; -typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; - -/** - * External semaphore signal parameters - */ -typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { - struct { - /** - * Parameters for fence objects - */ - struct { - /** - * Value of fence to be signaled - */ - unsigned long long value; - } fence; - union { - /** - * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType - * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. - */ - void *fence; - unsigned long long reserved; - } nvSciSync; - /** - * Parameters for keyed mutex objects - */ - struct { - /** - * Value of key to release the mutex with - */ - unsigned long long key; - } keyedMutex; - unsigned int reserved[12]; - } params; - /** - * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to - * signal a ::CUexternalSemaphore of type - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates - * that while signaling the ::CUexternalSemaphore, no memory synchronization - * operations should be performed for any external memory object imported - * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. - * For all other types of ::CUexternalSemaphore, flags must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1; -typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; - -/** - * External semaphore wait parameters - */ -typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { - struct { - /** - * Parameters for fence objects - */ - struct { - /** - * Value of fence to be waited on - */ - unsigned long long value; - } fence; - /** - * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType - * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. - */ - union { - void *fence; - unsigned long long reserved; - } nvSciSync; - /** - * Parameters for keyed mutex objects - */ - struct { - /** - * Value of key to acquire the mutex with - */ - unsigned long long key; - /** - * Timeout in milliseconds to wait to acquire the mutex - */ - unsigned int timeoutMs; - } keyedMutex; - unsigned int reserved[10]; - } params; - /** - * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on - * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, - * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC - * which indicates that while waiting for the ::CUexternalSemaphore, no memory - * synchronization operations should be performed for any external memory - * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. - * For all other types of ::CUexternalSemaphore, flags must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1; -typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; - -/** - * Semaphore signal node parameters - */ -typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st { - CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ - const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */ - unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ -} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1; -typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS; - -/** - * Semaphore wait node parameters - */ -typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st { - CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ - const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */ - unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ -} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1; -typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS; - -typedef unsigned long long CUmemGenericAllocationHandle_v1; -typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; - -/** - * Flags for specifying particular handle types - */ -typedef enum CUmemAllocationHandleType_enum { - CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ - CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ - CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ - CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF -} CUmemAllocationHandleType; - -/** - * Specifies the memory protection flags for mapping. - */ -typedef enum CUmemAccess_flags_enum { - CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ - CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ - CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ - CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF -} CUmemAccess_flags; - -/** - * Specifies the type of location - */ -typedef enum CUmemLocationType_enum { - CU_MEM_LOCATION_TYPE_INVALID = 0x0, - CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ - CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF -} CUmemLocationType; - -/** -* Defines the allocation types available -*/ -typedef enum CUmemAllocationType_enum { - CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, - - /** This allocation type is 'pinned', i.e. cannot migrate from its current - * location while the application is actively using it - */ - CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, - CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF -} CUmemAllocationType; - -/** -* Flag for requesting different optimal and required granularities for an allocation. -*/ -typedef enum CUmemAllocationGranularity_flags_enum { - CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ - CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ -} CUmemAllocationGranularity_flags; - -/** - * Sparse subresource types - */ -typedef enum CUarraySparseSubresourceType_enum { - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 -} CUarraySparseSubresourceType; - -/** - * Memory operation types - */ -typedef enum CUmemOperationType_enum { - CU_MEM_OPERATION_TYPE_MAP = 1, - CU_MEM_OPERATION_TYPE_UNMAP = 2 -} CUmemOperationType; - -/** - * Memory handle types - */ -typedef enum CUmemHandleType_enum { - CU_MEM_HANDLE_TYPE_GENERIC = 0 -} CUmemHandleType; - -/** - * Specifies the CUDA array or CUDA mipmapped array memory mapping information - */ -typedef struct CUarrayMapInfo_st { - CUresourcetype resourceType; /**< Resource type */ - - union { - CUmipmappedArray mipmap; - CUarray array; - } resource; - - CUarraySparseSubresourceType subresourceType; /**< Sparse subresource type */ - - union { - struct { - unsigned int level; /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */ - unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ - unsigned int offsetX; /**< Starting X offset in elements */ - unsigned int offsetY; /**< Starting Y offset in elements */ - unsigned int offsetZ; /**< Starting Z offset in elements */ - unsigned int extentWidth; /**< Width in elements */ - unsigned int extentHeight; /**< Height in elements */ - unsigned int extentDepth; /**< Depth in elements */ - } sparseLevel; - struct { - unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ - unsigned long long offset; /**< Offset within mip tail */ - unsigned long long size; /**< Extent in bytes */ - } miptail; - } subresource; - - CUmemOperationType memOperationType; /**< Memory operation type */ - CUmemHandleType memHandleType; /**< Memory handle type */ - - union { - CUmemGenericAllocationHandle memHandle; - } memHandle; - - unsigned long long offset; /**< Offset within the memory */ - unsigned int deviceBitMask; /**< Device ordinal bit mask */ - unsigned int flags; /**< flags for future use, must be zero now. */ - unsigned int reserved[2]; /**< Reserved for future use, must be zero now. */ -} CUarrayMapInfo_v1; -typedef CUarrayMapInfo_v1 CUarrayMapInfo; - -/** - * Specifies a memory location. - */ -typedef struct CUmemLocation_st { - CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ - int id; /**< identifier for a given this location's ::CUmemLocationType. */ -} CUmemLocation_v1; -typedef CUmemLocation_v1 CUmemLocation; - -/** - * Specifies compression attribute for an allocation. - */ -typedef enum CUmemAllocationCompType_enum { - CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ - CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ -} CUmemAllocationCompType; - -/** - * This flag if set indicates that the memory will be used as a tile pool. - */ -#define CU_MEM_CREATE_USAGE_TILE_POOL 0x1 - -/** -* Specifies the allocation properties for a allocation. -*/ -typedef struct CUmemAllocationProp_st { - /** Allocation type */ - CUmemAllocationType type; - /** requested ::CUmemAllocationHandleType */ - CUmemAllocationHandleType requestedHandleTypes; - /** Location of allocation */ - CUmemLocation location; - /** - * Windows-specific POBJECT_ATTRIBUTES required when - * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object atributes structure - * includes security attributes that define - * the scope of which exported allocations may be tranferred to other - * processes. In all other cases, this field is required to be zero. - */ - void *win32HandleMetaData; - struct { - /** - * Allocation hint for requesting compressible memory. - * On devices that support Compute Data Compression, compressible - * memory can be used to accelerate accesses to data with unstructured - * sparsity and other compressible data patterns. Applications are - * expected to query allocation property of the handle obtained with - * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to - * validate if the obtained allocation is compressible or not. Note that - * compressed memory may not be mappable on all devices. - */ - unsigned char compressionType; - unsigned char gpuDirectRDMACapable; - /** Bitmask indicating intended usage for this allocation */ - unsigned short usage; - unsigned char reserved[4]; - } allocFlags; -} CUmemAllocationProp_v1; -typedef CUmemAllocationProp_v1 CUmemAllocationProp; - -/** - * Memory access descriptor - */ -typedef struct CUmemAccessDesc_st { - CUmemLocation location; /**< Location on which the request is to change it's accessibility */ - CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ -} CUmemAccessDesc_v1; -typedef CUmemAccessDesc_v1 CUmemAccessDesc; - -typedef enum CUgraphExecUpdateResult_enum { - CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ - CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ - CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6, /**< The update failed because something about the node is not supported */ - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */ - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */ -} CUgraphExecUpdateResult; - -/** - * CUDA memory pool attributes - */ -typedef enum CUmemPool_attribute_enum { - /** - * (value type = int) - * Allow cuMemAllocAsync to use memory asynchronously freed - * in another streams as long as a stream ordering dependency - * of the allocating stream on the free action exists. - * Cuda events and null stream interactions can create the required - * stream ordered dependencies. (default enabled) - */ - CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1, - - /** - * (value type = int) - * Allow reuse of already completed frees when there is no dependency - * between the free and allocation. (default enabled) - */ - CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, - - /** - * (value type = int) - * Allow cuMemAllocAsync to insert new stream dependencies - * in order to establish the stream ordering required to reuse - * a piece of memory released by cuFreeAsync (default enabled). - */ - CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, - - /** - * (value type = cuuint64_t) - * Amount of reserved memory in bytes to hold onto before trying - * to release memory back to the OS. When more than the release - * threshold bytes of memory are held by the memory pool, the - * allocator will try to release memory back to the OS on the - * next call to stream, event or context synchronize. (default 0) - */ - CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - - /** - * (value type = cuuint64_t) - * Amount of backing memory currently allocated for the mempool. - */ - CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of backing memory allocated for the mempool since the - * last time it was reset. High watermark can only be reset to zero. - */ - CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, - - /** - * (value type = cuuint64_t) - * Amount of memory from the pool that is currently in use by the application. - */ - CU_MEMPOOL_ATTR_USED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of the amount of memory from the pool that was in use by the application since - * the last time it was reset. High watermark can only be reset to zero. - */ - CU_MEMPOOL_ATTR_USED_MEM_HIGH -} CUmemPool_attribute; - -/** - * Specifies the properties of allocations made from the pool. - */ -typedef struct CUmemPoolProps_st { - CUmemAllocationType allocType; /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */ - CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ - CUmemLocation location; /**< Location where allocations should reside. */ - /** - * Windows-specific LPSECURITYATTRIBUTES required when - * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines - * the scope of which exported allocations may be tranferred to other - * processes. In all other cases, this field is required to be zero. - */ - void *win32SecurityAttributes; - unsigned char reserved[64]; /**< reserved for future use, must be 0 */ -} CUmemPoolProps_v1; -typedef CUmemPoolProps_v1 CUmemPoolProps; - -/** - * Opaque data for exporting a pool allocation - */ -typedef struct CUmemPoolPtrExportData_st { - unsigned char reserved[64]; -} CUmemPoolPtrExportData_v1; -typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData; - -/** - * Memory allocation node parameters - */ -typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st { - /** - * in: location where the allocation should reside (specified in ::location). - * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported. - */ - CUmemPoolProps poolProps; - const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */ - size_t accessDescCount; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ - size_t bytesize; /**< in: size in bytes of the requested allocation */ - CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */ -} CUDA_MEM_ALLOC_NODE_PARAMS; - -typedef enum CUgraphMem_attribute_enum { - /** - * (value type = cuuint64_t) - * Amount of memory, in bytes, currently associated with graphs - */ - CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of memory, in bytes, associated with graphs since the - * last time it was reset. High watermark can only be reset to zero. - */ - CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, - - /** - * (value type = cuuint64_t) - * Amount of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - */ - CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - */ - CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH -} CUgraphMem_attribute; - -/** - * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only - * waits for prior work in the stream corresponding to that GPU to complete before the - * kernel begins execution. - */ -#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 - -/** - * If set, any subsequent work pushed in a stream that participated in a call to - * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on - * the GPU corresponding to that stream to complete before it begins execution. - */ -#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 - -/** - * If set, the CUDA array is a collection of layers, where each layer is either a 1D - * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number - * of layers, not the depth of a 3D array. - */ -#define CUDA_ARRAY3D_LAYERED 0x01 - -/** - * Deprecated, use CUDA_ARRAY3D_LAYERED - */ -#define CUDA_ARRAY3D_2DARRAY 0x01 - -/** - * This flag must be set in order to bind a surface reference - * to the CUDA array - */ -#define CUDA_ARRAY3D_SURFACE_LDST 0x02 - -/** - * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The - * width of such a CUDA array must be equal to its height, and Depth must be six. - * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps - * and Depth must be a multiple of six. - */ -#define CUDA_ARRAY3D_CUBEMAP 0x04 - -/** - * This flag must be set in order to perform texture gather operations - * on a CUDA array. - */ -#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 - -/** - * This flag if set indicates that the CUDA - * array is a DEPTH_TEXTURE. - */ -#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 - -/** - * This flag indicates that the CUDA array may be bound as a color target - * in an external graphics API - */ -#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 - -/** - * This flag if set indicates that the CUDA array or CUDA mipmapped array - * is a sparse CUDA array or CUDA mipmapped array respectively - */ -#define CUDA_ARRAY3D_SPARSE 0x40 - - -/** - * This flag if set indicates that the CUDA array or CUDA mipmapped array - * will allow deferred memory mapping - */ -#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80 - - -/** - * Override the texref format with a format inferred from the array. - * Flag for ::cuTexRefSetArray() - */ -#define CU_TRSA_OVERRIDE_FORMAT 0x01 - -/** - * Read the texture as integers rather than promoting the values to floats - * in the range [0,1]. - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_READ_AS_INTEGER 0x01 - -/** - * Use normalized texture coordinates in the range [0,1) instead of [0,dim). - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_NORMALIZED_COORDINATES 0x02 - -/** - * Perform sRGB->linear conversion during texture read. - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_SRGB 0x10 - - /** - * Disable any trilinear filtering optimizations. - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 - -/** - * Enable seamless cube map filtering. - * Flag for ::cuTexObjectCreate() - */ -#define CU_TRSF_SEAMLESS_CUBEMAP 0x40 - -/** - * End of array terminator for the \p extra parameter to - * ::cuLaunchKernel - */ -#define CU_LAUNCH_PARAM_END ((void*)0x00) - -/** - * Indicator that the next value in the \p extra parameter to - * ::cuLaunchKernel will be a pointer to a buffer containing all kernel - * parameters used for launching kernel \p f. This buffer needs to - * honor all alignment/padding requirements of the individual parameters. - * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the - * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no - * effect. - */ -#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) - -/** - * Indicator that the next value in the \p extra parameter to - * ::cuLaunchKernel will be a pointer to a size_t which contains the - * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. - * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified - * in the \p extra array if the value associated with - * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. - */ -#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) - -/** - * For texture references loaded into the module, use default texunit from - * texture reference. - */ -#define CU_PARAM_TR_DEFAULT -1 - -/** - * Device that represents the CPU - */ -#define CU_DEVICE_CPU ((CUdevice)-1) - -/** - * Device that represents an invalid device - */ -#define CU_DEVICE_INVALID ((CUdevice)-2) - -/** - * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS - */ -typedef enum CUflushGPUDirectRDMAWritesOptions_enum { - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */ - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */ -} CUflushGPUDirectRDMAWritesOptions; - -/** - * Platform native ordering for GPUDirect RDMA writes - */ -typedef enum CUGPUDirectRDMAWritesOrdering_enum { - CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE = 0, /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */ - CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */ - CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200 /**< Any CUDA device in the system can consistently consume remote writes to this device. */ -} CUGPUDirectRDMAWritesOrdering; - -/** - * The scopes for ::cuFlushGPUDirectRDMAWrites - */ -typedef enum CUflushGPUDirectRDMAWritesScope_enum { - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ -} CUflushGPUDirectRDMAWritesScope; - -/** - * The targets for ::cuFlushGPUDirectRDMAWrites - */ -typedef enum CUflushGPUDirectRDMAWritesTarget_enum { - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ -} CUflushGPUDirectRDMAWritesTarget; - -/** - * The additional write options for ::cuGraphDebugDotPrint - */ -typedef enum CUgraphDebugDot_flags_enum { - CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1<<0, /** Output all debug data as if every debug flag is enabled */ - CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 1<<1, /** Use CUDA Runtime structures for output */ - CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = 1<<2, /** Adds CUDA_KERNEL_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = 1<<3, /** Adds CUDA_MEMCPY3D values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = 1<<4, /** Adds CUDA_MEMSET_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = 1<<5, /** Adds CUDA_HOST_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = 1<<6, /** Adds CUevent handle from record and wait nodes to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = 1<<7, /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = 1<<8, /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 1<<9, /** Adds CUkernelNodeAttrValue values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /** Adds node handles and every kernel function handle to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /** Adds memory alloc node parameters to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12 /** Adds memory free node parameters to output */ -} CUgraphDebugDot_flags; - -/** - * Flags for user objects for graphs - */ -typedef enum CUuserObject_flags_enum { - CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ -} CUuserObject_flags; - -/** - * Flags for retaining user object references for graphs - */ -typedef enum CUuserObjectRetain_flags_enum { - CU_GRAPH_USER_OBJECT_MOVE = 1 /**< Transfer references from the caller rather than creating new references. */ -} CUuserObjectRetain_flags; - -/** - * Flags for instantiating a graph - */ -typedef enum CUgraphInstantiate_flags_enum { - CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 /**< Automatically free memory allocated in a graph before relaunching. */ -} CUgraphInstantiate_flags; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -/** @} */ /* END CUDA_TYPES */ - -#if defined(__GNUC__) - #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) - #pragma GCC visibility push(default) - #endif -#endif - -#ifdef _WIN32 -#define CUDAAPI __stdcall -#else -#define CUDAAPI -#endif - -/** - * \defgroup CUDA_ERROR Error Handling - * - * ___MANBRIEF___ error handling functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the error handling functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Gets the string description of an error code - * - * Sets \p *pStr to the address of a NULL-terminated string description - * of the error code \p error. - * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE - * will be returned and \p *pStr will be set to the NULL address. - * - * \param error - Error code to convert to string - * \param pStr - Address of the string pointer. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::CUresult, - * ::cudaGetErrorString - */ -CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); - -/** - * \brief Gets the string representation of an error code enum name - * - * Sets \p *pStr to the address of a NULL-terminated string representation - * of the name of the enum error code \p error. - * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE - * will be returned and \p *pStr will be set to the NULL address. - * - * \param error - Error code to convert to string - * \param pStr - Address of the string pointer. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::CUresult, - * ::cudaGetErrorName - */ -CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); - -/** @} */ /* END CUDA_ERROR */ - -/** - * \defgroup CUDA_INITIALIZE Initialization - * - * ___MANBRIEF___ initialization functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the initialization functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Initialize the CUDA driver API - * - * Initializes the driver API and must be called before any other function from - * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() - * has not been called, any function from the driver API will return - * ::CUDA_ERROR_NOT_INITIALIZED. - * - * \param Flags - Initialization flag for CUDA. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, - * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE - * \notefnerr - */ -CUresult CUDAAPI cuInit(unsigned int Flags); - -/** @} */ /* END CUDA_INITIALIZE */ - -/** - * \defgroup CUDA_VERSION Version Management - * - * ___MANBRIEF___ version management functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the version management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns the latest CUDA version supported by driver - * - * Returns in \p *driverVersion the version of CUDA supported by - * the driver. The version is returned as - * (1000 × major + 10 × minor). For example, CUDA 9.2 - * would be represented by 9020. - * - * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if - * \p driverVersion is NULL. - * - * \param driverVersion - Returns the CUDA driver version - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cudaDriverGetVersion, - * ::cudaRuntimeGetVersion - */ -CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); - -/** @} */ /* END CUDA_VERSION */ - -/** - * \defgroup CUDA_DEVICE Device Management - * - * ___MANBRIEF___ device management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the device management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns a handle to a compute device - * - * Returns in \p *device a device handle given an ordinal in the range [0, - * ::cuDeviceGetCount()-1]. - * - * \param device - Returned device handle - * \param ordinal - Device number to get handle for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGetLuid, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport - */ -CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); - -/** - * \brief Returns the number of compute-capable devices - * - * Returns in \p *count the number of devices with compute capability greater - * than or equal to 2.0 that are available for execution. If there is no such - * device, ::cuDeviceGetCount() returns 0. - * - * \param count - Returned number of compute-capable devices - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGetLuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceCount - */ -CUresult CUDAAPI cuDeviceGetCount(int *count); - -/** - * \brief Returns an identifer string for the device - * - * Returns an ASCII string identifying the device \p dev in the NULL-terminated - * string pointed to by \p name. \p len specifies the maximum length of the - * string that may be returned. - * - * \param name - Returned identifier string for the device - * \param len - Maximum length of string to store in \p name - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetUuid, - * ::cuDeviceGetLuid, - * ::cuDeviceGetCount, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); - -/** - * \brief Return an UUID for the device - * - * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will - * supplant this version in 12.0, which is retained for minor version compatibility. - * - * Returns 16-octets identifing the device \p dev in the structure - * pointed by the \p uuid. - * - * \param uuid - Returned UUID - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetUuid_v2 - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetLuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); - -/** - * \brief Return an UUID for the device (11.4+) - * - * Returns 16-octets identifing the device \p dev in the structure - * pointed by the \p uuid. If the device is in MIG mode, returns its - * MIG UUID which uniquely identifies the subscribed MIG compute instance. - * - * \param uuid - Returned UUID - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetLuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev); - -/** - * \brief Return an LUID and device node mask for the device - * - * Return identifying information (\p luid and \p deviceNodeMask) to allow - * matching device with graphics APIs. - * - * \param luid - Returned LUID - * \param deviceNodeMask - Returned device node mask - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); - -/** - * \brief Returns the total amount of memory on the device - * - * Returns in \p *bytes the total amount of memory available on the device - * \p dev in bytes. - * - * \param bytes - Returned memory available on device in bytes - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaMemGetInfo - */ -CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); - -/** - * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. - * - * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture - * for given \p format and \p numChannels. - * - * \param maxWidthInElements - Returned maximum number of texture elements allocatable for given \p format and \p numChannels. - * \param format - Texture format. - * \param numChannels - Number of channels per texture element. - * \param dev - Device handle. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cudaMemGetInfo, - * ::cuDeviceTotalMem - */ -CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev); - -/** - * \brief Returns information about the device - * - * Returns in \p *pi the integer value of the attribute \p attrib on device - * \p dev. The supported attributes are: - * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per - * block; - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid - * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of - * shared memory available to a thread block in bytes - * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for - * __constant__ variables in a CUDA C kernel in bytes - * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads - * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the - * memory copy functions that involve memory regions allocated through - * ::cuMemAllocPitch() - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D - * texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width - * for a 1D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum - * mipmapped 1D texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D - * texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D - * texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width - * for a 2D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height - * for a 2D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch - * in bytes for a 2D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum - * mipmapped 2D texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum - * mipmapped 2D texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D - * texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D - * texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D - * texture depth - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: - * Alternate maximum 3D texture width, 0 if no alternate - * maximum 3D texture size is supported - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: - * Alternate maximum 3D texture height, 0 if no alternate - * maximum 3D texture size is supported - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: - * Alternate maximum 3D texture depth, 0 if no alternate - * maximum 3D texture size is supported - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: - * Maximum cubemap texture width or height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: - * Maximum 1D layered texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: - * Maximum layers in a 1D layered texture - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: - * Maximum 2D layered texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: - * Maximum 2D layered texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: - * Maximum layers in a 2D layered texture - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: - * Maximum cubemap layered texture width or height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: - * Maximum layers in a cubemap layered texture - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: - * Maximum 1D surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: - * Maximum 2D surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: - * Maximum 2D surface height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: - * Maximum 3D surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: - * Maximum 3D surface height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: - * Maximum 3D surface depth - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: - * Maximum 1D layered surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: - * Maximum layers in a 1D layered surface - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: - * Maximum 2D layered surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: - * Maximum 2D layered surface height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: - * Maximum layers in a 2D layered surface - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: - * Maximum cubemap surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: - * Maximum cubemap layered surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: - * Maximum layers in a cubemap layered surface - * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit - * registers available to a thread block - * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz - * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture - * base addresses aligned to ::textureAlign bytes do not need an offset - * applied to texture fetches - * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement - * for 2D texture references bound to pitched memory - * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy - * memory between host and device while executing a kernel, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on - * the device - * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit - * for kernels executed on the device, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the - * memory subsystem, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host - * memory into the CUDA address space, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently - * in. Available modes are as follows: - * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and - * can have multiple CUDA contexts present at a single time. - * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is - * prohibited from creating new CUDA contexts. - * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device - * can have only one context used by a single process at a time. - * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports - * executing multiple kernels within the same context simultaneously, or 0 if - * not. It is not guaranteed that multiple kernels will be resident - * on the device concurrently so this feature should not be relied upon for - * correctness. - * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the - * device, 0 if error correction is disabled or not supported by the device - * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device - * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier - * of the device - * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device - * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC - * is only available on Tesla hardware running Windows Vista or later - * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz - * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits - * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache - * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with - * the host, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number - * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals - * in L1 cache, 0 if caching globals in L1 cache is not supported by the device - * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals - * in L1 cache, 0 if caching locals in L1 cache is not supported by the device - * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of - * shared memory available to a multiprocessor in bytes; this amount is shared - * by all thread blocks simultaneously resident on a multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit - * registers available to a multiprocessor; this number is shared by all thread - * blocks simultaneously resident on a multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory - * on this system, 0 if allocating managed memory is not supported by the device on this system. - * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. - * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices - * associated with the same board. Devices on the same multi-GPU board will share the same identifier. - * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host - * supports native atomic operations. - * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance - * (in floating-point operations per second) to double precision performance. - * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing - * pageable memory without calling cudaHostRegister on it. - * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory - * concurrently with the CPU. - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. - * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered - * memory at the same virtual address as the CPU. - * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size - * suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. - * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES - * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's - * page tables. - * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. - * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs - * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes - * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate. - * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes - * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. - * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU - * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. - * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC - - * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays. - - * - * \param pi - Returned device attribute value - * \param attrib - Device attribute to query - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaDeviceGetAttribute, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); - -/** - * \brief Return NvSciSync attributes that this device can support. - * - * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that - * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList - * can be used to create an NvSciSync object that matches this device's capabilities. - * - * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is - * already set this API will return ::CUDA_ERROR_INVALID_VALUE. - * - * The applications should set \p nvSciSyncAttrList to a valid - * NvSciSyncAttrList failing which this API will return - * ::CUDA_ERROR_INVALID_HANDLE. - * - * The \p flags controls how applications intends to use - * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: - * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to - * signal an NvSciSync on this CUDA device. - * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to - * wait on an NvSciSync on this CUDA device. - * - * At least one of these flags must be set, failing which the API - * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal - * to one another: a developer may set both these flags that allows to - * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. - * - * \param nvSciSyncAttrList - Return NvSciSync attributes supported. - * \param dev - Valid Cuda Device to get NvSciSync attributes for. - * \param flags - flags describing NvSciSync usage. - * - * \return - * - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa - * ::cuImportExternalSemaphore, - * ::cuDestroyExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); - -/** - * \brief Sets the current memory pool of a device - * - * The memory pool must be local to the specified device. - * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. - * By default, a device's current memory pool is its default memory pool. - * - * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different - * than the one the stream runs on. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync - */ -CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool); - -/** - * \brief Gets the current mempool for a device - * - * Returns the last pool provided to ::cuDeviceSetMemPool for this device - * or the device's default memory pool if ::cuDeviceSetMemPool has never been called. - * By default the current mempool is the default mempool for a device. - * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool - */ -CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev); - -/** - * \brief Returns the default mempool of a device - * - * The default mempool of a device contains device memory from that device. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev); - -/** - * \brief Blocks until remote writes are visible to the specified scope - * - * Blocks until GPUDirect RDMA writes to the target context via mappings - * created through APIs like nvidia_p2p_get_pages (see - * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are - * visible to the specified scope. - * - * If the scope equals or lies within the scope indicated by - * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call - * will be a no-op and can be safely omitted for performance. This can be - * determined by comparing the numerical values between the two enums, with - * smaller scopes having smaller values. - * - * Users may query support for this API via - * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. - * - * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget - * \param scope - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - */ -CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); - -/** @} */ /* END CUDA_DEVICE */ - -/** - * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated device management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the device management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns properties for a selected device - * - * \deprecated - * - * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). - * - * Returns in \p *prop the properties of device \p dev. The ::CUdevprop - * structure is defined as: - * - * \code - typedef struct CUdevprop_st { - int maxThreadsPerBlock; - int maxThreadsDim[3]; - int maxGridSize[3]; - int sharedMemPerBlock; - int totalConstantMemory; - int SIMDWidth; - int memPitch; - int regsPerBlock; - int clockRate; - int textureAlign - } CUdevprop; - * \endcode - * where: - * - * - ::maxThreadsPerBlock is the maximum number of threads per block; - * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; - * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; - * - ::sharedMemPerBlock is the total amount of shared memory available per - * block in bytes; - * - ::totalConstantMemory is the total amount of constant memory available on - * the device in bytes; - * - ::SIMDWidth is the warp size; - * - ::memPitch is the maximum pitch allowed by the memory copy functions that - * involve memory regions allocated through ::cuMemAllocPitch(); - * - ::regsPerBlock is the total number of registers available per block; - * - ::clockRate is the clock frequency in kilohertz; - * - ::textureAlign is the alignment requirement; texture base addresses that - * are aligned to ::textureAlign bytes do not need an offset applied to - * texture fetches. - * - * \param prop - Returned properties of device - * \param dev - Device to get properties for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); - -/** - * \brief Returns the compute capability of the device - * - * \deprecated - * - * This function was deprecated as of CUDA 5.0 and its functionality superceded - * by ::cuDeviceGetAttribute(). - * - * Returns in \p *major and \p *minor the major and minor revision numbers that - * define the compute capability of the device \p dev. - * - * \param major - Major revision number - * \param minor - Minor revision number - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); - -/** @} */ /* END CUDA_DEVICE_DEPRECATED */ - -/** - * \defgroup CUDA_PRIMARY_CTX Primary Context Management - * - * ___MANBRIEF___ primary context management functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the primary context management functions of the low-level - * CUDA driver application programming interface. - * - * The primary context is unique per device and shared with the CUDA runtime API. - * These functions allow integration with other libraries using CUDA. - * - * @{ - */ - -/** - * \brief Retain the primary context on the GPU - * - * Retains the primary context on the device. - * Once the user successfully retains the primary context, the primary context - * will be active and available to the user until the user releases it - * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset(). - * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack. - * - * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN - * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function - * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to - * determine the compute mode of the device. - * The nvidia-smi tool can be used to set the compute mode for - * devices. Documentation for nvidia-smi can be obtained by passing a - * -h option to it. - * - * Please note that the primary context always supports pinned allocations. Other - * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). - * - * \param pctx - Returned context handle of the new context - * \param dev - Device for which primary context is requested - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRelease, - * ::cuDevicePrimaryCtxSetFlags, - * ::cuCtxCreate, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); - -/** - * \brief Release the primary context on the GPU - * - * Releases the primary context interop on the device. - * A retained context should always be released once the user is done using - * it. The context is automatically reset once the last reference to it is - * released. This behavior is different when the primary context was retained - * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary - * context remains always active. - * - * Releasing a primary context that has not been previously retained will - * fail with ::CUDA_ERROR_INVALID_CONTEXT. - * - * Please note that unlike ::cuCtxDestroy() this method does not pop the context - * from stack in any circumstances. - * - * \param dev - Device which primary context is released - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRetain, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); - -/** - * \brief Set flags for the primary context - * - * Sets the flags for the primary context on the device overwriting perviously - * set ones. - * - * The three LSBs of the \p flags parameter can be used to control how the OS - * thread, which owns the CUDA context at the time of an API call, interacts - * with the OS scheduler when waiting for results from the GPU. Only one of - * the scheduling flags can be set when creating a context. - * - * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can decrease latency when waiting for the GPU, - * but may lower the performance of CPU threads if they are performing work in - * parallel with the CUDA thread. - * - * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for - * results from the GPU. This can increase latency when waiting for the GPU, - * but can increase the performance of CPU threads performing work in parallel - * with the GPU. - * - * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work.
- * Deprecated: This flag was deprecated as of CUDA 4.0 and was - * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. - * - * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in the - * process \e C and the number of logical processors in the system \e P. If - * \e C > \e P, then CUDA will yield to other OS threads when waiting for - * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while - * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on - * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC - * for low-powered devices. - * - * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory - * after resizing local memory for a kernel. This can prevent thrashing by - * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage.
- * Deprecated: This flag is deprecated and the behavior enabled - * by this flag is now the default and cannot be disabled. - * - * \param dev - Device for which the primary context flags are set - * \param flags - New flags for the device - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRetain, - * ::cuDevicePrimaryCtxGetState, - * ::cuCtxCreate, - * ::cuCtxGetFlags, - * ::cudaSetDeviceFlags - */ -CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); - -/** - * \brief Get the state of the primary context - * - * Returns in \p *flags the flags for the primary context of \p dev, and in - * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag - * values. - * - * \param dev - Device to get primary context flags for - * \param flags - Pointer to store flags - * \param active - Pointer to store context state; 0 = inactive, 1 = active - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa - * ::cuDevicePrimaryCtxSetFlags, - * ::cuCtxGetFlags, - * ::cudaGetDeviceFlags - */ -CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); - -/** - * \brief Destroy all allocations and reset all state on the primary context - * - * Explicitly destroys and cleans up all resources associated with the current - * device in the current process. - * - * Note that it is responsibility of the calling function to ensure that no - * other module in the process is using the device any more. For that reason - * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. - * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() - * even after resetting the device. - * Resetting the primary context does not release it, an application that has - * retained the primary context should explicitly release its usage. - * - * \param dev - Device for which primary context is destroyed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRetain, - * ::cuDevicePrimaryCtxRelease, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaDeviceReset - */ -CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); - -/** @} */ /* END CUDA_PRIMARY_CTX */ - -/** - * \brief Returns information about the execution affinity support of the device. - * - * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. - * The supported types are: - * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, - * or 0 if not; - * - * \param pi - 1 if the execution affinity type \p type is supported by the device, or 0 if not - * \param type - Execution affinity type to query - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem - */ -CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev); - -/** - * \defgroup CUDA_CTX Context Management - * - * ___MANBRIEF___ context management functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the context management functions of the low-level - * CUDA driver application programming interface. - * - * Please note that some functions are described in - * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. - * - * @{ - */ - -/** - * \brief Create a CUDA context - * - * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. - * - * Creates a new CUDA context and associates it with the calling thread. The - * \p flags parameter is described below. The context is created with a usage - * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or - * when done using the context. If a context is already current to the thread, - * it is supplanted by the newly created context and may be restored by a subsequent - * call to ::cuCtxPopCurrent(). - * - * The three LSBs of the \p flags parameter can be used to control how the OS - * thread, which owns the CUDA context at the time of an API call, interacts - * with the OS scheduler when waiting for results from the GPU. Only one of - * the scheduling flags can be set when creating a context. - * - * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can decrease latency when waiting for the GPU, - * but may lower the performance of CPU threads if they are performing work in - * parallel with the CUDA thread. - * - * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for - * results from the GPU. This can increase latency when waiting for the GPU, - * but can increase the performance of CPU threads performing work in parallel - * with the GPU. - * - * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work.
- * Deprecated: This flag was deprecated as of CUDA 4.0 and was - * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. - * - * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in the - * process \e C and the number of logical processors in the system \e P. If - * \e C > \e P, then CUDA will yield to other OS threads when waiting for - * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while - * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on - * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC - * for low-powered devices. - * - * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. - * This flag must be set in order to allocate pinned host memory that is - * accessible to the GPU. - * - * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory - * after resizing local memory for a kernel. This can prevent thrashing by - * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage.
- * Deprecated: This flag is deprecated and the behavior enabled - * by this flag is now the default and cannot be disabled. - * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). - * - * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() - * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the - * compute mode of the device. The nvidia-smi tool can be used to set - * the compute mode for * devices. - * Documentation for nvidia-smi can be obtained by passing a - * -h option to it. - * - * \param pctx - Returned context handle of the new context - * \param flags - Context creation flags - * \param dev - Device to create context on - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); - -/** - * \brief Create a CUDA context with execution affinity - * - * Creates a new CUDA context with execution affinity and associates it with - * the calling thread. The \p paramsArray and \p flags parameter are described below. - * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must - * call ::cuCtxDestroy() or when done using the context. If a context is already - * current to the thread, it is supplanted by the newly created context and may - * be restored by a subsequent call to ::cuCtxPopCurrent(). - * - * The type and the amount of execution resource the context can use is limited by \p paramsArray - * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams - * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, - * the latter execution affinity parameter overrides the former execution affinity parameter. - * The supported execution affinity types are: - * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion - * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally - * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution - * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute - * is only supported under Volta+ MPS. - * - * The three LSBs of the \p flags parameter can be used to control how the OS - * thread, which owns the CUDA context at the time of an API call, interacts - * with the OS scheduler when waiting for results from the GPU. Only one of - * the scheduling flags can be set when creating a context. - * - * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can decrease latency when waiting for the GPU, - * but may lower the performance of CPU threads if they are performing work in - * parallel with the CUDA thread. - * - * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for - * results from the GPU. This can increase latency when waiting for the GPU, - * but can increase the performance of CPU threads performing work in parallel - * with the GPU. - * - * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work.
- * Deprecated: This flag was deprecated as of CUDA 4.0 and was - * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. - * - * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in the - * process \e C and the number of logical processors in the system \e P. If - * \e C > \e P, then CUDA will yield to other OS threads when waiting for - * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while - * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on - * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC - * for low-powered devices. - * - * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. - * This flag must be set in order to allocate pinned host memory that is - * accessible to the GPU. - * - * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory - * after resizing local memory for a kernel. This can prevent thrashing by - * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage.
- * Deprecated: This flag is deprecated and the behavior enabled - * by this flag is now the default and cannot be disabled. - * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). - * - * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() - * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the - * compute mode of the device. The nvidia-smi tool can be used to set - * the compute mode for * devices. - * Documentation for nvidia-smi can be obtained by passing a - * -h option to it. - * - * \param pctx - Returned context handle of the new context - * \param paramsArray - Execution affinity parameters - * \param numParams - Number of execution affinity parameters - * \param flags - Context creation flags - * \param dev - Device to create context on - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::CUexecAffinityParam - */ -CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev); - -/** - * \brief Destroy a CUDA context - * - * Destroys the CUDA context specified by \p ctx. The context \p ctx will be - * destroyed regardless of how many threads it is current to. - * It is the responsibility of the calling function to ensure that no API - * call issues using \p ctx while ::cuCtxDestroy() is executing. - * - * Destroys and cleans up all resources associated with the context. - * It is the caller's responsibility to ensure that the context or its resources - * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior. - * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent, - * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref, - * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore. - * - * If \p ctx is current to the calling thread then \p ctx will also be - * popped from the current thread's context stack (as though ::cuCtxPopCurrent() - * were called). If \p ctx is current to other threads, then \p ctx will - * remain current to those threads, and attempting to access \p ctx from - * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. - * - * \param ctx - Context to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); - -/** - * \brief Pushes a context on the current CPU thread - * - * Pushes the given context \p ctx onto the CPU thread's stack of current - * contexts. The specified context becomes the CPU thread's current context, so - * all CUDA functions that operate on the current context are affected. - * - * The previous current context may be made current again by calling - * ::cuCtxDestroy() or ::cuCtxPopCurrent(). - * - * \param ctx - Context to push - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); - -/** - * \brief Pops the current CUDA context from the current CPU thread. - * - * Pops the current CUDA context from the CPU thread and passes back the - * old context handle in \p *pctx. That context may then be made current - * to a different CPU thread by calling ::cuCtxPushCurrent(). - * - * If a context was current to the CPU thread before ::cuCtxCreate() or - * ::cuCtxPushCurrent() was called, this function makes that context current to - * the CPU thread again. - * - * \param pctx - Returned popped context handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); - -/** - * \brief Binds the specified CUDA context to the calling CPU thread - * - * Binds the specified CUDA context to the calling CPU thread. - * If \p ctx is NULL then the CUDA context previously bound to the - * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. - * - * If there exists a CUDA context stack on the calling CPU thread, this - * will replace the top of that stack with \p ctx. - * If \p ctx is NULL then this will be equivalent to popping the top - * of the calling CPU thread's CUDA context stack (or a no-op if the - * calling CPU thread's CUDA context stack is empty). - * - * \param ctx - Context to bind to the calling CPU thread - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa - * ::cuCtxGetCurrent, - * ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cudaSetDevice - */ -CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); - -/** - * \brief Returns the CUDA context bound to the calling CPU thread. - * - * Returns in \p *pctx the CUDA context bound to the calling CPU thread. - * If no context is bound to the calling CPU thread then \p *pctx is - * set to NULL and ::CUDA_SUCCESS is returned. - * - * \param pctx - Returned context handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * \notefnerr - * - * \sa - * ::cuCtxSetCurrent, - * ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cudaGetDevice - */ -CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); - -/** - * \brief Returns the device ID for the current context - * - * Returns in \p *device the ordinal of the current context's device. - * - * \param device - Returned device ID for the current context - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaGetDevice - */ -CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); - -/** - * \brief Returns the flags for the current context - * - * Returns in \p *flags the flags of the current context. See ::cuCtxCreate - * for flag values. - * - * \param flags - Pointer to store flags of current context - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetCurrent, - * ::cuCtxGetDevice, - * ::cuCtxGetLimit, - * ::cuCtxGetSharedMemConfig, - * ::cuCtxGetStreamPriorityRange, - * ::cudaGetDeviceFlags - */ -CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); - -/** - * \brief Block for a context's tasks to complete - * - * Blocks until the device has completed all preceding requested tasks. - * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. - * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the - * CPU thread will block until the GPU context has finished its work. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cudaDeviceSynchronize - */ -CUresult CUDAAPI cuCtxSynchronize(void); - -/** - * \brief Set resource limits - * - * Setting \p limit to \p value is a request by the application to update - * the current limit maintained by the context. The driver is free to - * modify the requested value to meet h/w requirements (this could be - * clamping to minimum or maximum values, rounding up to nearest element - * size, etc). The application can use ::cuCtxGetLimit() to find out exactly - * what the limit has been set to. - * - * Setting each ::CUlimit has its own specific restrictions, so each is - * discussed here. - * - * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. - * The driver automatically increases the per-thread stack size - * for each kernel launch as needed. This size isn't reset back to the - * original value after each launch. Setting this value will take effect - * immediately, and if necessary, the device will block until all preceding - * requested tasks are complete. - * - * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used - * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE - * must be performed before launching any kernel that uses the ::printf() - * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used - * by the ::malloc() and ::free() device system calls. Setting - * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel - * that uses the ::malloc() or ::free() device system calls, otherwise - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of - * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting - * this limit must be performed before any launch of a kernel that uses the - * device runtime and calls ::cudaDeviceSynchronize() above the default sync - * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail - * with error code ::cudaErrorSyncDepthExceeded if the limitation is - * violated. This limit can be set smaller than the default or up the maximum - * launch depth of 24. When setting this limit, keep in mind that additional - * levels of sync depth require the driver to reserve large amounts of device - * memory which can no longer be used for user allocations. If these - * reservations of device memory fail, ::cuCtxSetLimit() will return - * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. - * This limit is only applicable to devices of compute capability 3.5 and - * higher. Attempting to set this limit on devices of compute capability less - * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being - * returned. - * - * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of - * outstanding device runtime launches that can be made from the current - * context. A grid is outstanding from the point of launch up until the grid - * is known to have been completed. Device runtime launches which violate - * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when - * ::cudaGetLastError() is called after launch. If more pending launches than - * the default (2048 launches) are needed for a module using the device - * runtime, this limit can be increased. Keep in mind that being able to - * sustain additional pending launches will require the driver to reserve - * larger amounts of device memory upfront which can no longer be used for - * allocations. If these reservations fail, ::cuCtxSetLimit() will return - * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. - * This limit is only applicable to devices of compute capability 3.5 and - * higher. Attempting to set this limit on devices of compute capability less - * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being - * returned. - * - * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. - * Values can range from 0B to 128B. This is purely a performence hint and - * it can be ignored or clamped depending on the platform. - * - * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for - * persisting L2 cache. This is purely a performance hint and it can be - * ignored or clamped depending on the platform. - * - * \param limit - Limit to set - * \param value - Size of limit - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNSUPPORTED_LIMIT, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSynchronize, - * ::cudaDeviceSetLimit - */ -CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); - -/** - * \brief Returns resource limits - * - * Returns in \p *pvalue the current size of \p limit. The supported - * ::CUlimit values are: - * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. - * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the - * ::printf() device system call. - * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the - * ::malloc() and ::free() device system calls. - * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread - * can issue the device runtime call ::cudaDeviceSynchronize() to wait on - * child grid launches to complete. - * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding - * device runtime launches that can be made from this context. - * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. - * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes - * - * \param limit - Limit to query - * \param pvalue - Returned size of limit - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNSUPPORTED_LIMIT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaDeviceGetLimit - */ -CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); - -/** - * \brief Returns the preferred cache configuration for the current context. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this function returns through \p pconfig the preferred cache configuration - * for the current context. This is only a preference. The driver will use - * the requested configuration if possible, but it is free to choose a different - * configuration if required to execute functions. - * - * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices - * where the size of the L1 cache and shared memory are fixed. - * - * The supported cache configurations are: - * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) - * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache - * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory - * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory - * - * \param pconfig - Returned cache configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceGetCacheConfig - */ -CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); - -/** - * \brief Sets the preferred cache configuration for the current context. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p config the preferred cache configuration for - * the current context. This is only a preference. The driver will use - * the requested configuration if possible, but it is free to choose a different - * configuration if required to execute the function. Any function preference - * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide - * setting. Setting the context-wide cache configuration to - * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer - * to not change the cache configuration unless required to launch the kernel. - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * The supported cache configurations are: - * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) - * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache - * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory - * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory - * - * \param config - Requested cache configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceSetCacheConfig - */ -CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); - -/** - * \brief Returns the current shared memory configuration for the current context. - * - * This function will return in \p pConfig the current size of shared memory banks - * in the current context. On devices with configurable shared memory banks, - * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all - * subsequent kernel launches will by default use the new bank size. When - * ::cuCtxGetSharedMemConfig is called on devices without configurable shared - * memory, it will return the fixed bank size of the hardware. - * - * The returned bank configurations can be either: - * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is - * four bytes. - * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will - * eight bytes. - * - * \param pConfig - returned shared memory configuration - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuCtxGetSharedMemConfig, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceGetSharedMemConfig - */ -CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); - -/** - * \brief Sets the shared memory configuration for the current context. - * - * On devices with configurable shared memory banks, this function will set - * the context's shared memory bank size which is used for subsequent kernel - * launches. - * - * Changed the shared memory configuration between launches may insert a device - * side synchronization point between those launches. - * - * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. - * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank - * conflicts. - * - * This function will do nothing on devices with fixed shared memory bank size. - * - * The supported bank configurations are: - * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial - * setting (currently, four bytes). - * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to - * be natively four bytes. - * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to - * be natively eight bytes. - * - * \param config - requested shared memory configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuCtxGetSharedMemConfig, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceSetSharedMemConfig - */ -CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); - -/** - * \brief Gets the context's API version. - * - * Returns a version number in \p version corresponding to the capabilities of - * the context (e.g. 3010 or 3020), which library developers can use to direct - * callers to a specific API version. If \p ctx is NULL, returns the API version - * used to create the currently bound context. - * - * Note that new API versions are only introduced when context capabilities are - * changed that break binary compatibility, so the API version and driver version - * may be different. For example, it is valid for the API version to be 3020 while - * the driver version is 4020. - * - * \param ctx - Context to check - * \param version - Pointer to version - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); - -/** - * \brief Returns numerical values that correspond to the least and - * greatest stream priorities. - * - * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond - * to the least and greatest stream priorities respectively. Stream priorities - * follow a convention where lower numbers imply greater priorities. The range of - * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. - * If the user attempts to create a stream with a priority value that is - * outside the meaningful range as specified by this API, the priority is - * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority - * respectively. See ::cuStreamCreateWithPriority for details on creating a - * priority stream. - * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value - * is not desired. - * - * This function will return '0' in both \p *leastPriority and \p *greatestPriority if - * the current context's device does not support stream priorities - * (see ::cuDeviceGetAttribute). - * - * \param leastPriority - Pointer to an int in which the numerical value for least - * stream priority is returned - * \param greatestPriority - Pointer to an int in which the numerical value for greatest - * stream priority is returned - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuStreamCreateWithPriority, - * ::cuStreamGetPriority, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaDeviceGetStreamPriorityRange - */ -CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); - -/** - * \brief Resets all persisting lines in cache to normal status. - * - * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal - * status. Takes effect on function return. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); - -/** - * \brief Returns the execution affinity setting for the current context. - * - * Returns in \p *pExecAffinity the current value of \p type. The supported - * ::CUexecAffinityType values are: - * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. - * - * \param type - Execution affinity type to query - * \param pExecAffinity - Returned execution affinity - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY - * \notefnerr - * - * \sa - * ::CUexecAffinityParam - */ -CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); - - -/** @} */ /* END CUDA_CTX */ - -/** - * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated context management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated context management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Increment a context's usage-count - * - * \deprecated - * - * Note that this function is deprecated and should not be used. - * - * Increments the usage count of the context and passes back a context handle - * in \p *pctx that must be passed to ::cuCtxDetach() when the application is - * done with the context. ::cuCtxAttach() fails if there is no context current - * to the thread. - * - * Currently, the \p flags parameter must be 0. - * - * \param pctx - Returned context handle of the current context - * \param flags - Context attach flags (must be 0) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxDetach, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); - -/** - * \brief Decrement a context's usage-count - * - * \deprecated - * - * Note that this function is deprecated and should not be used. - * - * Decrements the usage count of the context \p ctx, and destroys the context - * if the usage count goes to 0. The context must be a handle that was passed - * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the - * calling thread. - * - * \param ctx - Context to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); - -/** @} */ /* END CUDA_CTX_DEPRECATED */ - - -/** - * \defgroup CUDA_MODULE Module Management - * - * ___MANBRIEF___ module management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the module management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Loads a compute module - * - * Takes a filename \p fname and loads the corresponding module \p module into - * the current context. The CUDA driver API does not attempt to lazily - * allocate the resources needed by a module; if the memory for functions and - * data (constant and global) needed by the module cannot be allocated, - * ::cuModuleLoad() fails. The file should be a \e cubin file as output by - * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or - * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. - * - * \param module - Returned module - * \param fname - Filename of module to load - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_NOT_FOUND, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_FILE_NOT_FOUND, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); - -/** - * \brief Load a module's data - * - * Takes a pointer \p image and loads the corresponding module \p module into - * the current context. The pointer may be obtained by mapping a \e cubin or - * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file - * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin - * object into the executable resources and using operating system calls such - * as Windows \c FindResource() to obtain the pointer. - * - * \param module - Returned module - * \param image - Module data to load - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); - -/** - * \brief Load a module's data with options - * - * Takes a pointer \p image and loads the corresponding module \p module into - * the current context. The pointer may be obtained by mapping a \e cubin or - * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file - * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin - * object into the executable resources and using operating system calls such - * as Windows \c FindResource() to obtain the pointer. Options are passed as - * an array via \p options and any corresponding parameters are passed in - * \p optionValues. The number of total options is supplied via \p numOptions. - * Any outputs will be returned via \p optionValues. - * - * \param module - Returned module - * \param image - Module data to load - * \param numOptions - Number of options - * \param options - Options for JIT - * \param optionValues - Option values for JIT - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); - -/** - * \brief Load a module's data - * - * Takes a pointer \p fatCubin and loads the corresponding module \p module - * into the current context. The pointer represents a fat binary object, - * which is a collection of different \e cubin and/or \e PTX files, all - * representing the same device code, but compiled and optimized for different - * architectures. - * - * Prior to CUDA 4.0, there was no documented API for constructing and using - * fat binary objects by programmers. Starting with CUDA 4.0, fat binary - * objects can be constructed by providing the -fatbin option to \b nvcc. - * More information can be found in the \b nvcc document. - * - * \param module - Returned module - * \param fatCubin - Fat binary to load - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_NOT_FOUND, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); - -/** - * \brief Unloads a module - * - * Unloads a module \p hmod from the current context. - * - * \param hmod - Module to unload - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_destroy_ub - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary - */ -CUresult CUDAAPI cuModuleUnload(CUmodule hmod); - -/** - * \brief Returns a function handle - * - * Returns in \p *hfunc the handle of the function of name \p name located in - * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() - * returns ::CUDA_ERROR_NOT_FOUND. - * - * \param hfunc - Returned function handle - * \param hmod - Module to retrieve function from - * \param name - Name of function to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); - -/** - * \brief Returns a global pointer from a module - * - * Returns in \p *dptr and \p *bytes the base pointer and size of the - * global of name \p name located in module \p hmod. If no variable of that name - * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both - * parameters \p dptr and \p bytes are optional. If one of them is - * NULL, it is ignored. - * - * \param dptr - Returned global device pointer - * \param bytes - Returned global size in bytes - * \param hmod - Module to retrieve global from - * \param name - Name of global to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload, - * ::cudaGetSymbolAddress, - * ::cudaGetSymbolSize - */ -CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); - -/** - * \brief Returns a handle to a texture reference - * - * Returns in \p *pTexRef the handle of the texture reference of name \p name - * in the module \p hmod. If no texture reference of that name exists, - * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference - * handle should not be destroyed, since it will be destroyed when the module - * is unloaded. - * - * \param pTexRef - Returned texture reference - * \param hmod - Module to retrieve texture reference from - * \param name - Name of texture reference to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetSurfRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload, - * ::cudaGetTextureReference - */ -CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); - -/** - * \brief Returns a handle to a surface reference - * - * Returns in \p *pSurfRef the handle of the surface reference of name \p name - * in the module \p hmod. If no surface reference of that name exists, - * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. - * - * \param pSurfRef - Returned surface reference - * \param hmod - Module to retrieve surface reference from - * \param name - Name of surface reference to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload, - * ::cudaGetSurfaceReference - */ -CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); - -/** - * \brief Creates a pending JIT linker invocation. - * - * If the call is successful, the caller owns the returned CUlinkState, which - * should eventually be destroyed with ::cuLinkDestroy. The - * device code machine size (32 or 64 bit) will match the calling application. - * - * Both linker and compiler options may be specified. Compiler options will - * be applied to inputs to this linker action which must be compiled from PTX. - * The options ::CU_JIT_WALL_TIME, - * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES - * will accumulate data until the CUlinkState is destroyed. - * - * \p optionValues must remain valid for the life of the CUlinkState if output - * options are used. No other references to inputs are maintained after this - * call returns. - * - * \param numOptions Size of options arrays - * \param options Array of linker and compiler options - * \param optionValues Array of option values, each cast to void * - * \param stateOut On success, this will contain a CUlinkState to specify - * and complete this action - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuLinkAddData, - * ::cuLinkAddFile, - * ::cuLinkComplete, - * ::cuLinkDestroy - */ -CUresult CUDAAPI -cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); - -/** - * \brief Add an input to a pending linker invocation - * - * Ownership of \p data is retained by the caller. No reference is retained to any - * inputs after this call returns. - * - * This method accepts only compiler options, which are used if the data must - * be compiled from PTX, and does not accept any of - * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, - * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. - * - * \param state A pending linker action. - * \param type The type of the input data. - * \param data The input data. PTX must be NULL-terminated. - * \param size The length of the input data. - * \param name An optional name for this input in log messages. - * \param numOptions Size of options. - * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). - * \param optionValues Array of option values, each cast to void *. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU - * - * \sa ::cuLinkCreate, - * ::cuLinkAddFile, - * ::cuLinkComplete, - * ::cuLinkDestroy - */ -CUresult CUDAAPI -cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, - unsigned int numOptions, CUjit_option *options, void **optionValues); - -/** - * \brief Add a file input to a pending linker invocation - * - * No reference is retained to any inputs after this call returns. - * - * This method accepts only compiler options, which are used if the input - * must be compiled from PTX, and does not accept any of - * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, - * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. - * - * This method is equivalent to invoking ::cuLinkAddData on the contents - * of the file. - * - * \param state A pending linker action - * \param type The type of the input data - * \param path Path to the input file - * \param numOptions Size of options - * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) - * \param optionValues Array of option values, each cast to void * - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_FILE_NOT_FOUND - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU - * - * \sa ::cuLinkCreate, - * ::cuLinkAddData, - * ::cuLinkComplete, - * ::cuLinkDestroy - */ -CUresult CUDAAPI -cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, - unsigned int numOptions, CUjit_option *options, void **optionValues); - -/** - * \brief Complete a pending linker invocation - * - * Completes the pending linker action and returns the cubin image for the linked - * device code, which can be used with ::cuModuleLoadData. The cubin is owned by - * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. - * This call does not destroy \p state. - * - * \param state A pending linker invocation - * \param cubinOut On success, this will point to the output image - * \param sizeOut Optional parameter to receive the size of the generated image - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuLinkCreate, - * ::cuLinkAddData, - * ::cuLinkAddFile, - * ::cuLinkDestroy, - * ::cuModuleLoadData - */ -CUresult CUDAAPI -cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); - -/** - * \brief Destroys state for a JIT linker invocation. - * - * \param state State object for the linker invocation - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE - * - * \sa ::cuLinkCreate - */ -CUresult CUDAAPI -cuLinkDestroy(CUlinkState state); - -/** @} */ /* END CUDA_MODULE */ - - -/** - * \defgroup CUDA_MEM Memory Management - * - * ___MANBRIEF___ memory management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the memory management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Gets free and total memory - * - * Returns in \p *total the total amount of memory available to the the current context. - * Returns in \p *free the amount of memory on the device that is free according to the OS. - * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free. - * In a multi-tenet situation, free estimate returned is prone to race condition where - * a new allocation/free done by a different process or a different thread in the same - * process between the time when free memory was estimated and reported, will result in - * deviation in free value reported and actual free memory. - * - * The integrated GPU on Tegra shares memory with CPU and other component - * of the SoC. The free and total values returned by the API excludes - * the SWAP memory space maintained by the OS on some platforms. - * The OS may move some of the memory pages into swap area as the GPU or - * CPU allocate or access memory. See Tegra app note on how to calculate - * total and free memory on Tegra. - * - * \param free - Returned free memory in bytes - * \param total - Returned total memory in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemGetInfo - */ -CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); - -/** - * \brief Allocates device memory - * - * Allocates \p bytesize bytes of linear memory on the device and returns in - * \p *dptr a pointer to the allocated memory. The allocated memory is suitably - * aligned for any kind of variable. The memory is not cleared. If \p bytesize - * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. - * - * \param dptr - Returned device pointer - * \param bytesize - Requested allocation size in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMalloc - */ -CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); - -/** - * \brief Allocates pitched device memory - * - * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on - * the device and returns in \p *dptr a pointer to the allocated memory. The - * function may pad the allocation to ensure that corresponding pointers in - * any given row will continue to meet the alignment requirements for - * coalescing as the address is updated from row to row. \p ElementSizeBytes - * specifies the size of the largest reads and writes that will be performed - * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced - * memory transactions are not possible on other data sizes). If - * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, - * the kernel will run correctly, but possibly at reduced speed. The pitch - * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the - * allocation. The intended usage of pitch is as a separate parameter of the - * allocation, used to compute addresses within the 2D array. Given the row - * and column of an array element of type \b T, the address is computed as: - * \code - T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; - * \endcode - * - * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with - * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is - * recommended that programmers consider performing pitch allocations using - * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is - * especially true if the application will be performing 2D memory copies - * between different regions of device memory (whether linear memory or CUDA - * arrays). - * - * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed - * to match or exceed the alignment requirement for texture binding with - * ::cuTexRefSetAddress2D(). - * - * \param dptr - Returned device pointer - * \param pPitch - Returned pitch of allocation in bytes - * \param WidthInBytes - Requested allocation width in bytes - * \param Height - Requested allocation height in rows - * \param ElementSizeBytes - Size of largest reads/writes for range - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMallocPitch - */ -CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); - -/** - * \brief Frees device memory - * - * Frees the memory space pointed to by \p dptr, which must have been returned - * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). - * - * \param dptr - Pointer to memory to free - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaFree - */ -CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); - -/** - * \brief Get information on memory allocations - * - * Returns the base address in \p *pbase and size in \p *psize of the - * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input - * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one - * of them is NULL, it is ignored. - * - * \param pbase - Returned base address - * \param psize - Returned size of device memory allocation - * \param dptr - Device pointer to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_NOT_FOUND, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 - */ -CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); - -/** - * \brief Allocates page-locked host memory - * - * Allocates \p bytesize bytes of host memory that is page-locked and - * accessible to the device. The driver tracks the virtual memory ranges - * allocated with this function and automatically accelerates calls to - * functions such as ::cuMemcpy(). Since the memory can be accessed directly by - * the device, it can be read or written with much higher bandwidth than - * pageable memory obtained with functions such as ::malloc(). Allocating - * excessive amounts of memory with ::cuMemAllocHost() may degrade system - * performance, since it reduces the amount of memory available to the system - * for paging. As a result, this function is best used sparingly to allocate - * staging areas for data exchange between host and device. - * - * Note all host memory allocated using ::cuMemHostAlloc() will automatically - * be immediately accessible to all contexts on all devices which support unified - * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). - * The device pointer that may be used to access this host memory from those - * contexts is always equal to the returned host pointer \p *pp. - * See \ref CUDA_UNIFIED for additional details. - * - * \param pp - Returned host pointer to page-locked memory - * \param bytesize - Requested allocation size in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMallocHost - */ -CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); - -/** - * \brief Frees page-locked host memory - * - * Frees the memory space pointed to by \p p, which must have been returned by - * a previous call to ::cuMemAllocHost(). - * - * \param p - Pointer to memory to free - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaFreeHost - */ -CUresult CUDAAPI cuMemFreeHost(void *p); - -/** - * \brief Allocates page-locked host memory - * - * Allocates \p bytesize bytes of host memory that is page-locked and accessible - * to the device. The driver tracks the virtual memory ranges allocated with - * this function and automatically accelerates calls to functions such as - * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, - * it can be read or written with much higher bandwidth than pageable memory - * obtained with functions such as ::malloc(). Allocating excessive amounts of - * pinned memory may degrade system performance, since it reduces the amount - * of memory available to the system for paging. As a result, this function is - * best used sparingly to allocate staging areas for data exchange between - * host and device. - * - * The \p Flags parameter enables different options to be specified that - * affect the allocation, as follows. - * - * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one that - * performed the allocation. - * - * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address - * space. The device pointer to the memory may be obtained by calling - * ::cuMemHostGetDevicePointer(). - * - * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined - * (WC). WC memory can be transferred across the PCI Express bus more - * quickly on some system configurations, but cannot be read efficiently by - * most CPUs. WC memory is a good option for buffers that will be written by - * the CPU and read by the GPU via mapped pinned memory or host->device - * transfers. - * - * All of these flags are orthogonal to one another: a developer may allocate - * memory that is portable, mapped and/or write-combined with no restrictions. - * - * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for - * devices that do not support mapped pinned memory. The failure is deferred - * to ::cuMemHostGetDevicePointer() because the memory may be mapped into - * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. - * - * The memory allocated by this function must be freed with ::cuMemFreeHost(). - * - * Note all host memory allocated using ::cuMemHostAlloc() will automatically - * be immediately accessible to all contexts on all devices which support unified - * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). - * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer - * that may be used to access this host memory from those contexts is always equal - * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED - * is specified, then the function ::cuMemHostGetDevicePointer() must be used - * to query the device pointer, even if the context supports unified addressing. - * See \ref CUDA_UNIFIED for additional details. - * - * \param pp - Returned host pointer to page-locked memory - * \param bytesize - Requested allocation size in bytes - * \param Flags - Flags for allocation request - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaHostAlloc - */ -CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); - -/** - * \brief Passes back device pointer of mapped pinned memory - * - * Passes back the device pointer \p pdptr corresponding to the mapped, pinned - * host buffer \p p allocated by ::cuMemHostAlloc. - * - * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP - * flag was not specified at the time the memory was allocated, or if the - * function is called on a GPU that does not support mapped pinned memory. - * - * For devices that have a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory - * can also be accessed from the device using the host pointer \p p. - * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not - * match the original host pointer \p p and depends on the devices visible to the - * application. If all devices visible to the application have a non-zero value for the - * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() - * will match the original pointer \p p. If any device visible to the application - * has a zero value for the device attribute, the device pointer returned by - * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, - * but it will be suitable for use on all devices provided Unified Virtual Addressing - * is enabled. In such systems, it is valid to access the memory using either pointer - * on devices that have a non-zero value for the device attribute. Note however that - * such devices should access the memory using only one of the two pointers and not both. - * - * \p Flags provides for future releases. For now, it must be set to 0. - * - * \param pdptr - Returned device pointer - * \param p - Host pointer - * \param Flags - Options (must be 0) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaHostGetDevicePointer - */ -CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); - -/** - * \brief Passes back flags that were used for a pinned allocation - * - * Passes back the flags \p pFlags that were specified when allocating - * the pinned host buffer \p p allocated by ::cuMemHostAlloc. - * - * ::cuMemHostGetFlags() will fail if the pointer does not reside in - * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). - * - * \param pFlags - Returned flags word - * \param p - Host pointer - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuMemAllocHost, - * ::cuMemHostAlloc, - * ::cudaHostGetFlags - */ -CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); - -/** - * \brief Allocates memory that will be automatically managed by the Unified Memory system - * - * Allocates \p bytesize bytes of managed memory on the device and returns in - * \p *dptr a pointer to the allocated memory. If the device doesn't support - * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support - * for managed memory can be queried using the device attribute - * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably - * aligned for any kind of variable. The memory is not cleared. If \p bytesize - * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer - * is valid on the CPU and on all GPUs in the system that support managed memory. - * All accesses to this pointer must obey the Unified Memory programming model. - * - * \p flags specifies the default stream association for this allocation. - * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If - * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from - * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the - * allocation should not be accessed from devices that have a zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to - * ::cuStreamAttachMemAsync will be required to enable access on such devices. - * - * If the association is later changed via ::cuStreamAttachMemAsync to - * a single stream, the default association as specifed during ::cuMemAllocManaged - * is restored when that stream is destroyed. For __managed__ variables, the - * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a - * stream is an asynchronous operation, and as a result, the change to default - * association won't happen until all work in the stream has completed. - * - * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. - * - * Device memory oversubscription is possible for GPUs that have a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on - * such GPUs may be evicted from device memory to host memory at any time by the Unified - * Memory driver in order to make room for other allocations. - * - * In a multi-GPU system where all GPUs have a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this - * API returns and instead may be populated on access. In such systems, managed memory can - * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to - * maintain data locality and prevent excessive page faults to the extent possible. The application - * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application - * can also explicitly migrate memory to a desired processor's memory via - * ::cuMemPrefetchAsync. - * - * In a multi-GPU system where all of the GPUs have a zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support - * with each other, the physical storage for managed memory is created on the GPU which is active - * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced - * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate - * memory among such GPUs. - * - * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and - * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS - * is zero for at least one of those GPUs, the location chosen for physical storage of managed - * memory is system-dependent. - * - On Linux, the location chosen will be device memory as long as the current set of active - * contexts are on devices that either have peer-to-peer support with each other or have a - * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * If there is an active context on a GPU that does not have a non-zero value for that device - * attribute and it does not have peer-to-peer support with the other devices that have active - * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. - * Note that this means that managed memory that is located in device memory is migrated to - * host memory if a new context is created on a GPU that doesn't have a non-zero value for - * the device attribute and does not support peer-to-peer with at least one of the other devices - * that has an active context. This in turn implies that context creation may fail if there is - * insufficient host memory to migrate all managed allocations. - * - On Windows, the physical storage is always created in 'zero-copy' or host memory. - * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these - * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to - * restrict CUDA to only use those GPUs that have peer-to-peer support. - * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a - * non-zero value to force the driver to always use device memory for physical storage. - * When this environment variable is set to a non-zero value, all contexts created in - * that process on devices that support managed memory have to be peer-to-peer compatible - * with each other. Context creation will fail if a context is created on a device that - * supports managed memory and is not peer-to-peer compatible with any of the other - * managed memory supporting devices on which contexts were previously created, even if - * those contexts have been destroyed. These environment variables are described - * in the CUDA programming guide under the "CUDA environment variables" section. - * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. - * - * \param dptr - Returned device pointer - * \param bytesize - Requested allocation size in bytes - * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, - * ::cudaMallocManaged - */ -CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); - -/** - * \brief Returns a handle to a compute device - * - * Returns in \p *device a device handle given a PCI bus ID string. - * - * \param dev - Returned device handle - * - * \param pciBusId - String in one of the following forms: - * [domain]:[bus]:[device].[function] - * [domain]:[bus]:[device] - * [bus]:[device].[function] - * where \p domain, \p bus, \p device, and \p function are all hexadecimal values - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGet, - * ::cuDeviceGetAttribute, - * ::cuDeviceGetPCIBusId, - * ::cudaDeviceGetByPCIBusId - */ -CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); - -/** - * \brief Returns a PCI Bus Id string for the device - * - * Returns an ASCII string identifying the device \p dev in the NULL-terminated - * string pointed to by \p pciBusId. \p len specifies the maximum length of the - * string that may be returned. - * - * \param pciBusId - Returned identifier string for the device in the following format - * [domain]:[bus]:[device].[function] - * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. - * pciBusId should be large enough to store 13 characters including the NULL-terminator. - * - * \param len - Maximum length of string to store in \p name - * - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGet, - * ::cuDeviceGetAttribute, - * ::cuDeviceGetByPCIBusId, - * ::cudaDeviceGetPCIBusId - */ -CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); - -/** - * \brief Gets an interprocess handle for a previously allocated event - * - * Takes as input a previously allocated event. This event must have been - * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING - * flags set. This opaque handle may be copied into other processes and - * opened with ::cuIpcOpenEventHandle to allow efficient hardware - * synchronization between GPU work in different processes. - * - * After the event has been opened in the importing process, - * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and - * ::cuEventQuery may be used in either process. Performing operations - * on the imported event after the exported event has been freed - * with ::cuEventDestroy will result in undefined behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param pHandle - Pointer to a user allocated CUipcEventHandle - * in which to return the opaque event handle - * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and - * ::CU_EVENT_DISABLE_TIMING flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuEventCreate, - * ::cuEventDestroy, - * ::cuEventSynchronize, - * ::cuEventQuery, - * ::cuStreamWaitEvent, - * ::cuIpcOpenEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcOpenMemHandle, - * ::cuIpcCloseMemHandle, - * ::cudaIpcGetEventHandle - */ -CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); - -/** - * \brief Opens an interprocess event handle for use in the current process - * - * Opens an interprocess event handle exported from another process with - * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like - * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. - * This event must be freed with ::cuEventDestroy. - * - * Performing operations on the imported event after the exported event has - * been freed with ::cuEventDestroy will result in undefined behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param phEvent - Returns the imported event - * \param handle - Interprocess handle to open - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuEventCreate, - * ::cuEventDestroy, - * ::cuEventSynchronize, - * ::cuEventQuery, - * ::cuStreamWaitEvent, - * ::cuIpcGetEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcOpenMemHandle, - * ::cuIpcCloseMemHandle, - * ::cudaIpcOpenEventHandle - */ -CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); - -/** - * \brief Gets an interprocess memory handle for an existing device memory - * allocation - * - * Takes a pointer to the base of an existing device memory allocation created - * with ::cuMemAlloc and exports it for use in another process. This is a - * lightweight operation and may be called multiple times on an allocation - * without adverse effects. - * - * If a region of memory is freed with ::cuMemFree and a subsequent call - * to ::cuMemAlloc returns memory with the same device address, - * ::cuIpcGetMemHandle will return a unique handle for the - * new memory. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return - * the handle in. - * \param dptr - Base pointer to previously allocated device memory - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuIpcGetEventHandle, - * ::cuIpcOpenEventHandle, - * ::cuIpcOpenMemHandle, - * ::cuIpcCloseMemHandle, - * ::cudaIpcGetMemHandle - */ -CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); - -/** - * \brief Opens an interprocess memory handle exported from another process - * and returns a device pointer usable in the local process. - * - * Maps memory exported from another process with ::cuIpcGetMemHandle into - * the current device address space. For contexts on different devices - * ::cuIpcOpenMemHandle can attempt to enable peer access between the - * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is - * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. - * ::cuDeviceCanAccessPeer can determine if a mapping is possible. - * - * Contexts that may open ::CUipcMemHandles are restricted in the following way. - * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened - * by one ::CUcontext per ::CUdevice per other process. - * - * If the memory handle has already been opened by the current context, the - * reference count on the handle is incremented by 1 and the existing device pointer - * is returned. - * - * Memory returned from ::cuIpcOpenMemHandle must be freed with - * ::cuIpcCloseMemHandle. - * - * Calling ::cuMemFree on an exported memory region before calling - * ::cuIpcCloseMemHandle in the importing context will result in undefined - * behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param pdptr - Returned device pointer - * \param handle - ::CUipcMemHandle to open - * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_TOO_MANY_PEERS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \note No guarantees are made about the address returned in \p *pdptr. - * In particular, multiple processes may not receive the same address for the same \p handle. - * - * \sa - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuIpcGetEventHandle, - * ::cuIpcOpenEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcCloseMemHandle, - * ::cuCtxEnablePeerAccess, - * ::cuDeviceCanAccessPeer, - * ::cudaIpcOpenMemHandle - */ -CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); - -/** - * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle - * - * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1. - * When the reference count reaches 0, this API unmaps the memory. The original allocation - * in the exporting process as well as imported mappings in other processes - * will be unaffected. - * - * Any resources used to enable peer access will be freed if this is the - * last mapping using them. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \sa - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuIpcGetEventHandle, - * ::cuIpcOpenEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcOpenMemHandle, - * ::cudaIpcCloseMemHandle - */ -CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); - -/** - * \brief Registers an existing host memory range for use by CUDA - * - * Page-locks the memory range specified by \p p and \p bytesize and maps it - * for the device(s) as specified by \p Flags. This memory range also is added - * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed - * directly by the device, it can be read or written with much higher bandwidth - * than pageable memory that has not been registered. Page-locking excessive - * amounts of memory may degrade system performance, since it reduces the amount - * of memory available to the system for paging. As a result, this function is - * best used sparingly to register staging areas for data exchange between - * host and device. - * - * This function has limited support on Mac OS X. OS 10.7 or higher is required. - * - * The \p Flags parameter enables different options to be specified that - * affect the allocation, as follows. - * - * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one that - * performed the allocation. - * - * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address - * space. The device pointer to the memory may be obtained by calling - * ::cuMemHostGetDevicePointer(). - * - * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some - * I/O memory space, e.g. the PCI Express resource of a 3rd party device. - * - * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory - * that is considered read-only by the device. On platforms without - * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is - * required in order to register memory mapped to the CPU as read-only. Support - * for the use of this flag can be queried from the device attribute - * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with - * a current context associated with a device that does not have this attribute - * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. - * - * All of these flags are orthogonal to one another: a developer may page-lock - * memory that is portable or mapped with no restrictions. - * - * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for - * devices that do not support mapped pinned memory. The failure is deferred - * to ::cuMemHostGetDevicePointer() because the memory may be mapped into - * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. - * - * For devices that have a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory - * can also be accessed from the device using the host pointer \p p. - * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not - * match the original host pointer \p ptr and depends on the devices visible to the - * application. If all devices visible to the application have a non-zero value for the - * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() - * will match the original pointer \p ptr. If any device visible to the application - * has a zero value for the device attribute, the device pointer returned by - * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, - * but it will be suitable for use on all devices provided Unified Virtual Addressing - * is enabled. In such systems, it is valid to access the memory using either pointer - * on devices that have a non-zero value for the device attribute. Note however that - * such devices should access the memory using only of the two pointers and not both. - * - * The memory page-locked by this function must be unregistered with - * ::cuMemHostUnregister(). - * - * \param p - Host pointer to memory to page-lock - * \param bytesize - Size in bytes of the address range to page-lock - * \param Flags - Flags for allocation request - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, - * ::CUDA_ERROR_NOT_PERMITTED, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa - * ::cuMemHostUnregister, - * ::cuMemHostGetFlags, - * ::cuMemHostGetDevicePointer, - * ::cudaHostRegister - */ -CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); - -/** - * \brief Unregisters a memory range that was registered with cuMemHostRegister. - * - * Unmaps the memory range whose base address is specified by \p p, and makes - * it pageable again. - * - * The base address must be the same one specified to ::cuMemHostRegister(). - * - * \param p - Host pointer to memory to unregister - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - * \notefnerr - * - * \sa - * ::cuMemHostRegister, - * ::cudaHostUnregister - */ -CUresult CUDAAPI cuMemHostUnregister(void *p); - -/** - * \brief Copies memory - * - * Copies data between two pointers. - * \p dst and \p src are base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * Note that this function infers the type of the transfer (host to host, host to - * device, device to device, or device to host) from the pointer values. This - * function is only allowed in contexts which support unified addressing. - * - * \param dst - Destination unified virtual address space pointer - * \param src - Source unified virtual address space pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol - */ -CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); - -/** - * \brief Copies device memory between two contexts - * - * Copies from device memory in one context to device memory in another - * context. \p dstDevice is the base device pointer of the destination memory - * and \p dstContext is the destination context. \p srcDevice is the base - * device pointer of the source memory and \p srcContext is the source pointer. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param dstContext - Destination context - * \param srcDevice - Source device pointer - * \param srcContext - Source context - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpyPeer - */ -CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); - -/** - * \brief Copies memory from Host to Device - * - * Copies from host memory to device memory. \p dstDevice and \p srcHost are - * the base addresses of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyToSymbol - */ -CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); - -/** - * \brief Copies memory from Device to Host - * - * Copies from device to host memory. \p dstHost and \p srcDevice specify the - * base pointers of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstHost - Destination host pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyFromSymbol - */ -CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); - -/** - * \brief Copies memory from Device to Device - * - * Copies from device memory to device memory. \p dstDevice and \p srcDevice - * are the base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol - */ -CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); - -/** - * \brief Copies memory from Device to Array - * - * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset - * specify the CUDA array handle and starting index of the destination data. - * \p srcDevice specifies the base pointer of the source. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyToArray - */ -CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); - -/** - * \brief Copies memory from Array to Device - * - * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the - * base pointer of the destination and must be naturally aligned with the CUDA - * array elements. \p srcArray and \p srcOffset specify the CUDA array handle - * and the offset in bytes into the array where the copy is to begin. - * \p ByteCount specifies the number of bytes to copy and must be evenly - * divisible by the array element size. - * - * \param dstDevice - Destination device pointer - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyFromArray - */ -CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); - -/** - * \brief Copies memory from Host to Array - * - * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset - * specify the CUDA array handle and starting offset in bytes of the destination - * data. \p pSrc specifies the base address of the source. \p ByteCount specifies - * the number of bytes to copy. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyToArray - */ -CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); - -/** - * \brief Copies memory from Array to Host - * - * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base - * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA - * array handle and starting offset in bytes of the source data. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstHost - Destination device pointer - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyFromArray - */ -CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); - -/** - * \brief Copies memory from Array to Array - * - * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray - * specify the handles of the destination and source CUDA arrays for the copy, - * respectively. \p dstOffset and \p srcOffset specify the destination and - * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of - * bytes to be copied. The size of the elements in the CUDA arrays need not be - * the same format, but the elements must be the same size; and count must be - * evenly divisible by that size. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyArrayToArray - */ -CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); - -/** - * \brief Copies memory for 2D arrays - * - * Perform a 2D memory copy according to the parameters specified in \p pCopy. - * The ::CUDA_MEMCPY2D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY2D_st { - unsigned int srcXInBytes, srcY; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; - - unsigned int dstXInBytes, dstY; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; - - unsigned int WidthInBytes; - unsigned int Height; - } CUDA_MEMCPY2D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch - * specify the (host) base address of the source data and the bytes per row to - * apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch - * specify the (device) base address of the source data and the bytes per row - * to apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are - * ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are - * ignored. - * - * - ::srcXInBytes and ::srcY specify the base address of the source data for - * the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - ::dstXInBytes and ::dstY specify the base address of the destination data - * for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes and ::Height specify the width (in bytes) and height of - * the 2D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - * \par - * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back - * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies - * (device to device, CUDA array to device, CUDA array to CUDA array), - * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). - * ::cuMemcpy2DUnaligned() does not have this restriction, but may run - * significantly slower in the cases where ::cuMemcpy2D() would have returned - * an error code. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy2D, - * ::cudaMemcpy2DToArray, - * ::cudaMemcpy2DFromArray - */ -CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); - -/** - * \brief Copies memory for 2D arrays - * - * Perform a 2D memory copy according to the parameters specified in \p pCopy. - * The ::CUDA_MEMCPY2D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY2D_st { - unsigned int srcXInBytes, srcY; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; - unsigned int dstXInBytes, dstY; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; - unsigned int WidthInBytes; - unsigned int Height; - } CUDA_MEMCPY2D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch - * specify the (host) base address of the source data and the bytes per row to - * apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch - * specify the (device) base address of the source data and the bytes per row - * to apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are - * ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are - * ignored. - * - * - ::srcXInBytes and ::srcY specify the base address of the source data for - * the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - ::dstXInBytes and ::dstY specify the base address of the destination data - * for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes and ::Height specify the width (in bytes) and height of - * the 2D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - * \par - * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back - * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies - * (device to device, CUDA array to device, CUDA array to CUDA array), - * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). - * ::cuMemcpy2DUnaligned() does not have this restriction, but may run - * significantly slower in the cases where ::cuMemcpy2D() would have returned - * an error code. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy2D, - * ::cudaMemcpy2DToArray, - * ::cudaMemcpy2DFromArray - */ -CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); - -/** - * \brief Copies memory for 3D arrays - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY3D_st { - - unsigned int srcXInBytes, srcY, srcZ; - unsigned int srcLOD; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; // ignored when src is array - unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - - unsigned int dstXInBytes, dstY, dstZ; - unsigned int dstLOD; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; // ignored when dst is array - unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - - unsigned int WidthInBytes; - unsigned int Height; - unsigned int Depth; - } CUDA_MEMCPY3D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and - * ::srcHeight specify the (host) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and - * ::srcHeight specify the (device) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and - * ::srcHeight are ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data, the bytes per row, - * and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and - * ::dstHeight are ignored. - * - * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source - * data for the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, ::dstY and ::dstZ specify the base address of the - * destination data for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcHeight must be greater than or equal to ::Height + - * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. - * - * \par - * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). - * - * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be - * set to 0. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy3D - */ -CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); - -/** - * \brief Copies memory between contexts - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure - * for documentation of its parameters. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpy3DPeer - */ -CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); - -/** - * \brief Copies memory asynchronously - * - * Copies data between two pointers. - * \p dst and \p src are base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * Note that this function infers the type of the transfer (host to host, host to - * device, device to device, or device to host) from the pointer values. This - * function is only allowed in contexts which support unified addressing. - * - * \param dst - Destination unified virtual address space pointer - * \param src - Source unified virtual address space pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyToSymbolAsync, - * ::cudaMemcpyFromSymbolAsync - */ -CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies device memory between two contexts asynchronously. - * - * Copies from device memory in one context to device memory in another - * context. \p dstDevice is the base device pointer of the destination memory - * and \p dstContext is the destination context. \p srcDevice is the base - * device pointer of the source memory and \p srcContext is the source pointer. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param dstContext - Destination context - * \param srcDevice - Source device pointer - * \param srcContext - Source context - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpyPeerAsync - */ -CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Host to Device - * - * Copies from host memory to device memory. \p dstDevice and \p srcHost are - * the base addresses of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyToSymbolAsync - */ -CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Device to Host - * - * Copies from device to host memory. \p dstHost and \p srcDevice specify the - * base pointers of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstHost - Destination host pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyFromSymbolAsync - */ -CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Device to Device - * - * Copies from device memory to device memory. \p dstDevice and \p srcDevice - * are the base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyToSymbolAsync, - * ::cudaMemcpyFromSymbolAsync - */ -CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Host to Array - * - * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset - * specify the CUDA array handle and starting offset in bytes of the - * destination data. \p srcHost specifies the base address of the source. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyToArrayAsync - */ -CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Array to Host - * - * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base - * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA - * array handle and starting offset in bytes of the source data. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstHost - Destination pointer - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyFromArrayAsync - */ -CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory for 2D arrays - * - * Perform a 2D memory copy according to the parameters specified in \p pCopy. - * The ::CUDA_MEMCPY2D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY2D_st { - unsigned int srcXInBytes, srcY; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; - unsigned int dstXInBytes, dstY; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; - unsigned int WidthInBytes; - unsigned int Height; - } CUDA_MEMCPY2D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch - * specify the (host) base address of the source data and the bytes per row to - * apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch - * specify the (device) base address of the source data and the bytes per row - * to apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are - * ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are - * ignored. - * - * - ::srcXInBytes and ::srcY specify the base address of the source data for - * the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - ::dstXInBytes and ::dstY specify the base address of the destination data - * for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes and ::Height specify the width (in bytes) and height of - * the 2D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcHeight must be greater than or equal to ::Height + - * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. - * - * \par - * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back - * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies - * (device to device, CUDA array to device, CUDA array to CUDA array), - * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). - * - * \param pCopy - Parameters for the memory copy - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpy2DAsync, - * ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpy2DFromArrayAsync - */ -CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); - -/** - * \brief Copies memory for 3D arrays - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY3D_st { - - unsigned int srcXInBytes, srcY, srcZ; - unsigned int srcLOD; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; // ignored when src is array - unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - - unsigned int dstXInBytes, dstY, dstZ; - unsigned int dstLOD; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; // ignored when dst is array - unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - - unsigned int WidthInBytes; - unsigned int Height; - unsigned int Depth; - } CUDA_MEMCPY3D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and - * ::srcHeight specify the (host) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and - * ::srcHeight specify the (device) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and - * ::srcHeight are ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data, the bytes per row, - * and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and - * ::dstHeight are ignored. - * - * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source - * data for the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, ::dstY and ::dstZ specify the base address of the - * destination data for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcHeight must be greater than or equal to ::Height + - * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. - * - * \par - * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). - * - * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be - * set to 0. - * - * \param pCopy - Parameters for the memory copy - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpy3DAsync - */ -CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); - -/** - * \brief Copies memory between contexts asynchronously. - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure - * for documentation of its parameters. - * - * \param pCopy - Parameters for the memory copy - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpy3DPeerAsync - */ -CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); - -/** - * \brief Initializes device memory - * - * Sets the memory range of \p N 8-bit values to the specified value - * \p uc. - * - * \param dstDevice - Destination device pointer - * \param uc - Value to set - * \param N - Number of elements - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset - */ -CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); - -/** - * \brief Initializes device memory - * - * Sets the memory range of \p N 16-bit values to the specified value - * \p us. The \p dstDevice pointer must be two byte aligned. - * - * \param dstDevice - Destination device pointer - * \param us - Value to set - * \param N - Number of elements - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset - */ -CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); - -/** - * \brief Initializes device memory - * - * Sets the memory range of \p N 32-bit values to the specified value - * \p ui. The \p dstDevice pointer must be four byte aligned. - * - * \param dstDevice - Destination device pointer - * \param ui - Value to set - * \param N - Number of elements - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32Async, - * ::cudaMemset - */ -CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); - -/** - * \brief Initializes device memory - * - * Sets the 2D memory range of \p Width 8-bit values to the specified value - * \p uc. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param uc - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2D - */ -CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); - -/** - * \brief Initializes device memory - * - * Sets the 2D memory range of \p Width 16-bit values to the specified value - * \p us. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be two byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param us - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2D - */ -CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); - -/** - * \brief Initializes device memory - * - * Sets the 2D memory range of \p Width 32-bit values to the specified value - * \p ui. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be four byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param ui - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2D - */ -CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); - -/** - * \brief Sets device memory - * - * Sets the memory range of \p N 8-bit values to the specified value - * \p uc. - * - * \param dstDevice - Destination device pointer - * \param uc - Value to set - * \param N - Number of elements - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemsetAsync - */ -CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the memory range of \p N 16-bit values to the specified value - * \p us. The \p dstDevice pointer must be two byte aligned. - * - * \param dstDevice - Destination device pointer - * \param us - Value to set - * \param N - Number of elements - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemsetAsync - */ -CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the memory range of \p N 32-bit values to the specified value - * \p ui. The \p dstDevice pointer must be four byte aligned. - * - * \param dstDevice - Destination device pointer - * \param ui - Value to set - * \param N - Number of elements - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, - * ::cudaMemsetAsync - */ -CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the 2D memory range of \p Width 8-bit values to the specified value - * \p uc. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param uc - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2DAsync - */ -CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the 2D memory range of \p Width 16-bit values to the specified value - * \p us. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be two byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param us - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2DAsync - */ -CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the 2D memory range of \p Width 32-bit values to the specified value - * \p ui. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be four byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param ui - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2DAsync - */ -CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); - -/** - * \brief Creates a 1D or 2D CUDA array - * - * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure - * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. - * The ::CUDA_ARRAY_DESCRIPTOR is defined as: - * - * \code - typedef struct { - unsigned int Width; - unsigned int Height; - CUarray_format Format; - unsigned int NumChannels; - } CUDA_ARRAY_DESCRIPTOR; - * \endcode - * where: - * - * - \p Width, and \p Height are the width, and height of the CUDA array (in - * elements); the CUDA array is one-dimensional if height is 0, two-dimensional - * otherwise; - * - ::Format specifies the format of the elements; ::CUarray_format is - * defined as: - * \code - typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - CU_AD_FORMAT_SIGNED_INT8 = 0x08, - CU_AD_FORMAT_SIGNED_INT16 = 0x09, - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - CU_AD_FORMAT_HALF = 0x10, - CU_AD_FORMAT_FLOAT = 0x20 - } CUarray_format; - * \endcode - * - \p NumChannels specifies the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * \code - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 2048; - desc.Height = 1; - * \endcode - * - * Description for a 64 x 64 CUDA array of floats: - * \code - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 64; - desc.Height = 64; - * \endcode - * - * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit - * float16's: - * \code - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_HALF; - desc.NumChannels = 4; - desc.Width = width; - desc.Height = height; - * \endcode - * - * Description for a \p width x \p height CUDA array of 16-bit elements, each - * of which is two 8-bit unsigned chars: - * \code - CUDA_ARRAY_DESCRIPTOR arrayDesc; - desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - desc.NumChannels = 2; - desc.Width = width; - desc.Height = height; - * \endcode - * - * \param pHandle - Returned array - * \param pAllocateArray - Array descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMallocArray - */ -CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); - -/** - * \brief Get a 1D or 2D CUDA array descriptor - * - * Returns in \p *pArrayDescriptor a descriptor containing information on the - * format and dimensions of the CUDA array \p hArray. It is useful for - * subroutines that have been passed a CUDA array, but need to know the CUDA - * array parameters for validation or other purposes. - * - * \param pArrayDescriptor - Returned array descriptor - * \param hArray - Array to get descriptor of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaArrayGetInfo - */ -CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); - -/** - * \brief Returns the layout properties of a sparse CUDA array - * - * Returns the layout properties of a sparse CUDA array in \p sparseProperties - * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, - * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero. - * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. - * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained - * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties - * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. - * - * \return - * ::CUDA_SUCCESS - * ::CUDA_ERROR_INVALID_VALUE - * - * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES - * \param[in] array - CUDA array to get the sparse properties of - * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync - */ -CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array); - -/** - * \brief Returns the layout properties of a sparse CUDA mipmapped array - * - * Returns the sparse array layout properties in \p sparseProperties - * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the - * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth - * is less than that of the tile. - * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, - * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. - * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. - * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. - * - * \return - * ::CUDA_SUCCESS - * ::CUDA_ERROR_INVALID_VALUE - * - * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES - * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of - * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync - */ -CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap); - - -/** - * \brief Returns the memory requirements of a CUDA array - * - * Returns the memory requirements of a CUDA array in \p memoryRequirements - * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size - * represents the total size of the CUDA array. - * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment - * represents the alignment necessary for mapping the CUDA array. - * - * \return - * ::CUDA_SUCCESS - * ::CUDA_ERROR_INVALID_VALUE - * - * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS - * \param[in] array - CUDA array to get the memory requirements of - * \param[in] device - Device to get the memory requirements for - * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync - */ -CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device); - -/** - * \brief Returns the memory requirements of a CUDA mipmapped array - * - * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements - * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size - * represents the total size of the CUDA mipmapped array. - * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment - * represents the alignment necessary for mapping the CUDA mipmapped - * array. - * - * \return - * ::CUDA_SUCCESS - * ::CUDA_ERROR_INVALID_VALUE - * - * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS - * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of - * \param[in] device - Device to get the memory requirements for - * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync - */ -CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device); - - -/** - * \brief Gets a CUDA array plane from a CUDA array - * - * Returns in \p pPlaneArray a CUDA array that represents a single format plane - * of the CUDA array \p hArray. - * - * If \p planeIdx is greater than the maximum number of planes in this array or if the array does - * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns - * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. - * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width - * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. - * - * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx - * \param hArray - Multiplanar CUDA array - * \param planeIdx - Plane index - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::cuArrayCreate, - * ::cudaGetArrayPlane - */ -CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); - -/** - * \brief Destroys a CUDA array - * - * Destroys the CUDA array \p hArray. - * - * \param hArray - Array to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ARRAY_IS_MAPPED, - * ::CUDA_ERROR_CONTEXT_IS_DESTROYED - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaFreeArray - */ -CUresult CUDAAPI cuArrayDestroy(CUarray hArray); - -/** - * \brief Creates a 3D CUDA array - * - * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure - * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. - * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: - * - * \code - typedef struct { - unsigned int Width; - unsigned int Height; - unsigned int Depth; - CUarray_format Format; - unsigned int NumChannels; - unsigned int Flags; - } CUDA_ARRAY3D_DESCRIPTOR; - * \endcode - * where: - * - * - \p Width, \p Height, and \p Depth are the width, height, and depth of the - * CUDA array (in elements); the following types of CUDA arrays can be allocated: - * - A 1D array is allocated if \p Height and \p Depth extents are both zero. - * - A 2D array is allocated if only \p Depth extent is zero. - * - A 3D array is allocated if all three extents are non-zero. - * - A 1D layered CUDA array is allocated if only \p Height is zero and the - * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number - * of layers is determined by the depth extent. - * - A 2D layered CUDA array is allocated if all three extents are non-zero and - * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number - * of layers is determined by the depth extent. - * - A cubemap CUDA array is allocated if all three extents are non-zero and the - * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and - * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, - * where the six layers represent the six faces of a cube. The order of the six - * layers in memory is the same as that listed in ::CUarray_cubemap_face. - * - A cubemap layered CUDA array is allocated if all three extents are non-zero, - * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. - * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. - * A cubemap layered CUDA array is a special type of 2D layered CUDA array that - * consists of a collection of cubemaps. The first six layers represent the first - * cubemap, the next six layers form the second cubemap, and so on. - * - * - ::Format specifies the format of the elements; ::CUarray_format is - * defined as: - * \code - typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - CU_AD_FORMAT_SIGNED_INT8 = 0x08, - CU_AD_FORMAT_SIGNED_INT16 = 0x09, - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - CU_AD_FORMAT_HALF = 0x10, - CU_AD_FORMAT_FLOAT = 0x20 - } CUarray_format; - * \endcode - * - * - \p NumChannels specifies the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - * - ::Flags may be set to - * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, - * \p Depth specifies the number of layers, not the depth of a 3D array. - * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. - * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array - * to a surface reference. - * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be - * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, - * then \p Depth must be a multiple of six. - * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. - * Texture gather can only be performed on 2D CUDA arrays. - * - * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. - * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute - * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. - * - * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag - * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH - * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), - * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
- * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } - *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), - * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), - * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, - * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, - * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), - * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), - * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), - * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), - * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), - * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
- * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * \code - CUDA_ARRAY3D_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 2048; - desc.Height = 0; - desc.Depth = 0; - * \endcode - * - * Description for a 64 x 64 CUDA array of floats: - * \code - CUDA_ARRAY3D_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 64; - desc.Height = 64; - desc.Depth = 0; - * \endcode - * - * Description for a \p width x \p height x \p depth CUDA array of 64-bit, - * 4x16-bit float16's: - * \code - CUDA_ARRAY3D_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_HALF; - desc.NumChannels = 4; - desc.Width = width; - desc.Height = height; - desc.Depth = depth; - * \endcode - * - * \param pHandle - Returned array - * \param pAllocateArray - 3D array descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMalloc3DArray - */ -CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); - -/** - * \brief Get a 3D CUDA array descriptor - * - * Returns in \p *pArrayDescriptor a descriptor containing information on the - * format and dimensions of the CUDA array \p hArray. It is useful for - * subroutines that have been passed a CUDA array, but need to know the CUDA - * array parameters for validation or other purposes. - * - * This function may be called on 1D and 2D arrays, in which case the \p Height - * and/or \p Depth members of the descriptor struct will be set to 0. - * - * \param pArrayDescriptor - Returned 3D array descriptor - * \param hArray - 3D array to get descriptor of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_CONTEXT_IS_DESTROYED - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaArrayGetInfo - */ -CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); - -/** - * \brief Creates a CUDA mipmapped array - * - * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure - * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. - * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is - * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. - * - * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: - * - * \code - typedef struct { - unsigned int Width; - unsigned int Height; - unsigned int Depth; - CUarray_format Format; - unsigned int NumChannels; - unsigned int Flags; - } CUDA_ARRAY3D_DESCRIPTOR; - * \endcode - * where: - * - * - \p Width, \p Height, and \p Depth are the width, height, and depth of the - * CUDA array (in elements); the following types of CUDA arrays can be allocated: - * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. - * - A 2D mipmapped array is allocated if only \p Depth extent is zero. - * - A 3D mipmapped array is allocated if all three extents are non-zero. - * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the - * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number - * of layers is determined by the depth extent. - * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and - * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number - * of layers is determined by the depth extent. - * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the - * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and - * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, - * where the six layers represent the six faces of a cube. The order of the six - * layers in memory is the same as that listed in ::CUarray_cubemap_face. - * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, - * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. - * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. - * A cubemap layered CUDA array is a special type of 2D layered CUDA array that - * consists of a collection of cubemaps. The first six layers represent the first - * cubemap, the next six layers form the second cubemap, and so on. - * - * - ::Format specifies the format of the elements; ::CUarray_format is - * defined as: - * \code - typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - CU_AD_FORMAT_SIGNED_INT8 = 0x08, - CU_AD_FORMAT_SIGNED_INT16 = 0x09, - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - CU_AD_FORMAT_HALF = 0x10, - CU_AD_FORMAT_FLOAT = 0x20 - } CUarray_format; - * \endcode - * - * - \p NumChannels specifies the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - * - ::Flags may be set to - * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, - * \p Depth specifies the number of layers, not the depth of a 3D array. - * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of - * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to - * bind a mipmap level of the CUDA mipmapped array to a surface reference. - * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be - * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, - * then \p Depth must be a multiple of six. - * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. - * Texture gather can only be performed on 2D CUDA mipmapped arrays. - * - * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. - * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute - * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), - * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
- * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } - *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), - * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), - * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, - * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, - * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), - * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), - * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), - * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), - * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), - * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
- * - * - * \param pHandle - Returned mipmapped array - * \param pMipmappedArrayDesc - mipmapped array descriptor - * \param numMipmapLevels - Number of mipmap levels - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cuMipmappedArrayDestroy, - * ::cuMipmappedArrayGetLevel, - * ::cuArrayCreate, - * ::cudaMallocMipmappedArray - */ -CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); - -/** - * \brief Gets a mipmap level of a CUDA mipmapped array - * - * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level - * of the CUDA mipmapped array \p hMipmappedArray. - * - * If \p level is greater than the maximum number of levels in this mipmapped array, - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \param pLevelArray - Returned mipmap level CUDA array - * \param hMipmappedArray - CUDA mipmapped array - * \param level - Mipmap level - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::cuMipmappedArrayCreate, - * ::cuMipmappedArrayDestroy, - * ::cuArrayCreate, - * ::cudaGetMipmappedArrayLevel - */ -CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); - -/** - * \brief Destroys a CUDA mipmapped array - * - * Destroys the CUDA mipmapped array \p hMipmappedArray. - * - * \param hMipmappedArray - Mipmapped array to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ARRAY_IS_MAPPED, - * ::CUDA_ERROR_CONTEXT_IS_DESTROYED - * \notefnerr - * - * \sa - * ::cuMipmappedArrayCreate, - * ::cuMipmappedArrayGetLevel, - * ::cuArrayCreate, - * ::cudaFreeMipmappedArray - */ -CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); - -/** @} */ /* END CUDA_MEM */ - -/** - * \defgroup CUDA_VA Virtual Memory Management - * - * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the virtual memory management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** -* \brief Allocate an address range reservation. -* -* Reserves a virtual address range based on the given parameters, giving -* the starting address of the range in \p ptr. This API requires a system that -* supports UVA. The size and address parameters must be a multiple of the -* host page size and the alignment must be a power of two or zero for default -* alignment. -* -* \param[out] ptr - Resulting pointer to start of virtual address range allocated -* \param[in] size - Size of the reserved virtual address range requested -* \param[in] alignment - Alignment of the reserved virtual address range requested -* \param[in] addr - Fixed starting address range requested -* \param[in] flags - Currently unused, must be zero -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_OUT_OF_MEMORY, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemAddressFree -*/ -CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); - -/** -* \brief Free an address range reservation. -* -* Frees a virtual address range reserved by cuMemAddressReserve. The size -* must match what was given to memAddressReserve and the ptr given must -* match what was returned from memAddressReserve. -* -* \param[in] ptr - Starting address of the virtual address range to free -* \param[in] size - Size of the virtual address region to free -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemAddressReserve -*/ -CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); - -/** -* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties -* -* This creates a memory allocation on the target device specified through the -* \p prop strcuture. The created allocation will not have any device or host -* mappings. The generic memory \p handle for the allocation can be -* mapped to the address space of calling process via ::cuMemMap. This handle -* cannot be transmitted directly to other processes (see -* ::cuMemExportToShareableHandle). On Windows, the caller must also pass -* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which -* limits or allows access to this handle for a recepient process (see -* ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this -* allocation must be a multiple of the the value given via -* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM -* flag. -* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then -* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays -* and sparse CUDA mipmapped arrays. -* (see ::cuMemMapArrayAsync). -* -* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. -* \param[in] size - Size of the allocation requested -* \param[in] prop - Properties of the allocation to create. -* \param[in] flags - flags for future use, must be zero now. -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_OUT_OF_MEMORY, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* -* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); - -/** -* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. -* -* Frees the memory that was allocated on a device through cuMemCreate. -* -* The memory allocation will be freed when all outstanding mappings to the memory -* are unmapped and when all outstanding references to the handle (including it's -* shareable counterparts) are also released. The generic memory handle can be -* freed when there are still outstanding mappings made with this handle. Each -* time a recepient process imports a shareable handle, it needs to pair it with -* ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle -* the behavior is undefined. -* -* \param[in] handle Value of handle which was returned previously by cuMemCreate. -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* -* \sa ::cuMemCreate -*/ -CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); - -/** -* \brief Maps an allocation handle to a reserved virtual address range. -* -* Maps bytes of memory represented by \p handle starting from byte \p offset to -* \p size to address range [\p addr, \p addr + \p size]. This range must be an -* address reservation previously reserved with ::cuMemAddressReserve, and -* \p offset + \p size must be less than the size of the memory allocation. -* Both \p ptr, \p size, and \p offset must be a multiple of the value given via -* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. -* -* Please note calling ::cuMemMap does not make the address accessible, -* the caller needs to update accessibility of a contiguous mapped VA -* range by calling ::cuMemSetAccess. -* -* Once a recipient process obtains a shareable memory handle -* from ::cuMemImportFromShareableHandle, the process must -* use ::cuMemMap to map the memory into its address ranges before -* setting accessibility with ::cuMemSetAccess. -* -* ::cuMemMap can only create mappings on VA range reservations -* that are not currently mapped. -* -* \param[in] ptr - Address where memory will be mapped. -* \param[in] size - Size of the memory mapping. -* \param[in] offset - Offset into the memory represented by -* - \p handle from which to start mapping -* - Note: currently must be zero. -* \param[in] handle - Handle to a shareable memory -* \param[in] flags - flags for future use, must be zero now. -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_OUT_OF_MEMORY, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* -* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); - -/** - * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays - * - * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays. - * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count. - * The structure ::CUarrayMapInfo is defined as follow: - \code - typedef struct CUarrayMapInfo_st { - CUresourcetype resourceType; - union { - CUmipmappedArray mipmap; - CUarray array; - } resource; - - CUarraySparseSubresourceType subresourceType; - union { - struct { - unsigned int level; - unsigned int layer; - unsigned int offsetX; - unsigned int offsetY; - unsigned int offsetZ; - unsigned int extentWidth; - unsigned int extentHeight; - unsigned int extentDepth; - } sparseLevel; - struct { - unsigned int layer; - unsigned long long offset; - unsigned long long size; - } miptail; - } subresource; - - CUmemOperationType memOperationType; - - CUmemHandleType memHandleType; - union { - CUmemGenericAllocationHandle memHandle; - } memHandle; - - unsigned long long offset; - unsigned int deviceBitMask; - unsigned int flags; - unsigned int reserved[2]; - } CUarrayMapInfo; - \endcode - * - * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on. - * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then - * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle. - * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using - * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE - - * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. - - * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. - * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY - * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle. - * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been - * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE - - * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. - - * - * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. - * ::CUarraySparseSubresourceType_enum is defined as: - \code - typedef enum CUarraySparseSubresourceType_enum { - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 - } CUarraySparseSubresourceType; - \endcode - * - * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a - * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which - * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by - * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type. - * - * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL - * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents. - * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY - * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively. - * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight - * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively. - * These offsets and extents must be aligned to the corresponding tile dimension. - * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise, - * must be zero. - * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise, - * must be zero. - * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth - * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays. - * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties - * - * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL - * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in - * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size. - * Both, mip tail offset and mip tail size must be aligned to the tile size. - * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags - * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index. - * Otherwise, must be zero. - * - - * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING - * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored. - * - - * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as: - \code - typedef enum CUmemOperationType_enum { - CU_MEM_OPERATION_TYPE_MAP = 1, - CU_MEM_OPERATION_TYPE_UNMAP = 2 - } CUmemOperationType; - \endcode - * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource - * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. - * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, - * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC. - * - * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation - * is performed. ::CUarrayMapInfo::memHandle must be NULL. - * - * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. - * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. - * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match - * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle. - * - * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * - * \param[in] mapInfoList - List of ::CUarrayMapInfo - * \param[in] count - Count of ::CUarrayMapInfo in \p mapInfoList - * \param[in] hStream - Stream identifier for the stream to use for map or unmap operations - * - * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties - */ -CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); - -/** -* \brief Unmap the backing memory of a given address range. -* -* The range must be the entire contiguous address range that was mapped to. In -* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped -* by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed -* if there are no existing mappings and there are no unreleased memory handles. -* -* When ::cuMemUnmap returns successfully the address range is converted to an -* address reservation and can be used for a future calls to ::cuMemMap. Any new -* mapping to this virtual address will need to have access granted through -* ::cuMemSetAccess, as all mappings start with no accessibility setup. -* -* \param[in] ptr - Starting address for the virtual address range to unmap -* \param[in] size - Size of the virtual address range to unmap -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* \note_sync -* -* \sa ::cuMemCreate, ::cuMemAddressReserve -*/ -CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); - -/** -* \brief Set the access flags for each location specified in \p desc for the given virtual address range -* -* Given the virtual address range via \p ptr and \p size, and the locations -* in the array given by \p desc and \p count, set the access flags for the -* target locations. The range must be a fully mapped address range -* containing all allocations created by ::cuMemMap / ::cuMemCreate. -* -* \param[in] ptr - Starting address for the virtual address range -* \param[in] size - Length of the virtual address range -* \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the -* - mapping for each location specified -* \param[in] count - Number of ::CUmemAccessDesc in \p desc -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* \note_sync -* -* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap -*/ -CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); - -/** -* \brief Get the access \p flags set for the given \p location and \p ptr -* -* \param[out] flags - Flags set for this location -* \param[in] location - Location in which to check the flags for -* \param[in] ptr - Address in which to check the access flags for -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemSetAccess -*/ -CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); - -/** -* \brief Exports an allocation to a requested shareable handle type -* -* Given a CUDA memory handle, create a shareable memory -* allocation handle that can be used to share the memory with other -* processes. The recipient process can convert the shareable handle back into a -* CUDA memory handle using ::cuMemImportFromShareableHandle and map -* it with ::cuMemMap. The implementation of what this handle is and how it -* can be transferred is defined by the requested handle type in \p handleType -* -* Once all shareable handles are closed and the allocation is released, the allocated -* memory referenced will be released back to the OS and uses of the CUDA handle afterward -* will lead to undefined behavior. -* -* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) -* that support importing memory from the shareable type -* -* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type -* \param[in] handle - CUDA handle for the memory allocation -* \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) -* \param[in] flags - Reserved, must be zero -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); - -/** -* \brief Imports an allocation from a requested shareable handle type. -* -* If the current process cannot support the memory described by this shareable -* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. -* -* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) -* created on devices under an SLI group may not be supported, and thus this API will -* return CUDA_ERROR_NOT_SUPPORTED. -* There is no guarantee that the contents of \p handle will be the same CUDA memory handle -* for the same given OS shareable handle, or the same underlying allocation. -* -* \param[out] handle - CUDA Memory handle for the memory allocation. -* \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. -* \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease -*/ -CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); - -/** -* \brief Calculates either the minimal or recommended granularity -* -* Calculates either the minimal or recommended granularity -* for a given allocation specification and returns it in granularity. This -* granularity can be used as a multiple for alignment, size, or address mapping. -* -* \param[out] granularity Returned granularity. -* \param[in] prop Property for which to determine the granularity for -* \param[in] option Determines which granularity to return -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemCreate, ::cuMemMap -*/ -CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); - -/** -* \brief Retrieve the contents of the property structure defining properties for this handle -* -* \param[out] prop - Pointer to a properties structure which will hold the information about this handle -* \param[in] handle - Handle which to perform the query on -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); - -/** -* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. -* -* The handle is guaranteed to be the same handle value used to map the memory. If the address -* requested is not mapped, the function will fail. The returned handle must be released with -* corresponding number of calls to ::cuMemRelease. -* -* \note The address \p addr, can be any address in a range previously mapped -* by ::cuMemMap, and not necessarily the start address. -* -* \param[out] handle CUDA Memory handle for the backing memory allocation. -* \param[in] addr Memory address to query, that has been mapped previously. -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap -*/ -CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); - -/** @} */ /* END CUDA_VA */ - -/** - * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator - * - * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. - * Functions for controlling the behavior of the underlying allocator. - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the stream ordered memory allocator exposed by the - * low-level CUDA driver application programming interface. - * - * @{ - * - * \section CUDA_MALLOC_ASYNC_overview overview - * - * The asynchronous allocator allows the user to allocate and free in stream order. - * All asynchronous accesses of the allocation must happen between - * the stream executions of the allocation and the free. If the memory is accessed - * outside of the promised stream order, a use before allocation / use after free error - * will cause undefined behavior. - * - * The allocator is free to reallocate the memory as long as it can guarantee - * that compliant memory accesses will not overlap temporally. - * The allocator may refer to internal stream ordering as well as inter-stream dependencies - * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. - * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. - * - * \section CUDA_MALLOC_ASYNC_support Supported Platforms - * - * Whether or not a device supports the integrated stream ordered memory allocator - * may be queried by calling ::cuDeviceGetAttribute() with the device attribute - * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED - */ - -/** - * \brief Frees memory with stream ordered semantics - * - * Inserts a free operation into \p hStream. - * The allocation must not be accessed after stream execution reaches the free. - * After this API returns, accessing the memory from any subsequent work launched on the GPU - * or querying its pointer attributes results in undefined behavior. - * - * \note During stream capture, this function results in the creation of a free node and - * must therefore be passed the address of a graph allocation. - * - * \param dptr - memory to free - * \param hStream - The stream establishing the stream ordering contract. - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), - * ::CUDA_ERROR_NOT_SUPPORTED - */ -CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); - -/** - * \brief Allocates memory with stream ordered semantics - * - * Inserts an allocation operation into \p hStream. - * A pointer to the allocated memory is returned immediately in *dptr. - * The allocation must not be accessed until the the allocation operation completes. - * The allocation comes from the memory pool current to the stream's device. - * - * \note The default memory pool of a device contains device memory from that device. - * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. - * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation - * operation completes before work submitted in a separate stream runs. - * \note During stream capture, this function results in the creation of an allocation node. In this case, - * the allocation is owned by the graph instead of the memory pool. The memory pool's properties - * are used to set the node's creation parameters. - * - * \param[out] dptr - Returned device pointer - * \param[in] bytesize - Number of bytes to allocate - * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool, - * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, - * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute - */ -CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); - -/** - * \brief Tries to release memory back to the OS - * - * Releases memory back to the OS until the pool contains fewer than minBytesToKeep - * reserved bytes, or there is no more memory that the allocator can safely release. - * The allocator cannot release OS allocations that back outstanding asynchronous allocations. - * The OS allocations may happen at different granularity from the user allocations. - * - * \note: Allocations that have not been freed count as outstanding. - * \note: Allocations that have been asynchronously freed but whose completion has - * not been observed on the host (eg. by a synchronize) can count as outstanding. - * - * \param[in] pool - The memory pool to trim - * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, - * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have - * at least minBytesToKeep bytes reserved after the operation. - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep); - -/** - * \brief Sets attributes of a memory pool - * - * Supported attributes are: - * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) - * Amount of reserved memory in bytes to hold onto before trying - * to release memory back to the OS. When more than the release - * threshold bytes of memory are held by the memory pool, the - * allocator will try to release memory back to the OS on the - * next call to stream, event or context synchronize. (default 0) - * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to use memory asynchronously freed - * in another stream as long as a stream ordering dependency - * of the allocating stream on the free action exists. - * Cuda events and null stream interactions can create the required - * stream ordered dependencies. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) - * Allow reuse of already completed frees when there is no dependency - * between the free and allocation. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to insert new stream dependencies - * in order to establish the stream ordering required to reuse - * a piece of memory released by ::cuMemFreeAsync (default enabled). - * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) - * Reset the high watermark that tracks the amount of backing memory that was - * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. - * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) - * Reset the high watermark that tracks the amount of used memory that was - * allocated for the memory pool. - * - * \param[in] pool - The memory pool to modify - * \param[in] attr - The attribute to modify - * \param[in] value - Pointer to the value to assign - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); - -/** - * \brief Gets attributes of a memory pool - * - * Supported attributes are: - * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) - * Amount of reserved memory in bytes to hold onto before trying - * to release memory back to the OS. When more than the release - * threshold bytes of memory are held by the memory pool, the - * allocator will try to release memory back to the OS on the - * next call to stream, event or context synchronize. (default 0) - * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to use memory asynchronously freed - * in another stream as long as a stream ordering dependency - * of the allocating stream on the free action exists. - * Cuda events and null stream interactions can create the required - * stream ordered dependencies. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) - * Allow reuse of already completed frees when there is no dependency - * between the free and allocation. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to insert new stream dependencies - * in order to establish the stream ordering required to reuse - * a piece of memory released by ::cuMemFreeAsync (default enabled). - * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t) - * Amount of backing memory currently allocated for the mempool - * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) - * High watermark of backing memory allocated for the mempool since the - * last time it was reset. - * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t) - * Amount of memory from the pool that is currently in use by the application. - * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) - * High watermark of the amount of memory from the pool that was in use by the application. - * - * \param[in] pool - The memory pool to get attributes of - * \param[in] attr - The attribute to get - * \param[out] value - Retrieved value - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); - -/** - * \brief Controls visibility of pools between devices - * - * \param[in] pool - The pool being modified - * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. - * \param[in] count - Number of descriptors in the map array. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count); - -/** - * \brief Returns the accessibility of a pool from a device - * - * Returns the accessibility of the pool's memory from the specified location. - * - * \param[out] flags - the accessibility of the pool from the specified location - * \param[in] memPool - the pool being queried - * \param[in] location - the location accessing the pool - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location); - -/** - * \brief Creates a memory pool - * - * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines - * the properties of the pool such as the backing device and IPC capabilities. - * - * By default, the pool's memory will be accessible from the device it is allocated on. - * - * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NOT_SUPPORTED - * - * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool, - * ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle - */ -CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps); - -/** - * \brief Destroys the specified memory pool - * - * If any pointers obtained from this pool haven't been freed or - * the pool has free operations that haven't completed - * when ::cuMemPoolDestroy is invoked, the function will return immediately and the - * resources associated with the pool will be released automatically - * once there are no more outstanding allocations. - * - * Destroying the current mempool of a device sets the default mempool of - * that device as the current mempool for that device. - * - * \note A device's default memory pool cannot be destroyed. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, - * ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool); - -/** - * \brief Allocates memory from a specified pool with stream ordered semantics. - * - * Inserts an allocation operation into \p hStream. - * A pointer to the allocated memory is returned immediately in *dptr. - * The allocation must not be accessed until the the allocation operation completes. - * The allocation comes from the specified memory pool. - * - * \note - * - The specified memory pool may be from a device different than that of the specified \p hStream. - * - * - Basic stream ordering allows future work submitted into the same stream to use the allocation. - * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation - * operation completes before work submitted in a separate stream runs. - * - * \note During stream capture, this function results in the creation of an allocation node. In this case, - * the allocation is owned by the graph instead of the memory pool. The memory pool's properties - * are used to set the node's creation parameters. - * - * \param[out] dptr - Returned device pointer - * \param[in] bytesize - Number of bytes to allocate - * \param[in] pool - The pool to allocate from - * \param[in] hStream - The stream establishing the stream ordering semantic - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess, - * ::cuMemPoolSetAttribute - */ -CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); - -/** - * \brief Exports a memory pool to the requested handle type. - * - * Given an IPC capable mempool, create an OS handle to share the pool with another process. - * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle. - * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. - * The implementation of what the shareable handle is and how it can be transferred is defined by the requested - * handle type. - * - * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. - * - * \param[out] handle_out - Returned OS handle - * \param[in] pool - pool to export - * \param[in] handleType - the type of handle to create - * \param[in] flags - must be 0 - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer, - * ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync, - * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, - * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute - */ -CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); - -/** - * \brief imports a memory pool from a shared handle. - * - * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. - * - * \note Imported memory pools do not support creating new allocations. - * As such imported memory pools may not be used in cuDeviceSetMemPool - * or ::cuMemAllocFromPoolAsync calls. - * - * \param[out] pool_out - Returned memory pool - * \param[in] handle - OS handle of the pool to open - * \param[in] handleType - The type of handle being imported - * \param[in] flags - must be 0 - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer - */ -CUresult CUDAAPI cuMemPoolImportFromShareableHandle( - CUmemoryPool *pool_out, - void *handle, - CUmemAllocationHandleType handleType, - unsigned long long flags); - -/** - * \brief Export data to share a memory pool allocation between processes. - * - * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. - * The recipient process can import the allocation with the ::cuMemPoolImportPointer api. - * The data is not a handle and may be shared through any IPC mechanism. - * - * \param[out] shareData_out - Returned export data - * \param[in] ptr - pointer to memory being exported - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer - */ -CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr); - -/** - * \brief Import a memory pool allocation from another process. - * - * Returns in \p ptr_out a pointer to the imported memory. - * The imported memory must not be accessed before the allocation operation completes - * in the exporting process. The imported memory must be freed from all importing processes before - * being freed in the exporting process. The pointer may be freed with cuMemFree - * or cuMemFreeAsync. If cuMemFreeAsync is used, the free must be completed - * on the importing process before the free operation on the exporting process. - * - * \note The cuMemFreeAsync api may be used in the exporting process before - * the cuMemFreeAsync operation completes in its stream as long as the - * cuMemFreeAsync in the exporting process specifies a stream with - * a stream dependency on the importing process's cuMemFreeAsync. - * - * \param[out] ptr_out - pointer to imported memory - * \param[in] pool - pool from which to import - * \param[in] shareData - data specifying the memory to import - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer - */ -CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData); - -/** @} */ /* END CUDA_MALLOC_ASYNC */ - -/** - * \defgroup CUDA_UNIFIED Unified Addressing - * - * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the unified addressing functions of the - * low-level CUDA driver application programming interface. - * - * @{ - * - * \section CUDA_UNIFIED_overview Overview - * - * CUDA devices can share a unified address space with the host. - * For these devices there is no distinction between a device - * pointer and a host pointer -- the same pointer value may be - * used to access memory from the host program and from a kernel - * running on the device (with exceptions enumerated below). - * - * \section CUDA_UNIFIED_support Supported Platforms - * - * Whether or not a device supports unified addressing may be - * queried by calling ::cuDeviceGetAttribute() with the device - * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. - * - * Unified addressing is automatically enabled in 64-bit processes - * - * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values - * - * It is possible to look up information about the memory which backs a - * pointer value. For instance, one may want to know if a pointer points - * to host or device memory. As another example, in the case of device - * memory, one may want to know on which CUDA device the memory - * resides. These properties may be queried using the function - * ::cuPointerGetAttribute() - * - * Since pointers are unique, it is not necessary to specify information - * about the pointers specified to the various copy functions in the - * CUDA API. The function ::cuMemcpy() may be used to perform a copy - * between two pointers, ignoring whether they point to host or device - * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() - * unnecessary for devices supporting unified addressing). For - * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be - * used to specify that the CUDA driver should infer the location of the - * pointer from its value. - * - * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory - * - * All host memory allocated in all contexts using ::cuMemAllocHost() and - * ::cuMemHostAlloc() is always directly accessible from all contexts on - * all devices that support unified addressing. This is the case regardless - * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and - * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. - * - * The pointer value through which allocated host memory may be accessed - * in kernels on all devices that support unified addressing is the same - * as the pointer value through which that memory is accessed on the host, - * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device - * pointer for these allocations. - * - * Note that this is not the case for memory allocated using the flag - * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. - * - * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory - * - * Upon enabling direct access from a context that supports unified addressing - * to another peer context that supports unified addressing using - * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using - * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible - * by the current context. The device pointer value through - * which any peer memory may be accessed in the current context - * is the same pointer value through which that memory may be - * accessed in the peer context. - * - * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing - * - * Not all memory may be accessed on devices through the same pointer - * value through which they are accessed on the host. These exceptions - * are host memory registered using ::cuMemHostRegister() and host memory - * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these - * exceptions, there exists a distinct host and device address for the - * memory. The device address is guaranteed to not overlap any valid host - * pointer range and is guaranteed to have the same value across all - * contexts that support unified addressing. - * - * This device address may be queried using ::cuMemHostGetDevicePointer() - * when a context using unified addressing is current. Either the host - * or the unified device pointer value may be used to refer to this memory - * through ::cuMemcpy() and similar functions using the - * ::CU_MEMORYTYPE_UNIFIED memory type. - * - */ - -/** - * \brief Returns information about a pointer - * - * The supported attributes are: - * - * - ::CU_POINTER_ATTRIBUTE_CONTEXT: - * - * Returns in \p *data the ::CUcontext in which \p ptr was allocated or - * registered. - * The type of \p data must be ::CUcontext *. - * - * If \p ptr was not allocated by, mapped by, or registered with - * a ::CUcontext which uses unified virtual addressing then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: - * - * Returns in \p *data the physical memory type of the memory that - * \p ptr addresses as a ::CUmemorytype enumerated value. - * The type of \p data must be unsigned int. - * - * If \p ptr addresses device memory then \p *data is set to - * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the - * memory resides is the ::CUdevice of the ::CUcontext returned by the - * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. - * - * If \p ptr addresses host memory then \p *data is set to - * ::CU_MEMORYTYPE_HOST. - * - * If \p ptr was not allocated by, mapped by, or registered with - * a ::CUcontext which uses unified virtual addressing then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * If the current ::CUcontext does not support unified virtual - * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. - * - * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: - * - * Returns in \p *data the device pointer value through which - * \p ptr may be accessed by kernels running in the current - * ::CUcontext. - * The type of \p data must be CUdeviceptr *. - * - * If there exists no device pointer value through which - * kernels running in the current ::CUcontext may access - * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * If there is no current ::CUcontext then - * ::CUDA_ERROR_INVALID_CONTEXT is returned. - * - * Except in the exceptional disjoint addressing cases discussed - * below, the value returned in \p *data will equal the input - * value \p ptr. - * - * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: - * - * Returns in \p *data the host pointer value through which - * \p ptr may be accessed by by the host program. - * The type of \p data must be void **. - * If there exists no host pointer value through which - * the host program may directly access \p ptr then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * Except in the exceptional disjoint addressing cases discussed - * below, the value returned in \p *data will equal the input - * value \p ptr. - * - * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: - * - * Returns in \p *data two tokens for use with the nv-p2p.h Linux - * kernel interface. \p data must be a struct of type - * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. - * - * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). - * Note that p2pToken and vaSpaceToken are only valid for the - * lifetime of the source allocation. A subsequent allocation at - * the same address may return completely different tokens. - * Querying this attribute has a side effect of setting the attribute - * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that - * \p ptr points to. - * - * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: - * - * A boolean attribute which when set, ensures that synchronous memory operations - * initiated on the region of memory that \p ptr points to will always synchronize. - * See further documentation in the section titled "API synchronization behavior" - * to learn more about cases when synchronous memory operations can - * exhibit asynchronous behavior. - * - * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: - * - * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. - * \p data must point to an unsigned long long. - * - * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. - * Every memory allocation from any of the CUDA memory allocation APIs will - * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs - * from previous freed allocations. IDs are only unique within a single process. - * - * - * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: - * - * Returns in \p *data a boolean that indicates whether the pointer points to - * managed memory or not. - * - * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: - * - * Returns in \p *data an integer representing a device ordinal of a device against - * which the memory was allocated or registered. - * - * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: - * - * Returns in \p *data a boolean that indicates if this pointer maps to - * an allocation that is suitable for ::cudaIpcGetMemHandle. - * - * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: - * - * Returns in \p *data the starting address for the allocation referenced - * by the device pointer \p ptr. Note that this is not necessarily the - * address of the mapped region, but the address of the mappable address - * range \p ptr references (e.g. from ::cuMemAddressReserve). - * - * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: - * - * Returns in \p *data the size for the allocation referenced by the device - * pointer \p ptr. Note that this is not necessarily the size of the mapped - * region, but the size of the mappable address range \p ptr references - * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped - * region, see ::cuMemGetAddressRange - * - * - ::CU_POINTER_ATTRIBUTE_MAPPED: - * - * Returns in \p *data a boolean that indicates if this pointer is in a - * valid address range that is mapped to a backing allocation. - * - * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: - * - * Returns a bitmask of the allowed handle types for an allocation that may - * be passed to ::cuMemExportToShareableHandle. - * - * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: - * - * Returns in \p *data the handle to the mempool that the allocation was obtained from. - * - * \par - * - * Note that for most allocations in the unified virtual address space - * the host and device pointer for accessing the allocation will be the - * same. The exceptions to this are - * - user memory registered using ::cuMemHostRegister - * - host memory allocated using ::cuMemHostAlloc with the - * ::CU_MEMHOSTALLOC_WRITECOMBINED flag - * For these types of allocation there will exist separate, disjoint host - * and device addresses for accessing the allocation. In particular - * - The host address will correspond to an invalid unmapped device address - * (which will result in an exception if accessed from the device) - * - The device address will correspond to an invalid unmapped host address - * (which will result in an exception if accessed from the host). - * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER - * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host - * and device addresses from either address. - * - * \param data - Returned pointer attribute value - * \param attribute - Pointer attribute to query - * \param ptr - Pointer - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuPointerSetAttribute, - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuMemAllocHost, - * ::cuMemFreeHost, - * ::cuMemHostAlloc, - * ::cuMemHostRegister, - * ::cuMemHostUnregister, - * ::cudaPointerGetAttributes - */ -CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); - -/** - * \brief Prefetches memory to the specified destination device - * - * Prefetches memory to the specified destination device. \p devPtr is the - * base device pointer of the memory to be prefetched and \p dstDevice is the - * destination device. \p count specifies the number of bytes to copy. \p hStream - * is the stream in which the operation is enqueued. The memory range must refer - * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. - * - * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If - * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS - * must be non-zero. Additionally, \p hStream must be associated with a device that has a - * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * - * The start address and end address of the memory range will be rounded down and rounded up - * respectively to be aligned to CPU page size before the prefetch operation is enqueued - * in the stream. - * - * If no physical memory has been allocated for this region, then this memory region - * will be populated and mapped on the destination device. If there's insufficient - * memory to prefetch the desired region, the Unified Memory driver may evict pages from other - * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory - * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. - * - * By default, any mappings to the previous location of the migrated pages are removed and - * mappings for the new location are only setup on \p dstDevice. The exact behavior however - * also depends on the settings applied to this memory range via ::cuMemAdvise as described - * below: - * - * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, - * then that subset will create a read-only copy of the pages on \p dstDevice. - * - * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory - * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the - * preferred location of any pages in the memory range. - * - * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, - * then mappings to those pages from all the appropriate processors are updated to - * refer to the new location if establishing such a mapping is possible. Otherwise, - * those mappings are cleared. - * - * Note that this API is not required for functionality and only serves to improve performance - * by allowing the application to migrate data to a suitable location before it is accessed. - * Memory accesses to this range are always coherent and are allowed even when the data is - * actively being migrated. - * - * Note that this function is asynchronous with respect to the host and all work - * on other devices. - * - * \param devPtr - Pointer to be prefetched - * \param count - Size in bytes - * \param dstDevice - Destination device to prefetch to - * \param hStream - Stream to enqueue prefetch operation - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, - * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, - * ::cudaMemPrefetchAsync - */ -CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); - -/** - * \brief Advise about the usage of a given memory range - * - * Advise the Unified Memory subsystem about the usage pattern for the memory range - * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory - * range will be rounded down and rounded up respectively to be aligned to CPU page size before the - * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged - * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable - * memory provided it represents a valid, host-accessible region of memory and all additional constraints - * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable - * memory range results in an error being returned. - * - * The \p advice parameter can take the following values: - * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - * from and only occasionally written to. Any read accesses from any processor to this region will create a - * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync - * is called on this region, it will create a read-only copy of the data on the destination processor. - * If any processor writes to this region, all copies of the corresponding page will be invalidated - * except for the one where the write occurred. The \p device argument is ignored for this advice. - * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU - * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * Also, if a context is created on a device that does not have the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until - * all such contexts are destroyed. - * If the memory region refers to valid system-allocated pageable memory, then the accessing device must - * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only - * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice - * will not create a read-only copy when that device accesses this memory region. - * - * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the - * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated - * copies of the data will be collapsed into a single copy. The location for the collapsed - * copy will be the preferred location if the page has a preferred location and one of the read-duplicated - * copies was resident at that location. Otherwise, the location chosen is arbitrary. - * - * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location - * does not cause data to migrate to that location immediately. Instead, it guides the migration policy - * when a fault occurs on that memory region. If the data is already in its preferred location and the - * faulting processor can establish a mapping without requiring the data to be migrated, then - * data migration will be avoided. On the other hand, if the data is not in its preferred location - * or if a direct mapping cannot be established, then it will be migrated to the processor accessing - * it. It is important to note that setting the preferred location does not prevent data prefetching - * done using ::cuMemPrefetchAsync. - * Having a preferred location can override the page thrash detection and resolution logic in the Unified - * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device - * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But - * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. - * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the - * policies associated with that advice will override the policies of this advice, unless read accesses from - * \p device will not result in a read-only copy being created on that device as outlined in description for - * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. - * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero - * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has - * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - * then this call has no effect. Note however that this behavior may change in the future. - * - * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - * and changes the preferred location to none. - * - * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then - * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. - * This advice does not cause data migration and has no impact on the location of the data per se. Instead, - * it causes the data to always be mapped in the specified processor's page tables, as long as the - * location of the data permits a mapping to be established. If the data gets migrated for any reason, - * the mappings are updated accordingly. - * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. - * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data - * over to the other GPUs is not as important because the accesses are infrequent and the overhead of - * migration may be too high. But preventing faults can still help improve performance, and so having - * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the - * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - * page in host memory. - * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the - * policies associated with that advice will override the policies of this advice. Additionally, if the - * preferred location of this memory region or any subset of it is also \p device, then the policies - * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. - * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero - * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has - * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - * then this call has no effect. - * - * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to - * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. - * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero - * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has - * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - * then this call has no effect. - * - * \param devPtr - Pointer to memory to set the advice for - * \param count - Size in bytes of the memory range - * \param advice - Advice to be applied for the specified memory range - * \param device - Device to apply the advice for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, - * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, - * ::cudaMemAdvise - */ -CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); - -/** - * \brief Query an attribute of a given memory range - * - * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The - * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via - * __managed__ variables. - * - * The \p attribute parameter can take the following values: - * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted - * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given - * memory range have read-duplication enabled, or 0 otherwise. - * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be - * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device - * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU - * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID - * if either all the pages don't have the same preferred location or some of the pages don't have a - * preferred location at all. Note that the actual location of the pages in the memory range at the time of - * the query may be different from the preferred location. - * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted - * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned - * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. - * If any device does not have that advice set for the entire memory range, that device will not be included. - * If \p data is larger than the number of devices that have that advice set for that memory range, - * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 - * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be - * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have - * that advice set, then only as many devices will be returned as can fit in the array. There is no - * guarantee on which specific devices will be returned, however. - * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be - * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location - * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be - * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU - * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not - * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the - * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to - * whether the prefetch operation to that location has completed or even begun. - * - * \param data - A pointers to a memory location where the result - * of each attribute query will be written to. - * \param dataSize - Array containing the size of data - * \param attribute - The attribute to query - * \param devPtr - Start of the range to query - * \param count - Size of the range to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, - * ::cuMemAdvise, - * ::cudaMemRangeGetAttribute - */ -CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); - -/** - * \brief Query attributes of a given memory range. - * - * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The - * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via - * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes - * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. - * The results of the query will be stored in \p data. - * - * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for - * attribute descriptions and restrictions. - * - * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY - * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION - * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY - * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION - * - * \param data - A two-dimensional array containing pointers to memory - * locations where the result of each attribute query will be written to. - * \param dataSizes - Array containing the sizes of each result - * \param attributes - An array of attributes to query - * (numAttributes and the number of attributes in this array should match) - * \param numAttributes - Number of attributes to query - * \param devPtr - Start of the range to query - * \param count - Size of the range to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise, - * ::cuMemPrefetchAsync, - * ::cudaMemRangeGetAttributes - */ -CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); - -/** - * \brief Set attributes on a previously allocated memory region - * - * The supported attributes are: - * - * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: - * - * A boolean attribute that can either be set (1) or unset (0). When set, - * the region of memory that \p ptr points to is guaranteed to always synchronize - * memory operations that are synchronous. If there are some previously initiated - * synchronous memory operations that are pending when this attribute is set, the - * function does not return until those memory operations are complete. - * See further documentation in the section titled "API synchronization behavior" - * to learn more about cases when synchronous memory operations can - * exhibit asynchronous behavior. - * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. - * - * \param value - Pointer to memory containing the value to be set - * \param attribute - Pointer attribute to set - * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa ::cuPointerGetAttribute, - * ::cuPointerGetAttributes, - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuMemAllocHost, - * ::cuMemFreeHost, - * ::cuMemHostAlloc, - * ::cuMemHostRegister, - * ::cuMemHostUnregister - */ -CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); - -/** - * \brief Returns information about a pointer. - * - * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): - * - * - ::CU_POINTER_ATTRIBUTE_CONTEXT - * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE - * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER - * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER - * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS - * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID - * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED - * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL - * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR - * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE - * - ::CU_POINTER_ATTRIBUTE_MAPPED - * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE - * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES - * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE - * - * \param numAttributes - Number of attributes to query - * \param attributes - An array of attributes to query - * (numAttributes and the number of attributes in this array should match) - * \param data - A two-dimensional array containing pointers to memory - * locations where the result of each attribute query will be written to. - * \param ptr - Pointer to query - * - * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr - * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values - * and CUDA_SUCCESS is returned. - * - * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA - * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuPointerGetAttribute, - * ::cuPointerSetAttribute, - * ::cudaPointerGetAttributes - */ -CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); - -/** @} */ /* END CUDA_UNIFIED */ - -/** - * \defgroup CUDA_STREAM Stream Management - * - * ___MANBRIEF___ stream management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the stream management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Create a stream - * - * Creates a stream and returns a handle in \p phStream. The \p Flags argument - * determines behaviors of the stream. - * - * Valid values for \p Flags are: - * - ::CU_STREAM_DEFAULT: Default stream creation flag. - * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created - * stream may run concurrently with work in stream 0 (the NULL stream), and that - * the created stream should perform no implicit synchronization with stream 0. - * - * \param phStream - Returned newly created stream - * \param Flags - Parameters for stream creation - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreateWithPriority, - * ::cuStreamGetPriority, - * ::cuStreamGetFlags, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamCreate, - * ::cudaStreamCreateWithFlags - */ -CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); - -/** - * \brief Create a stream with the given priority - * - * Creates a stream with the specified priority and returns a handle in \p phStream. - * This API alters the scheduler priority of work in the stream. Work in a higher - * priority stream may preempt work already executing in a low priority stream. - * - * \p priority follows a convention where lower numbers represent higher priorities. - * '0' represents default priority. The range of meaningful numerical priorities can - * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is - * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, - * it will automatically be clamped to the lowest or the highest number in the range. - * - * \param phStream - Returned newly created stream - * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of - * valid flags - * \param priority - Stream priority. Lower numbers represent higher priorities. - * See ::cuCtxGetStreamPriorityRange for more information about - * meaningful stream priorities that can be passed. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \note Stream priorities are supported only on GPUs - * with compute capability 3.5 or higher. - * - * \note In the current implementation, only compute kernels launched in - * priority streams are affected by the stream's priority. Stream priorities have - * no effect on host-to-device and device-to-host memory operations. - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreate, - * ::cuStreamGetPriority, - * ::cuCtxGetStreamPriorityRange, - * ::cuStreamGetFlags, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamCreateWithPriority - */ -CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); - - -/** - * \brief Query the priority of a given stream - * - * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority - * and return the priority in \p priority. Note that if the stream was created with a - * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, - * this function returns the clamped priority. - * See ::cuStreamCreateWithPriority for details about priority clamping. - * - * \param hStream - Handle to the stream to be queried - * \param priority - Pointer to a signed integer in which the stream's priority is returned - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreate, - * ::cuStreamCreateWithPriority, - * ::cuCtxGetStreamPriorityRange, - * ::cuStreamGetFlags, - * ::cudaStreamGetPriority - */ -CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); - -/** - * \brief Query the flags of a given stream - * - * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority - * and return the flags in \p flags. - * - * \param hStream - Handle to the stream to be queried - * \param flags - Pointer to an unsigned integer in which the stream's flags are returned - * The value returned in \p flags is a logical 'OR' of all flags that - * were used while creating this stream. See ::cuStreamCreate for the list - * of valid flags - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreate, - * ::cuStreamGetPriority, - * ::cudaStreamGetFlags - */ -CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); - -/** - * \brief Query the context associated with a stream - * - * Returns the CUDA context that the stream is associated with. - * - * The stream handle \p hStream can refer to any of the following: - *
    - *
  • a stream created via any of the CUDA driver APIs such as ::cuStreamCreate - * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as - * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. - * The returned context is the context that was active in the calling thread when the - * stream was created. Passing an invalid handle will result in undefined behavior.
  • - *
  • any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and - * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, - * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. - * Specifying any of the special handles will return the context current to the - * calling thread. If no context is current to the calling thread, - * ::CUDA_ERROR_INVALID_CONTEXT is returned.
  • - *
- * - * \param hStream - Handle to the stream to be queried - * \param pctx - Returned context associated with the stream - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreateWithPriority, - * ::cuStreamGetPriority, - * ::cuStreamGetFlags, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamCreate, - * ::cudaStreamCreateWithFlags - */ -CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); - -/** - * \brief Make a compute stream wait on an event - * - * Makes all future work submitted to \p hStream wait for all work captured in - * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. - * The synchronization will be performed efficiently on the device when applicable. - * \p hEvent may be from a different context or device than \p hStream. - * - * flags include: - * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag. - * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external - * event node when performing stream capture. This flag is invalid outside - * of stream capture. - * - * \param hStream - Stream to wait - * \param hEvent - Event to wait on (may not be NULL) - * \param Flags - See ::CUevent_capture_flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuEventRecord, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cuStreamDestroy, - * ::cudaStreamWaitEvent - */ -CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); - -/** - * \brief Add a callback to a compute stream - * - * \note This function is slated for eventual deprecation and removal. If - * you do not require the callback to execute in case of a device error, - * consider using ::cuLaunchHostFunc. Additionally, this function is not - * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike - * ::cuLaunchHostFunc. - * - * Adds a callback to be called on the host after all currently enqueued - * items in the stream have completed. For each - * cuStreamAddCallback call, the callback will be executed exactly once. - * The callback will block later work in the stream until it is finished. - * - * The callback may be passed ::CUDA_SUCCESS or an error code. In the event - * of a device error, all subsequently executed callbacks will receive an - * appropriate ::CUresult. - * - * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API - * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any - * synchronization that may depend on outstanding device work or other callbacks - * that are not mandated to run earlier. Callbacks without a mandated order - * (in independent streams) execute in undefined order and may be serialized. - * - * For the purposes of Unified Memory, callback execution makes a number of - * guarantees: - *
    - *
  • The callback stream is considered idle for the duration of the - * callback. Thus, for example, a callback may always use memory attached - * to the callback stream.
  • - *
  • The start of execution of a callback has the same effect as - * synchronizing an event recorded in the same stream immediately prior to - * the callback. It thus synchronizes streams which have been "joined" - * prior to the callback.
  • - *
  • Adding device work to any stream does not have the effect of making - * the stream active until all preceding host functions and stream callbacks - * have executed. Thus, for - * example, a callback might use global attached memory even if work has - * been added to another stream, if the work has been ordered behind the - * callback with an event.
  • - *
  • Completion of a callback does not cause a stream to become - * active except as described above. The callback stream will remain idle - * if no device work follows the callback, and will remain idle across - * consecutive callbacks without device work in between. Thus, for example, - * stream synchronization can be done by signaling from a callback at the - * end of the stream.
  • - *
- * - * \param hStream - Stream to add callback to - * \param callback - The function to call once preceding stream operations are complete - * \param userData - User specified data to be passed to the callback function - * \param flags - Reserved for future use, must be 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuMemAllocManaged, - * ::cuStreamAttachMemAsync, - * ::cuStreamLaunchHostFunc, - * ::cudaStreamAddCallback - */ -CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); - -/** - * \brief Begins graph capture on a stream - * - * Begin graph capture on \p hStream. When a stream is in capture mode, all operations - * pushed into the stream will not be executed, but will instead be captured into - * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated - * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which - * it was initiated, and it may only be initiated if the stream is not already in capture - * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id - * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. - * - * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be - * called on this stream from the same thread. - * - * \param hStream - Stream in which to initiate capture - * \param mode - Controls the interaction of this capture sequence with other API - * calls that are potentially unsafe. For more details see - * ::cuThreadExchangeStreamCaptureMode. - * - * \note Kernels captured using this API must not use texture and surface references. - * Reading or writing through any texture or surface reference is undefined - * behavior. This restriction does not apply to texture and surface objects. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuStreamCreate, - * ::cuStreamIsCapturing, - * ::cuStreamEndCapture, - * ::cuThreadExchangeStreamCaptureMode - */ -CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); - -/** - * \brief Swaps the stream capture interaction mode for a thread - * - * Sets the calling thread's stream capture interaction mode to the value contained - * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To - * facilitate deterministic behavior across function or module boundaries, callers - * are encouraged to use this API in a push-pop fashion: \code - CUstreamCaptureMode mode = desiredMode; - cuThreadExchangeStreamCaptureMode(&mode); - ... - cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode - * \endcode - * - * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call - * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is - * not enqueued asynchronously to a stream, and is not observed by stream capture. - * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture - * depended on the allocation being replayed whenever the graph is launched, the - * captured graph would be invalid. - * - * Therefore, stream capture places restrictions on API calls that can be made within - * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This - * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. - * - * A thread's mode is one of the following: - * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has - * an ongoing capture sequence that was not initiated with - * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread - * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, - * this thread is prohibited from potentially unsafe API calls. - * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture - * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited - * from potentially unsafe API calls. Concurrent capture sequences in other threads - * are ignored. - * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially - * unsafe API calls. Note that the thread is still prohibited from API calls which - * necessarily conflict with stream capture, for example, attempting ::cuEventQuery - * on an event that was last recorded inside a capture sequence. - * - * \param mode - Pointer to mode value to swap with the current mode - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuStreamBeginCapture - */ -CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); - -/** - * \brief Ends capture on a stream, returning the captured graph - * - * End capture on \p hStream, returning the captured graph via \p phGraph. - * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. - * If capture was invalidated, due to a violation of the rules of stream capture, then - * a NULL graph will be returned. - * - * If the \p mode argument to ::cuStreamBeginCapture was not - * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as - * ::cuStreamBeginCapture. - * - * \param hStream - Stream to query - * \param phGraph - The captured graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD - * \notefnerr - * - * \sa - * ::cuStreamCreate, - * ::cuStreamBeginCapture, - * ::cuStreamIsCapturing - */ -CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); - -/** - * \brief Returns a stream's capture status - * - * Return the capture status of \p hStream via \p captureStatus. After a successful - * call, \p *captureStatus will contain one of the following: - * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. - * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. - * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error - * has invalidated the capture sequence. The capture sequence must be terminated - * with ::cuStreamEndCapture on the stream where it was initiated in order to - * continue using \p hStream. - * - * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while - * a blocking stream in the same context is capturing, it will return - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified - * after the call. The blocking stream capture is not invalidated. - * - * When a blocking stream is capturing, the legacy stream is in an - * unusable state until the blocking stream capture is terminated. The legacy - * stream is not supported for stream capture, but attempted use would have an - * implicit dependency on the capturing stream(s). - * - * \param hStream - Stream to query - * \param captureStatus - Returns the stream's capture status - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT - * \notefnerr - * - * \sa - * ::cuStreamCreate, - * ::cuStreamBeginCapture, - * ::cuStreamEndCapture - */ -CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); - -/** - * \brief Query capture status of a stream - * - * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will - * supplant this version in 12.0, which is retained for minor version compatibility. - * - * Query the capture status of a stream and and get an id for - * the capture sequence, which is unique over the lifetime of the process. - * - * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created - * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. - * - * A valid id is returned only if both of the following are true: - * - the call returns CUDA_SUCCESS - * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT - * \notefnerr - * - * \sa - * ::cuStreamGetCaptureInfo_v2, - * ::cuStreamBeginCapture, - * ::cuStreamIsCapturing - */ -CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); - -/** - * \brief Query a stream's capture state (11.3+) - * - * Query stream state related to stream capture. - * - * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created - * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. - * - * Valid data (other than capture status) is returned only if both of the following are true: - * - the call returns CUDA_SUCCESS - * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE - * - * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the - * previous version in 12.0. Developers requiring compatibility across minor versions to - * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback - * path. - * - * \param hStream - The stream to query - * \param captureStatus_out - Location to return the capture status of the stream; required - * \param id_out - Optional location to return an id for the capture sequence, which is - * unique over the lifetime of the process - * \param graph_out - Optional location to return the graph being captured into. All - * operations other than destroy and node removal are permitted on the graph - * while the capture sequence is in progress. This API does not transfer - * ownership of the graph, which is transferred or destroyed at - * ::cuStreamEndCapture. Note that the graph handle may be invalidated before - * end of capture for certain errors. Nodes that are or become - * unreachable from the original stream at ::cuStreamEndCapture due to direct - * actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. - * \param dependencies_out - Optional location to store a pointer to an array of nodes. - * The next node to be captured in the stream will depend on this set of nodes, - * absent operations such as event wait which modify this set. The array pointer - * is valid until the next API call which operates on the stream or until end of - * capture. The node handles may be copied out and are valid until they or the - * graph is destroyed. The driver-owned array may also be passed directly to - * APIs that operate on the graph (not the stream) without copying. - * \param numDependencies_out - Optional location to store the size of the array - * returned in dependencies_out. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuStreamGetCaptureInfo, - * ::cuStreamBeginCapture, - * ::cuStreamIsCapturing, - * ::cuStreamUpdateCaptureDependencies - */ -CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, - cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); - -/** - * \brief Update the set of dependencies in a capturing stream (11.3+) - * - * Modifies the dependency set of a capturing stream. The dependency set is the set - * of nodes that the next captured node in the stream will depend on. - * - * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and - * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to - * the API is added to the existing set or replaces it. A flags value of 0 defaults - * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. - * - * Nodes that are removed from the dependency set via this API do not result in - * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at - * ::cuStreamEndCapture. - * - * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. - * - * This API is new in CUDA 11.3. Developers requiring compatibility across minor - * versions to CUDA 11.0 should not use this API or provide a fallback. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_ILLEGAL_STATE - * - * \sa - * ::cuStreamBeginCapture, - * ::cuStreamGetCaptureInfo, - * ::cuStreamGetCaptureInfo_v2 - */ -CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); - -/** - * \brief Attach memory to a stream asynchronously - * - * Enqueues an operation in \p hStream to specify stream association of - * \p length bytes of memory starting from \p dptr. This function is a - * stream-ordered operation, meaning that it is dependent on, and will - * only take effect when, previous work in stream has completed. Any - * previous association is automatically replaced. - * - * \p dptr must point to one of the following types of memories: - * - managed memory declared using the __managed__ keyword or allocated with - * ::cuMemAllocManaged. - * - a valid host-accessible region of system-allocated pageable memory. This - * type of memory may only be specified if the device associated with the - * stream reports a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. - * - * For managed allocations, \p length must be either zero or the entire - * allocation's size. Both indicate that the entire allocation's stream - * association is being changed. Currently, it is not possible to change stream - * association for a portion of a managed allocation. - * - * For pageable host allocations, \p length must be non-zero. - * - * The stream association is specified using \p flags which must be - * one of ::CUmemAttach_flags. - * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed - * by any stream on any device. - * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee - * that it won't access the memory on the device from any stream on a device that - * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with - * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, - * the program makes a guarantee that it will only access the memory on the device - * from \p hStream. It is illegal to attach singly to the NULL stream, because the - * NULL stream is a virtual global stream and not a specific stream. An error will - * be returned in this case. - * - * When memory is associated with a single stream, the Unified Memory system will - * allow CPU access to this memory region so long as all operations in \p hStream - * have completed, regardless of whether other streams are active. In effect, - * this constrains exclusive ownership of the managed memory region by - * an active GPU to per-stream activity instead of whole-GPU activity. - * - * Accessing memory on the device from streams that are not associated with - * it will produce undefined results. No error checking is performed by the - * Unified Memory system to ensure that kernels launched into other streams - * do not access this region. - * - * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync - * via events, synchronization or other means to ensure legal access to memory - * at all times. Data visibility and coherency will be changed appropriately - * for all kernels which follow a stream-association change. - * - * If \p hStream is destroyed while data is associated with it, the association is - * removed and the association reverts to the default visibility of the allocation - * as specified at ::cuMemAllocManaged. For __managed__ variables, the default - * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an - * asynchronous operation, and as a result, the change to default association won't - * happen until all work in the stream has completed. - * - * \param hStream - Stream in which to enqueue the attach operation - * \param dptr - Pointer to memory (must be a pointer to managed memory or - * to a valid host-accessible region of system-allocated - * pageable memory) - * \param length - Length of memory - * \param flags - Must be one of ::CUmemAttach_flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuMemAllocManaged, - * ::cudaStreamAttachMemAsync - */ -CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); - -/** - * \brief Determine status of a compute stream - * - * Returns ::CUDA_SUCCESS if all operations in the stream specified by - * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. - * - * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS - * is equivalent to having called ::cuStreamSynchronize(). - * - * \param hStream - Stream to query status of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_READY - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamQuery - */ -CUresult CUDAAPI cuStreamQuery(CUstream hStream); - -/** - * \brief Wait until a stream's tasks are completed - * - * Waits until the device has completed all operations in the stream specified - * by \p hStream. If the context was created with the - * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the - * stream is finished with all of its tasks. - * - * \param hStream - Stream to wait for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE - - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamDestroy, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamAddCallback, - * ::cudaStreamSynchronize - */ -CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); - -/** - * \brief Destroys a stream - * - * Destroys the stream specified by \p hStream. - * - * In case the device is still doing work in the stream \p hStream - * when ::cuStreamDestroy() is called, the function will return immediately - * and the resources associated with \p hStream will be released automatically - * once the device has completed all work in \p hStream. - * - * \param hStream - Stream to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamDestroy - */ -CUresult CUDAAPI cuStreamDestroy(CUstream hStream); - -/** - * \brief Copies attributes from source stream to destination stream. - * - * Copies attributes from source stream \p src to destination stream \p dst. - * Both streams must have the same context. - * - * \param[out] dst Destination stream - * \param[in] src Source stream - * For list of attributes see ::CUstreamAttrID - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); - -/** - * \brief Queries stream attribute. - * - * Queries attribute \p attr from \p hStream and stores it in corresponding - * member of \p value_out. - * - * \param[in] hStream - * \param[in] attr - * \param[out] value_out - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, - CUstreamAttrValue *value_out); - -/** - * \brief Sets stream attribute. - * - * Sets attribute \p attr on \p hStream from corresponding attribute of - * \p value. The updated attribute will be applied to subsequent work - * submitted to the stream. It will not affect previously submitted work. - * - * \param[out] hStream - * \param[in] attr - * \param[in] value - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, - const CUstreamAttrValue *value); - -/** @} */ /* END CUDA_STREAM */ - - -/** - * \defgroup CUDA_EVENT Event Management - * - * ___MANBRIEF___ event management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the event management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Creates an event - * - * Creates an event *phEvent for the current context with the flags specified via - * \p Flags. Valid flags include: - * - ::CU_EVENT_DEFAULT: Default event creation flag. - * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking - * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on - * an event created with this flag will block until the event has actually - * been recorded. - * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need - * to record timing data. Events created with this flag specified and - * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best - * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). - * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an - * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must - * be specified along with ::CU_EVENT_DISABLE_TIMING. - * - * \param phEvent - Returns newly created event - * \param Flags - Event creation flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventCreate, - * ::cudaEventCreateWithFlags - */ -CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); - -/** - * \brief Records an event - * - * Captures in \p hEvent the contents of \p hStream at the time of this call. - * \p hEvent and \p hStream must be from the same context. - * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then - * examine or wait for completion of the work that was captured. Uses of - * \p hStream after this call do not modify \p hEvent. See note on default - * stream behavior for what is captured in the default case. - * - * ::cuEventRecord() can be called multiple times on the same event and - * will overwrite the previously captured state. Other APIs such as - * ::cuStreamWaitEvent() use the most recently captured state at the time - * of the API call, and are not affected by later calls to - * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an - * event represents an empty set of work, so for example ::cuEventQuery() - * would return ::CUDA_SUCCESS. - * - * \param hEvent - Event to record - * \param hStream - Stream to record event for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \note_null_stream - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuStreamWaitEvent, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventRecord, - * ::cuEventRecordWithFlags - */ -CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); - -/** - * \brief Records an event - * - * Captures in \p hEvent the contents of \p hStream at the time of this call. - * \p hEvent and \p hStream must be from the same context. - * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then - * examine or wait for completion of the work that was captured. Uses of - * \p hStream after this call do not modify \p hEvent. See note on default - * stream behavior for what is captured in the default case. - * - * ::cuEventRecordWithFlags() can be called multiple times on the same event and - * will overwrite the previously captured state. Other APIs such as - * ::cuStreamWaitEvent() use the most recently captured state at the time - * of the API call, and are not affected by later calls to - * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an - * event represents an empty set of work, so for example ::cuEventQuery() - * would return ::CUDA_SUCCESS. - * - * flags include: - * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag. - * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external - * event node when performing stream capture. This flag is invalid outside - * of stream capture. - * - * \param hEvent - Event to record - * \param hStream - Stream to record event for - * \param flags - See ::CUevent_capture_flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \note_null_stream - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuStreamWaitEvent, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cuEventRecord, - * ::cudaEventRecord - */ -CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); - -/** - * \brief Queries an event's status - * - * Queries the status of all work currently captured by \p hEvent. See - * ::cuEventRecord() for details on what is captured by an event. - * - * Returns ::CUDA_SUCCESS if all captured work has been completed, or - * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. - * - * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS - * is equivalent to having called ::cuEventSynchronize(). - * - * \param hEvent - Event to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_READY - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventSynchronize, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventQuery - */ -CUresult CUDAAPI cuEventQuery(CUevent hEvent); - -/** - * \brief Waits for an event to complete - * - * Waits until the completion of all work currently captured in \p hEvent. - * See ::cuEventRecord() for details on what is captured by an event. - * - * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC - * flag will cause the calling CPU thread to block until the event has - * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has - * not been set, then the CPU thread will busy-wait until the event has - * been completed by the device. - * - * \param hEvent - Event to wait for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventSynchronize - */ -CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); - -/** - * \brief Destroys an event - * - * Destroys the event specified by \p hEvent. - * - * An event may be destroyed before it is complete (i.e., while - * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the - * call does not block on completion of the event, and any associated - * resources will automatically be released asynchronously at completion. - * - * \param hEvent - Event to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuEventElapsedTime, - * ::cudaEventDestroy - */ -CUresult CUDAAPI cuEventDestroy(CUevent hEvent); - -/** - * \brief Computes the elapsed time between two events - * - * Computes the elapsed time between two events (in milliseconds with a - * resolution of around 0.5 microseconds). - * - * If either event was last recorded in a non-NULL stream, the resulting time - * may be greater than expected (even if both used the same stream handle). This - * happens because the ::cuEventRecord() operation takes place asynchronously - * and there is no guarantee that the measured latency is actually just between - * the two events. Any number of other different stream operations could execute - * in between the two measured events, thus altering the timing in a significant - * way. - * - * If ::cuEventRecord() has not been called on either event then - * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called - * on both events but one or both of them has not yet been completed (that is, - * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the - * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with - * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return - * ::CUDA_ERROR_INVALID_HANDLE. - * - * \param pMilliseconds - Time between \p hStart and \p hEnd in ms - * \param hStart - Starting event - * \param hEnd - Ending event - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_READY - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuEventDestroy, - * ::cudaEventElapsedTime - */ -CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); - -/** @} */ /* END CUDA_EVENT */ - -/** - * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability - * - * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the external resource interoperability functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - - /** - * \brief Imports an external memory object - * - * Imports an externally allocated memory object and returns - * a handle to that in \p extMem_out. - * - * The properties of the handle being imported must be described in - * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure - * is defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { - CUexternalMemoryHandleType type; - union { - int fd; - struct { - void *handle; - const void *name; - } win32; - const void *nvSciBufObject; - } handle; - unsigned long long size; - unsigned int flags; - } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type - * of handle being imported. ::CUexternalMemoryHandleType is - * defined as: - * - * \code - typedef enum CUexternalMemoryHandleType_enum { - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 - } CUexternalMemoryHandleType; - * \endcode - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid - * file descriptor referencing a memory object. Ownership of - * the file descriptor is transferred to the CUDA driver when the - * handle is imported successfully. Performing any operations on the - * file descriptor after it is imported results in undefined behavior. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one - * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be - * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * references a memory object. Ownership of this handle is - * not transferred to CUDA after the import operation, so the - * application must release the handle using the appropriate system - * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a memory object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must - * be non-NULL and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * must be NULL. The handle specified must be a globally shared KMT - * handle. This handle does not hold a reference to the underlying - * object, and thus will be invalid when all references to the - * memory object are destroyed. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one - * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be - * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3D12Device::CreateSharedHandle when referring to a - * ID3D12Heap object. This handle holds a reference to the underlying - * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a ID3D12Heap object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one - * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be - * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3D12Device::CreateSharedHandle when referring to a - * ID3D12Resource object. This handle holds a reference to the - * underlying object. If - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a ID3D12Resource object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must - * represent a valid shared NT handle that is returned by - * IDXGIResource1::CreateSharedHandle when referring to a - * ID3D11Resource object. If - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a ID3D11Resource object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must - * represent a valid shared KMT handle that is returned by - * IDXGIResource::GetSharedHandle when referring to a - * ID3D11Resource object and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * must be NULL. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL - * and reference a valid NvSciBuf object. - * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the - * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync - * as appropriate barriers to maintain coherence between CUDA and the other drivers. - * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC - * for memory synchronization. - * - * - * The size of the memory object must be specified in - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. - * - * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the - * resource is a dedicated resource. The definition of what a - * dedicated resource is outside the scope of this extension. - * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type - * is one of the following: - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT - * - * \param extMem_out - Returned handle to an external memory object - * \param memHandleDesc - Memory import handle descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the - * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges - * as well as appropriate Vulkan pipeline barriers to maintain coherence between - * CPU and GPU. For more information on these APIs, please refer to "Synchronization - * and Cache Control" chapter from Vulkan specification. - * - * \sa ::cuDestroyExternalMemory, - * ::cuExternalMemoryGetMappedBuffer, - * ::cuExternalMemoryGetMappedMipmappedArray - */ -CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); - -/** - * \brief Maps a buffer onto an imported memory object - * - * Maps a buffer onto an imported memory object and returns a device - * pointer in \p devPtr. - * - * The properties of the buffer being mapped must be described in - * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is - * defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { - unsigned long long offset; - unsigned long long size; - unsigned int flags; - } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in - * the memory object where the buffer's base address is. - * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. - * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. - * - * The offset and size have to be suitably aligned to match the - * requirements of the external API. Mapping two buffers whose ranges - * overlap may or may not result in the same virtual address being - * returned for the overlapped portion. In such cases, the application - * must ensure that all accesses to that region from the GPU are - * volatile. Otherwise writes made via one address are not guaranteed - * to be visible via the other address, even if they're issued by the - * same thread. It is recommended that applications map the combined - * range instead of mapping separate buffers and then apply the - * appropriate offsets to the returned pointer to derive the - * individual buffers. - * - * The returned pointer \p devPtr must be freed using ::cuMemFree. - * - * \param devPtr - Returned device pointer to buffer - * \param extMem - Handle to external memory object - * \param bufferDesc - Buffer descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalMemory, - * ::cuDestroyExternalMemory, - * ::cuExternalMemoryGetMappedMipmappedArray - */ -CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); - -/** - * \brief Maps a CUDA mipmapped array onto an external memory object - * - * Maps a CUDA mipmapped array onto an external object and returns a - * handle to it in \p mipmap. - * - * The properties of the CUDA mipmapped array being mapped must be - * described in \p mipmapDesc. The structure - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { - unsigned long long offset; - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - unsigned int numLevels; - } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the - * offset in the memory object where the base level of the mipmap - * chain is. - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes - * the format, dimensions and type of the base level of the mipmap - * chain. For further details on these parameters, please refer to the - * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped - * array is bound as a color target in the graphics API, then the flag - * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies - * the total number of levels in the mipmap chain. - * - * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. - * - * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. - * - * \param mipmap - Returned CUDA mipmapped array - * \param extMem - Handle to external memory object - * \param mipmapDesc - CUDA array descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalMemory, - * ::cuDestroyExternalMemory, - * ::cuExternalMemoryGetMappedBuffer - */ -CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); - -/** - * \brief Destroys an external memory object. - * - * Destroys the specified external memory object. Any existing buffers - * and CUDA mipmapped arrays mapped onto this object must no longer be - * used and must be explicitly freed using ::cuMemFree and - * ::cuMipmappedArrayDestroy respectively. - * - * \param extMem - External memory object to be destroyed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalMemory, - * ::cuExternalMemoryGetMappedBuffer, - * ::cuExternalMemoryGetMappedMipmappedArray - */ -CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); - -/** - * \brief Imports an external semaphore - * - * Imports an externally allocated synchronization object and returns - * a handle to that in \p extSem_out. - * - * The properties of the handle being imported must be described in - * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is - * defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { - CUexternalSemaphoreHandleType type; - union { - int fd; - struct { - void *handle; - const void *name; - } win32; - const void* NvSciSyncObj; - } handle; - unsigned int flags; - } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of - * handle being imported. ::CUexternalSemaphoreHandleType is defined - * as: - * - * \code - typedef enum CUexternalSemaphoreHandleType_enum { - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 - } CUexternalSemaphoreHandleType; - * \endcode - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid - * file descriptor referencing a synchronization object. Ownership of - * the file descriptor is transferred to the CUDA driver when the - * handle is imported successfully. Performing any operations on the - * file descriptor after it is imported results in undefined behavior. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one - * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be - * NULL. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * references a synchronization object. Ownership of this handle is - * not transferred to CUDA after the import operation, so the - * application must release the handle using the appropriate system - * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must - * be non-NULL and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * must be NULL. The handle specified must be a globally shared KMT - * handle. This handle does not hold a reference to the underlying - * object, and thus will be invalid when all references to the - * synchronization object are destroyed. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one - * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be - * NULL. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3D12Device::CreateSharedHandle when referring to a - * ID3D12Fence object. This handle holds a reference to the underlying - * object. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object that - * refers to a valid ID3D12Fence object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * represents a valid shared NT handle that is returned by - * ID3D11Fence::CreateSharedHandle. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object that - * refers to a valid ID3D11Fence object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj - * represents a valid NvSciSyncObj. - * - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * represents a valid shared NT handle that - * is returned by IDXGIResource1::CreateSharedHandle when referring to - * a IDXGIKeyedMutex object. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object that - * refers to a valid IDXGIKeyedMutex object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * represents a valid shared KMT handle that - * is returned by IDXGIResource::GetSharedHandle when referring to - * a IDXGIKeyedMutex object and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid - * file descriptor referencing a synchronization object. Ownership of - * the file descriptor is transferred to the CUDA driver when the - * handle is imported successfully. Performing any operations on the - * file descriptor after it is imported results in undefined behavior. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one - * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be - * NULL. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * references a synchronization object. Ownership of this handle is - * not transferred to CUDA after the import operation, so the - * application must release the handle using the appropriate system - * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object. - * - * \param extSem_out - Returned handle to an external semaphore - * \param semHandleDesc - Semaphore import handle descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuDestroyExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); - -/** - * \brief Signals a set of external semaphore objects - * - * Enqueues a signal operation on a set of externally allocated - * semaphore object in the specified stream. The operations will be - * executed when all prior operations in the stream complete. - * - * The exact semantics of signaling a semaphore depends on the type of - * the object. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * then signaling the semaphore will set it to the signaled state. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 - * then the semaphore will be set to the value specified in - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. - * - * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC - * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence - * to a value that can be used by subsequent waiters of the same NvSciSync object - * to order operations with those currently submitted in \p stream. Such an update - * will overwrite previous contents of - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, - * signaling such an external semaphore object causes appropriate memory synchronization - * operations to be performed over all external memory objects that are imported as - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses - * made by other importers of the same set of NvSciBuf memory object(s) are coherent. - * These operations can be skipped by specifying the flag - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a - * performance optimization when data coherency is not required. But specifying this - * flag in scenarios where data coherency is required results in undefined behavior. - * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, - * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in - * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return - * CUDA_ERROR_NOT_SUPPORTED. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT - * then the keyed mutex will be released with the key specified in - * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. - * - * \param extSemArray - Set of external semaphores to be signaled - * \param paramsArray - Array of semaphore parameters - * \param numExtSems - Number of semaphores to signal - * \param stream - Stream to enqueue the signal operations in - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuImportExternalSemaphore, - * ::cuDestroyExternalSemaphore, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - -/** - * \brief Waits on a set of external semaphore objects - * - * Enqueues a wait operation on a set of externally allocated - * semaphore object in the specified stream. The operations will be - * executed when all prior operations in the stream complete. - * - * The exact semantics of waiting on a semaphore depends on the type - * of the object. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * then waiting on the semaphore will wait until the semaphore reaches - * the signaled state. The semaphore will then be reset to the - * unsignaled state. Therefore for every signal operation, there can - * only be one wait operation. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 - * then waiting on the semaphore will wait until the value of the - * semaphore is greater than or equal to - * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. - * - * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC - * then, waiting on the semaphore will wait until the - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the - * signaler of the NvSciSyncObj that was associated with this semaphore object. - * By default, waiting on such an external semaphore object causes appropriate - * memory synchronization operations to be performed over all external memory objects - * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that - * any subsequent accesses made by other importers of the same set of NvSciBuf memory - * object(s) are coherent. These operations can be skipped by specifying the flag - * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a - * performance optimization when data coherency is not required. But specifying this - * flag in scenarios where data coherency is required results in undefined behavior. - * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, - * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in - * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return - * CUDA_ERROR_NOT_SUPPORTED. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT - * then the keyed mutex will be acquired when it is released with the key - * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key - * or until the timeout specified by - * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs - * has lapsed. The timeout interval can either be a finite value - * specified in milliseconds or an infinite value. In case an infinite - * value is specified the timeout never elapses. The windows INFINITE - * macro must be used to specify infinite timeout. - * - * \param extSemArray - External semaphores to be waited on - * \param paramsArray - Array of semaphore parameters - * \param numExtSems - Number of semaphores to wait on - * \param stream - Stream to enqueue the wait operations in - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_TIMEOUT - * \notefnerr - * - * \sa ::cuImportExternalSemaphore, - * ::cuDestroyExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync - */ -CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - -/** - * \brief Destroys an external semaphore - * - * Destroys an external semaphore object and releases any references - * to the underlying resource. Any outstanding signals or waits must - * have completed before the semaphore is destroyed. - * - * \param extSem - External semaphore to be destroyed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); - -/** @} */ /* END CUDA_EXTRES_INTEROP */ - -/** - * \defgroup CUDA_MEMOP Stream memory operations - * - * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the stream memory operations of the low-level CUDA - * driver application programming interface. - * - * The whole set of operations is disabled by default. Users are required - * to explicitly enable them, e.g. on Linux by passing the kernel module - * parameter shown below: - * modprobe nvidia NVreg_EnableStreamMemOPs=1 - * There is currently no way to enable these operations on other operating - * systems. - * - * Users can programmatically query whether the device supports these - * operations with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. - * - * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. - * - * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() - * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and - * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. - * - * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and - * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform - * hardware features and can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. - * - * Note that all memory pointers passed as parameters to these operations - * are device pointers. Where necessary a device pointer should be - * obtained, for example with ::cuMemHostGetDevicePointer(). - * - * None of the operations accepts pointers to managed memory buffers - * (::cuMemAllocManaged). - * - * @{ - */ - -/** - * \brief Wait on a memory location - * - * Enqueues a synchronization of the stream on the given memory location. Work - * ordered after the operation will block until the given condition on the - * memory is satisfied. By default, the condition is to wait for - * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. - * Other condition types can be specified via \p flags. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot - * be used with managed memory (::cuMemAllocManaged). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. - * - * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. - * - * \param stream The stream to synchronize on the memory location. - * \param addr The memory location to wait on. - * \param value The value to compare with the memory location. - * \param flags See ::CUstreamWaitValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWaitValue64, - * ::cuStreamWriteValue32, - * ::cuStreamWriteValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - -/** - * \brief Wait on a memory location - * - * Enqueues a synchronization of the stream on the given memory location. Work - * ordered after the operation will block until the given condition on the - * memory is satisfied. By default, the condition is to wait for - * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. - * Other condition types can be specified via \p flags. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. - * - * \param stream The stream to synchronize on the memory location. - * \param addr The memory location to wait on. - * \param value The value to compare with the memory location. - * \param flags See ::CUstreamWaitValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWaitValue32, - * ::cuStreamWriteValue32, - * ::cuStreamWriteValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - -/** - * \brief Write a value to memory - * - * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER - * flag is passed, the write is preceded by a system-wide memory fence, - * equivalent to a __threadfence_system() but scoped to the stream - * rather than a CUDA thread. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot - * be used with managed memory (::cuMemAllocManaged). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. - * - * \param stream The stream to do the write in. - * \param addr The device address to write to. - * \param value The value to write. - * \param flags See ::CUstreamWriteValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWriteValue64, - * ::cuStreamWaitValue32, - * ::cuStreamWaitValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuEventRecord - */ -CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - -/** - * \brief Write a value to memory - * - * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER - * flag is passed, the write is preceded by a system-wide memory fence, - * equivalent to a __threadfence_system() but scoped to the stream - * rather than a CUDA thread. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. - * - * \param stream The stream to do the write in. - * \param addr The device address to write to. - * \param value The value to write. - * \param flags See ::CUstreamWriteValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWriteValue32, - * ::cuStreamWaitValue32, - * ::cuStreamWaitValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuEventRecord - */ -CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - -/** - * \brief Batch operations to synchronize the stream via memory operations - * - * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). - * Batching operations may avoid some performance overhead in both the API call - * and the device execution versus adding them to the stream in separate API - * calls. The operations are enqueued in the order they appear in the array. - * - * See ::CUstreamBatchMemOpType for the full set of supported operations, and - * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), - * and ::cuStreamWriteValue64() for details of specific operations. - * - * Basic support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details - * on querying support for specific operations. - * - * \param stream The stream to enqueue the operations in. - * \param count The number of operations in the array. Must be less than 256. - * \param paramArray The types and parameters of the individual operations. - * \param flags Reserved for future expansion; must be 0. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWaitValue32, - * ::cuStreamWaitValue64, - * ::cuStreamWriteValue32, - * ::cuStreamWriteValue64, - * ::cuMemHostRegister - */ -CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); - -/** @} */ /* END CUDA_MEMOP */ - -/** - * \defgroup CUDA_EXEC Execution Control - * - * ___MANBRIEF___ execution control functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the execution control functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns information about a function - * - * Returns in \p *pi the integer value of the attribute \p attrib on the kernel - * given by \p hfunc. The supported attributes are: - * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads - * per block, beyond which a launch of the function would fail. This number - * depends on both the function and the device on which the function is - * currently loaded. - * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of - * statically-allocated shared memory per block required by this function. - * This does not include dynamically-allocated shared memory requested by - * the user at runtime. - * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated - * constant memory required by this function. - * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory - * used by each thread of this function. - * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread - * of this function. - * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for - * which the function was compiled. This value is the major PTX version * 10 - * + the minor PTX version, so a PTX version 1.3 function would return the - * value 13. Note that this may return the undefined value of 0 for cubins - * compiled prior to CUDA 3.0. - * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for - * which the function was compiled. This value is the major binary - * version * 10 + the minor binary version, so a binary version 1.3 function - * would return the value 13. Note that this will return a value of 10 for - * legacy cubins that do not have a properly-encoded binary architecture - * version. - * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has - * been compiled with user specified option "-Xptxas --dlcm=ca" set . - * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of - * dynamically-allocated shared memory. - * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 - * cache split ratio in percent of total shared memory. - * - * \param pi - Returned attribute value - * \param attrib - Attribute requested - * \param hfunc - Function to query attribute of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuLaunchKernel, - * ::cudaFuncGetAttributes, - * ::cudaFuncSetAttribute - */ -CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); - -/** - * \brief Sets information about a function - * - * This call sets the value of a specified attribute \p attrib on the kernel given - * by \p hfunc to an integer value specified by \p val - * This function returns CUDA_SUCCESS if the new value of the attribute could be - * successfully set. If the set fails, this call will return an error. - * Not all attributes can have values set. Attempting to set a value on a read-only - * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) - * - * Supported attributes for the cuFuncSetAttribute call are: - * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of - * dynamically-allocated shared memory. The value should contain the requested - * maximum size of dynamically-allocated shared memory. The sum of this value and - * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the - * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. - * The maximal size of requestable dynamic shared memory may differ by GPU - * architecture. - * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 - * cache and shared memory use the same hardware resources, this sets the shared memory - * carveout preference, in percent of the total shared memory. - * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR - * This is only a hint, and the driver can choose a different ratio if required to execute the function. - * - * \param hfunc - Function to query attribute of - * \param attrib - Attribute requested - * \param value - The value to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuLaunchKernel, - * ::cudaFuncGetAttributes, - * ::cudaFuncSetAttribute - */ -CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); - -/** - * \brief Sets the preferred cache configuration for a device function - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p config the preferred cache configuration for - * the device function \p hfunc. This is only a preference. The driver will use - * the requested configuration if possible, but it is free to choose a different - * configuration if required to execute \p hfunc. Any context-wide preference - * set via ::cuCtxSetCacheConfig() will be overridden by this per-function - * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In - * that case, the current context-wide setting will be used. - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * - * The supported cache configurations are: - * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) - * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache - * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory - * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory - * - * \param hfunc - Kernel to configure cache for - * \param config - Requested cache configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchKernel, - * ::cudaFuncSetCacheConfig - */ -CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); - -/** - * \brief Sets the shared memory configuration for a device function. - * - * On devices with configurable shared memory banks, this function will - * force all subsequent launches of the specified device function to have - * the given shared memory bank size configuration. On any given launch of the - * function, the shared memory configuration of the device will be temporarily - * changed if needed to suit the function's preferred configuration. Changes in - * shared memory configuration between subsequent launches of functions, - * may introduce a device side synchronization point. - * - * Any per-function setting of shared memory bank size set via - * ::cuFuncSetSharedMemConfig will override the context wide setting set with - * ::cuCtxSetSharedMemConfig. - * - * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. - * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank - * conflicts. - * - * This function will do nothing on devices with fixed shared memory bank size. - * - * The supported bank configurations are: - * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory - * configuration when launching this function. - * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to - * be natively four bytes when launching this function. - * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to - * be natively eight bytes when launching this function. - * - * \param hfunc - kernel to be given a shared memory config - * \param config - requested shared memory configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuCtxGetSharedMemConfig, - * ::cuCtxSetSharedMemConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchKernel, - * ::cudaFuncSetSharedMemConfig - */ -CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); - -/** - * \brief Returns a module handle - * - * Returns in \p *hmod the handle of the module that function \p hfunc - * is located in. The lifetime of the module corresponds to the lifetime of - * the context it was loaded in or until the module is explicitly unloaded. - * - * The CUDA runtime manages its own modules loaded into the primary context. - * If the handle returned by this API refers to a module loaded by the CUDA runtime, - * calling ::cuModuleUnload() on that module will result in undefined behavior. - * - * \param hmod - Returned module handle - * \param hfunc - Function to retrieve module for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - */ -CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); - -/** - * \brief Launches a CUDA function - * - * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ - * grid of blocks. Each block contains \p blockDimX x \p blockDimY x - * \p blockDimZ threads. - * - * \p sharedMemBytes sets the amount of dynamic shared memory that will be - * available to each thread block. - * - * Kernel parameters to \p f can be specified in one of two ways: - * - * 1) Kernel parameters can be specified via \p kernelParams. If \p f - * has N parameters, then \p kernelParams needs to be an array of N - * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] - * must point to a region of memory from which the actual kernel - * parameter will be copied. The number of kernel parameters and their - * offsets and sizes do not need to be specified as that information is - * retrieved directly from the kernel's image. - * - * 2) Kernel parameters can also be packaged by the application into - * a single buffer that is passed in via the \p extra parameter. - * This places the burden on the application of knowing each kernel - * parameter's size and alignment/padding within the buffer. Here is - * an example of using the \p extra parameter in this manner: - * \code - size_t argBufferSize; - char argBuffer[256]; - - // populate argBuffer and argBufferSize - - void *config[] = { - CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, - CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, - CU_LAUNCH_PARAM_END - }; - status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); - * \endcode - * - * The \p extra parameter exists to allow ::cuLaunchKernel to take - * additional less commonly used arguments. \p extra specifies a list of - * names of extra settings and their corresponding values. Each extra - * setting name is immediately followed by the corresponding value. The - * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. - * - * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra - * array; - * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next - * value in \p extra will be a pointer to a buffer containing all - * the kernel parameters for launching kernel \p f; - * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next - * value in \p extra will be a pointer to a size_t containing the - * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; - * - * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel - * parameters are specified with both \p kernelParams and \p extra - * (i.e. both \p kernelParams and \p extra are non-NULL). - * - * Calling ::cuLaunchKernel() invalidates the persistent function state - * set through the following deprecated APIs: - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), - * ::cuParamSetv(). - * - * Note that to use ::cuLaunchKernel(), the kernel \p f must either have - * been compiled with toolchain version 3.2 or later so that it will - * contain kernel parameter information, or have no kernel parameters. - * If either of these conditions is not met, then ::cuLaunchKernel() will - * return ::CUDA_ERROR_INVALID_IMAGE. - * - * \param f - Kernel to launch - * \param gridDimX - Width of grid in blocks - * \param gridDimY - Height of grid in blocks - * \param gridDimZ - Depth of grid in blocks - * \param blockDimX - X dimension of each thread block - * \param blockDimY - Y dimension of each thread block - * \param blockDimZ - Z dimension of each thread block - * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes - * \param hStream - Stream identifier - * \param kernelParams - Array of pointers to kernel parameters - * \param extra - Extra options - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \note_null_stream - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cudaLaunchKernel - */ -CUresult CUDAAPI cuLaunchKernel(CUfunction f, - unsigned int gridDimX, - unsigned int gridDimY, - unsigned int gridDimZ, - unsigned int blockDimX, - unsigned int blockDimY, - unsigned int blockDimZ, - unsigned int sharedMemBytes, - CUstream hStream, - void **kernelParams, - void **extra); - - - - - - - - -/** - * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute - * - * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ - * grid of blocks. Each block contains \p blockDimX x \p blockDimY x - * \p blockDimZ threads. - * - * \p sharedMemBytes sets the amount of dynamic shared memory that will be - * available to each thread block. - * - * The device on which this kernel is invoked must have a non-zero value for - * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. - * - * The total number of blocks launched cannot exceed the maximum number of blocks per - * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or - * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors - * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. - * - * The kernel cannot make use of CUDA dynamic parallelism. - * - * Kernel parameters must be specified via \p kernelParams. If \p f - * has N parameters, then \p kernelParams needs to be an array of N - * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] - * must point to a region of memory from which the actual kernel - * parameter will be copied. The number of kernel parameters and their - * offsets and sizes do not need to be specified as that information is - * retrieved directly from the kernel's image. - * - * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is - * the same as function state set through ::cuLaunchKernel API - * - * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous - * block shape, shared size and parameter info associated with \p f - * is overwritten. - * - * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have - * been compiled with toolchain version 3.2 or later so that it will - * contain kernel parameter information, or have no kernel parameters. - * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will - * return ::CUDA_ERROR_INVALID_IMAGE. - * - * \param f - Kernel to launch - * \param gridDimX - Width of grid in blocks - * \param gridDimY - Height of grid in blocks - * \param gridDimZ - Depth of grid in blocks - * \param blockDimX - X dimension of each thread block - * \param blockDimY - Y dimension of each thread block - * \param blockDimZ - Z dimension of each thread block - * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes - * \param hStream - Stream identifier - * \param kernelParams - Array of pointers to kernel parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \note_null_stream - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchCooperativeKernelMultiDevice, - * ::cudaLaunchCooperativeKernel - */ -CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, - unsigned int gridDimX, - unsigned int gridDimY, - unsigned int gridDimZ, - unsigned int blockDimX, - unsigned int blockDimY, - unsigned int blockDimZ, - unsigned int sharedMemBytes, - CUstream hStream, - void **kernelParams); - -/** - * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute - * - * \deprecated This function is deprecated as of CUDA 11.3. - * - * Invokes kernels as specified in the \p launchParamsList array where each element - * of the array specifies all the parameters required to perform a single kernel launch. - * These kernels can cooperate and synchronize as they execute. The size of the array is - * specified by \p numDevices. - * - * No two kernels can be launched on the same device. All the devices targeted by this - * multi-device launch must be identical. All devices must have a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. - * - * All kernels launched must be identical with respect to the compiled code. Note that - * any __device__, __constant__ or __managed__ variables present in the module that owns - * the kernel launched on each device, are independently instantiated on every device. - * It is the application's responsiblity to ensure these variables are initialized and - * used appropriately. - * - * The size of the grids as specified in blocks, the size of the blocks themselves - * and the amount of shared memory used by each thread block must also match across - * all launched kernels. - * - * The streams used to launch these kernels must have been created via either ::cuStreamCreate - * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD - * cannot be used. - * - * The total number of blocks launched per kernel cannot exceed the maximum number of blocks - * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or - * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors - * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the - * total number of blocks launched per device has to match across all devices, the maximum - * number of blocks that can be launched per device will be limited by the device with the - * least number of multiprocessors. - * - * The kernels cannot make use of CUDA dynamic parallelism. - * - * The ::CUDA_LAUNCH_PARAMS structure is defined as: - * \code - typedef struct CUDA_LAUNCH_PARAMS_st - { - CUfunction function; - unsigned int gridDimX; - unsigned int gridDimY; - unsigned int gridDimZ; - unsigned int blockDimX; - unsigned int blockDimY; - unsigned int blockDimZ; - unsigned int sharedMemBytes; - CUstream hStream; - void **kernelParams; - } CUDA_LAUNCH_PARAMS; - * \endcode - * where: - * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must - * be identical with respect to the compiled code. - * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. - * This must match across all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot - * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated - * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. - * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If - * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams - * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through - * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual - * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes - * do not need to be specified as that information is retrieved directly from the kernel's image. - * - * By default, the kernel won't begin execution on any GPU until all prior work in all the specified - * streams has completed. This behavior can be overridden by specifying the flag - * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel - * will only wait for prior work in the stream corresponding to that GPU to complete before it begins - * execution. - * - * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin - * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying - * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, - * any subsequent work pushed in any of the specified streams will only wait for the kernel launched - * on the GPU corresponding to that stream to complete before it begins execution. - * - * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is - * the same as function state set through ::cuLaunchKernel API when called individually for each - * element in \p launchParamsList. - * - * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous - * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function - * in \p launchParamsList is overwritten. - * - * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have - * been compiled with toolchain version 3.2 or later so that it will - * contain kernel parameter information, or have no kernel parameters. - * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will - * return ::CUDA_ERROR_INVALID_IMAGE. - * - * \param launchParamsList - List of launch parameters, one per device - * \param numDevices - Size of the \p launchParamsList array - * \param flags - Flags to control launch behavior - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \note_null_stream - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchCooperativeKernel, - * ::cudaLaunchCooperativeKernelMultiDevice - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); - -/** - * \brief Enqueues a host function call in a stream - * - * Enqueues a host function to run in a stream. The function will be called - * after currently enqueued work and will block work added after it. - * - * The host function must not make any CUDA API calls. Attempting to use a - * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. - * The host function must not perform any synchronization that may depend on - * outstanding CUDA work not mandated to run earlier. Host functions without a - * mandated order (such as in independent streams) execute in undefined order - * and may be serialized. - * - * For the purposes of Unified Memory, execution makes a number of guarantees: - *
    - *
  • The stream is considered idle for the duration of the function's - * execution. Thus, for example, the function may always use memory attached - * to the stream it was enqueued in.
  • - *
  • The start of execution of the function has the same effect as - * synchronizing an event recorded in the same stream immediately prior to - * the function. It thus synchronizes streams which have been "joined" - * prior to the function.
  • - *
  • Adding device work to any stream does not have the effect of making - * the stream active until all preceding host functions and stream callbacks - * have executed. Thus, for - * example, a function might use global attached memory even if work has - * been added to another stream, if the work has been ordered behind the - * function call with an event.
  • - *
  • Completion of the function does not cause a stream to become - * active except as described above. The stream will remain idle - * if no device work follows the function, and will remain idle across - * consecutive host functions or stream callbacks without device work in - * between. Thus, for example, - * stream synchronization can be done by signaling from a host function at the - * end of the stream.
  • - *
- * - * Note that, in contrast to ::cuStreamAddCallback, the function will not be - * called in the event of an error in the CUDA context. - * - * \param hStream - Stream to enqueue function call in - * \param fn - The function to call once preceding stream operations are complete - * \param userData - User-specified data to be passed to the function - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuMemAllocManaged, - * ::cuStreamAttachMemAsync, - * ::cuStreamAddCallback - */ -CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); - -/** @} */ /* END CUDA_EXEC */ - -/** - * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] - * - * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated execution control functions of the - * low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Sets the block-dimensions for the function - * - * \deprecated - * - * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are - * created when the kernel given by \p hfunc is launched. - * - * \param hfunc - Kernel to specify dimensions of - * \param x - X dimension - * \param y - Y dimension - * \param z - Z dimension - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetSharedSize, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSeti, - * ::cuParamSetf, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); - -/** - * \brief Sets the dynamic shared-memory size for the function - * - * \deprecated - * - * Sets through \p bytes the amount of dynamic shared memory that will be - * available to each thread block when the kernel given by \p hfunc is launched. - * - * \param hfunc - Kernel to specify dynamic shared-memory size for - * \param bytes - Dynamic shared-memory size per thread in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSeti, - * ::cuParamSetf, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); - -/** - * \brief Sets the parameter size for the function - * - * \deprecated - * - * Sets through \p numbytes the total size in bytes needed by the function - * parameters of the kernel corresponding to \p hfunc. - * - * \param hfunc - Kernel to set parameter size for - * \param numbytes - Size of parameter list in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); - -/** - * \brief Adds an integer parameter to the function's argument list - * - * \deprecated - * - * Sets an integer parameter that will be specified the next time the - * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. - * - * \param hfunc - Kernel to add parameter to - * \param offset - Offset to add parameter to argument list - * \param value - Value of parameter - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); - -/** - * \brief Adds a floating-point parameter to the function's argument list - * - * \deprecated - * - * Sets a floating-point parameter that will be specified the next time the - * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. - * - * \param hfunc - Kernel to add parameter to - * \param offset - Offset to add parameter to argument list - * \param value - Value of parameter - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); - -/** - * \brief Adds arbitrary data to the function's argument list - * - * \deprecated - * - * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr - * into the parameter space of the kernel corresponding to \p hfunc. \p offset - * is a byte offset. - * - * \param hfunc - Kernel to add data to - * \param offset - Offset to add data to argument list - * \param ptr - Pointer to arbitrary data - * \param numbytes - Size of data to copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); - -/** - * \brief Launches a CUDA function - * - * \deprecated - * - * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block - * contains the number of threads specified by a previous call to - * ::cuFuncSetBlockShape(). - * - * The block shape, dynamic shared memory size, and parameter information - * must be set using - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), and - * ::cuParamSetv() - * prior to calling this function. - * - * Launching a function via ::cuLaunchKernel() invalidates the function's - * block shape, dynamic shared memory size, and parameter information. After - * launching via cuLaunchKernel, this state must be re-initialized prior to - * calling this function. Failure to do so results in undefined behavior. - * - * \param f - Kernel to launch - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); - -/** - * \brief Launches a CUDA function - * - * \deprecated - * - * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of - * blocks. Each block contains the number of threads specified by a previous - * call to ::cuFuncSetBlockShape(). - * - * The block shape, dynamic shared memory size, and parameter information - * must be set using - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), and - * ::cuParamSetv() - * prior to calling this function. - * - * Launching a function via ::cuLaunchKernel() invalidates the function's - * block shape, dynamic shared memory size, and parameter information. After - * launching via cuLaunchKernel, this state must be re-initialized prior to - * calling this function. Failure to do so results in undefined behavior. - * - * \param f - Kernel to launch - * \param grid_width - Width of grid in blocks - * \param grid_height - Height of grid in blocks - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); - -/** - * \brief Launches a CUDA function - * - * \deprecated - * - * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of - * blocks. Each block contains the number of threads specified by a previous - * call to ::cuFuncSetBlockShape(). - * - * The block shape, dynamic shared memory size, and parameter information - * must be set using - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), and - * ::cuParamSetv() - * prior to calling this function. - * - * Launching a function via ::cuLaunchKernel() invalidates the function's - * block shape, dynamic shared memory size, and parameter information. After - * launching via cuLaunchKernel, this state must be re-initialized prior to - * calling this function. Failure to do so results in undefined behavior. - * - * \param f - Kernel to launch - * \param grid_width - Width of grid in blocks - * \param grid_height - Height of grid in blocks - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * - * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), - * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by - * growing the per-thread stack as needed per launch and not shrinking it afterwards. - * - * \note_null_stream - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); - - -/** - * \brief Adds a texture-reference to the function's argument list - * - * \deprecated - * - * Makes the CUDA array or linear memory bound to the texture reference - * \p hTexRef available to a device program as a texture. In this version of - * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and - * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. - * - * \param hfunc - Kernel to add texture-reference to - * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) - * \param hTexRef - Texture-reference to add to argument list - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); -/** @} */ /* END CUDA_EXEC_DEPRECATED */ - -/** - * \defgroup CUDA_GRAPH Graph Management - * - * ___MANBRIEF___ graph management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the graph management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Creates a graph - * - * Creates an empty graph, which is returned via \p phGraph. - * - * \param phGraph - Returns newly created graph - * \param flags - Graph creation flags, must be 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - * ::cuGraphInstantiate, - * ::cuGraphDestroy, - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphClone - */ -CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); - -/** - * \brief Creates a kernel execution node and adds it to a graph - * - * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * The CUDA_KERNEL_NODE_PARAMS structure is defined as: - * - * \code - * typedef struct CUDA_KERNEL_NODE_PARAMS_st { - * CUfunction func; - * unsigned int gridDimX; - * unsigned int gridDimY; - * unsigned int gridDimZ; - * unsigned int blockDimX; - * unsigned int blockDimY; - * unsigned int blockDimZ; - * unsigned int sharedMemBytes; - * void **kernelParams; - * void **extra; - * } CUDA_KERNEL_NODE_PARAMS; - * \endcode - * - * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x - * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains - * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. - * - * \p sharedMemBytes sets the amount of dynamic shared memory that will be - * available to each thread block. - * - * Kernel parameters to \p func can be specified in one of two ways: - * - * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N - * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, - * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual - * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need - * to be specified as that information is retrieved directly from the kernel's image. - * - * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single - * buffer that is passed in via \p extra. This places the burden on the application of knowing each - * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists - * to allow this function to take additional less commonly used arguments. \p extra specifies - * a list of names of extra settings and their corresponding values. Each extra setting name is - * immediately followed by the corresponding value. The list must be terminated with either NULL or - * CU_LAUNCH_PARAM_END. - * - * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra - * array; - * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next - * value in \p extra will be a pointer to a buffer - * containing all the kernel parameters for launching kernel - * \p func; - * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next - * value in \p extra will be a pointer to a size_t - * containing the size of the buffer specified with - * ::CU_LAUNCH_PARAM_BUFFER_POINTER; - * - * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both - * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). - * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. - * - * The \p kernelParams or \p extra array, as well as the argument values it points to, - * are copied during this call. - * - * \note Kernels launched using graphs must not use texture and surface references. Reading or - * writing through any texture or surface reference is undefined behavior. - * This restriction does not apply to texture and surface objects. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the GPU execution node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuLaunchCooperativeKernel, - * ::cuGraphKernelNodeGetParams, - * ::cuGraphKernelNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Returns a kernel node's parameters - * - * Returns the parameters of kernel node \p hNode in \p nodeParams. - * The \p kernelParams or \p extra array returned in \p nodeParams, - * as well as the argument values it points to, are owned by the node. - * This memory remains valid until the node is destroyed or its - * parameters are modified, and should not be modified - * directly. Use ::cuGraphKernelNodeSetParams to update the - * parameters of this node. - * - * The params will contain either \p kernelParams or \p extra, - * according to which of these was most recently set on the node. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddKernelNode, - * ::cuGraphKernelNodeSetParams - */ -CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Sets a kernel node's parameters - * - * Sets the parameters of kernel node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddKernelNode, - * ::cuGraphKernelNodeGetParams - */ -CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Creates a memcpy node and adds it to a graph - * - * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * When the graph is launched, the node will perform the memcpy described by \p copyParams. - * See ::cuMemcpy3D() for a description of the structure and its restrictions. - * - * Memcpy nodes have some additional restrictions with regards to managed memory, if the - * system contains at least one device which has a zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer - * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed - * for those operand(s). The managed memory will be treated as residing on either the - * host or the device, depending on which memory type is specified. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param copyParams - Parameters for the memory copy - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemcpy3D, - * ::cuGraphMemcpyNodeGetParams, - * ::cuGraphMemcpyNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); - -/** - * \brief Returns a memcpy node's parameters - * - * Returns the parameters of memcpy node \p hNode in \p nodeParams. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemcpy3D, - * ::cuGraphAddMemcpyNode, - * ::cuGraphMemcpyNodeSetParams - */ -CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); - -/** - * \brief Sets a memcpy node's parameters - * - * Sets the parameters of memcpy node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemcpy3D, - * ::cuGraphAddMemcpyNode, - * ::cuGraphMemcpyNodeGetParams - */ -CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); - -/** - * \brief Creates a memset node and adds it to a graph - * - * Creates a new memset node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * The element size must be 1, 2, or 4 bytes. - * When the graph is launched, the node will perform the memset described by \p memsetParams. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param memsetParams - Parameters for the memory set - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_CONTEXT - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemsetD2D32, - * ::cuGraphMemsetNodeGetParams, - * ::cuGraphMemsetNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode - */ -CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); - -/** - * \brief Returns a memset node's parameters - * - * Returns the parameters of memset node \p hNode in \p nodeParams. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemsetD2D32, - * ::cuGraphAddMemsetNode, - * ::cuGraphMemsetNodeSetParams - */ -CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); - -/** - * \brief Sets a memset node's parameters - * - * Sets the parameters of memset node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemsetD2D32, - * ::cuGraphAddMemsetNode, - * ::cuGraphMemsetNodeGetParams - */ -CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); - -/** - * \brief Creates a host execution node and adds it to a graph - * - * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * When the graph is launched, the node will invoke the specified CPU function. - * Host nodes are not supported under MPS with pre-Volta GPUs. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the host node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchHostFunc, - * ::cuGraphHostNodeGetParams, - * ::cuGraphHostNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Returns a host node's parameters - * - * Returns the parameters of host node \p hNode in \p nodeParams. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchHostFunc, - * ::cuGraphAddHostNode, - * ::cuGraphHostNodeSetParams - */ -CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Sets a host node's parameters - * - * Sets the parameters of host node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchHostFunc, - * ::cuGraphAddHostNode, - * ::cuGraphHostNodeGetParams - */ -CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Creates a child graph node and adds it to a graph - * - * Creates a new node which executes an embedded graph, and adds it to \p hGraph with - * \p numDependencies dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * If \p hGraph contains allocation or free nodes, this call will return an error. - * - * The node executes an embedded child graph. The child graph is cloned in this call. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param childGraph - The graph to clone into this node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphChildGraphNodeGetGraph, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - * ::cuGraphClone - */ -CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); - -/** - * \brief Gets a handle to the embedded graph of a child graph node - * - * Gets a handle to the embedded graph in a child graph node. This call - * does not clone the graph. Changes to the graph will be reflected in - * the node, and the node retains ownership of the graph. - * - * Allocation and free nodes cannot be added to the returned graph. - * Attempting to do so will return an error. - * - * \param hNode - Node to get the embedded graph for - * \param phGraph - Location to store a handle to the graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphNodeFindInClone - */ -CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); - -/** - * \brief Creates an empty node and adds it to a graph - * - * Creates a new node which performs no operation, and adds it to \p hGraph with - * \p numDependencies dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * An empty node performs no operation during execution, but can be used for - * transitive ordering. For example, a phased execution graph with 2 groups of n - * nodes with a barrier between them can be represented using an empty node and - * 2*n dependency edges, rather than no empty node and n^2 dependency edges. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); - -/** - * \brief Creates an event record node and adds it to a graph - * - * Creates a new event record node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and event specified in \p event. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * Each launch of the graph will record \p event to capture execution of the - * node's dependencies. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param event - Event for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); - -/** - * \brief Returns the event associated with an event record node - * - * Returns the event of event record node \p hNode in \p event_out. - * - * \param hNode - Node to get the event for - * \param event_out - Pointer to return the event - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuGraphEventRecordNodeSetEvent, - * ::cuGraphEventWaitNodeGetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out); - -/** - * \brief Sets an event record node's event - * - * Sets the event of event record node \p hNode to \p event. - * - * \param hNode - Node to set the event for - * \param event - Event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuGraphEventRecordNodeGetEvent, - * ::cuGraphEventWaitNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event); - -/** - * \brief Creates an event wait node and adds it to a graph - * - * Creates a new event wait node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and event specified in \p event. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * The graph node will wait for all work captured in \p event. See ::cuEventRecord() - * for details on what is captured by an event. \p event may be from a different context - * or device than the launch stream. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param event - Event for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); - -/** - * \brief Returns the event associated with an event wait node - * - * Returns the event of event wait node \p hNode in \p event_out. - * - * \param hNode - Node to get the event for - * \param event_out - Pointer to return the event - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuGraphEventWaitNodeSetEvent, - * ::cuGraphEventRecordNodeGetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out); - -/** - * \brief Sets an event wait node's event - * - * Sets the event of event wait node \p hNode to \p event. - * - * \param hNode - Node to set the event for - * \param event - Event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuGraphEventWaitNodeGetEvent, - * ::cuGraphEventRecordNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event); - -/** - * \brief Creates an external semaphore signal node and adds it to a graph - * - * Creates a new external semaphore signal node and adds it to \p hGraph with \p - * numDependencies dependencies specified via \p dependencies and arguments specified - * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the - * node will be placed at the root of the graph. \p dependencies may not have any - * duplicate entries. A handle to the new node will be returned in \p phGraphNode. - * - * Performs a signal operation on a set of externally allocated semaphore objects - * when the node is launched. The operation(s) will occur after all of the node's - * dependencies have completed. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphExternalSemaphoresSignalNodeGetParams, - * ::cuGraphExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); - -/** - * \brief Returns an external semaphore signal node's parameters - * - * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. - * The \p extSemArray and \p paramsArray returned in \p params_out, - * are owned by the node. This memory remains valid until the node is destroyed or its - * parameters are modified, and should not be modified - * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the - * parameters of this node. - * - * \param hNode - Node to get the parameters for - * \param params_out - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphExternalSemaphoresSignalNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out); - -/** - * \brief Sets an external semaphore signal node's parameters - * - * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphExternalSemaphoresSignalNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); - -/** - * \brief Creates an external semaphore wait node and adds it to a graph - * - * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. A handle - * to the new node will be returned in \p phGraphNode. - * - * Performs a wait operation on a set of externally allocated semaphore objects - * when the node is launched. The node's dependencies will not be launched until - * the wait operation has completed. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphExternalSemaphoresWaitNodeGetParams, - * ::cuGraphExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); - -/** - * \brief Returns an external semaphore wait node's parameters - * - * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. - * The \p extSemArray and \p paramsArray returned in \p params_out, - * are owned by the node. This memory remains valid until the node is destroyed or its - * parameters are modified, and should not be modified - * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the - * parameters of this node. - * - * \param hNode - Node to get the parameters for - * \param params_out - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphExternalSemaphoresWaitNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out); - -/** - * \brief Sets an external semaphore wait node's parameters - * - * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphExternalSemaphoresWaitNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); - -/** - * \brief Creates an allocation node and adds it to a graph - * - * Creates a new allocation node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. A handle - * to the new node will be returned in \p phGraphNode. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the node - * - * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in - * \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. - * - * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, - * the allocation can be accessed by nodes ordered after the allocation node but before the free node. - * These allocations cannot be freed outside the owning graph, and they can only be freed once in the - * owning graph. - * - * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the - * graph which are ordered after the allocation node, but also by stream operations ordered after the - * graph's execution but before the allocation is freed. - * - * Allocations which are not freed in the same graph can be freed by: - * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; - * - launching a graph with a free node for that allocation; or - * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes - * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation. - * - * It is not possible to free an allocation in both the owning graph and another graph. If the allocation - * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed - * in another graph, a free node can no longer be added to the owning graph. - * - * The following restrictions apply to graphs which contain allocation and/or memory free nodes: - * - Nodes and edges of the graph cannot be deleted. - * - The graph cannot be used in a child node. - * - Only one instantiation of the graph may exist at any point in time. - * - The graph cannot be cloned. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemFreeNode, - * ::cuGraphMemAllocNodeGetParams, - * ::cuDeviceGraphMemTrim, - * ::cuDeviceGetGraphMemAttribute, - * ::cuDeviceSetGraphMemAttribute, - * ::cuMemAllocAsync, - * ::cuMemFreeAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); - -/** - * \brief Returns a memory alloc node's parameters - * - * Returns the parameters of a memory alloc node \p hNode in \p params_out. - * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the - * node. This memory remains valid until the node is destroyed. The returned - * parameters must not be modified. - * - * \param hNode - Node to get the parameters for - * \param params_out - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemAllocNode, - * ::cuGraphMemFreeNodeGetParams - */ -CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); - -/** - * \brief Creates a memory free node and adds it to a graph - * - * Creates a new memory free node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. A handle - * to the new node will be returned in \p phGraphNode. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param dptr - Address of memory to free - * - * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: - * - an allocation twice in the same graph. - * - an address that was not returned by an allocation node. - * - an invalid address. - * - * The following restrictions apply to graphs which contain allocation and/or memory free nodes: - * - Nodes and edges of the graph cannot be deleted. - * - The graph cannot be used in a child node. - * - Only one instantiation of the graph may exist at any point in time. - * - The graph cannot be cloned. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemAllocNode, - * ::cuGraphMemFreeNodeGetParams, - * ::cuDeviceGraphMemTrim, - * ::cuDeviceGetGraphMemAttribute, - * ::cuDeviceSetGraphMemAttribute, - * ::cuMemAllocAsync, - * ::cuMemFreeAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); - -/** - * \brief Returns a memory free node's parameters - * - * Returns the address of a memory free node \p hNode in \p dptr_out. - * - * \param hNode - Node to get the parameters for - * \param dptr_out - Pointer to return the device address - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemFreeNode, - * ::cuGraphMemAllocNodeGetParams - */ -CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out); - -/** - * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. - * - * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are - * freed back to the operating system. - * - * \param device - The device for which cached memory should be freed. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_DEVICE - * - * \sa - * ::cuGraphAddMemAllocNode, - * ::cuGraphAddMemFreeNode, - * ::cuDeviceSetGraphMemAttribute, - * ::cuDeviceGetGraphMemAttribute - */ -CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device); - -/** - * \brief Query asynchronous allocation attributes related to graphs - * - * Valid attributes are: - * - * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs - * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the - * last time it was reset. High watermark can only be reset to zero. - * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - * - * \param device - Specifies the scope of the query - * \param attr - attribute to get - * \param value - retrieved value - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_DEVICE - * - * \sa - * ::cuDeviceSetGraphMemAttribute, - * ::cuGraphAddMemAllocNode, - * ::cuGraphAddMemFreeNode - */ -CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); - -/** - * \brief Set asynchronous allocation attributes related to graphs - * - * Valid attributes are: - * - * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the - * last time it was reset. High watermark can only be reset to zero. - * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - * - * \param device - Specifies the scope of the query - * \param attr - attribute to get - * \param value - pointer to value to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_DEVICE - * - * \sa - * ::cuDeviceGetGraphMemAttribute, - * ::cuGraphAddMemAllocNode, - * ::cuGraphAddMemFreeNode - */ -CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); - -/** - * \brief Clones a graph - * - * This function creates a copy of \p originalGraph and returns it in \p phGraphClone. - * All parameters are copied into the cloned graph. The original graph may be modified - * after this call without affecting the clone. - * - * Child graph nodes in the original graph are recursively copied into the clone. - * - * \param phGraphClone - Returns newly created cloned graph - * \param originalGraph - Graph to clone - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphNodeFindInClone - */ -CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); - -/** - * \brief Finds a cloned version of a node - * - * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode - * in the original graph. - * - * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. - * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to - * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have - * been removed. The cloned node is then returned via \p phClonedNode. - * - * \param phNode - Returns handle to the cloned node - * \param hOriginalNode - Handle to the original node - * \param hClonedGraph - Cloned graph to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphClone - */ -CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); - -/** - * \brief Returns a node's type - * - * Returns the node type of \p hNode in \p type. - * - * \param hNode - Node to query - * \param type - Pointer to return the node type - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphChildGraphNodeGetGraph, - * ::cuGraphKernelNodeGetParams, - * ::cuGraphKernelNodeSetParams, - * ::cuGraphHostNodeGetParams, - * ::cuGraphHostNodeSetParams, - * ::cuGraphMemcpyNodeGetParams, - * ::cuGraphMemcpyNodeSetParams, - * ::cuGraphMemsetNodeGetParams, - * ::cuGraphMemsetNodeSetParams - */ -CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); - -/** - * \brief Returns a graph's nodes - * - * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this - * function will return the number of nodes in \p numNodes. Otherwise, - * \p numNodes entries will be filled in. If \p numNodes is higher than the actual - * number of nodes, the remaining entries in \p nodes will be set to NULL, and the - * number of nodes actually obtained will be returned in \p numNodes. - * - * \param hGraph - Graph to query - * \param nodes - Pointer to return the nodes - * \param numNodes - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetType, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); - -/** - * \brief Returns a graph's root nodes - * - * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this - * function will return the number of root nodes in \p numRootNodes. Otherwise, - * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual - * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the - * number of nodes actually obtained will be returned in \p numRootNodes. - * - * \param hGraph - Graph to query - * \param rootNodes - Pointer to return the root nodes - * \param numRootNodes - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphGetNodes, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetType, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); - -/** - * \brief Returns a graph's dependency edges - * - * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding - * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the - * node in \p from[i]. \p from and \p to may both be NULL, in which - * case this function only returns the number of edges in \p numEdges. Otherwise, - * \p numEdges entries will be filled in. If \p numEdges is higher than the actual - * number of edges, the remaining entries in \p from and \p to will be set to NULL, and - * the number of edges actually returned will be written to \p numEdges. - * - * \param hGraph - Graph to get the edges from - * \param from - Location to return edge endpoints - * \param to - Location to return edge endpoints - * \param numEdges - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphAddDependencies, - * ::cuGraphRemoveDependencies, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); - -/** - * \brief Returns a node's dependencies - * - * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this - * function will return the number of dependencies in \p numDependencies. Otherwise, - * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual - * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the - * number of nodes actually obtained will be returned in \p numDependencies. - * - * \param hNode - Node to query - * \param dependencies - Pointer to return the dependencies - * \param numDependencies - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphNodeGetDependentNodes, - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphAddDependencies, - * ::cuGraphRemoveDependencies - */ -CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); - -/** - * \brief Returns a node's dependent nodes - * - * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which - * case this function will return the number of dependent nodes in \p numDependentNodes. - * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is - * higher than the actual number of dependent nodes, the remaining entries in - * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will - * be returned in \p numDependentNodes. - * - * \param hNode - Node to query - * \param dependentNodes - Pointer to return the dependent nodes - * \param numDependentNodes - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphNodeGetDependencies, - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphAddDependencies, - * ::cuGraphRemoveDependencies - */ -CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); - -/** - * \brief Adds dependency edges to a graph - * - * The number of dependencies to be added is defined by \p numDependencies - * Elements in \p from and \p to at corresponding indices define a dependency. - * Each node in \p from and \p to must belong to \p hGraph. - * - * If \p numDependencies is 0, elements in \p from and \p to will be ignored. - * Specifying an existing dependency will return an error. - * - * \param hGraph - Graph to which dependencies are added - * \param from - Array of nodes that provide the dependencies - * \param to - Array of dependent nodes - * \param numDependencies - Number of dependencies to be added - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphRemoveDependencies, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); - -/** - * \brief Removes dependency edges from a graph - * - * The number of \p dependencies to be removed is defined by \p numDependencies. - * Elements in \p from and \p to at corresponding indices define a dependency. - * Each node in \p from and \p to must belong to \p hGraph. - * - * If \p numDependencies is 0, elements in \p from and \p to will be ignored. - * Specifying a non-existing dependency will return an error. - * - * Dependencies cannot be removed from graphs which contain allocation or free nodes. - * Any attempt to do so will return an error. - * - * \param hGraph - Graph from which to remove dependencies - * \param from - Array of nodes that provide the dependencies - * \param to - Array of dependent nodes - * \param numDependencies - Number of dependencies to be removed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddDependencies, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); - -/** - * \brief Remove a node from the graph - * - * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes - * on \p hNode and vice versa. - * - * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. - * Any attempt to do so will return an error. - * - * \param hNode - Node to remove - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); - -/** - * \brief Creates an executable graph from a graph - * - * Instantiates \p hGraph as an executable graph. The graph is validated for any - * structural constraints or intra-node constraints which were not previously - * validated. If instantiation is successful, a handle to the instantiated graph - * is returned in \p phGraphExec. - * - * If there are any errors, diagnostic information may be returned in \p errorNode and - * \p logBuffer. This is the primary way to inspect instantiation errors. The output - * will be null terminated unless the diagnostics overflow - * the buffer. In this case, they will be truncated, and the last byte can be - * inspected to determine if truncation occurred. - * - * \param phGraphExec - Returns instantiated graph - * \param hGraph - Graph to instantiate - * \param phErrorNode - In case of an instantiation error, this may be modified to - * indicate a node contributing to the error - * \param logBuffer - A character buffer to store diagnostic messages - * \param bufferSize - Size of the log buffer in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiateWithFlags, - * ::cuGraphCreate, - * ::cuGraphUpload, - * ::cuGraphLaunch, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); - -/** - * \brief Creates an executable graph from a graph - * - * Instantiates \p hGraph as an executable graph. The graph is validated for any - * structural constraints or intra-node constraints which were not previously - * validated. If instantiation is successful, a handle to the instantiated graph - * is returned in \p phGraphExec. - * - * The \p flags parameter controls the behavior of instantiation and subsequent - * graph launches. Valid flags are: - * - * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a - * graph containing memory allocation nodes to automatically free any - * unfreed memory allocations before the graph is relaunched. - * - * If \p hGraph contains any allocation or free nodes, there can be at most one - * executable graph in existence for that graph at a time. - * - * An attempt to instantiate a second executable graph before destroying the first - * with ::cuGraphExecDestroy will result in an error. - * - * \param phGraphExec - Returns instantiated graph - * \param hGraph - Graph to instantiate - * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphCreate, - * ::cuGraphUpload, - * ::cuGraphLaunch, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); - -/** - * \brief Sets the parameters for a kernel node in the given graphExec - * - * Sets the parameters of a kernel node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. All \p nodeParams - * fields may change, but the following restrictions apply to \p func updates: - * - * - The owning context of the function cannot change. - * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated - * to a function which uses CDP - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - kernel node from the graph from which graphExec was instantiated - * \param nodeParams - Updated Parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddKernelNode, - * ::cuGraphKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Sets the parameters for a memcpy node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had - * contained \p copyParams at instantiation. hNode must remain in the graph which was - * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. - * - * The source and destination memory in \p copyParams must be allocated from the same - * contexts as the original source and destination memory. Both the instantiation-time - * memory operands and the memory operands in \p copyParams must be 1-dimensional. - * Zero-length operations are not supported. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. hNode is also - * not modified by this call. - * - * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or - * either the original or new memory operands are multidimensional. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Memcpy node from the graph which was used to instantiate graphExec - * \param copyParams - The updated parameters to set - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemcpyNode, - * ::cuGraphMemcpyNodeSetParams, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); - -/** - * \brief Sets the parameters for a memset node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had - * contained \p memsetParams at instantiation. hNode must remain in the graph which was - * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. - * - * The destination memory in \p memsetParams must be allocated from the same - * contexts as the original destination memory. Both the instantiation-time - * memory operand and the memory operand in \p memsetParams must be 1-dimensional. - * Zero-length operations are not supported. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. hNode is also - * not modified by this call. - * - * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or - * either the original or new memory operand are multidimensional. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Memset node from the graph which was used to instantiate graphExec - * \param memsetParams - The updated parameters to set - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemsetNode, - * ::cuGraphMemsetNodeSetParams, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); - -/** - * \brief Sets the parameters for a host node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had - * contained \p nodeParams at instantiation. hNode must remain in the graph which was - * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. hNode is also - * not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Host node from the graph which was used to instantiate graphExec - * \param nodeParams - The updated parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddHostNode, - * ::cuGraphHostNodeSetParams, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Updates node parameters in the child graph node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained - * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. - * \p hNode must remain in the graph which was used to instantiate \p hGraphExec. - * Changed edges to and from \p hNode are ignored. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. \p hNode is also - * not modified by this call. - * - * The topology of \p childGraph, as well as the node insertion order, must match that - * of the graph contained in \p hNode. See ::cuGraphExecUpdate() for a list of restrictions - * on what can be updated in an instantiated graph. The update is recursive, so child graph - * nodes contained within the top level child graph will also be updated. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Host node from the graph which was used to instantiate graphExec - * \param childGraph - The graph supplying the updated parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphChildGraphNodeGetGraph, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); - -/** - * \brief Sets the event for an event record node in the given graphExec - * - * Sets the event of an event record node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - event record node from the graph from which graphExec was instantiated - * \param event - Updated event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuGraphEventRecordNodeGetEvent, - * ::cuGraphEventWaitNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); - -/** - * \brief Sets the event for an event wait node in the given graphExec - * - * Sets the event of an event wait node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - event wait node from the graph from which graphExec was instantiated - * \param event - Updated event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuGraphEventWaitNodeGetEvent, - * ::cuGraphEventRecordNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); - -/** - * \brief Sets the parameters for an external semaphore signal node in the given graphExec - * - * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * Changing \p nodeParams->numExtSems is not supported. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - semaphore signal node from the graph from which graphExec was instantiated - * \param nodeParams - Updated Parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); - -/** - * \brief Sets the parameters for an external semaphore wait node in the given graphExec - * - * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * Changing \p nodeParams->numExtSems is not supported. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - semaphore wait node from the graph from which graphExec was instantiated - * \param nodeParams - Updated Parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); - -/** - * \brief Enables or disables the specified node in the given graphExec - * - * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent - * to empty nodes until they are reenabled. Existing node parameters are not affected by - * disabling/enabling the node. - * - * The node is identified by the corresponding node \p hNode in the non-executable - * graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * \note Currently only kernel nodes are supported. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Node from the graph from which graphExec was instantiated - * \param isEnabled - Node is enabled if != 0, otherwise the node is disabled - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphNodeGetEnabled, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - * ::cuGraphLaunch - */ -CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled); - -/** - * \brief Query whether a node in the given graphExec is enabled - * - * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled. - * - * The node is identified by the corresponding node \p hNode in the non-executable - * graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. - * - * \note Currently only kernel nodes are supported. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Node from the graph from which graphExec was instantiated - * \param isEnabled - Location to return the enabled status of the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphNodeSetEnabled, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - * ::cuGraphLaunch - */ -CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled); - -/** - * \brief Uploads an executable graph in a stream - * - * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of - * the same \p hGraphExec will be serialized. Each upload is ordered behind both any - * previous work in \p hStream and any previous launches of \p hGraphExec. - * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec. - * - * \param hGraphExec - Executable graph to upload - * \param hStream - Stream in which to upload the graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphLaunch, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream); - -/** - * \brief Launches an executable graph in a stream - * - * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing - * at a time. Each launch is ordered behind both any previous work in \p hStream - * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be - * instantiated multiple times into multiple executable graphs. - * - * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and - * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, - * the launch will fail with ::CUDA_ERROR_INVALID_VALUE. - * - * \param hGraphExec - Executable graph to launch - * \param hStream - Stream in which to launch the graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphUpload, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); - -/** - * \brief Destroys an executable graph - * - * Destroys the executable graph specified by \p hGraphExec, as well - * as all of its executable nodes. If the executable graph is - * in-flight, it will not be terminated, but rather freed - * asynchronously on completion. - * - * \param hGraphExec - Executable graph to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphUpload, - * ::cuGraphLaunch - */ -CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); - -/** - * \brief Destroys a graph - * - * Destroys the graph specified by \p hGraph, as well as all of its nodes. - * - * \param hGraph - Graph to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate - */ -CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); - -/** - * \brief Check whether an executable graph can be updated with a graph and perform the update if possible - * - * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the - * node parameters in a topologically identical graph specified by \p hGraph. - * - * Limitations: - * - * - Kernel nodes: - * - The owning context of the function cannot change. - * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated - * to a function which uses CDP. - * - A cooperative node cannot be updated to a non-cooperative node, and vice-versa. - * - Memset and memcpy nodes: - * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. - * - The source/destination memory must be allocated from the same contexts as the original - * source/destination memory. - * - Only 1D memsets can be changed. - * - Additional memcpy node restrictions: - * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, - * CU_MEMORYTYPE_ARRAY, etc.) is not supported. - * - External semaphore wait nodes and record nodes: - * - Changing the number of semaphores is not supported. - * - * Note: The API may add further restrictions in future releases. The return code should always be checked. - * - * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under - * the following conditions: - * - * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out - * is NULL. - * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out - * is NULL. - * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is - * the pairless node from \p hGraph. - * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. - * - * cuGraphExecUpdate sets \p updateResult_out to: - * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. - * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed - * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case - * \p hErrorNode_out is set to the node from \p hGraph. - * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported - * way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph - * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way - * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. - * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way - * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. - * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like - * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph - * - * If \p updateResult_out isn't set in one of the situations described above, the update check passes - * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens - * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise, - * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS. - * - * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns - * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included - * changes which violated constraints specific to instantiated graph update. - * - * \param hGraphExec The instantiated graph to be updated - * \param hGraph The graph containing the updated parameters - * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any - * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - */ -CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); - -/** - * \brief Copies attributes from source node to destination node. - * - * Copies attributes from source node \p src to destination node \p dst. - * Both node must have the same context. - * - * \param[out] dst Destination node - * \param[in] src Source node - * For list of attributes see ::CUkernelNodeAttrID - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); - -/** - * \brief Queries node attribute. - * - * Queries attribute \p attr from node \p hNode and stores it in corresponding - * member of \p value_out. - * - * \param[in] hNode - * \param[in] attr - * \param[out] value_out - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, - CUkernelNodeAttrValue *value_out); - -/** - * \brief Sets node attribute. - * - * Sets attribute \p attr on node \p hNode from corresponding attribute of - * \p value. - * - * \param[out] hNode - * \param[in] attr - * \param[out] value - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, - const CUkernelNodeAttrValue *value); - -/** - * \brief Write a DOT file describing graph structure - * - * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. - * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. - * \p flags can be specified to write more detailed information about each node type such as - * parameter values, kernel attributes, node and function handles. - * - * \param hGraph - The graph to create a DOT file from - * \param path - The path to write the DOT file to - * \param flags - Flags from CUgraphDebugDot_flags for specifying which additional node information to write - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OPERATING_SYSTEM - */ -CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags); - -/** - * \brief Create a user object - * - * Create a user object with the specified destructor callback and initial reference count. The - * initial references are owned by the caller. - * - * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they - * are executed by a shared internal thread. Another thread may be signaled to perform such - * actions, if it does not block forward progress of tasks scheduled through CUDA. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param object_out - Location to return the user object handle - * \param ptr - The pointer to pass to the destroy function - * \param destroy - Callback to free the user object when it is no longer in use - * \param initialRefcount - The initial refcount to create the object with, typically 1. The - * initial references are owned by the calling thread. - * \param flags - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, - * which is the only defined flag. This indicates that the destroy - * callback cannot be waited on by any CUDA API. Users requiring - * synchronization of the callback should signal its completion - * manually. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectRetain, - * ::cuUserObjectRelease, - * ::cuGraphRetainUserObject, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, - unsigned int initialRefcount, unsigned int flags); - -/** - * \brief Retain a reference to a user object - * - * Retains new references to a user object. The new references are owned by the caller. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param object - The object to retain - * \param count - The number of references to retain, typically 1. Must be nonzero - * and not larger than INT_MAX. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRelease, - * ::cuGraphRetainUserObject, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count); - -/** - * \brief Release a reference to a user object - * - * Releases user object references owned by the caller. The object's destructor is invoked if - * the reference count reaches zero. - * - * It is undefined behavior to release references not owned by the caller, or to use a user - * object handle after all references are released. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param object - The object to release - * \param count - The number of references to release, typically 1. Must be nonzero - * and not larger than INT_MAX. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRetain, - * ::cuGraphRetainUserObject, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count); - -/** - * \brief Retain a reference to a user object from a graph - * - * Creates or moves user object references that will be owned by a CUDA graph. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param graph - The graph to associate the reference with - * \param object - The user object to retain a reference for - * \param count - The number of references to add to the graph, typically 1. Must be - * nonzero and not larger than INT_MAX. - * \param flags - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references - * from the calling thread, rather than create new references. Pass 0 - * to create new references. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRetain, - * ::cuUserObjectRelease, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); - -/** - * \brief Release a user object reference from a graph - * - * Releases user object references owned by a graph. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param graph - The graph that will release the reference - * \param object - The user object to release a reference for - * \param count - The number of references to release, typically 1. Must be nonzero - * and not larger than INT_MAX. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRetain, - * ::cuUserObjectRelease, - * ::cuGraphRetainUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count); - -/** @} */ /* END CUDA_GRAPH */ - -/** - * \defgroup CUDA_OCCUPANCY Occupancy - * - * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the occupancy calculation functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns occupancy of a function - * - * Returns in \p *numBlocks the number of the maximum active blocks per - * streaming multiprocessor. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel for which occupancy is calculated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - */ -CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); - -/** - * \brief Returns occupancy of a function - * - * Returns in \p *numBlocks the number of the maximum active blocks per - * streaming multiprocessor. - * - * The \p Flags parameter controls how special cases are handled. The - * valid flags are: - * - * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as - * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; - * - * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the - * default behavior on platform where global caching affects - * occupancy. On such platforms, if caching is enabled, but - * per-block SM resource usage would result in zero occupancy, the - * occupancy calculator will calculate the occupancy as if caching - * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes - * the occupancy calculator to return 0 in such cases. More information - * can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel for which occupancy is calculated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * \param flags - Requested behavior for the occupancy calculator - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - */ -CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); - -/** - * \brief Suggest a launch configuration with reasonable occupancy - * - * Returns in \p *blockSize a reasonable block size that can achieve - * the maximum occupancy (or, the maximum number of active warps with - * the fewest blocks per multiprocessor), and in \p *minGridSize the - * minimum grid size to achieve the maximum occupancy. - * - * If \p blockSizeLimit is 0, the configurator will use the maximum - * block size permitted by the device / function instead. - * - * If per-block dynamic shared memory allocation is not needed, the - * user should leave both \p blockSizeToDynamicSMemSize and \p - * dynamicSMemSize as 0. - * - * If per-block dynamic shared memory allocation is needed, then if - * the dynamic shared memory size is constant regardless of block - * size, the size should be passed through \p dynamicSMemSize, and \p - * blockSizeToDynamicSMemSize should be NULL. - * - * Otherwise, if the per-block dynamic shared memory size varies with - * different block sizes, the user needs to provide a unary function - * through \p blockSizeToDynamicSMemSize that computes the dynamic - * shared memory needed by \p func for any given block size. \p - * dynamicSMemSize is ignored. An example signature is: - * - * \code - * // Take block size, returns dynamic shared memory needed - * size_t blockToSmem(int blockSize); - * \endcode - * - * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy - * \param blockSize - Returned maximum block size that can achieve the maximum occupancy - * \param func - Kernel for which launch configuration is calculated - * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size - * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes - * \param blockSizeLimit - The maximum block size \p func is designed to handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxPotentialBlockSize - */ -CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); - -/** - * \brief Suggest a launch configuration with reasonable occupancy - * - * An extended version of ::cuOccupancyMaxPotentialBlockSize. In - * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, - * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags - * parameter. - * - * The \p Flags parameter controls how special cases are handled. The - * valid flags are: - * - * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as - * ::cuOccupancyMaxPotentialBlockSize; - * - * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the - * default behavior on platform where global caching affects - * occupancy. On such platforms, the launch configurations that - * produces maximal occupancy might not support global - * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE - * guarantees that the the produced launch configuration is global - * caching compatible at a potential cost of occupancy. More information - * can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy - * \param blockSize - Returned maximum block size that can achieve the maximum occupancy - * \param func - Kernel for which launch configuration is calculated - * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size - * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes - * \param blockSizeLimit - The maximum block size \p func is designed to handle - * \param flags - Options - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxPotentialBlockSizeWithFlags - */ -CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); - -/** - * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM - * - * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. - * - * \param dynamicSmemSize - Returned maximum dynamic shared memory - * \param func - Kernel function for which occupancy is calculated - * \param numBlocks - Number of blocks to fit on SM - * \param blockSize - Size of the blocks - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - */ -CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); - -/** @} */ /* END CUDA_OCCUPANCY */ - -/** - * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated texture reference management functions of the - * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated texture reference management - * functions of the low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Binds an array as a texture reference - * - * \deprecated - * - * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any - * previous address or CUDA array state associated with the texture reference - * is superseded by this function. \p Flags must be set to - * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is - * unbound. - * - * \param hTexRef - Texture reference to bind - * \param hArray - Array to bind - * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); - -/** - * \brief Binds a mipmapped array to a texture reference - * - * \deprecated - * - * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. - * Any previous address or CUDA array state associated with the texture reference - * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. - * Any CUDA array previously bound to \p hTexRef is unbound. - * - * \param hTexRef - Texture reference to bind - * \param hMipmappedArray - Mipmapped array to bind - * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); - -/** - * \brief Binds an address as a texture reference - * - * \deprecated - * - * Binds a linear address range to the texture reference \p hTexRef. Any - * previous address or CUDA array state associated with the texture reference - * is superseded by this function. Any memory previously bound to \p hTexRef - * is unbound. - * - * Since the hardware enforces an alignment requirement on texture base - * addresses, ::cuTexRefSetAddress() passes back a byte offset in - * \p *ByteOffset that must be applied to texture fetches in order to read from - * the desired memory. This offset must be divided by the texel size and - * passed to kernels that read from the texture so they can be applied to the - * ::tex1Dfetch() function. - * - * If the device memory pointer was returned from ::cuMemAlloc(), the offset - * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. - * - * The total number of elements (or texels) in the linear address range - * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. - * The number of elements is computed as (\p bytes / bytesPerElement), - * where bytesPerElement is determined from the data format and number of - * components set using ::cuTexRefSetFormat(). - * - * \param ByteOffset - Returned byte offset - * \param hTexRef - Texture reference to bind - * \param dptr - Device pointer to bind - * \param bytes - Size of memory to bind in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); - -/** - * \brief Binds an address as a 2D texture reference - * - * \deprecated - * - * Binds a linear address range to the texture reference \p hTexRef. Any - * previous address or CUDA array state associated with the texture reference - * is superseded by this function. Any memory previously bound to \p hTexRef - * is unbound. - * - * Using a ::tex2D() function inside a kernel requires a call to either - * ::cuTexRefSetArray() to bind the corresponding texture reference to an - * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear - * memory. - * - * Function calls to ::cuTexRefSetFormat() cannot follow calls to - * ::cuTexRefSetAddress2D() for the same texture reference. - * - * It is required that \p dptr be aligned to the appropriate hardware-specific - * texture alignment. You can query this value using the device attribute - * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is - * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. - * This value can be queried using the device attribute - * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is - * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. - * - * Width and Height, which are specified in elements (or texels), cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. - * \p Pitch, which is specified in bytes, cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. - * - * \param hTexRef - Texture reference to bind - * \param desc - Descriptor of CUDA array - * \param dptr - Device pointer to bind - * \param Pitch - Line pitch in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture2D - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); - -/** - * \brief Sets the format for a texture reference - * - * \deprecated - * - * Specifies the format of the data to be read by the texture reference - * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the - * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: - * They specify the format of each component and the number of components per - * array element. - * - * \param hTexRef - Texture reference - * \param fmt - Format to set - * \param NumPackedComponents - Number of components per array element - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaCreateChannelDesc, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); - -/** - * \brief Sets the addressing mode for a texture reference - * - * \deprecated - * - * Specifies the addressing mode \p am for the given dimension \p dim of the - * texture reference \p hTexRef. If \p dim is zero, the addressing mode is - * applied to the first parameter of the functions used to fetch from the - * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined - * as: - * \code - typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, - CU_TR_ADDRESS_MODE_CLAMP = 1, - CU_TR_ADDRESS_MODE_MIRROR = 2, - CU_TR_ADDRESS_MODE_BORDER = 3 - } CUaddress_mode; - * \endcode - * - * Note that this call has no effect if \p hTexRef is bound to linear memory. - * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only - * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. - * - * \param hTexRef - Texture reference - * \param dim - Dimension - * \param am - Addressing mode to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); - -/** - * \brief Sets the filtering mode for a texture reference - * - * \deprecated - * - * Specifies the filtering mode \p fm to be used when reading memory through - * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: - * - * \code - typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, - CU_TR_FILTER_MODE_LINEAR = 1 - } CUfilter_mode; - * \endcode - * - * Note that this call has no effect if \p hTexRef is bound to linear memory. - * - * \param hTexRef - Texture reference - * \param fm - Filtering mode to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); - -/** - * \brief Sets the mipmap filtering mode for a texture reference - * - * \deprecated - * - * Specifies the mipmap filtering mode \p fm to be used when reading memory through - * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: - * - * \code - typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, - CU_TR_FILTER_MODE_LINEAR = 1 - } CUfilter_mode; - * \endcode - * - * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. - * - * \param hTexRef - Texture reference - * \param fm - Filtering mode to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); - -/** - * \brief Sets the mipmap level bias for a texture reference - * - * \deprecated - * - * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when - * reading memory through the texture reference \p hTexRef. - * - * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. - * - * \param hTexRef - Texture reference - * \param bias - Mipmap level bias - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); - -/** - * \brief Sets the mipmap min/max mipmap level clamps for a texture reference - * - * \deprecated - * - * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp - * respectively, to be used when reading memory through the texture reference - * \p hTexRef. - * - * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. - * - * \param hTexRef - Texture reference - * \param minMipmapLevelClamp - Mipmap min level clamp - * \param maxMipmapLevelClamp - Mipmap max level clamp - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); - -/** - * \brief Sets the maximum anisotropy for a texture reference - * - * \deprecated - * - * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through - * the texture reference \p hTexRef. - * - * Note that this call has no effect if \p hTexRef is bound to linear memory. - * - * \param hTexRef - Texture reference - * \param maxAniso - Maximum anisotropy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); - -/** - * \brief Sets the border color for a texture reference - * - * \deprecated - * - * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - * \p hTexRef. The color value supports only float type and holds color components in - * the following sequence: - * pBorderColor[0] holds 'R' component - * pBorderColor[1] holds 'G' component - * pBorderColor[2] holds 'B' component - * pBorderColor[3] holds 'A' component - * - * Note that the color values can be set only when the Address mode is set to - * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. - * Applications using integer border color values have to "reinterpret_cast" their values to float. - * - * \param hTexRef - Texture reference - * \param pBorderColor - RGBA color - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddressMode, - * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); - -/** - * \brief Sets the flags for a texture reference - * - * \deprecated - * - * Specifies optional flags via \p Flags to specify the behavior of data - * returned through the texture reference \p hTexRef. The valid flags are: - * - * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of - * having the texture promote integer data to floating point data in the - * range [0, 1]. Note that texture with 32-bit integer format - * would not be promoted, regardless of whether or not this - * flag is specified; - * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the - * default behavior of having the texture coordinates range - * from [0, Dim) where Dim is the width or height of the CUDA - * array. Instead, the texture coordinates [0, 1.0) reference - * the entire breadth of the array dimension; - * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear - * filtering optimizations. Trilinear optimizations improve texture filtering - * performance by allowing bilinear filtering on textures in scenarios where - * it can closely approximate the expected results. - * - * \param hTexRef - Texture reference - * \param Flags - Optional flags to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); - -/** - * \brief Gets the address associated with a texture reference - * - * \deprecated - * - * Returns in \p *pdptr the base address bound to the texture reference - * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any device memory range. - * - * \param pdptr - Returned device address - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); - -/** - * \brief Gets the array bound to a texture reference - * - * \deprecated - * - * Returns in \p *phArray the CUDA array bound to the texture reference - * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any CUDA array. - * - * \param phArray - Returned array - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); - -/** - * \brief Gets the mipmapped array bound to a texture reference - * - * \deprecated - * - * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture - * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any CUDA mipmapped array. - * - * \param phMipmappedArray - Returned mipmapped array - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); - -/** - * \brief Gets the addressing mode used by a texture reference - * - * \deprecated - * - * Returns in \p *pam the addressing mode corresponding to the - * dimension \p dim of the texture reference \p hTexRef. Currently, the only - * valid value for \p dim are 0 and 1. - * - * \param pam - Returned addressing mode - * \param hTexRef - Texture reference - * \param dim - Dimension - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); - -/** - * \brief Gets the filter-mode used by a texture reference - * - * \deprecated - * - * Returns in \p *pfm the filtering mode of the texture reference - * \p hTexRef. - * - * \param pfm - Returned filtering mode - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); - -/** - * \brief Gets the format used by a texture reference - * - * \deprecated - * - * Returns in \p *pFormat and \p *pNumChannels the format and number - * of components of the CUDA array bound to the texture reference \p hTexRef. - * If \p pFormat or \p pNumChannels is NULL, it will be ignored. - * - * \param pFormat - Returned format - * \param pNumChannels - Returned number of components - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); - -/** - * \brief Gets the mipmap filtering mode for a texture reference - * - * \deprecated - * - * Returns the mipmap filtering mode in \p pfm that's used when reading memory through - * the texture reference \p hTexRef. - * - * \param pfm - Returned mipmap filtering mode - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); - -/** - * \brief Gets the mipmap level bias for a texture reference - * - * \deprecated - * - * Returns the mipmap level bias in \p pBias that's added to the specified mipmap - * level when reading memory through the texture reference \p hTexRef. - * - * \param pbias - Returned mipmap level bias - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); - -/** - * \brief Gets the min/max mipmap level clamps for a texture reference - * - * \deprecated - * - * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp - * that's used when reading memory through the texture reference \p hTexRef. - * - * \param pminMipmapLevelClamp - Returned mipmap min level clamp - * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); - -/** - * \brief Gets the maximum anisotropy for a texture reference - * - * \deprecated - * - * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through - * the texture reference \p hTexRef. - * - * \param pmaxAniso - Returned maximum anisotropy - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); - -/** - * \brief Gets the border color used by a texture reference - * - * \deprecated - * - * Returns in \p pBorderColor, values of the RGBA color used by - * the texture reference \p hTexRef. - * The color value is of type float and holds color components in - * the following sequence: - * pBorderColor[0] holds 'R' component - * pBorderColor[1] holds 'G' component - * pBorderColor[2] holds 'B' component - * pBorderColor[3] holds 'A' component - * - * \param hTexRef - Texture reference - * \param pBorderColor - Returned Type and Value of RGBA color - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddressMode, - * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); - -/** - * \brief Gets the flags used by a texture reference - * - * \deprecated - * - * Returns in \p *pFlags the flags of the texture reference \p hTexRef. - * - * \param pFlags - Returned flags - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); - -/** - * \brief Creates a texture reference - * - * \deprecated - * - * Creates a texture reference and returns its handle in \p *pTexRef. Once - * created, the application must call ::cuTexRefSetArray() or - * ::cuTexRefSetAddress() to associate the reference with allocated memory. - * Other texture reference functions are used to specify the format and - * interpretation (addressing, filtering, etc.) to be used when the memory is - * read through this texture reference. - * - * \param pTexRef - Returned texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefDestroy - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); - -/** - * \brief Destroys a texture reference - * - * \deprecated - * - * Destroys the texture reference specified by \p hTexRef. - * - * \param hTexRef - Texture reference to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefCreate - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); - -/** @} */ /* END CUDA_TEXREF_DEPRECATED */ - - -/** - * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] - * - * ___MANBRIEF___ surface reference management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the surface reference management functions of the - * low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Sets the CUDA array for a surface reference. - * - * \deprecated - * - * Sets the CUDA array \p hArray to be read and written by the surface reference - * \p hSurfRef. Any previous CUDA array state associated with the surface - * reference is superseded by this function. \p Flags must be set to 0. - * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. - * Any CUDA array previously bound to \p hSurfRef is unbound. - - * \param hSurfRef - Surface reference handle - * \param hArray - CUDA array handle - * \param Flags - set to 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuModuleGetSurfRef, - * ::cuSurfRefGetArray, - * ::cudaBindSurfaceToArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); - -/** - * \brief Passes back the CUDA array bound to a surface reference. - * - * \deprecated - * - * Returns in \p *phArray the CUDA array bound to the surface reference - * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference - * is not bound to any CUDA array. - - * \param phArray - Surface reference handle - * \param hSurfRef - Surface reference handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); - -/** @} */ /* END CUDA_SURFREF_DEPRECATED */ - -/** - * \defgroup CUDA_TEXOBJECT Texture Object Management - * - * ___MANBRIEF___ texture object management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the texture object management functions of the - * low-level CUDA driver application programming interface. The texture - * object API is only supported on devices of compute capability 3.0 or higher. - * - * @{ - */ - -/** - * \brief Creates a texture object - * - * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes - * the data to texture from. \p pTexDesc describes how the data should be sampled. - * \p pResViewDesc is an optional argument that specifies an alternate format for - * the data described by \p pResDesc, and also describes the subresource region - * to restrict access to when texturing. \p pResViewDesc can only be specified if - * the type of resource is a CUDA array or a CUDA mipmapped array. - * - * Texture objects are only supported on devices of compute capability 3.0 or higher. - * Additionally, a texture object is an opaque value, and, as such, should only be - * accessed through CUDA API calls. - * - * The ::CUDA_RESOURCE_DESC structure is defined as: - * \code - typedef struct CUDA_RESOURCE_DESC_st - { - CUresourcetype resType; - - union { - struct { - CUarray hArray; - } array; - struct { - CUmipmappedArray hMipmappedArray; - } mipmap; - struct { - CUdeviceptr devPtr; - CUarray_format format; - unsigned int numChannels; - size_t sizeInBytes; - } linear; - struct { - CUdeviceptr devPtr; - CUarray_format format; - unsigned int numChannels; - size_t width; - size_t height; - size_t pitchInBytes; - } pitch2D; - } res; - - unsigned int flags; - } CUDA_RESOURCE_DESC; - - * \endcode - * where: - * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. - * CUresourceType is defined as: - * \code - typedef enum CUresourcetype_enum { - CU_RESOURCE_TYPE_ARRAY = 0x00, - CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, - CU_RESOURCE_TYPE_LINEAR = 0x02, - CU_RESOURCE_TYPE_PITCH2D = 0x03 - } CUresourcetype; - * \endcode - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray - * must be set to a valid CUDA array handle. - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray - * must be set to a valid CUDA mipmapped array handle. - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr - * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. - * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels - * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes - * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr - * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. - * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels - * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width - * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. - * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to - * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. - * - * - ::flags must be set to zero. - * - * - * The ::CUDA_TEXTURE_DESC struct is defined as - * \code - typedef struct CUDA_TEXTURE_DESC_st { - CUaddress_mode addressMode[3]; - CUfilter_mode filterMode; - unsigned int flags; - unsigned int maxAnisotropy; - CUfilter_mode mipmapFilterMode; - float mipmapLevelBias; - float minMipmapLevelClamp; - float maxMipmapLevelClamp; - } CUDA_TEXTURE_DESC; - * \endcode - * where - * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: - * \code - typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, - CU_TR_ADDRESS_MODE_CLAMP = 1, - CU_TR_ADDRESS_MODE_MIRROR = 2, - CU_TR_ADDRESS_MODE_BORDER = 3 - } CUaddress_mode; - * \endcode - * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES - * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. - * - * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: - * \code - typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, - CU_TR_FILTER_MODE_LINEAR = 1 - } CUfilter_mode; - * \endcode - * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. - * - * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: - * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of - * having the texture promote integer data to floating point data in the - * range [0, 1]. Note that texture with 32-bit integer format would not be - * promoted, regardless of whether or not this flag is specified. - * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior - * of having the texture coordinates range from [0, Dim) where Dim is the - * width or height of the CUDA array. Instead, the texture coordinates - * [0, 1.0) reference the entire breadth of the array dimension; Note that - * for CUDA mipmapped arrays, this flag has to be set. - * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear - * filtering optimizations. Trilinear optimizations improve texture filtering - * performance by allowing bilinear filtering on textures in scenarios where - * it can closely approximate the expected results. - * - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. - * This flag can only be specified if the underlying resource is a CUDA array - * or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP. - * When seamless cube map filtering is enabled, texture address modes specified - * by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode - * is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP - * will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is - * set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed - * when sampling along the cube face borders. - * - * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be - * clamped to the range [1,16]. - * - * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. - * - * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. - * - * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. - * - * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. - * - * - * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as - * \code - typedef struct CUDA_RESOURCE_VIEW_DESC_st - { - CUresourceViewFormat format; - size_t width; - size_t height; - size_t depth; - unsigned int firstMipmapLevel; - unsigned int lastMipmapLevel; - unsigned int firstLayer; - unsigned int lastLayer; - } CUDA_RESOURCE_VIEW_DESC; - * \endcode - * where: - * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should - * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block - * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. - * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have - * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base - * format but with 4 channels. - * - * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block - * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, - * this value has to be equal to that of the original resource. - * - * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block - * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, - * this value has to be equal to that of the original resource. - * - * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the - * original resource. - * - * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. - * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp - * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, - * then the actual minimum mipmap level clamp will be 3.2. - * - * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value - * has to be zero. - * - * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. - * For non-layered resources, this value has to be zero. - * - * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, - * this value has to be zero. - * - * - * \param pTexObject - Texture object to create - * \param pResDesc - Resource descriptor - * \param pTexDesc - Texture descriptor - * \param pResViewDesc - Resource view descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectDestroy, - * ::cudaCreateTextureObject - */ -CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); - -/** - * \brief Destroys a texture object - * - * Destroys the texture object specified by \p texObject. - * - * \param texObject - Texture object to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaDestroyTextureObject - */ -CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); - -/** - * \brief Returns a texture object's resource descriptor - * - * Returns the resource descriptor for the texture object specified by \p texObject. - * - * \param pResDesc - Resource descriptor - * \param texObject - Texture object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaGetTextureObjectResourceDesc, - */ -CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); - -/** - * \brief Returns a texture object's texture descriptor - * - * Returns the texture descriptor for the texture object specified by \p texObject. - * - * \param pTexDesc - Texture descriptor - * \param texObject - Texture object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaGetTextureObjectTextureDesc - */ -CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); - -/** - * \brief Returns a texture object's resource view descriptor - * - * Returns the resource view descriptor for the texture object specified by \p texObject. - * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \param pResViewDesc - Resource view descriptor - * \param texObject - Texture object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaGetTextureObjectResourceViewDesc - */ -CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); - -/** @} */ /* END CUDA_TEXOBJECT */ - -/** - * \defgroup CUDA_SURFOBJECT Surface Object Management - * - * ___MANBRIEF___ surface object management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the surface object management functions of the - * low-level CUDA driver application programming interface. The surface - * object API is only supported on devices of compute capability 3.0 or higher. - * - * @{ - */ - -/** - * \brief Creates a surface object - * - * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes - * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be - * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray - * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. - * - * Surface objects are only supported on devices of compute capability 3.0 or higher. - * Additionally, a surface object is an opaque value, and, as such, should only be - * accessed through CUDA API calls. - * - * \param pSurfObject - Surface object to create - * \param pResDesc - Resource descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuSurfObjectDestroy, - * ::cudaCreateSurfaceObject - */ -CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); - -/** - * \brief Destroys a surface object - * - * Destroys the surface object specified by \p surfObject. - * - * \param surfObject - Surface object to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuSurfObjectCreate, - * ::cudaDestroySurfaceObject - */ -CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); - -/** - * \brief Returns a surface object's resource descriptor - * - * Returns the resource descriptor for the surface object specified by \p surfObject. - * - * \param pResDesc - Resource descriptor - * \param surfObject - Surface object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuSurfObjectCreate, - * ::cudaGetSurfaceObjectResourceDesc - */ -CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); - -/** @} */ /* END CUDA_SURFOBJECT */ - -/** - * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access - * - * ___MANBRIEF___ direct peer context memory access functions of the low-level - * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the direct peer context memory access functions - * of the low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Queries if a device may directly access a peer device's memory. - * - * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of - * directly accessing memory from contexts on \p peerDev and 0 otherwise. - * If direct access of \p peerDev from \p dev is possible, then access may be - * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). - * - * \param canAccessPeer - Returned access capability - * \param dev - Device from which allocations on \p peerDev are to - * be directly accessed. - * \param peerDev - Device on which the allocations to be directly accessed - * by \p dev reside. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuCtxEnablePeerAccess, - * ::cuCtxDisablePeerAccess, - * ::cudaDeviceCanAccessPeer - */ -CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); - -/** - * \brief Enables direct access to memory allocations in a peer context. - * - * If both the current context and \p peerContext are on devices which support unified - * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same - * major compute capability, then on success all allocations from \p peerContext will - * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional - * details. - * - * Note that access granted by this call is unidirectional and that in order to access - * memory from the current context in \p peerContext, a separate symmetric call - * to ::cuCtxEnablePeerAccess() is required. - * - * Note that there are both device-wide and system-wide limitations per system - * configuration, as noted in the CUDA Programming Guide under the section - * "Peer-to-Peer Memory Access". - * - * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates - * that the ::CUdevice of the current context cannot directly access memory - * from the ::CUdevice of \p peerContext. - * - * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of - * \p peerContext from the current context has already been enabled. - * - * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible - * because hardware resources required for peer access have been exhausted. - * - * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext - * is not a valid context, or if the current context is \p peerContext. - * - * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. - * - * \param peerContext - Peer context to enable direct access to from the current context - * \param Flags - Reserved for future use and must be set to 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, - * ::CUDA_ERROR_TOO_MANY_PEERS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuDeviceCanAccessPeer, - * ::cuCtxDisablePeerAccess, - * ::cudaDeviceEnablePeerAccess - */ -CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); - -/** - * \brief Disables direct access to memory allocations in a peer context and - * unregisters any registered allocations. - * - Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has - * not yet been enabled from \p peerContext to the current context. - * - * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if - * \p peerContext is not a valid context. - * - * \param peerContext - Peer context to disable direct access to - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * \notefnerr - * - * \sa - * ::cuDeviceCanAccessPeer, - * ::cuCtxEnablePeerAccess, - * ::cudaDeviceDisablePeerAccess - */ -CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); - -/** - * \brief Queries attributes of the link between two devices. - * - * Returns in \p *value the value of the requested attribute \p attrib of the - * link between \p srcDevice and \p dstDevice. The supported attributes are: - * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the - * performance of the link between two devices. - * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. - * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over - * the link are supported. - * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can - * be accessed over the link. - * - * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid - * or if they represent the same device. - * - * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is - * a null pointer. - * - * \param value - Returned value of the requested attribute - * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. - * \param srcDevice - The source device of the target link. - * \param dstDevice - The destination device of the target link. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuCtxEnablePeerAccess, - * ::cuCtxDisablePeerAccess, - * ::cuDeviceCanAccessPeer, - * ::cudaDeviceGetP2PAttribute - */ -CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); - -/** @} */ /* END CUDA_PEER_ACCESS */ - -/** - * \defgroup CUDA_GRAPHICS Graphics Interoperability - * - * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the graphics interoperability functions of the - * low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Unregisters a graphics resource for access by CUDA - * - * Unregisters the graphics resource \p resource so it is not accessible by - * CUDA unless registered again. - * - * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is - * returned. - * - * \param resource - Resource to unregister - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cuGraphicsD3D9RegisterResource, - * ::cuGraphicsD3D10RegisterResource, - * ::cuGraphicsD3D11RegisterResource, - * ::cuGraphicsGLRegisterBuffer, - * ::cuGraphicsGLRegisterImage, - * ::cudaGraphicsUnregisterResource - */ -CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); - -/** - * \brief Get an array through which to access a subresource of a mapped graphics resource. - * - * Returns in \p *pArray an array through which the subresource of the mapped - * graphics resource \p resource which corresponds to array index \p arrayIndex - * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may - * change every time that \p resource is mapped. - * - * If \p resource is not a texture then it cannot be accessed via an array and - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. - * If \p arrayIndex is not a valid array index for \p resource then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * If \p mipLevel is not a valid mipmap level for \p resource then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. - * - * \param pArray - Returned array through which a subresource of \p resource may be accessed - * \param resource - Mapped resource to access - * \param arrayIndex - Array index for array textures or cubemap face - * index as defined by ::CUarray_cubemap_face for - * cubemap textures for the subresource to access - * \param mipLevel - Mipmap level for the subresource to access - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY - * \notefnerr - * - * \sa - * ::cuGraphicsResourceGetMappedPointer, - * ::cudaGraphicsSubResourceGetMappedArray - */ -CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); - -/** - * \brief Get a mipmapped array through which to access a mapped graphics resource. - * - * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics - * resource \p resource. The value set in \p *pMipmappedArray may change every time - * that \p resource is mapped. - * - * If \p resource is not a texture then it cannot be accessed via a mipmapped array and - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. - * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. - * - * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed - * \param resource - Mapped resource to access - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY - * \notefnerr - * - * \sa - * ::cuGraphicsResourceGetMappedPointer, - * ::cudaGraphicsResourceGetMappedMipmappedArray - */ -CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); - -/** - * \brief Get a device pointer through which to access a mapped graphics resource. - * - * Returns in \p *pDevPtr a pointer through which the mapped graphics resource - * \p resource may be accessed. - * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. - * The value set in \p pPointer may change every time that \p resource is mapped. - * - * If \p resource is not a buffer then it cannot be accessed via a pointer and - * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. - * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. - * * - * \param pDevPtr - Returned pointer through which \p resource may be accessed - * \param pSize - Returned size of the buffer accessible starting at \p *pPointer - * \param resource - Mapped resource to access - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER - * \notefnerr - * - * \sa - * ::cuGraphicsMapResources, - * ::cuGraphicsSubResourceGetMappedArray, - * ::cudaGraphicsResourceGetMappedPointer - */ -CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); - -/** - * \brief Set usage flags for mapping a graphics resource - * - * Set \p flags for mapping the graphics resource \p resource. - * - * Changes to \p flags will take effect the next time \p resource is mapped. - * The \p flags argument may be any of the following: - - * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this - * resource will be used. It is therefore assumed that this resource will be - * read from and written to by CUDA kernels. This is the default value. - * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which - * access this resource will not write to this resource. - * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels - * which access this resource will not read from this resource and will - * write over the entire contents of the resource, so none of the data - * previously stored in the resource will be preserved. - * - * If \p resource is presently mapped for access by CUDA then - * ::CUDA_ERROR_ALREADY_MAPPED is returned. - * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \param resource - Registered resource to set flags for - * \param flags - Parameters for resource mapping - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ALREADY_MAPPED - * \notefnerr - * - * \sa - * ::cuGraphicsMapResources, - * ::cudaGraphicsResourceSetMapFlags - */ -CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); - -/** - * \brief Map graphics resources for access by CUDA - * - * Maps the \p count graphics resources in \p resources for access by CUDA. - * - * The resources in \p resources may be accessed by CUDA until they - * are unmapped. The graphics API from which \p resources were registered - * should not access any resources while they are mapped by CUDA. If an - * application does so, the results are undefined. - * - * This function provides the synchronization guarantee that any graphics calls - * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA - * work issued in \p stream begins. - * - * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. - * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. - * - * \param count - Number of resources to map - * \param resources - Resources to map for CUDA usage - * \param hStream - Stream with which to synchronize - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ALREADY_MAPPED, - * ::CUDA_ERROR_UNKNOWN - * \note_null_stream - * \notefnerr - * - * \sa - * ::cuGraphicsResourceGetMappedPointer, - * ::cuGraphicsSubResourceGetMappedArray, - * ::cuGraphicsUnmapResources, - * ::cudaGraphicsMapResources - */ -CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - -/** - * \brief Unmap graphics resources. - * - * Unmaps the \p count graphics resources in \p resources. - * - * Once unmapped, the resources in \p resources may not be accessed by CUDA - * until they are mapped again. - * - * This function provides the synchronization guarantee that any CUDA work issued - * in \p stream before ::cuGraphicsUnmapResources() will complete before any - * subsequently issued graphics work begins. - * - * - * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. - * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. - * - * \param count - Number of resources to unmap - * \param resources - Resources to unmap - * \param hStream - Stream with which to synchronize - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_UNKNOWN - * \note_null_stream - * \notefnerr - * - * \sa - * ::cuGraphicsMapResources, - * ::cudaGraphicsUnmapResources - */ -CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - -/** @} */ /* END CUDA_GRAPHICS */ - -/** - * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access - * - * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the driver entry point access functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns the requested driver API function pointer - * - * Returns in \p **pfn the address of the CUDA driver function for the requested - * CUDA version and flags. - * - * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2 - * should be specified as 11020. For a requested driver symbol, if the specified - * CUDA version is greater than or equal to the CUDA version in which the driver symbol - * was introduced, this API will return the function pointer to the corresponding - * versioned function. - * - * The pointer returned by the API should be cast to a function pointer matching the - * requested driver function's definition in the API header file. The function pointer - * typedef can be picked up from the corresponding typedefs header file. For example, - * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. - * - * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not - * supported on the platform, no ABI compatible driver function exists for the specified - * \p cudaVersion or if the driver symbol is invalid. - * - * The requested flags can be: - * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to - * ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with - * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM - * is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise. - * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols - * that match the requested driver symbol name except the corresponding per-thread versions. - * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all - * driver symbols that match the requested driver symbol name including the per-thread - * versions. If a per-thread version is not found, the API will return the legacy version - * of the driver function. - * - * \param symbol - The base name of the driver API function to look for. As an example, - * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and - * \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. - * \param pfn - Location to return the function pointer to the requested driver function - * \param cudaVersion - The CUDA version to look for the requested driver symbol - * \param flags - Flags to specify search options. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_NOT_FOUND - * \note_version_mixing - * - * \sa - * ::cudaGetDriverEntryPoint - */ -CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags); - -/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */ - -CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); - -/** - * CUDA API versioning support - */ -#if defined(__CUDA_API_VERSION_INTERNAL) - #undef cuMemHostRegister - #undef cuGraphicsResourceSetMapFlags - #undef cuLinkCreate - #undef cuLinkAddData - #undef cuLinkAddFile - #undef cuDeviceTotalMem - #undef cuCtxCreate - #undef cuModuleGetGlobal - #undef cuMemGetInfo - #undef cuMemAlloc - #undef cuMemAllocPitch - #undef cuMemFree - #undef cuMemGetAddressRange - #undef cuMemAllocHost - #undef cuMemHostGetDevicePointer - #undef cuMemcpyHtoD - #undef cuMemcpyDtoH - #undef cuMemcpyDtoD - #undef cuMemcpyDtoA - #undef cuMemcpyAtoD - #undef cuMemcpyHtoA - #undef cuMemcpyAtoH - #undef cuMemcpyAtoA - #undef cuMemcpyHtoAAsync - #undef cuMemcpyAtoHAsync - #undef cuMemcpy2D - #undef cuMemcpy2DUnaligned - #undef cuMemcpy3D - #undef cuMemcpyHtoDAsync - #undef cuMemcpyDtoHAsync - #undef cuMemcpyDtoDAsync - #undef cuMemcpy2DAsync - #undef cuMemcpy3DAsync - #undef cuMemsetD8 - #undef cuMemsetD16 - #undef cuMemsetD32 - #undef cuMemsetD2D8 - #undef cuMemsetD2D16 - #undef cuMemsetD2D32 - #undef cuArrayCreate - #undef cuArrayGetDescriptor - #undef cuArray3DCreate - #undef cuArray3DGetDescriptor - #undef cuTexRefSetAddress - #undef cuTexRefSetAddress2D - #undef cuTexRefGetAddress - #undef cuGraphicsResourceGetMappedPointer - #undef cuCtxDestroy - #undef cuCtxPopCurrent - #undef cuCtxPushCurrent - #undef cuStreamDestroy - #undef cuEventDestroy - #undef cuMemcpy - #undef cuMemcpyAsync - #undef cuMemcpyPeer - #undef cuMemcpyPeerAsync - #undef cuMemcpy3DPeer - #undef cuMemcpy3DPeerAsync - #undef cuMemsetD8Async - #undef cuMemsetD16Async - #undef cuMemsetD32Async - #undef cuMemsetD2D8Async - #undef cuMemsetD2D16Async - #undef cuMemsetD2D32Async - #undef cuStreamGetPriority - #undef cuStreamGetFlags - #undef cuStreamGetCtx - #undef cuStreamWaitEvent - #undef cuStreamAddCallback - #undef cuStreamAttachMemAsync - #undef cuStreamQuery - #undef cuStreamSynchronize - #undef cuEventRecord - #undef cuEventRecordWithFlags - #undef cuLaunchKernel - - - - #undef cuLaunchHostFunc - #undef cuGraphicsMapResources - #undef cuGraphicsUnmapResources - #undef cuStreamWriteValue32 - #undef cuStreamWaitValue32 - #undef cuStreamWriteValue64 - #undef cuStreamWaitValue64 - #undef cuStreamBatchMemOp - #undef cuMemPrefetchAsync - #undef cuLaunchCooperativeKernel - #undef cuSignalExternalSemaphoresAsync - #undef cuWaitExternalSemaphoresAsync - #undef cuStreamBeginCapture - #undef cuStreamEndCapture - #undef cuStreamIsCapturing - #undef cuStreamGetCaptureInfo - #undef cuStreamGetCaptureInfo_v2 - #undef cuGraphUpload - #undef cuGraphLaunch - #undef cuDevicePrimaryCtxRelease - #undef cuDevicePrimaryCtxReset - #undef cuDevicePrimaryCtxSetFlags - #undef cuIpcOpenMemHandle - #undef cuStreamCopyAttributes - #undef cuStreamSetAttribute - #undef cuStreamGetAttribute - #undef cuGraphInstantiate - #undef cuMemMapArrayAsync - #undef cuMemFreeAsync - #undef cuMemAllocAsync - #undef cuMemAllocFromPoolAsync - #undef cuStreamUpdateCaptureDependencies - - CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); - CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); - CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); - CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, - unsigned int numOptions, CUjit_option *options, void **optionValues); - CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, - unsigned int numOptions, CUjit_option *options, void **optionValues); - CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); - - typedef unsigned int CUdeviceptr_v1; - - typedef struct CUDA_MEMCPY2D_v1_st - { - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr_v1 srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ - - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ - - unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ - unsigned int Height; /**< Height of 2D memory copy */ - } CUDA_MEMCPY2D_v1; - - typedef struct CUDA_MEMCPY3D_v1_st - { - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - unsigned int srcZ; /**< Source Z */ - unsigned int srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr_v1 srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - void *reserved0; /**< Must be NULL */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ - unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - unsigned int dstZ; /**< Destination Z */ - unsigned int dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - void *reserved1; /**< Must be NULL */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ - unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - - unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ - unsigned int Height; /**< Height of 3D memory copy */ - unsigned int Depth; /**< Depth of 3D memory copy */ - } CUDA_MEMCPY3D_v1; - - typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st - { - unsigned int Width; /**< Width of array */ - unsigned int Height; /**< Height of array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - } CUDA_ARRAY_DESCRIPTOR_v1; - - typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st - { - unsigned int Width; /**< Width of 3D array */ - unsigned int Height; /**< Height of 3D array */ - unsigned int Depth; /**< Depth of 3D array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - unsigned int Flags; /**< Flags */ - } CUDA_ARRAY3D_DESCRIPTOR_v1; - - CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); - CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); - CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); - CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); - CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); - CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); - CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); - CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); - CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); - CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); - CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); - CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); - CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); - CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); - CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); - CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); - CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); - CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); - CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); - CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); - CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); - CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); - CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); - CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); - CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); - CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); - CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); - - CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); - CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); - CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); - CUresult CUDAAPI cuStreamDestroy(CUstream hStream); - CUresult CUDAAPI cuEventDestroy(CUevent hEvent); - CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); - CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); - CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); - - CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); - CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); - CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); - CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); - CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); - CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); - CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); - CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); - CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); - CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); - CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); - CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); - CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); - CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); - CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); - CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); - CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); - - CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); - CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); - CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); - CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); - CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); - CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); - - CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); - CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); - CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); - CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); - CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); - CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); - CUresult CUDAAPI cuStreamQuery(CUstream hStream); - CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); - CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); - CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); - CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); - - - - CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); - CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); - CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); - CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); - CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); - CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); - CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); - CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); - CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); - CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); - CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); - CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream); - CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); - CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); - CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); - CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); - - CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); - CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); - CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); - - CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); - CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); - CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); - - CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); -#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) -static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) { - const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM| - CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM); - if ((flags & procAddressMask) == 0) { - flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM; - } - return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); -} -#define cuGetProcAddress cuGetProcAddress_ptsz -#endif - -#ifdef __cplusplus -} -#endif - -#if defined(__GNUC__) - #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) - #pragma GCC visibility pop - #endif -#endif - -#undef __CUDA_DEPRECATED - -#endif /* __cuda_cuda_h__ */ diff --git a/python/triton/third_party/cuda/lib/libdevice.10.bc b/python/triton/third_party/cuda/lib/libdevice.10.bc deleted file mode 100755 index b2c75a5026df..000000000000 Binary files a/python/triton/third_party/cuda/lib/libdevice.10.bc and /dev/null differ diff --git a/python/triton/tools/__init__.py b/python/triton/tools/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/triton/tools/aot.py b/python/triton/tools/aot.py deleted file mode 100644 index 4073423bce3f..000000000000 --- a/python/triton/tools/aot.py +++ /dev/null @@ -1,114 +0,0 @@ -import argparse -import sys - -import triton._C.libtriton.triton as libtriton -import triton.compiler.compiler as tc - -if __name__ == '__main__': - - # valid source and target formats - VALID_FORMATS = ['triton-ir', 'triton-gpu-ir', 'llvm-ir', 'ptx', 'amdgcn'] - - # set up the argument parser - # TODO: conditional requirements - parser = argparse.ArgumentParser() - parser.add_argument('src', help="Source file to compile") - parser.add_argument('--target', required=True, - help="Target format, one of: " + ', '.join(VALID_FORMATS)) - parser.add_argument('--sm', type=int, help="Compute capability to compile for") - parser.add_argument('--ptx-version', type=int, help="PTX version to compile for") - parser.add_argument('--gfx', type=str, help="AMDGPU target to compile for") - parser.add_argument('--triple', type=str, help="target triple, for example: amdgcn-amd-amdhsa") - parser.add_argument('--features', type=str, help="target features, for example: +sramecc,-xnack") - parser.add_argument('--num_warps', type=int, help="number of warps to compile ttgir for") - - # parse the args - args = parser.parse_args() - - # TODO: clean-up and re-use triton.compiler primitive functions - # check for validity of format arguments - if args.target not in VALID_FORMATS: - print("Invalid target format: " + args.target) - sys.exit(0) - - # parse source file to MLIR module - context = libtriton.ir.context() - module = libtriton.ir.parse_mlir_module(args.src, context) - module.context = context - - # optimizer triton-ir - module = tc.optimize_ttir(module, arch=args.sm) - if args.target == 'triton-ir': - print(module.str()) - sys.exit(0) - - if not args.num_warps: - args.num_warps = 4 - - # llvm-ir -> amdgcn - if args.target == 'amdgcn': - # auto detect available architecture and features - # if nothing detected, set with default values - arch_details = tc.get_amdgpu_arch_fulldetails() - if not arch_details: - arch_name = "" - arch_triple = "amdgcn-amd-amdhsa" - arch_features = "" - else: - arch_triple, arch_name, arch_features = arch_details - - # stop processing if architecture name is not automatically detected and is not set manually - if not args.gfx and not arch_name: - raise argparse.ArgumentError(None, "Must specify --gfx for AMDGCN compilation") - - # rewrite default and automatically detected values with manually provided data - if args.gfx: - arch_name = args.gfx - if args.triple: - arch_triple = args.triple - if args.features: - arch_features = args.features - - # triton-ir -> triton-gpu-ir - # use compute_capability == 80 - module = tc.ttir_to_ttgir(module, num_warps=args.num_warps) # num_stages=3, compute_capability=80) - module = tc.optimize_ttgir(module, num_stages=3, arch=80) - # triton-gpu-ir -> llvm-ir - # use compute_capability == 80 - module = tc.ttgir_to_llir(module, extern_libs=None, arch=80) - # llvm-ir -> amdgcn asm, hsaco binary - module, hsaco_path = tc.llir_to_amdgcn_and_hsaco(module, arch_name, arch_triple, arch_features) - - print(hsaco_path) - print(module) - sys.exit(0) - - if not args.sm: - raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation") - - # triton-ir -> triton-gpu-ir - module = tc.ttir_to_ttgir(module, num_warps=args.num_warps) - module = tc.optimize_ttgir(module, num_stages=3, arch=args.sm) - if args.target == 'triton-gpu-ir': - print(module.str()) - sys.exit(0) - - # triton-gpu-ir -> llvm-ir - module = tc.ttgir_to_llir(module, extern_libs=None, arch=args.sm) - if args.target == 'llvm-ir': - print(module) - sys.exit(0) - - # llvm-ir -> ptx - if args.target == 'ptx': - if not args.ptx_version: - raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation") - module = tc.llir_to_ptx(module, arch=args.sm, ptx_version=args.ptx_version) - - # llvm-ir -> amdgcn - if args.target == 'amdgcn': - if not args.gfx: - raise argparse.ArgumentError(None, "Must specify --gfx for AMDGCN compilation") - module, hsaco_path = tc.llir_to_amdgcn_and_hsaco(module, args.gfx) - - print(module) diff --git a/python/triton/tools/build_extern.py b/python/triton/tools/build_extern.py deleted file mode 100644 index f19fbd561c07..000000000000 --- a/python/triton/tools/build_extern.py +++ /dev/null @@ -1,398 +0,0 @@ -import argparse -import subprocess -from abc import ABC, abstractmethod -from typing import Dict, List, Optional - - -class Symbol: - _name: str - _op_name: str - _ret_type: str - _arg_names: List[str] - _arg_types: List[str] - - def __init__( - self, - name: str, - op_name: str, - ret_type: str, - arg_names: List[str], - arg_types: List[str], - ) -> None: - ''' - A symbol is a function declaration. - :param name: name of the symbol - :param op_name: name of the operation - :param ret_type: return type of the operation - :param arg_names: names of the arguments - :param arg_types: types of the arguments - ''' - self._name = name - self._op_name = op_name - self._ret_type = ret_type - self._arg_names = list(arg_names) - self._arg_types = list(arg_types) - - @property - def name(self) -> str: - return self._name - - @property - def op_name(self) -> str: - return self._op_name - - @property - def ret_type(self) -> str: - return self._ret_type - - @property - def arg_names(self) -> List[str]: - return self._arg_names - - @property - def arg_types(self) -> List[str]: - return self._arg_types - - -def convert_type(type_str) -> Optional[str]: - if type_str == "i32": - return "int32" - elif type_str == "u32": - return "uint32" - elif type_str == "i64": - return "int64" - elif type_str == "u64": - return "uint64" - elif type_str == "float": - return "fp32" - elif type_str == "double": - return "fp64" - else: - # ignore other types, such as pointer types - return None - - -def to_unsigned(type_str) -> str: - if type_str == "int32": - return "uint32" - elif type_str == "int64": - return "uint64" - else: - return type_str - - -class ExternLibrary(ABC): - _name: str - _path: str - _symbols: Dict[str, Symbol] - _format: bool - _grouping: bool - - def __init__( - self, - name: str, - path: str, - format: bool = True, - grouping: bool = True, - ) -> None: - ''' - Abstract class for extern library. - :param name: name of the library - :param path: path of the library - :param format: whether to format the generated stub file - ''' - self._name = name - self._path = path - self._symbols = {} - self._format = format - self._grouping = grouping - - @property - def name(self) -> str: - return self._name - - @property - def path(self) -> str: - return self._path - - @property - def symbols(self) -> Dict[str, Symbol]: - return self._symbols - - @property - def grouping(self) -> bool: - return self._grouping - - @abstractmethod - def parse_symbols(self, input_file) -> None: - pass - - @abstractmethod - def _output_stubs(self) -> str: - pass - - def generate_stub_file(self, output_dir) -> None: - file_str = self._output_stubs() - if file_str is None or len(file_str) == 0: - raise Exception("file_str is empty") - - output_file = f"{output_dir}/{self._name}.py" - with open(output_file, "w") as f: - f.write(file_str) - f.close() - if self._format: - subprocess.Popen(["autopep8", "-a", "-r", "-i", output_file], - stdout=subprocess.PIPE).communicate() - subprocess.Popen(["isort", output_file], stdout=subprocess.PIPE).communicate() - - -class Libdevice(ExternLibrary): - _symbol_groups: Dict[str, List[Symbol]] - - def __init__(self, path) -> None: - ''' - Constructor for Libdevice. - :param path: path of the libdevice library - ''' - super().__init__("libdevice", path) - self._symbol_groups = {} - self.is_pure = True - - @staticmethod - def _extract_symbol(line) -> Optional[Symbol]: - # Extract symbols from line in the following format: - # "define [internal] @(,)" - entries = line.split("@") - ret_str = entries[0] - func_str = entries[1] - # Get ret_type, skip internal symbols - ret_strs = ret_str.split() - if ret_strs[1] == "internal": - return None - ret_type = convert_type(ret_strs[1]) - if ret_type is None: - return None - # Get function name - func_strs = func_str.split("(") - func_name = func_strs[0].replace("@", "") - op_name = func_name.replace("__nv_", "") - if 'ieee' in op_name: - return None - # Get arg_types - arg_strs = func_strs[1].split(",") - arg_types = [] - arg_names = [] - for i, arg_str in enumerate(arg_strs): - arg_type = convert_type(arg_str.split()[0]) - if arg_type is None: - return None - arg_name = 'arg' + str(i) - arg_types.append(arg_type) - arg_names.append(arg_name) - if op_name == "sad": - # Special case for sad, where the last argument is an unsigned int - arg_types[-1] = to_unsigned(arg_types[-1]) - elif op_name.startswith("u"): - # LLVM does not differentiate between signed and unsigned integer type. - # We have to convert the types to unsigned - ret_type = to_unsigned(ret_type) - for i, arg_type in enumerate(arg_types): - arg_types[i] = to_unsigned(arg_type) - return Symbol(func_name, op_name, ret_type, arg_names, arg_types) - - def _group_symbols(self) -> None: - symbol_set = {} - for symbol in self._symbols.values(): - op_name = symbol.op_name - symbol_set[op_name] = symbol - - # Group functions together by renaming. - renaming = { - 'llabs': 'abs', 'acosf': 'acos', 'acoshf': 'acosh', - 'dadd_rd': 'add_rd', 'fadd_rd': 'add_rd', 'dadd_rn': 'add_rn', - 'fadd_rn': 'add_rn', 'dadd_ru': 'add_ru', 'fadd_ru': 'add_ru', - 'dadd_rz': 'add_rz', 'fadd_rz': 'add_rz', 'asinf': 'asin', - 'asinhf': 'asinh', 'atanf': 'atan', 'atan2f': 'atan2', - 'atanhf': 'atanh', 'brevll': 'brev', 'cbrtf': 'cbrt', - 'ceilf': 'ceil', 'clzll': 'clz', 'copysignf': 'copysign', - 'cosf': 'cos', 'coshf': 'cosh', 'cospif': 'cospi', - 'cyl_bessel_i0f': 'cyl_bessel_i0', 'cyl_bessel_i1f': 'cyl_bessel_i1', - 'fdiv_rd': 'div_rd', 'ddiv_rd': 'div_rd', 'fdiv_rn': 'div_rn', - 'ddiv_rn': 'div_rn', 'fdiv_ru': 'div_ru', 'ddiv_ru': 'div_ru', - 'fdiv_rz': 'div_rz', 'ddiv_rz': 'div_rz', 'erff': 'erf', - 'erfcf': 'erfc', 'erfcinvf': 'erfcinv', 'erfcxf': 'erfcx', - 'erfinvf': 'erfinv', 'expf': 'exp', 'exp10f': 'exp10', - 'exp2f': 'exp2', 'expm1f': 'expm1', 'fabsf': 'abs', - 'fabs': 'abs', 'fast_fdividef': 'fast_dividef', - 'fdimf': 'fdim', 'ffsll': 'ffs', 'floorf': 'floor', - 'fmaf': 'fma', 'fmaf_rd': 'fma_rd', 'fmaf_rn': 'fma_rn', - 'fmaf_ru': 'fma_ru', 'fmaf_rz': 'fma_rz', 'fmodf': 'fmod', - 'uhadd': 'hadd', 'hypotf': 'hypot', 'ilogbf': 'ilogb', - 'isinff': 'isinf', 'isinfd': 'isinf', 'isnanf': 'isnan', - 'isnand': 'isnan', 'j0f': 'j0', 'j1f': 'j1', 'jnf': 'jn', - 'ldexpf': 'ldexp', 'lgammaf': 'lgamma', 'llrintf': 'llrint', - 'llroundf': 'llround', 'logf': 'log', 'log10f': 'log10', - 'log1pf': 'log1p', 'log2f': 'log2', 'logbf': 'logb', - 'umax': 'max', 'llmax': 'max', 'ullmax': 'max', 'fmaxf': 'max', - 'fmax': 'max', 'umin': 'min', 'llmin': 'min', 'ullmin': 'min', - 'fminf': 'min', 'fmin': 'min', 'dmul_rd': 'mul_rd', 'fmul_rd': 'mul_rd', - 'dmul_rn': 'mul_rn', 'fmul_rn': 'mul_rn', 'dmul_ru': 'mul_ru', - 'fmul_ru': 'mul_ru', 'dmul_rz': 'mul_rz', 'fmul_rz': 'mul_rz', - 'umul24': 'mul24', 'umulhi': 'mulhi', 'mul64hi': 'mulhi', - 'umul64hi': 'mulhi', 'nearbyintf': 'nearbyint', 'nextafterf': 'nextafter', - 'norm3df': 'norm3d', 'norm4df': 'norm4d', 'normcdff': 'normcdf', - 'normcdfinvf': 'normcdfinv', 'popcll': 'popc', 'powif': 'pow', 'powi': 'pow', - 'powf': 'pow', 'rcbrtf': 'rcbrt', 'frcp_rd': 'rcp_rd', 'drcp_rd': 'rcp_rd', - 'frcp_rn': 'rcp_rn', 'drcp_rn': 'rcp_rn', 'frcp_ru': 'rcp_ru', - 'drcp_ru': 'rcp_ru', 'frcp_rz': 'rcp_rz', 'drcp_rz': 'rcp_rz', - 'remainderf': 'remainder', 'urhadd': 'rhadd', 'rhypotf': 'rhypot', - 'rintf': 'rint', 'rnorm3df': 'rnorm3d', 'rnorm4df': 'rnorm4d', - 'roundf': 'round', 'rsqrtf': 'rsqrt', 'frsqrt_rn': 'rsqrt_rn', - 'usad': 'sad', 'scalbnf': 'scalbn', 'signbitf': 'signbit', - 'signbitd': 'signbit', 'sinf': 'sin', 'sinhf': 'sinh', - 'sinpif': 'sinpi', 'sqrtf': 'sqrt', 'fsqrt_rd': 'sqrt_rd', - 'dsqrt_rd': 'sqrt_rd', 'fsqrt_rn': 'sqrt_rn', 'dsqrt_rn': 'sqrt_rn', - 'fsqrt_ru': 'sqrt_ru', 'dsqrt_ru': 'sqrt_ru', 'fsqrt_rz': 'sqrt_rz', - 'dsqrt_rz': 'sqrt_rz', 'fsub_rd': 'sub_rd', 'dsub_rd': 'sub_rd', - 'fsub_rn': 'sub_rn', 'dsub_rn': 'sub_rn', 'fsub_ru': 'sub_ru', - 'dsub_ru': 'sub_ru', 'fsub_rz': 'sub_rz', 'dsub_rz': 'sub_rz', - 'tanf': 'tan', 'tanhf': 'tanh', 'tgammaf': 'tgamma', 'truncf': 'trunc', - 'y0f': 'y0', 'y1f': 'y1', 'ynf': 'yn' - } - - for symbol in self._symbols.values(): - op_name = symbol.op_name - if op_name in renaming: - op_name = renaming[op_name] - symbol._op_name = op_name - if op_name in self._symbol_groups: - self._symbol_groups[op_name].append(symbol) - else: - self._symbol_groups[op_name] = [symbol] - - def parse_symbols(self, input_file) -> None: - if len(self.symbols) > 0: - return - output = subprocess.check_output(["grep", "define", input_file]).decode().splitlines() - for line in output: - symbol = self._extract_symbol(line) - if symbol is None: - continue - self._symbols[symbol.name] = symbol - - self._group_symbols() - - def _output_stubs(self) -> str: - # Generate python functions in the following format: - # @extern.extern - # def (, _builder=None): - # arg_type_symbol_dict = {[arg_type]: {(symbol, ret_type)}} - # return core.extern_elementwise("libdevice", , , , _builder) - import_str = "from . import core\n" - import_str += "import os\n" - import_str += "import functools\n" - - header_str = "" - header_str += "@functools.lru_cache()\n" - header_str += "def libdevice_path():\n" - header_str += " import torch\n" - header_str += " third_party_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), \"..\", \"third_party\")\n" - header_str += " if torch.version.hip is None:\n" - header_str += " default = os.path.join(third_party_dir, \"cuda\", \"lib\", \"libdevice.10.bc\")\n" - header_str += " else:\n" - header_str += " default = ''\n" - header_str += " return os.getenv(\"TRITON_LIBDEVICE_PATH\", default)\n" - func_str = "" - for symbols in self._symbol_groups.values(): - func_str += "@core.extern\n" - func_name_str = f"def {symbols[0].op_name}(" - for arg_name in symbols[0].arg_names: - func_name_str += f"{arg_name}, " - func_name_str += "_builder=None):\n" - - return_str = f"\treturn core.extern_elementwise(\"{self._name}\", libdevice_path(), [" - for arg_name in symbols[0].arg_names: - return_str += f"{arg_name}, " - return_str += "], \n" - - arg_type_symbol_dict_str = "{" - for symbol in symbols: - arg_type_symbol_dict_str += "(" - for arg_type in symbol.arg_types: - arg_type_symbol_dict_str += f'core.dtype("{arg_type}"),' - ret_type = f'core.dtype("{symbol.ret_type}")' - arg_type_symbol_dict_str += "): (\"" + symbol.name + "\", " + ret_type + "),\n" - arg_type_symbol_dict_str += "}" - - return_str += arg_type_symbol_dict_str - return_str += f", is_pure={self.is_pure}" - return_str += ", _builder=_builder)\n" - - func_str += func_name_str + return_str + "\n" - file_str = import_str + header_str + func_str - - return file_str - - -class LLVMDisassembler: - _path: str - _ll_file: str - - def __init__(self, path) -> None: - ''' - Invoke llvm-dis to disassemble the given file. - :param path: path to llvm-dis - ''' - self._path = path - self._ll_file = "/tmp/extern_lib.ll" - - def disasm(self, lib_path: str) -> None: - subprocess.Popen([self._path, lib_path, "-o", self.ll_file], - stdout=subprocess.PIPE).communicate() - - @property - def ll_file(self) -> str: - return self._ll_file - - @property - def path(self) -> str: - return self._path - - -extern_libs = ["libdevice"] - - -def build( - llvm_dis_path: str, - lib_path: str, - lib_name: str, - output_dir: str, -) -> None: - ''' - Interface function to build the library file. - :param llvm_dis_path: path to the llvm-dis binary - :param lib_path: path to the external library file - :param lib_name: name of the library - :param output_dir: path to the output directory - ''' - if lib_name == "libdevice": - extern_lib = Libdevice(lib_path) - else: - raise Exception(f"Unknown extern library: {lib_name}") - - llvm_disassembler = LLVMDisassembler(llvm_dis_path) - llvm_disassembler.disasm(lib_path) - - extern_lib.parse_symbols(llvm_disassembler.ll_file) - extern_lib.generate_stub_file(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--llvm-dis", dest="llvm_dis_path", help="Path to llvm-dis", default="llvm-dis") - parser.add_argument("--lib-path", dest="lib_path", help="Path to the extern library") - parser.add_argument("--lib-name", dest="lib_name", help="Name of the extern library") - parser.add_argument("--output", dest="output_dir", help="Output file path", default="/tmp/") - args = parser.parse_args() - - build(args.llvm_dis_path, args.lib_path, args.lib_name, args.output_dir) diff --git a/python/triton/tools/disasm.py b/python/triton/tools/disasm.py deleted file mode 100644 index 24a0787c5c16..000000000000 --- a/python/triton/tools/disasm.py +++ /dev/null @@ -1,122 +0,0 @@ -# MIT License - -# Copyright (c) 2020 Da Yan @ HKUST - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import re -import subprocess - -FLINE_RE = re.compile(r'\s*/\*\w{4}\*/\s*([^;]*;)\s*/\* 0x(\w{16}) \*/\s*') -SLINE_RE = re.compile(r'\s*/\* 0x(\w{16}) \*/\s*') -FNAME_RE = re.compile(r'\s*Function : (\w+)\s*') -BRA_RE = re.compile(r'(.*BRA(?:\.U)? )(0x\w+);') - - -def parseCtrl(sline): - enc = int(SLINE_RE.match(sline).group(1), 16) - stall = (enc >> 41) & 0xf - yld = (enc >> 45) & 0x1 - wrtdb = (enc >> 46) & 0x7 - readb = (enc >> 49) & 0x7 - watdb = (enc >> 52) & 0x3f - - yld_str = 'Y' if yld == 0 else '-' - wrtdb_str = '-' if wrtdb == 7 else str(wrtdb) - readb_str = '-' if readb == 7 else str(readb) - watdb_str = '--' if watdb == 0 else f'{watdb:02d}' - return f'{watdb_str}:{readb_str}:{wrtdb_str}:{yld_str}:{stall:x}' - - -def processSassLines(fline, sline, labels): - asm = FLINE_RE.match(fline).group(1) - # Remove tailing space - if asm.endswith(" ;"): - asm = asm[:-2] + ";" - ctrl = parseCtrl(sline) - # BRA target address - if BRA_RE.match(asm) is not None: - target = int(BRA_RE.match(asm).group(2), 16) - if target in labels: - pass - else: - labels[target] = len(labels) - return (f'{ctrl}', f'{asm}') - - -def extract(file_path, fun): - if fun is None: - sass_str = subprocess.check_output(["cuobjdump", "-sass", file_path]) - else: - sass_str = subprocess.check_output(["cuobjdump", "-fun", fun, "-sass", file_path]) - sass_lines = sass_str.splitlines() - line_idx = 0 - while line_idx < len(sass_lines): - line = sass_lines[line_idx].decode() - # format: - # function : - # .headerflags: ... - # /*0000*/ asmstr /*0x...*/ - # /*0x...*/ - - # Looking for new function header (function: ) - while FNAME_RE.match(line) is None: - line_idx += 1 - if line_idx < len(sass_lines): - line = sass_lines[line_idx].decode() - else: - return - - fname = FNAME_RE.match(line).group(1) - ret = '' - ret += f'Function:{fname}\n' - line_idx += 2 # bypass .headerflags - line = sass_lines[line_idx].decode() - # Remapping address to label - labels = {} # address -> label_idx - # store sass asm in buffer and them print them (for labels) - # (ctrl, asm) - asm_buffer = [] - while FLINE_RE.match(line) is not None: - # First line (Offset ASM Encoding) - fline = sass_lines[line_idx].decode() - line_idx += 1 - # Second line (Encoding) - sline = sass_lines[line_idx].decode() - line_idx += 1 - asm_buffer.append(processSassLines(fline, sline, labels)) - # peek the next line - line = sass_lines[line_idx].decode() - # Print sass - # label naming convention: LBB#i - for idx, (ctrl, asm) in enumerate(asm_buffer): - # Print label if this is BRA target - offset = idx * 16 - if offset in labels: - label_name = f'LBB{labels[offset]}' - ret += f'{label_name}:\n' - ret += ctrl + '\t' - # if this is BRA, remap offset to label - if BRA_RE.match(asm): - target = int(BRA_RE.match(asm).group(2), 16) - target_name = f'LBB{labels[target]}' - asm = BRA_RE.sub(rf'\1{target_name};', asm) - ret += asm + '\n' - ret += '\n' - return ret diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py deleted file mode 100644 index 3463ddf1ced1..000000000000 --- a/python/tutorials/01-vector-add.py +++ /dev/null @@ -1,139 +0,0 @@ -""" -Vector Addition -=============== - -In this tutorial, you will write a simple vector addition using Triton. - -In doing so, you will learn about: - -* The basic programming model of Triton. - -* The `triton.jit` decorator, which is used to define Triton kernels. - -* The best practices for validating and benchmarking your custom ops against native reference implementations. - -""" - -# %% -# Compute Kernel -# -------------- - -import torch - -import triton -import triton.language as tl - - -@triton.jit -def add_kernel( - x_ptr, # *Pointer* to first input vector. - y_ptr, # *Pointer* to second input vector. - output_ptr, # *Pointer* to output vector. - n_elements, # Size of the vector. - BLOCK_SIZE: tl.constexpr, # Number of elements each program should process. - # NOTE: `constexpr` so it can be used as a shape value. -): - # There are multiple 'programs' processing different data. We identify which program - # we are here: - pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0. - # This program will process inputs that are offset from the initial data. - # For instance, if you had a vector of length 256 and block_size of 64, the programs - # would each access the elements [0:64, 64:128, 128:192, 192:256]. - # Note that offsets is a list of pointers: - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - # Create a mask to guard memory operations against out-of-bounds accesses. - mask = offsets < n_elements - # Load x and y from DRAM, masking out any extra elements in case the input is not a - # multiple of the block size. - x = tl.load(x_ptr + offsets, mask=mask) - y = tl.load(y_ptr + offsets, mask=mask) - output = x + y - # Write x + y back to DRAM. - tl.store(output_ptr + offsets, output, mask=mask) - - -# %% -# Let's also declare a helper function to (1) allocate the `z` tensor -# and (2) enqueue the above kernel with appropriate grid/block sizes: - - -def add(x: torch.Tensor, y: torch.Tensor): - # We need to preallocate the output. - output = torch.empty_like(x) - assert x.is_cuda and y.is_cuda and output.is_cuda - n_elements = output.numel() - # The SPMD launch grid denotes the number of kernel instances that run in parallel. - # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]. - # In this case, we use a 1D grid where the size is the number of blocks: - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - # NOTE: - # - Each torch.tensor object is implicitly converted into a pointer to its first element. - # - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel. - # - Don't forget to pass meta-parameters as keywords arguments. - add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) - # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still - # running asynchronously at this point. - return output - - -# %% -# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness: - -torch.manual_seed(0) -size = 98432 -x = torch.rand(size, device='cuda') -y = torch.rand(size, device='cuda') -output_torch = x + y -output_triton = add(x, y) -print(output_torch) -print(output_triton) -print( - f'The maximum difference between torch and triton is ' - f'{torch.max(torch.abs(output_torch - output_triton))}' -) - -# %% -# Seems like we're good to go! - -# %% -# Benchmark -# --------- -# -# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch. -# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom ops. -# for different problem sizes. - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=['size'], # Argument names to use as an x-axis for the plot. - x_vals=[ - 2 ** i for i in range(12, 28, 1) - ], # Different possible values for `x_name`. - x_log=True, # x axis is logarithmic. - line_arg='provider', # Argument name whose value corresponds to a different line in the plot. - line_vals=['triton', 'torch'], # Possible values for `line_arg`. - line_names=['Triton', 'Torch'], # Label name for the lines. - styles=[('blue', '-'), ('green', '-')], # Line styles. - ylabel='GB/s', # Label name for the y-axis. - plot_name='vector-add-performance', # Name for the plot. Used also as a file name for saving the plot. - args={}, # Values for function arguments not in `x_names` and `y_name`. - ) -) -def benchmark(size, provider): - x = torch.rand(size, device='cuda', dtype=torch.float32) - y = torch.rand(size, device='cuda', dtype=torch.float32) - quantiles = [0.5, 0.2, 0.8] - if provider == 'torch': - ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y, quantiles=quantiles) - if provider == 'triton': - ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y), quantiles=quantiles) - gbps = lambda ms: 12 * size / ms * 1e-6 - return gbps(ms), gbps(max_ms), gbps(min_ms) - - -# %% -# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or -# `save_path='/path/to/results/' to save them to disk along with raw CSV data: -benchmark.run(print_data=True, show_plots=True) diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py deleted file mode 100644 index 13383cc1c783..000000000000 --- a/python/tutorials/02-fused-softmax.py +++ /dev/null @@ -1,200 +0,0 @@ -""" -Fused Softmax -============= - -In this tutorial, you will write a fused softmax operation that is significantly faster -than PyTorch's native op for a particular class of matrices: those whose rows can fit in -the GPU's SRAM. - -In doing so, you will learn about: - -* The benefits of kernel fusion for bandwidth-bound operations. - -* Reduction operators in Triton. - -""" - -# %% -# Motivations -# ----------- -# -# Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice. -# Let us consider instead the case of a simple (numerically stabilized) softmax operation: - -import torch - -import triton -import triton.language as tl - - -@torch.jit.script -def naive_softmax(x): - """Compute row-wise softmax of X using native pytorch - - We subtract the maximum element in order to avoid overflows. Softmax is invariant to - this shift. - """ - # read MN elements ; write M elements - x_max = x.max(dim=1)[0] - # read MN + M elements ; write MN elements - z = x - x_max[:, None] - # read MN elements ; write MN elements - numerator = torch.exp(z) - # read MN elements ; write M elements - denominator = numerator.sum(dim=1) - # read MN + M elements ; write MN elements - ret = numerator / denominator[:, None] - # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements - return ret - - -# %% -# When implemented naively in PyTorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}` -# requires reading :math:`5MN + 2M` elements from DRAM and writing back :math:`3MN + 2M` elements. -# This is obviously wasteful; we'd prefer to have a custom "fused" kernel that only reads -# X once and does all the necessary computations on-chip. -# Doing so would require reading and writing back only :math:`MN` bytes, so we could -# expect a theoretical speed-up of ~4x (i.e., :math:`(8MN + 4M) / 2MN`). -# The `torch.jit.script` flags aims to perform this kind of "kernel fusion" automatically -# but, as we will see later, it is still far from ideal. - -# %% -# Compute Kernel -# -------------- -# -# Our softmax kernel works as follows: each program loads a row of the input matrix X, -# normalizes it and writes back the result to the output Y. -# -# Note that one important limitation of Triton is that each block must have a -# power-of-two number of elements, so we need to internally "pad" each row and guard the -# memory operations properly if we want to handle any possible input shapes: - - -@triton.jit -def softmax_kernel( - output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, - BLOCK_SIZE: tl.constexpr -): - # The rows of the softmax are independent, so we parallelize across those - row_idx = tl.program_id(0) - # The stride represents how much we need to increase the pointer to advance 1 row - row_start_ptr = input_ptr + row_idx * input_row_stride - # The block size is the next power of two greater than n_cols, so we can fit each - # row in a single block - col_offsets = tl.arange(0, BLOCK_SIZE) - input_ptrs = row_start_ptr + col_offsets - # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols - row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf')) - # Subtract maximum for numerical stability - row_minus_max = row - tl.max(row, axis=0) - # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA) - numerator = tl.exp(row_minus_max) - denominator = tl.sum(numerator, axis=0) - softmax_output = numerator / denominator - # Write back output to DRAM - output_row_start_ptr = output_ptr + row_idx * output_row_stride - output_ptrs = output_row_start_ptr + col_offsets - tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols) - - -# %% -# We can create a helper function that enqueues the kernel and its (meta-)arguments for any given input tensor. - - -def softmax(x): - n_rows, n_cols = x.shape - # The block size is the smallest power of two greater than the number of columns in `x` - BLOCK_SIZE = triton.next_power_of_2(n_cols) - # Another trick we can use is to ask the compiler to use more threads per row by - # increasing the number of warps (`num_warps`) over which each row is distributed. - # You will see in the next tutorial how to auto-tune this value in a more natural - # way so you don't have to come up with manual heuristics yourself. - num_warps = 4 - if BLOCK_SIZE >= 2048: - num_warps = 8 - if BLOCK_SIZE >= 4096: - num_warps = 16 - # Allocate output - y = torch.empty_like(x) - # Enqueue kernel. The 1D launch grid is simple: we have one kernel instance per row o - # f the input matrix - softmax_kernel[(n_rows,)]( - y, - x, - x.stride(0), - y.stride(0), - n_cols, - num_warps=num_warps, - BLOCK_SIZE=BLOCK_SIZE, - ) - return y - - -# %% -# Unit Test -# --------- - -# %% -# We make sure that we test our kernel on a matrix with an irregular number of rows and columns. -# This will allow us to verify that our padding mechanism works. - -torch.manual_seed(0) -x = torch.randn(1823, 781, device='cuda') -y_triton = softmax(x) -y_torch = torch.softmax(x, axis=1) -assert torch.allclose(y_triton, y_torch), (y_triton, y_torch) - -# %% -# As expected, the results are identical. - -# %% -# Benchmark -# --------- -# -# Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows. -# We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above. - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=['N'], # argument names to use as an x-axis for the plot - x_vals=[ - 128 * i for i in range(2, 100) - ], # different possible values for `x_name` - line_arg='provider', # argument name whose value corresponds to a different line in the plot - line_vals=[ - 'triton', - 'torch-native', - 'torch-jit', - ], # possible values for `line_arg`` - line_names=[ - "Triton", - "Torch (native)", - "Torch (jit)", - ], # label name for the lines - styles=[('blue', '-'), ('green', '-'), ('green', '--')], # line styles - ylabel="GB/s", # label name for the y-axis - plot_name="softmax-performance", # name for the plot. Used also as a file name for saving the plot. - args={'M': 4096}, # values for function arguments not in `x_names` and `y_name` - ) -) -def benchmark(M, N, provider): - x = torch.randn(M, N, device='cuda', dtype=torch.float32) - quantiles = [0.5, 0.2, 0.8] - if provider == 'torch-native': - ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1), quantiles=quantiles) - if provider == 'triton': - ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x), quantiles=quantiles) - if provider == 'torch-jit': - ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x), quantiles=quantiles) - gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3) - return gbps(ms), gbps(max_ms), gbps(min_ms) - - -benchmark.run(show_plots=True, print_data=True) - -# %% -# In the above plot, we can see that: -# - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here. -# - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. -# Note however that the PyTorch `softmax` operation is more general and will work on tensors of any shape. diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py deleted file mode 100644 index 8bcae2007abd..000000000000 --- a/python/tutorials/03-matrix-multiplication.py +++ /dev/null @@ -1,350 +0,0 @@ -""" -Matrix Multiplication -===================== -In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves -performance on parallel with cuBLAS. - -You will specifically learn about: - -* Block-level matrix multiplications. - -* Multi-dimensional pointer arithmetics. - -* Program re-ordering for improved L2 cache hit rate. - -* Automatic performance tuning. - -""" - -# %% -# Motivations -# ----------- -# -# Matrix multiplications are a key building block of most modern high-performance computing systems. -# They are notoriously hard to optimize, hence their implementation is generally done by -# hardware vendors themselves as part of so-called "kernel libraries" (e.g., cuBLAS). -# Unfortunately, these libraries are often proprietary and cannot be easily customized -# to accommodate the needs of modern deep learning workloads (e.g., fused activation functions). -# In this tutorial, you will learn how to implement efficient matrix multiplications by -# yourself with Triton, in a way that is easy to customize and extend. -# -# Roughly speaking, the kernel that we will write will implement the following blocked -# algorithm to multiply a (M, K) by a (K, N) matrix: -# -# .. code-block:: python -# -# # Do in parallel -# for m in range(0, M, BLOCK_SIZE_M): -# # Do in parallel -# for n in range(0, N, BLOCK_SIZE_N): -# acc = zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=float32) -# for k in range(0, K, BLOCK_SIZE_K): -# a = A[m : m+BLOCK_SIZE_M, k : k+BLOCK_SIZE_K] -# b = B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N] -# acc += dot(a, b) -# C[m : m+BLOCK_SIZE_M, n : n+BLOCK_SIZE_N] = acc -# -# where each iteration of the doubly-nested for-loop is performed by a dedicated Triton program instance. - -# %% -# Compute Kernel -# -------------- -# -# The above algorithm is, actually, fairly straightforward to implement in Triton. -# The main difficulty comes from the computation of the memory locations at which blocks -# of :code:`A` and :code:`B` must be read in the inner loop. For that, we need -# multi-dimensional pointer arithmetics. -# -# Pointer Arithmetics -# ~~~~~~~~~~~~~~~~~~~ -# -# For a row-major 2D tensor :code:`X`, the memory location of :code:`X[i, j]` is given b -# y :code:`&X[i, j] = X + i*stride_xi + j*stride_xj`. -# Therefore, blocks of pointers for :code:`A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K]` and -# :code:`B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]` can be defined in pseudo-code as: -# -# .. code-block:: python -# -# &A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K] = a_ptr + (m : m+BLOCK_SIZE_M)[:, None]*A.stride(0) + (k : k+BLOCK_SIZE_K)[None, :]*A.stride(1); -# &B[k : k+BLOCK_SIZE_K, n:n+BLOCK_SIZE_N] = b_ptr + (k : k+BLOCK_SIZE_K)[:, None]*B.stride(0) + (n : n+BLOCK_SIZE_N)[None, :]*B.stride(1); -# -# Which means that pointers for blocks of A and B can be initialized (i.e., :code:`k=0`) in Triton as the following -# code. Also note that we need an extra modulo to handle the case where :code:`M` is not a multiple of -# :code:`BLOCK_SIZE_M` or :code:`N` is not a multiple of :code:`BLOCK_SIZE_N`, in which case we can pad the data with -# some useless values, which will not contribute to the results. For the :code:`K` dimension, we will handle that later -# using masking load semantics. -# -# .. code-block:: python -# -# offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M -# offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N -# offs_k = tl.arange(0, BLOCK_SIZE_K) -# a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak) -# b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn) -# -# And then updated in the inner loop as follows: -# -# .. code-block:: python -# -# a_ptrs += BLOCK_SIZE_K * stride_ak; -# b_ptrs += BLOCK_SIZE_K * stride_bk; -# -# -# L2 Cache Optimizations -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# As mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]` -# block of :code:`C`. -# It is important to remember that the order in which these blocks are computed does -# matter, since it affects the L2 cache hit rate of our program. and unfortunately, a -# a simple row-major ordering -# -# .. code-block:: Python -# -# pid = triton.program_id(0); -# grid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M; -# grid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N; -# pid_m = pid / grid_n; -# pid_n = pid % grid_n; -# -# is just not going to cut it. -# -# One possible solution is to launch blocks in an order that promotes data reuse. -# This can be done by 'super-grouping' blocks in groups of :code:`GROUP_M` rows before -# switching to the next column: -# -# .. code-block:: python -# -# # Program ID -# pid = tl.program_id(axis=0) -# # Number of program ids along the M axis -# num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) -# # Number of programs ids along the N axis -# num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) -# # Number of programs in group -# num_pid_in_group = GROUP_SIZE_M * num_pid_n -# # Id of the group this program is in -# group_id = pid // num_pid_in_group -# # Row-id of the first program in the group -# first_pid_m = group_id * GROUP_SIZE_M -# # If `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller -# group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) -# # *Within groups*, programs are ordered in a column-major order -# # Row-id of the program in the *launch grid* -# pid_m = first_pid_m + (pid % group_size_m) -# # Col-id of the program in the *launch grid* -# pid_n = (pid % num_pid_in_group) // group_size_m -# -# For example, in the following matmul where each matrix is 9 blocks by 9 blocks, -# we can see that if we compute the output in row-major ordering, we need to load 90 -# blocks into SRAM to compute the first 9 output blocks, but if we do it in grouped -# ordering, we only need to load 54 blocks. -# -# .. image:: grouped_vs_row_major_ordering.png -# -# In practice, this can improve the performance of our matrix multiplication kernel by -# more than 10\% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100). -# - -# %% -# Final Result -# ------------ - -import torch - -import triton -import triton.language as tl - - -# `triton.jit`'ed functions can be auto-tuned by using the `triton.autotune` decorator, which consumes: -# - A list of `triton.Config` objects that define different configurations of -# meta-parameters (e.g., `BLOCK_SIZE_M`) and compilation options (e.g., `num_warps`) to try -# - An auto-tuning *key* whose change in values will trigger evaluation of all the -# provided configs -@triton.autotune( - configs=[ - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2), - triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2), - ], - key=['M', 'N', 'K'], -) -@triton.jit -def matmul_kernel( - # Pointers to matrices - a_ptr, b_ptr, c_ptr, - # Matrix dimensions - M, N, K, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` - # by to get the element one row down (A has M rows). - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - ACTIVATION: tl.constexpr, -): - """Kernel for computing the matmul C = A x B. - A has shape (M, K), B has shape (K, N) and C has shape (M, N) - """ - # ----------------------------------------------------------- - # Map program ids `pid` to the block of C it should compute. - # This is done in a grouped ordering to promote L2 data reuse. - # See above `L2 Cache Optimizations` section for details. - pid = tl.program_id(axis=0) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) - num_pid_in_group = GROUP_SIZE_M * num_pid_n - group_id = pid // num_pid_in_group - first_pid_m = group_id * GROUP_SIZE_M - group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) - pid_m = first_pid_m + (pid % group_size_m) - pid_n = (pid % num_pid_in_group) // group_size_m - - # ---------------------------------------------------------- - # Create pointers for the first blocks of A and B. - # We will advance this pointer as we move in the K direction - # and accumulate - # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers - # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers - # See above `Pointer Arithmetics` section for details - offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) - b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) - - # ----------------------------------------------------------- - # Iterate to compute a block of the C matrix. - # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block - # of fp32 values for higher accuracy. - # `accumulator` will be converted back to fp16 after the loop. - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): - # Load the next block of A and B, generate a mask by checking the K dimension. - # If it is out of bounds, set it to 0. - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - # We accumulate along the K dimension. - accumulator += tl.dot(a, b) - # Advance the ptrs to the next K block. - a_ptrs += BLOCK_SIZE_K * stride_ak - b_ptrs += BLOCK_SIZE_K * stride_bk - # You can fuse arbitrary activation functions here - # while the accumulator is still in FP32! - if ACTIVATION == "leaky_relu": - accumulator = leaky_relu(accumulator) - c = accumulator.to(tl.float16) - - # ----------------------------------------------------------- - # Write back the block of the output matrix C with masks. - offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) - offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) - tl.store(c_ptrs, c, mask=c_mask) - - -# We can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`. -@triton.jit -def leaky_relu(x): - x = x + 1 - return tl.where(x >= 0, x, 0.01 * x) - - -# %% -# We can now create a convenience wrapper function that only takes two input tensors, -# and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel. - - -def matmul(a, b, activation=""): - # Check constraints. - assert a.shape[1] == b.shape[0], "Incompatible dimensions" - assert a.is_contiguous(), "Matrix A must be contiguous" - assert b.is_contiguous(), "Matrix B must be contiguous" - M, K = a.shape - K, N = b.shape - # Allocates output. - c = torch.empty((M, N), device=a.device, dtype=a.dtype) - # 1D launch kernel where each block gets its own program. - grid = lambda META: ( - triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), - ) - matmul_kernel[grid]( - a, b, c, - M, N, K, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - ACTIVATION=activation - ) - return c - - -# %% -# Unit Test -# --------- -# -# We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS). - -torch.manual_seed(0) -a = torch.randn((512, 512), device='cuda', dtype=torch.float16) -b = torch.randn((512, 512), device='cuda', dtype=torch.float16) -triton_output = matmul(a, b) -torch_output = torch.matmul(a, b) -print(f"triton_output={triton_output}") -print(f"torch_output={torch_output}") -if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0): - print("✅ Triton and Torch match") -else: - print("❌ Triton and Torch differ") - -# %% -# Benchmark -# --------- -# -# Square Matrix Performance -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We can now compare the performance of our kernel against that of cuBLAS. Here we focus on square matrices, -# but feel free to arrange this script as you wish to benchmark any other matrix shape. - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=['M', 'N', 'K'], # Argument names to use as an x-axis for the plot - x_vals=[ - 128 * i for i in range(2, 33) - ], # Different possible values for `x_name` - line_arg='provider', # Argument name whose value corresponds to a different line in the plot - # Possible values for `line_arg` - line_vals=['cublas', 'triton'], - # Label name for the lines - line_names=["cuBLAS", "Triton"], - # Line styles - styles=[('green', '-'), ('blue', '-')], - ylabel="TFLOPS", # Label name for the y-axis - plot_name="matmul-performance", # Name for the plot, used also as a file name for saving the plot. - args={}, - ) -) -def benchmark(M, N, K, provider): - a = torch.randn((M, K), device='cuda', dtype=torch.float16) - b = torch.randn((K, N), device='cuda', dtype=torch.float16) - quantiles = [0.5, 0.2, 0.8] - if provider == 'cublas': - ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), quantiles=quantiles) - if provider == 'triton': - ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b), quantiles=quantiles) - perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3) - return perf(ms), perf(max_ms), perf(min_ms) - - -benchmark.run(show_plots=True, print_data=True) diff --git a/python/tutorials/04-low-memory-dropout.py b/python/tutorials/04-low-memory-dropout.py deleted file mode 100644 index 3c4d217e22b0..000000000000 --- a/python/tutorials/04-low-memory-dropout.py +++ /dev/null @@ -1,173 +0,0 @@ -""" -Low-Memory Dropout -================== - -In this tutorial, you will write a memory-efficient implementation of dropout whose state -will be composed of a single int32 seed. This differs from more traditional implementations of dropout, -whose state is generally composed of a bit mask tensor of the same shape as the input. - -In doing so, you will learn about: - -* The limitations of naive implementations of Dropout with PyTorch. - -* Parallel pseudo-random number generation in Triton. - -""" - -# %% -# Baseline -# -------- -# -# The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance -# of deep neural networks in low-data regime (i.e. regularization). -# -# It takes a vector as input and produces a vector of the same shape as output. Each scalar in the -# output has a probability :math:`p` of being changed to zero and otherwise it is copied from the input. -# This forces the network to perform well even when only :math:`1 - p` scalars from the input are available. -# -# At evaluation time we want to use the full power of the network so we set :math:`p=0`. Naively this would -# increase the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease -# in the output softmax temperature). To prevent this we multiply the output by :math:`\frac{1}{1 - p}`, which -# keeps the norm consistent regardless of the dropout probability. -# -# Let's first take a look at the baseline implementation. - - -import tabulate -import torch - -import triton -import triton.language as tl - - -@triton.jit -def _dropout( - x_ptr, # pointer to the input - x_keep_ptr, # pointer to a mask of 0s and 1s - output_ptr, # pointer to the output - n_elements, # number of elements in the `x` tensor - p, # probability that an element of `x` is changed to zero - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - # Load data - x = tl.load(x_ptr + offsets, mask=mask) - x_keep = tl.load(x_keep_ptr + offsets, mask=mask) - # The line below is the crucial part, described in the paragraph above! - output = tl.where(x_keep, x / (1 - p), 0.0) - # Write-back output - tl.store(output_ptr + offsets, output, mask=mask) - - -def dropout(x, x_keep, p): - output = torch.empty_like(x) - assert x.is_contiguous() - n_elements = x.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024) - return output - - -# Input tensor -x = torch.randn(size=(10,)).cuda() -# Dropout mask -p = 0.5 -x_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda() -# -output = dropout(x, x_keep=x_keep, p=p) -print(tabulate.tabulate([ - ["input"] + x.tolist(), - ["keep mask"] + x_keep.tolist(), - ["output"] + output.tolist() -])) - -# %% -# Seeded dropout -# -------------- -# -# The above implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly -# we need to store the dropout mask for backpropagation. Secondly, dropout state management can get -# very tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in -# https://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation -# that (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management -# of persisting randomness across multiple invocations of the kernel. -# -# Pseudo-random number generation in Triton is simple! In this tutorial we will use the -# :code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` -# values in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides -# other :ref:`random number generation strategies `. -# -# .. note:: -# Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_). -# -# Let's put it all together. - - -@triton.jit -def _seeded_dropout( - x_ptr, - output_ptr, - n_elements, - p, - seed, - BLOCK_SIZE: tl.constexpr, -): - # compute memory offsets of elements handled by this instance - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - # load data from x - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - # randomly prune it - random = tl.rand(seed, offsets) - x_keep = random > p - # write-back - output = tl.where(x_keep, x / (1 - p), 0.0) - tl.store(output_ptr + offsets, output, mask=mask) - - -def seeded_dropout(x, p, seed): - output = torch.empty_like(x) - assert x.is_contiguous() - n_elements = x.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024) - return output - - -x = torch.randn(size=(10,)).cuda() -# Compare this to the baseline - dropout mask is never instantiated! -output = seeded_dropout(x, p=0.5, seed=123) -output2 = seeded_dropout(x, p=0.5, seed=123) -output3 = seeded_dropout(x, p=0.5, seed=512) - -print(tabulate.tabulate([ - ["input"] + x.tolist(), - ["output (seed = 123)"] + output.tolist(), - ["output (seed = 123)"] + output2.tolist(), - ["output (seed = 512)"] + output3.tolist() -])) - -# %% -# Et Voilà! We have a triton kernel that applies the same dropout mask provided the seed is the same! -# If you'd like explore further applications of pseudorandomness in GPU programming, we encourage you -# to explore the `triton/language/random` folder! - -# %% -# Exercises -# --------- -# -# 1. Extend the kernel to operate over a matrix and use a vector of seeds - one per row. -# 2. Add support for striding. -# 3. (challenge) Implement a kernel for sparse Johnson-Lindenstrauss transform which generates the projection matrix one the fly each time using a seed. - -# %% -# References -# ---------- -# -# .. [SALMON2011] John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, "Parallel Random Numbers: As Easy as 1, 2, 3", 2011 -# .. [SRIVASTAVA2014] Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, "Dropout: A Simple Way to Prevent Neural Networks from Overfitting", JMLR 2014 diff --git a/python/tutorials/05-layer-norm.py b/python/tutorials/05-layer-norm.py deleted file mode 100644 index 1737e7e36345..000000000000 --- a/python/tutorials/05-layer-norm.py +++ /dev/null @@ -1,374 +0,0 @@ -""" -Layer Normalization -==================== -In this tutorial, you will write a high-performance layer normalization -kernel that runs faster than the PyTorch implementation. - -In doing so, you will learn about: - -* Implementing backward pass in Triton. - -* Implementing parallel reduction in Triton. - -""" - -# %% -# Motivations -# ----------- -# -# The *LayerNorm* operator was first introduced in [BA2016]_ as a way to improve the performance -# of sequential models (e.g., Transformers) or neural networks with small batch size. -# It takes a vector :math:`x` as input and produces a vector :math:`y` of the same shape as output. -# The normalization is performed by subtracting the mean and dividing by the standard deviation of :math:`x`. -# After the normalization, a learnable linear transformation with weights :math:`w` and biases :math:`b` is applied. -# The forward pass can be expressed as follows: -# -# .. math:: -# y = \frac{ x - \text{E}[x] }{ \sqrt{\text{Var}(x) + \epsilon} } * w + b -# -# where :math:`\epsilon` is a small constant added to the denominator for numerical stability. -# Let’s first take a look at the forward pass implementation. - -import torch - -import triton -import triton.language as tl - -try: - # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it - # should not be added to extras_require in setup.py. - import apex - HAS_APEX = True -except ModuleNotFoundError: - HAS_APEX = False - - -@triton.jit -def _layer_norm_fwd_fused( - X, # pointer to the input - Y, # pointer to the output - W, # pointer to the weights - B, # pointer to the biases - Mean, # pointer to the mean - Rstd, # pointer to the 1/std - stride, # how much to increase the pointer when moving by 1 row - N, # number of columns in X - eps, # epsilon to avoid division by zero - BLOCK_SIZE: tl.constexpr, -): - # Map the program id to the row of X and Y it should compute. - row = tl.program_id(0) - Y += row * stride - X += row * stride - # Compute mean - mean = 0 - _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32) - for off in range(0, N, BLOCK_SIZE): - cols = off + tl.arange(0, BLOCK_SIZE) - a = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32) - _mean += a - mean = tl.sum(_mean, axis=0) / N - # Compute variance - _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32) - for off in range(0, N, BLOCK_SIZE): - cols = off + tl.arange(0, BLOCK_SIZE) - x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32) - x = tl.where(cols < N, x - mean, 0.) - _var += x * x - var = tl.sum(_var, axis=0) / N - rstd = 1 / tl.sqrt(var + eps) - # Write mean / rstd - tl.store(Mean + row, mean) - tl.store(Rstd + row, rstd) - # Normalize and apply linear transformation - for off in range(0, N, BLOCK_SIZE): - cols = off + tl.arange(0, BLOCK_SIZE) - mask = cols < N - w = tl.load(W + cols, mask=mask) - b = tl.load(B + cols, mask=mask) - x = tl.load(X + cols, mask=mask, other=0.).to(tl.float32) - x_hat = (x - mean) * rstd - y = x_hat * w + b - # Write output - tl.store(Y + cols, y, mask=mask) - - -# %% -# Backward pass -# ------------- -# -# The backward pass for the layer normalization operator is a bit more involved than the forward pass. -# Let :math:`\hat{x}` be the normalized inputs :math:`\frac{ x - \text{E}[x] }{ \sqrt{\text{Var}(x) + \epsilon} }` before the linear transformation, -# the Vector-Jacobian Products (VJP) :math:`\nabla_{x}` of :math:`x` are given by: -# -# .. math:: -# \nabla_{x} = \frac{1}{\sigma}\Big( \nabla_{y} \odot w - \underbrace{ \big( \frac{1}{N} \hat{x} \cdot (\nabla_{y} \odot w) \big) }_{c_1} \odot \hat{x} - \underbrace{ \frac{1}{N} \nabla_{y} \cdot w }_{c_2} \Big) -# -# where :math:`\odot` denotes the element-wise multiplication, :math:`\cdot` denotes the dot product, and :math:`\sigma` is the standard deviation. -# :math:`c_1` and :math:`c_2` are intermediate constants that improve the readability of the following implementation. -# -# For the weights :math:`w` and biases :math:`b`, the VJPs :math:`\nabla_{w}` and :math:`\nabla_{b}` are more straightforward: -# -# .. math:: -# \nabla_{w} = \nabla_{y} \odot \hat{x} \quad \text{and} \quad \nabla_{b} = \nabla_{y} -# -# Since the same weights :math:`w` and biases :math:`b` are used for all rows in the same batch, their gradients need to sum up. -# To perform this step efficiently, we use a parallel reduction strategy: each kernel instance accumulates -# partial :math:`\nabla_{w}` and :math:`\nabla_{b}` across certain rows into one of :math:`\text{GROUP_SIZE_M}` independent buffers. -# These buffers stay in the L2 cache and then are further reduced by another function to compute the actual :math:`\nabla_{w}` and :math:`\nabla_{b}`. -# -# Let the number of input rows :math:`M = 4` and :math:`\text{GROUP_SIZE_M} = 2`, -# here's a diagram of the parallel reduction strategy for :math:`\nabla_{w}` (:math:`\nabla_{b}` is omitted for brevity): -# -# .. image:: parallel_reduction.png -# -# In Stage 1, the rows of X that have the same color share the same buffer and thus a lock is used to ensure that only one kernel instance writes to the buffer at a time. -# In Stage 2, the buffers are further reduced to compute the final :math:`\nabla_{w}` and :math:`\nabla_{b}`. -# In the following implementation, Stage 1 is implemented by the function :code:`_layer_norm_bwd_dx_fused` and Stage 2 is implemented by the function :code:`_layer_norm_bwd_dwdb`. - -@triton.jit -def _layer_norm_bwd_dx_fused( - DX, # pointer to the input gradient - DY, # pointer to the output gradient - DW, # pointer to the partial sum of weights gradient - DB, # pointer to the partial sum of biases gradient - X, # pointer to the input - W, # pointer to the weights - B, # pointer to the biases - Mean, # pointer to the mean - Rstd, # pointer to the 1/std - Lock, # pointer to the lock - stride, # how much to increase the pointer when moving by 1 row - N, # number of columns in X - eps, # epsilon to avoid division by zero - GROUP_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr -): - # Map the program id to the elements of X, DX, and DY it should compute. - row = tl.program_id(0) - cols = tl.arange(0, BLOCK_SIZE_N) - mask = cols < N - X += row * stride - DY += row * stride - DX += row * stride - # Offset locks and weights/biases gradient pointer for parallel reduction - lock_id = row % GROUP_SIZE_M - Lock += lock_id - Count = Lock + GROUP_SIZE_M - DW = DW + lock_id * N + cols - DB = DB + lock_id * N + cols - # Load data to SRAM - x = tl.load(X + cols, mask=mask, other=0).to(tl.float32) - dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32) - w = tl.load(W + cols, mask=mask).to(tl.float32) - mean = tl.load(Mean + row) - rstd = tl.load(Rstd + row) - # Compute dx - xhat = (x - mean) * rstd - wdy = w * dy - xhat = tl.where(mask, xhat, 0.) - wdy = tl.where(mask, wdy, 0.) - c1 = tl.sum(xhat * wdy, axis=0) / N - c2 = tl.sum(wdy, axis=0) / N - dx = (wdy - (xhat * c1 + c2)) * rstd - # Write dx - tl.store(DX + cols, dx, mask=mask) - # Accumulate partial sums for dw/db - partial_dw = (dy * xhat).to(w.dtype) - partial_db = (dy).to(w.dtype) - while tl.atomic_cas(Lock, 0, 1) == 1: - pass - count = tl.load(Count) - # First store doesn't accumulate - if count == 0: - tl.atomic_xchg(Count, 1) - else: - partial_dw += tl.load(DW, mask=mask) - partial_db += tl.load(DB, mask=mask) - tl.store(DW, partial_dw, mask=mask) - tl.store(DB, partial_db, mask=mask) - # Release the lock - tl.atomic_xchg(Lock, 0) - - -@triton.jit -def _layer_norm_bwd_dwdb( - DW, # pointer to the partial sum of weights gradient - DB, # pointer to the partial sum of biases gradient - FINAL_DW, # pointer to the weights gradient - FINAL_DB, # pointer to the biases gradient - M, # GROUP_SIZE_M - N, # number of columns - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr -): - # Map the program id to the elements of DW and DB it should compute. - pid = tl.program_id(0) - cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - # Iterate through the rows of DW and DB to sum the partial sums. - for i in range(0, M, BLOCK_SIZE_M): - rows = i + tl.arange(0, BLOCK_SIZE_M) - mask = (rows[:, None] < M) & (cols[None, :] < N) - offs = rows[:, None] * N + cols[None, :] - dw += tl.load(DW + offs, mask=mask, other=0.) - db += tl.load(DB + offs, mask=mask, other=0.) - # Write the final sum to the output. - sum_dw = tl.sum(dw, axis=0) - sum_db = tl.sum(db, axis=0) - tl.store(FINAL_DW + cols, sum_dw, mask=cols < N) - tl.store(FINAL_DB + cols, sum_db, mask=cols < N) - - -# %% -# Benchmark -# --------- -# -# We can now compare the performance of our kernel against that of PyTorch. -# Here we focus on inputs that have Less than 64KB per feature. -# Specifically, one can set :code:`'mode': 'backward'` to benchmark the backward pass. - - -class LayerNorm(torch.autograd.Function): - - @staticmethod - def forward(ctx, x, normalized_shape, weight, bias, eps): - # allocate output - y = torch.empty_like(x) - # reshape input data into 2D tensor - x_arg = x.reshape(-1, x.shape[-1]) - M, N = x_arg.shape - mean = torch.empty((M, ), dtype=torch.float32, device='cuda') - rstd = torch.empty((M, ), dtype=torch.float32, device='cuda') - # Less than 64KB per feature: enqueue fused kernel - MAX_FUSED_SIZE = 65536 // x.element_size() - BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) - if N > BLOCK_SIZE: - raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") - # heuristics for number of warps - num_warps = min(max(BLOCK_SIZE // 256, 1), 8) - # enqueue kernel - _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd, - x_arg.stride(0), N, eps, - BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps) - ctx.save_for_backward(x, weight, bias, mean, rstd) - ctx.BLOCK_SIZE = BLOCK_SIZE - ctx.num_warps = num_warps - ctx.eps = eps - return y - - @staticmethod - def backward(ctx, dy): - x, w, b, m, v = ctx.saved_tensors - # heuristics for amount of parallel reduction stream for DW/DB - N = w.shape[0] - GROUP_SIZE_M = 64 - if N <= 8192: GROUP_SIZE_M = 96 - if N <= 4096: GROUP_SIZE_M = 128 - if N <= 1024: GROUP_SIZE_M = 256 - # allocate output - locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device='cuda') - _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device) - _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device) - dw = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device) - db = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device) - dx = torch.empty_like(dy) - # enqueue kernel using forward pass heuristics - # also compute partial sums for DW and DB - x_arg = x.reshape(-1, x.shape[-1]) - M, N = x_arg.shape - _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks, - x_arg.stride(0), N, ctx.eps, - BLOCK_SIZE_N=ctx.BLOCK_SIZE, - GROUP_SIZE_M=GROUP_SIZE_M, - num_warps=ctx.num_warps) - grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])] - # accumulate partial sums in separate kernel - _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N, - BLOCK_SIZE_M=32, - BLOCK_SIZE_N=128) - return dx, None, dw, db, None - - -layer_norm = LayerNorm.apply - - -def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'): - # create data - x_shape = (M, N) - w_shape = (x_shape[-1], ) - weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) - bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) - x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda') - dy = .1 * torch.randn_like(x) - x.requires_grad_(True) - # forward pass - y_tri = layer_norm(x, w_shape, weight, bias, eps) - y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype) - # backward pass (triton) - y_tri.backward(dy, retain_graph=True) - dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]] - x.grad, weight.grad, bias.grad = None, None, None - # backward pass (torch) - y_ref.backward(dy, retain_graph=True) - dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]] - # compare - assert torch.allclose(y_tri, y_ref, atol=1e-2, rtol=0) - assert torch.allclose(dx_tri, dx_ref, atol=1e-2, rtol=0) - assert torch.allclose(db_tri, db_ref, atol=1e-2, rtol=0) - assert torch.allclose(dw_tri, dw_ref, atol=1e-2, rtol=0) - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=['N'], - x_vals=[512 * i for i in range(2, 32)], - line_arg='provider', - line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []), - line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []), - styles=[('blue', '-'), ('green', '-'), ('orange', '-')], - ylabel='GB/s', - plot_name='layer-norm-backward', - args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'} - ) -) -def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='cuda'): - # create data - x_shape = (M, N) - w_shape = (x_shape[-1], ) - weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) - bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) - x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda') - dy = .1 * torch.randn_like(x) - x.requires_grad_(True) - quantiles = [0.5, 0.2, 0.8] - # utility functions - if provider == 'triton': - y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps) - if provider == 'torch': - y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps) - if provider == 'apex': - apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype) - y_fwd = lambda: apex_layer_norm(x) - # forward pass - if mode == 'forward': - gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6 - ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, quantiles=quantiles, rep=500) - # backward pass - if mode == 'backward': - gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6 - y = y_fwd() - ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True), - quantiles=quantiles, grad_to_none=[x], rep=500) - return gbps(ms), gbps(max_ms), gbps(min_ms) - - -test_layer_norm(1151, 8192, torch.float16) -bench_layer_norm.run(save_path='.', print_data=True) - -# %% -# References -# ---------- -# -# .. [BA2016] Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E. Hinton, "Layer Normalization", Arxiv 2016 diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py deleted file mode 100644 index c9875cf358a2..000000000000 --- a/python/tutorials/06-fused-attention.py +++ /dev/null @@ -1,361 +0,0 @@ -""" -Fused Attention -=============== - -This is a Triton implementation of the Flash Attention algorithm -(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf) -""" - -import pytest -import torch - -import triton -import triton.language as tl - - -@triton.jit -def _fwd_kernel( - Q, K, V, sm_scale, - L, M, - Out, - stride_qz, stride_qh, stride_qm, stride_qk, - stride_kz, stride_kh, stride_kn, stride_kk, - stride_vz, stride_vh, stride_vk, stride_vn, - stride_oz, stride_oh, stride_om, stride_on, - Z, H, N_CTX, - BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, -): - start_m = tl.program_id(0) - off_hz = tl.program_id(1) - # initialize offsets - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk - off_k = off_hz * stride_qh + offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kk - off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk - # Initialize pointers to Q, K, V - q_ptrs = Q + off_q - k_ptrs = K + off_k - v_ptrs = V + off_v - # initialize pointer to m and l - m_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") - l_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - # load q: it will stay in SRAM throughout - q = tl.load(q_ptrs) - # loop over k, v and update accumulator - for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N): - # -- compute qk ---- - k = tl.load(k_ptrs) - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk *= sm_scale - qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) - # compute new m - m_curr = tl.maximum(tl.max(qk, 1), m_prev) - # correct old l - l_prev *= tl.exp(m_prev - m_curr) - # attention weights - p = tl.exp(qk - m_curr[:, None]) - l_curr = tl.sum(p, 1) + l_prev - # rescale operands of matmuls - l_rcp = 1. / l_curr - p *= l_rcp[:, None] - acc *= (l_prev * l_rcp)[:, None] - # update acc - p = p.to(Q.dtype.element_ty) - v = tl.load(v_ptrs) - acc += tl.dot(p, v) - # update m_i and l_i - l_prev = l_curr - m_prev = m_curr - # update pointers - k_ptrs += BLOCK_N * stride_kn - v_ptrs += BLOCK_N * stride_vk - # rematerialize offsets to save registers - start_m = tl.program_id(0) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - # write back l and m - l_ptrs = L + off_hz * N_CTX + offs_m - m_ptrs = M + off_hz * N_CTX + offs_m - tl.store(l_ptrs, l_prev) - tl.store(m_ptrs, m_prev) - # initialize pointers to output - offs_n = tl.arange(0, BLOCK_DMODEL) - off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on - out_ptrs = Out + off_o - tl.store(out_ptrs, acc) - - -@triton.jit -def _bwd_preprocess( - Out, DO, L, - NewDO, Delta, - BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr, -): - off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) - off_n = tl.arange(0, D_HEAD) - # load - o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) - do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) - denom = tl.load(L + off_m).to(tl.float32) - # compute - do = do / denom[:, None] - delta = tl.sum(o * do, axis=1) - # write-back - tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do) - tl.store(Delta + off_m, delta) - - -@triton.jit -def _bwd_kernel( - Q, K, V, sm_scale, Out, DO, - DQ, DK, DV, - L, M, - D, - stride_qz, stride_qh, stride_qm, stride_qk, - stride_kz, stride_kh, stride_kn, stride_kk, - stride_vz, stride_vh, stride_vk, stride_vn, - Z, H, N_CTX, - num_block, - BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, -): - off_hz = tl.program_id(0) - off_z = off_hz // H - off_h = off_hz % H - # offset pointers for batch/head - Q += off_z * stride_qz + off_h * stride_qh - K += off_z * stride_qz + off_h * stride_qh - V += off_z * stride_qz + off_h * stride_qh - DO += off_z * stride_qz + off_h * stride_qh - DQ += off_z * stride_qz + off_h * stride_qh - DK += off_z * stride_qz + off_h * stride_qh - DV += off_z * stride_qz + off_h * stride_qh - for start_n in range(0, num_block): - lo = start_n * BLOCK_M - # initialize row/col offsets - offs_qm = lo + tl.arange(0, BLOCK_M) - offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M) - offs_m = tl.arange(0, BLOCK_N) - offs_k = tl.arange(0, BLOCK_DMODEL) - # initialize pointers to value-like data - q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) - v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) - do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - # pointer to row-wise quantities in value-like data - D_ptrs = D + off_hz * N_CTX - m_ptrs = M + off_hz * N_CTX - # initialize dv amd dk - dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - # k and v stay in SRAM throughout - k = tl.load(k_ptrs) - v = tl.load(v_ptrs) - # loop over rows - for start_m in range(lo, num_block * BLOCK_M, BLOCK_M): - offs_m_curr = start_m + offs_m - # load q, k, v, do on-chip - q = tl.load(q_ptrs) - # recompute p = softmax(qk, dim=-1).T - # NOTE: `do` is pre-divided by `l`; no normalization here - qk = tl.dot(q, tl.trans(k)) - qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) - m = tl.load(m_ptrs + offs_m_curr) - p = tl.exp(qk * sm_scale - m[:, None]) - # compute dv - do = tl.load(do_ptrs) - dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do) - # compute dp = dot(v, do) - Di = tl.load(D_ptrs + offs_m_curr) - dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None] - dp += tl.dot(do, tl.trans(v)) - # compute ds = p * (dp - delta[:, None]) - ds = p * dp * sm_scale - # compute dk = dot(ds.T, q) - dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q) - # compute dq - dq = tl.load(dq_ptrs) - dq += tl.dot(ds.to(Q.dtype.element_ty), k) - tl.store(dq_ptrs, dq) - # increment pointers - dq_ptrs += BLOCK_M * stride_qm - q_ptrs += BLOCK_M * stride_qm - do_ptrs += BLOCK_M * stride_qm - # write-back - dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) - dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) - tl.store(dv_ptrs, dv) - tl.store(dk_ptrs, dk) - - -empty = torch.empty(128, device="cuda") - - -class _attention(torch.autograd.Function): - - @staticmethod - def forward(ctx, q, k, v, sm_scale): - BLOCK = 128 - # shape constraints - Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] - assert Lq == Lk and Lk == Lv - assert Lk in {16, 32, 64, 128} - o = torch.empty_like(q) - grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1], 1) - L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) - m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) - num_warps = 4 if Lk <= 64 else 8 - - _fwd_kernel[grid]( - q, k, v, sm_scale, - L, m, - o, - q.stride(0), q.stride(1), q.stride(2), q.stride(3), - k.stride(0), k.stride(1), k.stride(2), k.stride(3), - v.stride(0), v.stride(1), v.stride(2), v.stride(3), - o.stride(0), o.stride(1), o.stride(2), o.stride(3), - q.shape[0], q.shape[1], q.shape[2], - BLOCK_M=BLOCK, BLOCK_N=BLOCK, - BLOCK_DMODEL=Lk, num_warps=num_warps, - num_stages=2, - ) - # print(h.asm["ttgir"]) - - ctx.save_for_backward(q, k, v, o, L, m) - ctx.grid = grid - ctx.sm_scale = sm_scale - ctx.BLOCK_DMODEL = Lk - return o - - @staticmethod - def backward(ctx, do): - BLOCK = 128 - q, k, v, o, l, m = ctx.saved_tensors - do = do.contiguous() - dq = torch.zeros_like(q, dtype=torch.float32) - dk = torch.empty_like(k) - dv = torch.empty_like(v) - do_scaled = torch.empty_like(do) - delta = torch.empty_like(l) - _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )]( - o, do, l, - do_scaled, delta, - BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL, - ) - _bwd_kernel[(ctx.grid[1],)]( - q, k, v, ctx.sm_scale, - o, do_scaled, - dq, dk, dv, - l, m, - delta, - q.stride(0), q.stride(1), q.stride(2), q.stride(3), - k.stride(0), k.stride(1), k.stride(2), k.stride(3), - v.stride(0), v.stride(1), v.stride(2), v.stride(3), - q.shape[0], q.shape[1], q.shape[2], - ctx.grid[0], - BLOCK_M=BLOCK, BLOCK_N=BLOCK, - BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8, - num_stages=1, - ) - # print(h.asm["ttgir"]) - return dq, dk, dv, None - - -attention = _attention.apply - - -@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(4, 48, 1024, 64)]) -def test_op(Z, H, N_CTX, D_HEAD, dtype=torch.float16): - torch.manual_seed(20) - q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_() - k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_() - v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_() - sm_scale = 0.2 - dout = torch.randn_like(q) - # reference implementation - M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda")) - p = torch.matmul(q, k.transpose(2, 3)) * sm_scale - for z in range(Z): - for h in range(H): - p[:, :, M == 0] = float("-inf") - p = torch.softmax(p.float(), dim=-1).half() - # p = torch.exp(p) - ref_out = torch.matmul(p, v) - ref_out.backward(dout) - ref_dv, v.grad = v.grad.clone(), None - ref_dk, k.grad = k.grad.clone(), None - ref_dq, q.grad = q.grad.clone(), None - # # triton implementation - tri_out = attention(q, k, v, sm_scale) - # print(ref_out) - # print(tri_out) - tri_out.backward(dout) - tri_dv, v.grad = v.grad.clone(), None - tri_dk, k.grad = k.grad.clone(), None - tri_dq, q.grad = q.grad.clone(), None - # compare - assert torch.allclose(ref_out, tri_out, atol=1e-2, rtol=0) - assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=0) - assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=0) - assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=0) - - -try: - from flash_attn.flash_attn_interface import flash_attn_func - HAS_FLASH = True -except BaseException: - HAS_FLASH = False - -BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64 -# vary seq length for fixed head and batch=4 -configs = [triton.testing.Benchmark( - x_names=['N_CTX'], - x_vals=[2**i for i in range(10, 14)], - line_arg='provider', - line_vals=['triton'] + (['flash'] if HAS_FLASH else []), - line_names=['Triton'] + (['Flash'] if HAS_FLASH else []), - styles=[('red', '-'), ('blue', '-')], - ylabel='ms', - plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-{mode}', - args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': torch.float16, 'mode': mode} -) for mode in ['fwd', 'bwd']] - - -@triton.testing.perf_report(configs) -def bench_flash_attention(BATCH, H, N_CTX, D_HEAD, mode, provider, dtype=torch.float16, device="cuda"): - assert mode in ['fwd', 'bwd'] - warmup = 25 - rep = 100 - if provider == "triton": - q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True) - k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True) - v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True) - sm_scale = 1.3 - fn = lambda: attention(q, k, v, sm_scale) - if mode == 'bwd': - o = fn() - do = torch.randn_like(o) - fn = lambda: o.backward(do, retain_graph=True) - ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep) - return ms - if provider == "flash": - lengths = torch.full((BATCH,), fill_value=N_CTX, device=device) - cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32) - cu_seqlens[1:] = lengths.cumsum(0) - qkv = torch.randn((BATCH * N_CTX, 3, H, D_HEAD), dtype=dtype, device=device, requires_grad=True) - fn = lambda: flash_attn_func(qkv, cu_seqlens, 0., N_CTX, causal=True) - if mode == 'bwd': - o = fn() - do = torch.randn_like(o) - fn = lambda: o.backward(do, retain_graph=True) - ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep) - return ms - - -# only works on post-Ampere GPUs right now -bench_flash_attention.run(save_path='.', print_data=True) diff --git a/python/tutorials/07-math-functions.py b/python/tutorials/07-math-functions.py deleted file mode 100644 index 1ded3aa984d6..000000000000 --- a/python/tutorials/07-math-functions.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Libdevice (`tl.math`) function -============================== -Triton can invoke a custom function from an external library. -In this example, we will use the `libdevice` library (a.k.a `math` in triton) to apply `asin` on a tensor. -Please refer to https://docs.nvidia.com/cuda/libdevice-users-guide/index.html regarding the semantics of all available libdevice functions. -In `triton/language/math.py`, we try to aggregate functions with the same computation but different data types together. -For example, both `__nv_asin` and `__nvasinf` calculate the principal value of the arc sine of the input, but `__nv_asin` operates on `double` and `__nv_asinf` operates on `float`. -Using triton, you can simply call `tl.math.asin`. -Triton automatically selects the correct underlying device function to invoke based on input and output types. -""" - -# %% -# asin Kernel -# ------------ - -import torch - -import triton -import triton.language as tl - - -@triton.jit -def asin_kernel( - x_ptr, - y_ptr, - n_elements, - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - x = tl.math.asin(x) - tl.store(y_ptr + offsets, x, mask=mask) - -# %% -# Using the default libdevice library path -# ----------------------------------------- -# We can use the default libdevice library path encoded in `triton/language/math.py` - - -torch.manual_seed(0) -size = 98432 -x = torch.rand(size, device='cuda') -output_triton = torch.zeros(size, device='cuda') -output_torch = torch.asin(x) -assert x.is_cuda and output_triton.is_cuda -n_elements = output_torch.numel() -grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) -asin_kernel[grid](x, output_triton, n_elements, BLOCK_SIZE=1024) -print(output_torch) -print(output_triton) -print( - f'The maximum difference between torch and triton is ' - f'{torch.max(torch.abs(output_torch - output_triton))}' -) - -# %% -# Customize the libdevice library path -# ------------------------------------- -# We can also customize the libdevice library path by passing the path to the `libdevice` library to the `asin` kernel. - -output_triton = torch.empty_like(x) -asin_kernel[grid](x, output_triton, n_elements, BLOCK_SIZE=1024, - extern_libs={'libdevice': '/usr/local/cuda/nvvm/libdevice/libdevice.10.bc'}) -print(output_torch) -print(output_triton) -print( - f'The maximum difference between torch and triton is ' - f'{torch.max(torch.abs(output_torch - output_triton))}' -) diff --git a/python/tutorials/08-experimental-block-pointer.py b/python/tutorials/08-experimental-block-pointer.py deleted file mode 100644 index 7147b69de6cc..000000000000 --- a/python/tutorials/08-experimental-block-pointer.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -Block Pointer (Experimental) -============================ -This tutorial will guide you through writing a matrix multiplication algorithm that utilizes block pointer semantics. -These semantics are more friendly for Triton to optimize and can result in better performance on specific hardware. -Note that this feature is still experimental and may change in the future. - -""" - -# %% -# Motivations -# ----------- -# In the previous matrix multiplication tutorial, we constructed blocks of values by de-referencing blocks of pointers, -# i.e., :code:`load(block>) -> block`, which involved loading blocks of -# elements from memory. This approach allowed for flexibility in using hardware-managed cache and implementing complex -# data structures, such as tensors of trees or unstructured look-up tables. -# -# However, the drawback of this approach is that it relies heavily on complex optimization passes by the compiler to -# optimize memory access patterns. This can result in brittle code that may suffer from performance degradation when the -# optimizer fails to perform adequately. Additionally, as memory controllers specialize to accommodate dense spatial -# data structures commonly used in machine learning workloads, this problem is likely to worsen. -# -# To address this issue, we will use block pointers :code:`pointer_type>` and load them into -# :code:`block`, in which way gives better friendliness for the compiler to optimize memory access -# patterns. -# -# Let's start with the previous matrix multiplication example and demonstrate how to rewrite it to utilize block pointer -# semantics. - -# %% -# Make a Block Pointer -# -------------------- -# A block pointer pointers to a block in a parent tensor and is constructed by :code:`make_block_ptr` function, -# which takes the following information as arguments: -# -# * :code:`base`: the base pointer to the parent tensor; -# -# * :code:`shape`: the shape of the parent tensor; -# -# * :code:`strides`: the strides of the parent tensor, which means how much to increase the pointer by when moving by 1 element in a specific axis; -# -# * :code:`offsets`: the offsets of the block; -# -# * :code:`block_shape`: the shape of the block; -# -# * :code:`order`: the order of the block, which means how the block is laid out in memory. -# -# For example, to a block pointer to a :code:`BLOCK_SIZE_M * BLOCK_SIZE_K` block in a row-major 2D matrix A by -# offsets :code:`(pid_m * BLOCK_SIZE_M, 0)` and strides :code:`(stride_am, stride_ak)`, we can use the following code -# (exactly the same as the previous matrix multiplication tutorial): -# -# .. code-block:: python -# -# a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(M, K), strides=(stride_am, stride_ak), -# offsets=(pid_m * BLOCK_SIZE_M, 0), block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_K), -# order=(1, 0)) -# -# Note that the :code:`order` argument is set to :code:`(1, 0)`, which means the second axis is the inner dimension in -# terms of storage, and the first axis is the outer dimension. This information may sound redundant, but it is necessary -# for some hardware backends to optimize for better performance. - -# %% -# Load/Store a Block Pointer -# -------------------------- -# To load/store a block pointer, we can use :code:`load/store` function, which takes a block pointer as an argument, -# de-references it, and loads/stores a block. You may mask some values in the block, here we have an extra argument -# :code:`boundary_check` to specify whether to check the boundary of each axis for the block pointer. With check on, -# out-of-bound values will be masked according to the :code:`padding_option` argument (load only), which can be -# :code:`zero` or :code:`nan`. Temporarily, we do not support other values due to some hardware limitations. In this -# mode of block pointer load/store does not support :code:`mask` or :code:`other` arguments in the legacy mode. -# -# So to load the block pointer of A in the previous section, we can simply write -# :code:`a = tl.load(a_block_ptr, boundary_check=(0, 1))`. Boundary check may cost extra performance, so if you can -# guarantee that the block pointer is always in-bound in some axis, you can turn off the check by not passing the index -# into the :code:`boundary_check` argument. For example, if we know that :code:`M` is a multiple of -# :code:`BLOCK_SIZE_M`, we can replace with :code:`a = tl.load(a_block_ptr, boundary_check=(1, ))`, since axis 0 is -# always in bound. - -# %% -# Advance a Block Pointer -# ----------------------- -# To advance a block pointer, we can use :code:`advance` function, which takes a block pointer and the increment for -# each axis as arguments and returns a new block pointer with the same shape and strides as the original one, -# but with the offsets advanced by the specified amount. -# -# For example, to advance the block pointer by :code:`BLOCK_SIZE_K` in the second axis -# (no need to multiply with strides), we can write :code:`a_block_ptr = tl.advance(a_block_ptr, (0, BLOCK_SIZE_K))`. - -# %% -# Final Result -# ------------ - -import torch - -import triton -import triton.language as tl - - -@triton.autotune( - configs=[ - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2), - triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2), - ], - key=['M', 'N', 'K'], -) -@triton.jit -def matmul_kernel_with_block_pointers( - # Pointers to matrices - a_ptr, b_ptr, c_ptr, - # Matrix dimensions - M, N, K, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` - # by to get the element one row down (A has M rows). - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr -): - """Kernel for computing the matmul C = A x B. - A has shape (M, K), B has shape (K, N) and C has shape (M, N) - """ - # ----------------------------------------------------------- - # Map program ids `pid` to the block of C it should compute. - # This is done in a grouped ordering to promote L2 data reuse. - # See the matrix multiplication tutorial for details. - pid = tl.program_id(axis=0) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) - num_pid_in_group = GROUP_SIZE_M * num_pid_n - group_id = pid // num_pid_in_group - first_pid_m = group_id * GROUP_SIZE_M - group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) - pid_m = first_pid_m + (pid % group_size_m) - pid_n = (pid % num_pid_in_group) // group_size_m - - # ---------------------------------------------------------- - # Create block pointers for the first blocks of A and B. - # We will advance this pointer as we move in the K direction and accumulate. - # See above `Make a Block Pointer` section for details. - a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(M, K), strides=(stride_am, stride_ak), - offsets=(pid_m * BLOCK_SIZE_M, 0), block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_K), - order=(1, 0)) - b_block_ptr = tl.make_block_ptr(base=b_ptr, shape=(K, N), strides=(stride_bk, stride_bn), - offsets=(0, pid_n * BLOCK_SIZE_N), block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_N), - order=(1, 0)) - - # ----------------------------------------------------------- - # Iterate to compute a block of the C matrix. - # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block. - # of fp32 values for higher accuracy. - # `accumulator` will be converted back to fp16 after the loop. - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - for k in range(0, K, BLOCK_SIZE_K): - # Load with boundary checks, no need to calculate the mask manually. - # For better performance, you may remove some axis from the boundary - # check, if you can guarantee that the access is always in-bound in - # that axis. - # See above `Load/Store a Block Pointer` section for details. - a = tl.load(a_block_ptr, boundary_check=(0, 1)) - b = tl.load(b_block_ptr, boundary_check=(0, 1)) - # We accumulate along the K dimension. - accumulator += tl.dot(a, b) - # Advance the block pointer to the next K block. - # See above `Advance a Block Pointer` section for details. - a_block_ptr = tl.advance(a_block_ptr, (0, BLOCK_SIZE_K)) - b_block_ptr = tl.advance(b_block_ptr, (BLOCK_SIZE_K, 0)) - c = accumulator.to(tl.float16) - - # ---------------------------------------------------------------- - # Write back the block of the output matrix C with boundary checks. - # See above `Load/Store a Block Pointer` section for details. - c_block_ptr = tl.make_block_ptr(base=c_ptr, shape=(M, N), strides=(stride_cm, stride_cn), - offsets=(pid_m * BLOCK_SIZE_M, pid_n * BLOCK_SIZE_N), - block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_N), order=(1, 0)) - tl.store(c_block_ptr, c, boundary_check=(0, 1)) - - -# We can now create a convenience wrapper function that only takes two input tensors, -# and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel. -def matmul(a, b): - # Check constraints. - assert a.shape[1] == b.shape[0], "Incompatible dimensions" - assert a.is_contiguous(), "Matrix A must be contiguous" - assert b.is_contiguous(), "Matrix B must be contiguous" - M, K = a.shape - K, N = b.shape - # Allocates output. - c = torch.empty((M, N), device=a.device, dtype=a.dtype) - # 1D launch kernel where each block gets its own program. - grid = lambda META: ( - triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), - ) - matmul_kernel_with_block_pointers[grid]( - a, b, c, - M, N, K, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - ) - return c - - -# %% -# Unit Test -# --------- -# -# Still we can test our matrix multiplication with block pointers against a native torch implementation (i.e., cuBLAS). - -torch.manual_seed(0) -a = torch.randn((512, 512), device='cuda', dtype=torch.float16) -b = torch.randn((512, 512), device='cuda', dtype=torch.float16) -triton_output = matmul(a, b) -torch_output = torch.matmul(a, b) -print(f"triton_output={triton_output}") -print(f"torch_output={torch_output}") -if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0): - print("✅ Triton and Torch match") -else: - print("❌ Triton and Torch differ") diff --git a/python/tutorials/README.rst b/python/tutorials/README.rst deleted file mode 100644 index 1dfa5f4dca91..000000000000 --- a/python/tutorials/README.rst +++ /dev/null @@ -1,11 +0,0 @@ -Tutorials -========= - -Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. - -To install the dependencies for the tutorials: - -.. code-block:: bash - - cd triton - pip install -e './python[tutorials]' diff --git a/test/BUILD b/test/BUILD new file mode 100644 index 000000000000..6cb5c9e81afe --- /dev/null +++ b/test/BUILD @@ -0,0 +1,55 @@ +load("//third_party/llvm/build_defs:lit.bzl", "glob_lit_tests") +load("//tools/build_defs/build_test:build_test.bzl", "build_test") + +package( + default_compatible_with = ["//buildenv/target:gce"], + default_visibility = ["//third_party/triton:__subpackages__"], +) + +glob_lit_tests( + data = [ + "@llvm-project//llvm:FileCheck", + "//third_party/triton:triton-opt", + ], + driver = "@llvm-project//mlir:run_lit.sh", + exclude = [ + # These require adjusted RUN commands for python in google3. + "Target/tritongpu_to_llvmir.mlir", + "Target/tritongpu_to_ptx.mlir", + ], + test_file_exts = ["mlir"], +) + +cc_library( + name = "TritonTestAnalysis", + srcs = glob(["lib/Analysis/*.cpp"]), + deps = [ + "@llvm-project//mlir:Analysis", + "@llvm-project//mlir:GPUDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:SCFToControlFlow", + "@llvm-project//mlir:Transforms", + "//third_party/triton:TritonAnalysis", + "//third_party/triton:TritonGPUDialect", + ], +) + +build_test( + name = "build_test", + allow_empty_target = False, + targets = [ + "//third_party/triton:TritonAnalysis", + "//third_party/triton:TritonDialect", + "//third_party/triton:TritonGPUDialect", + "//third_party/triton:TritonGPUToLLVM", + "//third_party/triton:TritonGPUTransforms", + "//third_party/triton:TritonLLVMIR", + "//third_party/triton:TritonPTX", + "//third_party/triton:TritonToTritonGPU", + "//third_party/triton:TritonTools", + "//third_party/triton:TritonTransforms", + "//third_party/triton:triton-opt", + "//third_party/triton:triton-translate", + ], +) diff --git a/triton.bzl b/triton.bzl new file mode 100644 index 000000000000..25627d030bcd --- /dev/null +++ b/triton.bzl @@ -0,0 +1,10 @@ +"""Bazel macros used by the triton build.""" + +def if_msvc(if_true, if_false = []): + return select({ + ":compiler_is_msvc": if_true, + "//conditions:default": if_false, + }) + +def if_not_msvc(a): + return if_msvc([], a)