diff --git a/mlir-tensorrt/build_tools/cmake/MTRTDependencies.cmake b/mlir-tensorrt/build_tools/cmake/MTRTDependencies.cmake index 1eb9efa8b..bbef1f2f6 100644 --- a/mlir-tensorrt/build_tools/cmake/MTRTDependencies.cmake +++ b/mlir-tensorrt/build_tools/cmake/MTRTDependencies.cmake @@ -57,8 +57,8 @@ macro(configure_tensorrt_python_plugin_header) find_file( trt_python_plugin_header NAMES NvInferPythonPlugin.h plugin.h - HINTS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/python/include/impl - PATHS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/python/include/impl + HINTS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/include/impl + PATHS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/include/impl REQUIRED NO_CMAKE_PATH NO_DEFAULT_PATH NO_CACHE diff --git a/mlir-tensorrt/build_tools/cmake/TensorRTDownloadURL.cmake b/mlir-tensorrt/build_tools/cmake/TensorRTDownloadURL.cmake index 7a1745f1e..394f4ea70 100644 --- a/mlir-tensorrt/build_tools/cmake/TensorRTDownloadURL.cmake +++ b/mlir-tensorrt/build_tools/cmake/TensorRTDownloadURL.cmake @@ -80,6 +80,10 @@ function(mtrt_get_tensorrt_download_url ARG_VERSION OS_NAME TARGET_ARCH ARG_OUT_ set(ARG_VERSION "10.12.0.36") endif() + if(ARG_VERSION VERSION_EQUAL "10.14") + set(ARG_VERSION "10.14.1.48") + endif() + set(downloadable_versions "8.6.1.6" "9.0.1.4" "9.1.0.4" "9.2.0.5" @@ -97,6 +101,7 @@ function(mtrt_get_tensorrt_download_url ARG_VERSION OS_NAME TARGET_ARCH ARG_OUT_ "10.8.0.43" "10.9.0.34" "10.12.0.36" + "10.14.1.48" ) if(NOT ARG_VERSION IN_LIST downloadable_versions) @@ -164,6 +169,8 @@ function(mtrt_get_tensorrt_download_url ARG_VERSION OS_NAME TARGET_ARCH ARG_OUT_ elseif(ARG_VERSION VERSION_GREATER 10.10 AND ARG_VERSION VERSION_LESS 10.13) set(TRT_CUDA_VERSION 12.9) + elseif(ARG_VERSION VERSION_GREATER 10.13) + set(TRT_CUDA_VERSION 13.0) endif() # Handle TRT 8 versions. diff --git a/mlir-tensorrt/build_tools/docker/Dockerfile b/mlir-tensorrt/build_tools/docker/Dockerfile index bb2996369..cf8e5df11 100644 --- a/mlir-tensorrt/build_tools/docker/Dockerfile +++ b/mlir-tensorrt/build_tools/docker/Dockerfile @@ -35,7 +35,7 @@ case "${LINUX_DISTRO}" in dnf install -y \ which wget gcc zlib-devel bzip2 bzip2-devel readline-devel sqlite \ sqlite-devel xz xz-devel libffi-devel curl git ncurses-devel \ - openssh-clients libcudnn8-devel zip jq \ + openssh-clients zip jq \ protobuf-compiler autoconf automake libtool dnf-plugins-core cmake dnf config-manager --set-enabled powertools dnf -y install gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTEnums.td b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTEnums.td index 0bb4e91fd..4d4fd144e 100644 --- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTEnums.td +++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTEnums.td @@ -378,4 +378,42 @@ def TensorRT_ScatterMode : TensorRT_I32EnumAttr< def TensorRT_ScatterModeAttr : TensorRT_EnumAttr{ } +def TensorRT_AttentionNormalizationOp : TensorRT_I32EnumAttr< + "AttentionNormalizationOp", "", + [ + I32EnumAttrCase<"kNONE", 0>, + I32EnumAttrCase<"kSOFTMAX", 1> + ]> +{ + let cppNamespace = "::mlir::tensorrt"; + let genSpecializedAttr = 0; +} + +def TensorRT_AttentionNormalizationOpAttr : TensorRT_EnumAttr{ +} + +def TensorRT_DataType : TensorRT_I32EnumAttr< + "DataType", "", + [ + I32EnumAttrCase<"kFLOAT", 0>, + I32EnumAttrCase<"kHALF", 1>, + I32EnumAttrCase<"kINT8", 2>, + I32EnumAttrCase<"kINT32", 3>, + I32EnumAttrCase<"kBOOL", 4>, + I32EnumAttrCase<"kUINT8", 5>, + I32EnumAttrCase<"kFP8", 6>, + I32EnumAttrCase<"kBF16", 7>, + I32EnumAttrCase<"kINT64", 8>, + I32EnumAttrCase<"kINT4", 9>, + I32EnumAttrCase<"kFP4", 10>, + I32EnumAttrCase<"kE8M0", 11> + ]> +{ + let cppNamespace = "::mlir::tensorrt"; + let genSpecializedAttr = 0; +} + +def TensorRT_DataTypeAttr : TensorRT_EnumAttr{ +} + #endif // MLIR_TENSORRT_DIALECT_TENSORRT_IR_TENSORRTENUMS diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td index e11ef94e6..ca24b3c78 100644 --- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td +++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td @@ -4432,4 +4432,170 @@ def TensorRT_ScatterElementsOp : TensorRT_Op<"scatter_elements", }]; } +//===----------------------------------------------------------------------===// +// AttentionOp +//===----------------------------------------------------------------------===// + +def TensorRT_AttentionOp : TensorRT_Op<"attention", + [Pure, AttrSizedOperandSegments, TensorRTInferTensorResultTypes, + AllElementTypesMatch<["query", "key", "value"]>, + AllRanksMatch<["query", "key", "value"]>]>{ + let summary = "TensorRT attention (IAttention) operation"; + let description = [{ + The `tensorrt.attention` operation implements a fused attention mechanism + that consumes query, key, and value tensors. The operation implicitly includes + two matrix multiplication layers (BMM1 and BMM2) and a normalization operation + (typically softmax). + + By default, TensorRT will try to use a single fused kernel for better efficiency. + The operation can optionally be decomposed into multiple kernels if no fused + kernel is available by setting `decomposable` to true. + + #### Architecture: + + ``` + Query Key Value Mask (optional) NormalizationQuantizeScale (optional) + | | | | | + | Transpose | | | + | | | | | + ----BMM1---- | | | + | | | | + *--------------------------- | + | | | + Normalization | | + | | | + *------------------------------------------------ + | | + -------BMM2------ + | + Output + ``` + + #### Inputs: + + - Query: tensor of type f32, f16, or bf16 with shape + [batchSize, numHeadsQuery, sequenceLengthQuery, dimHead] + - Key: tensor of type f32, f16, or bf16 with shape + [batchSize, numHeadsKeyValue, sequenceLengthKeyValue, dimHead] + - Value: tensor of type f32, f16, or bf16 with shape + [batchSize, numHeadsKeyValue, sequenceLengthKeyValue, dimHead] + - Mask (optional): tensor of type i1 or same type as BMM1 output with shape + [batchSize, numHeadsQuery, sequenceLengthQuery, sequenceLengthKeyValue] + where batchSize and numHeadsQuery are broadcastable. For i1 mask, true + indicates the position is allowed to attend. For other types, mask values + are added to BMM1 output. + - NormalizationQuantizeScale (optional): tensor of type f32, f16, or bf16 + with rank 0 or 1, used for quantizing the normalization output. + + #### Attributes: + + - normalization_operation: The normalization operation to use (default: kSOFTMAX) + - causal: Whether to use causal masking (default: false). Cannot be used with mask input. + - decomposable: Whether the operation can be decomposed (default: false) + - normalization_quantize_to_type: Optional output type for quantized normalization. + When specified, must be one of kFP8 or kINT8. Requires normalization_quantize_scale input to be provided. + + #### Constraints: + + - All query, key, and value tensors must be rank 4 with shape [batchSize, numHeads, sequenceLength, dimHead] + - Query, key, and value must have the same element type (f32, f16, or bf16) + - If normalization_quantize_to_type is specified: + * It must be kFP8 or kINT8 + * normalization_quantize_scale input must be provided + - Cannot use both mask input and causal=true simultaneously + + #### Examples: + + Basic attention: + ```mlir + %output = tensorrt.attention ins(%query, %key, %value : + tensor<2x8x128x64xf16>, tensor<2x8x128x64xf16>, tensor<2x8x128x64xf16>) + -> tensor<2x8x128x64xf16> + ``` + + Causal attention: + ```mlir + %output_causal = tensorrt.attention {causal = true} ins(%query, %key, %value : + tensor<2x8x128x64xf16>, tensor<2x8x128x64xf16>, tensor<2x8x128x64xf16>) + -> tensor<2x8x128x64xf16> + ``` + + Attention with quantization: + ```mlir + %scale = tensorrt.constant dense<1.0> : tensor + %output_quant = tensorrt.attention { + normalization_quantize_to_type = #tensorrt.data_type + } ins(%query, %key, %value, + normalization_quantize_scale = %scale : + tensor<2x8x128x64xf16>, tensor<2x8x128x64xf16>, + tensor<2x8x128x64xf16>, tensor) + -> tensor<2x8x128x64xf16> + ``` + }]; + + let arguments = (ins + TensorRT_RankedTensorOf<[F16, BF16, F32]>:$query, + TensorRT_RankedTensorOf<[F16, BF16, F32]>:$key, + TensorRT_RankedTensorOf<[F16, BF16, F32]>:$value, + Optional:$mask, + Optional>:$normalization_quantize_scale, + DefaultValuedAttr:$normalization_operation, + DefaultValuedAttr:$causal, + DefaultValuedAttr:$decomposable, + OptionalAttr:$normalization_quantize_to_type + ); + + let results = (outs TensorRT_RankedTensorOf<[F16, BF16, F32]>:$result); + + let assemblyFormat = [{ + attr-dict `ins` `(` $query `,` $key `,` $value + (`,` `mask` `=` $mask^)? + (`,` `normalization_quantize_scale` `=` $normalization_quantize_scale^)? + `:` type($query) `,` type($key) `,` type($value) + (`,` type($mask)^)? + (`,` type($normalization_quantize_scale)^)? + `)` `->` type($result) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + /// Returns true if created op is valid for TensorRT major version. + bool isValidForTensorRTVersion(int64_t trtMajorVersion); + }] # baseClassDeclaration; + + let trtLayerAdd = [{ + nvinfer1::IAttention *layer = $net->addAttention(*$query, *$key, *$value, *$normalization_operation, $causal); + if (!layer) + return failure(); + + if ($mask) + layer->setMask(*$mask); + + layer->setDecomposable($decomposable); + + if ($normalization_quantize_scale) { + layer->setNormalizationQuantizeScale(*$normalization_quantize_scale); + } + + if ($normalization_quantize_to_type) { + auto convertedDataType = ::mlir::tensorrt::convertDataTypeToNvInferEnum(*$normalization_quantize_to_type); + if (!convertedDataType) + return emitError($op->getLoc()) << "failed to convert DataType to nvinfer enum"; + layer->setNormalizationQuantizeToType(*convertedDataType); + } + + if (!$e.isStronglyTyped()){ + FailureOr outputTrtType = getNvInferDataType($op.getLoc(), + $op.getType().getElementType()); + if (failed(outputTrtType)) + return failure(); + } + + $results.push_back(layer->getOutput(0)); + // TODO: nvinfer1::IAttention does not have setMetadata API in 10.14 + // layer->setMetadata($op); + }]; +} + #endif // MLIR_TENSORRT_DIALECT_TENSORRT_IR_TENSORRTOPS_TD diff --git a/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TensorRTVersionCompatibility.cpp b/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TensorRTVersionCompatibility.cpp index 8b4413952..1b85e4787 100644 --- a/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TensorRTVersionCompatibility.cpp +++ b/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TensorRTVersionCompatibility.cpp @@ -914,3 +914,16 @@ bool tensorrt::ScatterElementsOp::isValidForTensorRTVersion( return isValidForTensorRTVersionScatterOpImpl( trtMajorVersion, dataElementType, indicesElementType); } + +//===----------------------------------------------------------------------===// +// AttentionOp +//===----------------------------------------------------------------------===// + +bool tensorrt::AttentionOp::isValidForTensorRTVersion( + int64_t trtMajorVersion) { + // IAttention layer is only supported in TensorRT >= 10.14.0 + if (trtMajorVersion < 10) + return false; + + return true; +}