-
Notifications
You must be signed in to change notification settings - Fork 100
Create DPAS Analysis infrastructure #1558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
d4310c6
Create DPAS Analysis infrastructure
etiotto b336b28
Merge branch 'llvm-target' into etiotto/dpas_analysis
etiotto f0aa6a4
Address code review comments
etiotto 36226dd
Address code review comments
etiotto 61cc1df
Address code review comments
etiotto d8e2ef9
Fix pre_commit
etiotto 1c1734f
Fix build
etiotto 2e11c47
DPAS Analysis
etiotto e38bda1
Address code review comments
etiotto 3b6744c
Address code review comments
etiotto 8808be3
Merge branch 'llvm-target' into etiotto/dpas_analysis
etiotto File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| #ifndef TRITON_INTEL_ANALYSIS_DPAS_H | ||
| #define TRITON_INTEL_ANALYSIS_DPAS_H | ||
|
|
||
| #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" | ||
| #include "mlir/Interfaces/FunctionInterfaces.h" | ||
| #include "triton/Dialect/Triton/IR/Dialect.h" | ||
|
|
||
| namespace mlir::triton::gpu::intel { | ||
|
|
||
| //===----------------------------------------------------------------------===// | ||
| // Intel DPAS Analysis | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| class DPASAnalysis { | ||
| public: | ||
| DPASAnalysis(FunctionOpInterface func); | ||
|
|
||
| enum class Result { True, False, Maybe }; | ||
|
|
||
| enum class DPASEngineType : uint8_t { | ||
| // data types for operands D,C,A,B. | ||
| FP32_FP32_FP16_FP16 = 0, // default | ||
| FP32_FP32_BF16_BF16, | ||
| FP32_FP32_TF32_TF32, | ||
| FP16_FP16_FP16_FP16, | ||
| BF16_BF16_BF16_BF16, | ||
| U32_U32_U8_U8, | ||
| S32_S32_S8_S8, | ||
| NOT_APPLICABLE | ||
| }; | ||
|
|
||
| /// Analyze the dpasMap and return: | ||
| /// - Result::True if the function associated with this analysis contains | ||
| /// DotOp operations that can be lowered to DPAS instructions, | ||
| /// - Result::False if it contains DotOp operations that cannot be lowered | ||
| /// to DPAS instructions, and | ||
| /// - Result::Maybe if it contains DotOp operations that could be lowered to | ||
| /// DPAS instructions if the module was executed with a different subgroup | ||
| /// (aka threads per warp) size. | ||
| Result canUseDPAS() const; | ||
|
|
||
| /// Return the threads per warp (aka subgroup size) supported by the DPAS | ||
| /// instruction on the given device architecture. | ||
| static unsigned supportedThreadsPerWarp(DeviceArch arch); | ||
|
|
||
| /// Given a DotOp operation, return the DPAS engine type. | ||
| static DPASEngineType getDPASType(DotOp op); | ||
|
|
||
| private: | ||
| /// The module enclosing the function associated with the analysis. | ||
| mlir::ModuleOp mod; | ||
|
|
||
| /// The map of DotOp to DPAS type. | ||
| std::map<DotOp, DPASEngineType> dpasMap; | ||
| }; | ||
|
|
||
| } // namespace mlir::triton::gpu::intel | ||
|
|
||
| #endif // TRITON_INTEL_ANALYSIS_DPAS_H | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,12 +15,6 @@ | |
|
|
||
| namespace mlir::triton::gpu::intel { | ||
|
|
||
| enum class DeviceArch { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: moved to |
||
| UNKNOWN = 0, | ||
| ATS, | ||
| PVC, | ||
| }; | ||
|
|
||
| #define GEN_PASS_DECL | ||
| #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h.inc" | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| add_triton_library(TritonIntelAnalysis | ||
| DPAS.cpp | ||
|
|
||
| DEPENDS | ||
| TritonTableGen | ||
| TritonGPUAttrDefsIncGen | ||
|
|
||
| LINK_LIBS PUBLIC | ||
| TritonIR | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| #include "intel/include/Analysis/DPAS.h" | ||
| #include "triton/Dialect/TritonGPU/IR/Dialect.h" | ||
|
|
||
| using namespace mlir; | ||
|
|
||
| namespace mlir::triton::gpu::intel { | ||
|
|
||
| DPASAnalysis::DPASAnalysis(FunctionOpInterface func) | ||
| : mod(func->getParentOfType<mlir::ModuleOp>()) { | ||
| DeviceArch arch = getDeviceArch(mod); | ||
|
|
||
|
etiotto marked this conversation as resolved.
|
||
| // Populate the DPAS map. | ||
| func.walk([&](DotOp dotOp) { | ||
| DPASEngineType dpasEngineType = | ||
| (mod->hasAttr("triton_gpu.is_lts") || arch == DeviceArch::UNKNOWN) | ||
|
etiotto marked this conversation as resolved.
etiotto marked this conversation as resolved.
etiotto marked this conversation as resolved.
|
||
| ? DPASEngineType::NOT_APPLICABLE | ||
| : DPASAnalysis::getDPASType(dotOp); | ||
| dpasMap[dotOp] = dpasEngineType; | ||
|
|
||
| // Only PVC supports TF32. | ||
| if (dpasEngineType == DPASEngineType::FP32_FP32_TF32_TF32) { | ||
| if (arch != DeviceArch::PVC || | ||
| dotOp.getInputPrecision() != InputPrecision::TF32) | ||
| dpasMap[dotOp] = DPASEngineType::NOT_APPLICABLE; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| DPASAnalysis::Result DPASAnalysis::canUseDPAS() const { | ||
| if (dpasMap.empty()) | ||
| return Result::False; | ||
|
|
||
| // Ensure all dot operations can be lowered to DPAS instructions. | ||
| if (llvm::any_of(dpasMap, [](const auto &entry) { | ||
| return entry.second == DPASEngineType::NOT_APPLICABLE; | ||
| })) | ||
| return Result::False; | ||
|
|
||
| // Verify whether the module has the correct number of threads per warp. | ||
| // Note: if the module doesn't have the warp size attribute, return | ||
| // Result::Maybe to allow the caller to set warp size. | ||
| Attribute threadsPerWarpAttr = | ||
| mod->getDiscardableAttr(TritonGPUDialect::getThreadsPerWarpAttrName()); | ||
| if (!threadsPerWarpAttr) | ||
| return Result::Maybe; | ||
|
|
||
| unsigned threadsPerWarp = cast<IntegerAttr>(threadsPerWarpAttr).getInt(); | ||
| DeviceArch arch = getDeviceArch(mod); | ||
| if (threadsPerWarp == supportedThreadsPerWarp(arch)) | ||
| return Result::True; | ||
|
|
||
| return Result::False; | ||
| } | ||
|
|
||
| unsigned DPASAnalysis::supportedThreadsPerWarp(DeviceArch arch) { | ||
|
etiotto marked this conversation as resolved.
|
||
| switch (arch) { | ||
| case DeviceArch::PVC: | ||
| return 16; | ||
| case DeviceArch::ATS: | ||
| return 8; | ||
| default: | ||
| llvm_unreachable("Unexpected target architecture"); | ||
| } | ||
| } | ||
|
|
||
| DPASAnalysis::DPASEngineType DPASAnalysis::getDPASType(DotOp op) { | ||
| // d = a * b + c | ||
| auto aTy = cast<RankedTensorType>(op.getA().getType()); | ||
| auto bTy = cast<RankedTensorType>(op.getB().getType()); | ||
| auto cTy = cast<RankedTensorType>(op.getC().getType()); | ||
| auto dTy = cast<RankedTensorType>(op.getD().getType()); | ||
| Type aElemTy = aTy.getElementType(); | ||
| Type bElemTy = bTy.getElementType(); | ||
| Type cElemTy = cTy.getElementType(); | ||
| Type dElemTy = dTy.getElementType(); | ||
|
|
||
| assert(cElemTy == dElemTy && "Unexpected element type mismatch"); | ||
|
|
||
| if (aElemTy != bElemTy) | ||
| return DPASEngineType::NOT_APPLICABLE; | ||
|
|
||
| if (dElemTy.isIntOrIndex()) { | ||
| if (dElemTy.getIntOrFloatBitWidth() == 32 && | ||
| aElemTy.getIntOrFloatBitWidth() == 8) | ||
| return dElemTy.isSignedInteger() ? DPASEngineType::S32_S32_S8_S8 | ||
| : DPASEngineType::U32_U32_U8_U8; | ||
| return DPASEngineType::NOT_APPLICABLE; | ||
| } | ||
|
|
||
| if (isa<FloatType>(dElemTy)) { | ||
| if (dElemTy.isF32()) { | ||
| if (aElemTy.isF16()) | ||
| return DPASEngineType::FP32_FP32_FP16_FP16; | ||
| if (aElemTy.isBF16()) | ||
| return DPASEngineType::FP32_FP32_BF16_BF16; | ||
| if (aElemTy.isF32() && op.getInputPrecision() == InputPrecision::TF32) | ||
| return DPASEngineType::FP32_FP32_TF32_TF32; | ||
| // For FP8XFP8->FP32, upcast to FP16 | ||
| if (aElemTy.isFloat8E5M2()) | ||
| return DPASEngineType::FP32_FP32_FP16_FP16; | ||
| if (aElemTy.isFloat8E4M3FNUZ()) | ||
| return DPASEngineType::FP32_FP32_FP16_FP16; | ||
| } else if (dElemTy.isF16()) { | ||
| if (aElemTy.isF16()) | ||
| return DPASEngineType::FP16_FP16_FP16_FP16; | ||
| } else if (dElemTy.isBF16()) { | ||
| if (aElemTy.isBF16()) | ||
| return DPASEngineType::BF16_BF16_BF16_BF16; | ||
| } | ||
| } | ||
|
|
||
| return DPASEngineType::NOT_APPLICABLE; | ||
| } | ||
|
|
||
| } // namespace mlir::triton::gpu::intel | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| add_subdirectory(Analysis) | ||
| add_subdirectory(Dialect) | ||
| add_subdirectory(GPUToTritonGEN) | ||
| add_subdirectory(Target) | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.