Skip to content

Commit 7feb377

Browse files
anamikac-intelAnamika Chatterjee
andauthored
Use newer version of mma_atom and copy_atom in 00_bmg_gemm (#540)
Modify 00_bmg_gemm to include new mma and copy atoms (#477). 00_bmg_gemm combines two parts: mma and epilogue. To add new atom changes, we need to update both parts since they currently use old atoms. As starting we will: > Keep CollectiveEpilogue unchanged for now > Only modify CollectiveMma first Old Atom: Problem Size: 5120x4096x4096x1 Cutlass GEMM Performance: [96.448]TFlop/s (1.7813)ms New Atom: Problem Size: 5120x4096x4096x1 Cutlass GEMM Performance: [97.259]TFlop/s (1.7664)ms Also depend on new copy_c/copy_d apis for load/store #572 --------- Co-authored-by: Anamika Chatterjee <[email protected]>
1 parent 5880275 commit 7feb377

File tree

10 files changed

+890
-72
lines changed

10 files changed

+890
-72
lines changed

examples/00_bmg_gemm/00_bmg_gemm.cpp

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -277,9 +277,7 @@ struct ExampleRunner {
277277
bool passed = verify(problem_size, options.alpha, options.beta);
278278
std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
279279

280-
if(!passed) return cutlass::Status::kErrorInternal;
281-
282-
if (options.iterations > 0) {
280+
if (passed && options.iterations > 0) {
283281
GPU_Clock timer;
284282
timer.start();
285283
for (int i = 0; i < options.iterations; ++i) {
@@ -345,30 +343,34 @@ int main(int argc, const char** argv)
345343
using LayoutC = cutlass::layout::RowMajor;
346344
using LayoutD = cutlass::layout::RowMajor;
347345

348-
// The 2D block copy operations used for the A and B matrices
349-
using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
350-
using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
346+
// [New Copy Atom] When left unspecified (void), MainloopXeL1Staged automatically selects
347+
// appropriate 2D block copy operations for matrices A and B. Alternatively, you can
348+
// explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI
349+
// (applicable only to matrix B), or XE_LOAD_2D_TRANSPOSE.
350+
// Refer https://github.com/intel/sycl-tla/blob/main/media/docs/cpp/xe_rearchitecture.md
351+
using GmemTiledCopyA = void; //XE_LOAD_2D<16, 32, 32>;
352+
using GmemTiledCopyB = void; //XE_LOAD_2D_VNNI<16, 32, 32>;
351353

352354
// Workgroup-level tile
353355
using TileShape = Shape<_256, _256, _32>;
354-
356+
355357
// A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
356358
// hardware (sub-groups for Intel BMG) and iterations by each sub-group.
357359
//
358-
// The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
359-
// (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
360-
// TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
360+
// The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom. This example uses
361+
// the XE_DPAS_TT<8, float, cute::bfloat16_t> atom, which represents an 8x16x16 DPAS operation with
362+
//float32 accumulation and bfloat16 inputs, TileShape (<256, 256, 32>) and sub-group layout (8x4x1).
363+
// The TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
361364
// single contiguous chunk of the work-group TileShape. For this configuration, this implies that
362365
// each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
363366
// 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
364367
// performance reasons.
365-
using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
366-
typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
367-
Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
368+
using TiledMma = typename TiledMMAHelper<MMA_Atom<XE_DPAS_TT<8, float, cute::bfloat16_t>>, Layout<TileShape>, Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
368369

369370
// For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
370371
constexpr int PipelineStages = 2;
371-
using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
372+
// For older version of copy/mma atom, use cutlass::gemm::MainloopIntelXeXMX16 as dispatch policy
373+
using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged<PipelineStages>;
372374
using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
373375

374376
// This is the 'default' epilogue operation (Linear Combination) which performs everything in:

0 commit comments

Comments
 (0)