Skip to content

Commit 8311403

Browse files
authored
Merge branch 'main' into liyang/import_syclcompat
2 parents b05da99 + 9baca2c commit 8311403

File tree

10 files changed

+52
-40
lines changed

10 files changed

+52
-40
lines changed

benchmarks/gemm/benchmark_runner.hpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,12 @@ struct BenchmarkRunnerGemm {
176176

177177
using CollectiveMainloop = typename Gemm::GemmKernel::CollectiveMainloop;
178178
using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
179-
using ElementMma = CollectiveMainloop::TiledMma::ValTypeA;
179+
using ElementMma = typename CollectiveMainloop::TiledMma::ValTypeA;
180180

181-
using ElementScale = ScaleType<CollectiveMainloop>::type;
182-
using ElementZero = ZeroType<CollectiveMainloop>::type;
183-
using StrideS = ScaleStride<CollectiveMainloop>::type;
184-
using StrideZ = ZeroStride<CollectiveMainloop>::type;
181+
using ElementScale = typename ScaleType<CollectiveMainloop>::type;
182+
using ElementZero = typename ZeroType<CollectiveMainloop>::type;
183+
using StrideS = typename ScaleStride<CollectiveMainloop>::type;
184+
using StrideZ = typename ZeroStride<CollectiveMainloop>::type;
185185

186186
using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
187187
using ElementC = typename Gemm::ElementC;
@@ -454,7 +454,10 @@ struct BenchmarkRunnerGemm {
454454
}
455455

456456
bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
457-
auto [M, N, K, L] = problem_size;
457+
auto& M = cute::get<0>(problem_size);
458+
auto& N = cute::get<1>(problem_size);
459+
auto& K = cute::get<2>(problem_size);
460+
auto& L = cute::get<3>(problem_size);
458461

459462
TensorRef ref_C(block_C[0].get(), LayoutC::packed({M, N}));
460463
TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));

cmake/FindDPCPP.cmake

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ add_library(DPCPP::DPCPP INTERFACE IMPORTED)
4040

4141
set(DPCPP_FLAGS "-fsycl;")
4242
set(DPCPP_COMPILE_ONLY_FLAGS "")
43+
set(DPCPP_LINK_ONLY_FLAGS "")
4344

4445
if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")
4546
list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};")
@@ -63,10 +64,10 @@ if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc" OR
6364
"${DPCPP_SYCL_TARGET}" STREQUAL "spir64" OR
6465
"${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_bmg_g21")
6566
if ((CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) OR CUTLASS_SYCL_BUILTIN_ENABLE)
66-
list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier")
67+
list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier")
6768
else()
68-
list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate")
69-
endif()
69+
list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate")
70+
endif()
7071
if(DPCPP_DISABLE_ITT_FOR_CUTLASS)
7172
list(APPEND DPCPP_FLAGS "-fno-sycl-instrument-device-code")
7273
endif()
@@ -76,14 +77,16 @@ endif()
7677
if(UNIX)
7778
set_target_properties(DPCPP::DPCPP PROPERTIES
7879
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}"
79-
INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
80+
INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}"
8081
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
8182
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
8283
message(STATUS "DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
83-
message(STATUS "Using DPCPP flags: ${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}")
84+
message(STATUS "Using DPCPP compile flags: ${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}")
85+
message(STATUS "Using DPCPP link flags: ${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}")
8486
else()
8587
set_target_properties(DPCPP::DPCPP PROPERTIES
8688
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}"
89+
INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}"
8790
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
8891
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
8992
endif()
@@ -105,7 +108,7 @@ function(add_sycl_to_target)
105108
)
106109
get_target_property(target_type ${CUTLASS_ADD_SYCL_TARGET} TYPE)
107110
if (NOT target_type STREQUAL "OBJECT_LIBRARY")
108-
target_link_options(${CUTLASS_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
111+
target_link_options(${CUTLASS_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS} ${DPCPP_LINK_ONLY_FLAGS})
109112
endif()
110113
endfunction()
111114

cmake/googletest.cmake

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@ FetchContent_Declare(
4444

4545
FetchContent_MakeAvailable(googletest)
4646

47+
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
48+
if (TARGET gtest)
49+
# Ignore unsupported warning flags on IntelLLVM
50+
target_compile_options(gtest PRIVATE -Wno-unknown-warning-option)
51+
# Show -Winline warnings, but don’t let them become errors
52+
target_compile_options(gtest PRIVATE -Wno-error=inline)
53+
endif()
54+
endif()
55+
4756
if (MSVC)
4857
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
49-
endif()
58+
endif()

examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,11 @@ struct ExampleRunner {
453453

454454
/// Initialize operands to be used in the GEMM and reference GEMM
455455
void initialize(Options const& options) {
456-
auto [M, N, K, L] = ProblemShapeType{options.m, options.n, options.k, options.l};
456+
auto problem_shape = ProblemShapeType{options.m, options.n, options.k, options.l};
457+
auto& M = cute::get<0>(problem_shape);
458+
auto& N = cute::get<1>(problem_shape);
459+
auto& K = cute::get<2>(problem_shape);
460+
auto& L = cute::get<3>(problem_shape);
457461

458462
auto zero_elements_packed_along_k = get<0>(StrideZero{});
459463
const int scale_k = cute::ceil_div(options.k, options.g);

examples/03_bmg_gemm_streamk/03_bmg_gemm_streamk.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ using namespace cute;
8686

8787
///////////////////////////////////////////////////////////////////////////////////////////////////
8888

89-
#define CUTLASS_SYCL_PROFILING_ENABLED
90-
9189
// Command line options parsing
9290
struct Options {
9391

@@ -303,9 +301,9 @@ struct ExampleRunner {
303301
if(!passed) return cutlass::Status::kErrorInternal;
304302

305303
if (options.iterations > 0) {
306-
GPU_Clock timer;
307304
float elapsed_time_seconds = 0.f;
308305
for (int i = 0; i < options.iterations; ++i) {
306+
GPU_Clock timer;
309307
gemm_op.initialize(arguments, workspace.get());
310308
timer.start();
311309
gemm_op.run();

examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ using ElementOutput = float; // <- data type of elements in output matr
9797

9898
///////////////////////////////////////////////////////////////////////////////////////////////////
9999

100-
#define CUTLASS_SYCL_PROFILING_ENABLED
101100

102101
// Command line options parsing
103102
struct Options {

include/cutlass/gemm/collective/xe_mma_mixed_input.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ struct CollectiveMma<
329329
}();
330330

331331
if constexpr (ModeScale) {
332-
return Params{tiled_copy_a, tiled_copy_b, tiled_copy_scale, {}, args.group_size};
332+
return Params{tiled_copy_a, tiled_copy_b, {tiled_copy_scale}, {}, args.group_size};
333333
} else {
334334
auto ptr_Z = [&]() {
335335
if constexpr (sizeof_bits_v<NonVoidElementZero> < 8) {
@@ -354,7 +354,7 @@ struct CollectiveMma<
354354
}
355355
}();
356356

357-
return Params{tiled_copy_a, tiled_copy_b, tiled_copy_scale, tiled_copy_zero, args.group_size};
357+
return Params{tiled_copy_a, tiled_copy_b, {tiled_copy_scale}, {tiled_copy_zero}, args.group_size};
358358
}
359359
}
360360
}

test/unit/cute/intel_xe/mma.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ void gemm_device(TA const *A, TB const *B, TC *C, uint32_t m, uint32_t n,
9090

9191
#define CUTLASS_ENABLE_DEBUG_PRINTS (0)
9292

93+
#undef LOG_THREAD
9394
#define LOG_THREAD (16)
9495

9596
#if CUTLASS_ENABLE_DEBUG_PRINTS

test/unit/cute/intel_xe/utils.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ using namespace cutlasscompat::experimental;
5353

5454
#define CUTLASS_ENABLE_DEBUG_PRINTS (0)
5555
#define LOG_GROUP (0)
56+
#undef LOG_THREAD
5657
#define LOG_THREAD (0)
5758

5859
template <class atype, class btype, class ctype>

tools/util/include/cutlass/util/reference/device/sycl_tensor_fill.h

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/***************************************************************************************************
2-
* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
3-
* Copyright (C) 2025 Intel Corporation, All rights reserved.
2+
* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
3+
* Copyright (c) 2025 Intel Corporation, All rights reserved.
44
* SPDX-License-Identifier: BSD-3-Clause
55
*
66
* Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,7 @@
4444
#include "cutlass/cutlass.h"
4545
#include "cutlass/complex.h"
4646
#include "cutlass/util/reference/device/tensor_foreach.h"
47+
#include "cutlass/util/reference/host/tensor_fill.h"
4748
#include "cutlass/tensor_view.h"
4849
#include "cutlass/layout/vector.h"
4950

@@ -176,25 +177,18 @@ void BlockFillRandomUniformCopyFromHost(
176177
size_t capacity,
177178
uint64_t seed, ///< seed for RNG
178179
typename RealType<Element>::Type max, ///< upper bound of distribution
179-
typename RealType<Element>::Type min ///< lower bound for distribution
180+
typename RealType<Element>::Type min, ///< lower bound for distribution
181+
int bits = -1 ///< If non-negative, specifies number of fractional bits that
182+
/// are not truncated to zero. Permits reducing precision of
183+
/// data.
180184
) {
181-
182-
if constexpr(std::is_same_v<Element, float> ||
183-
std::is_same_v<Element, cute::bfloat16_t> ||
184-
std::is_same_v<Element, cute::half_t>) {
185-
std::random_device rd;
186-
std::mt19937 gen(seed);
187-
std::uniform_real_distribution<float> dis(min, max);
188-
auto buff = std::vector<Element>(capacity);
189-
190-
for (size_t i = 0; i < capacity; ++i) {
191-
buff[i] = (Element)(dis(gen));
192-
}
193-
cutlasscompat::memcpy<Element>(ptr, buff.data(), capacity);
194-
cutlasscompat::wait();
195-
} else {
196-
assert(false && "Not supported dtype");
197-
}
185+
186+
auto buff = std::vector<Element>(capacity);
187+
188+
cutlass::reference::host::BlockFillRandomUniform(buff.data(), capacity, seed, max, min, bits);
189+
190+
cutlasscompat::memcpy<Element>(ptr, buff.data(), capacity);
191+
cutlasscompat::wait();
198192
}
199193

200194
///////////////////////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)