Skip to content

Commit 9acfcd5

Browse files
authored
Support for CUTLASS Library generation / Ops / Xe Arch (#578)
## Intel Xe Architecture Support for CUTLASS Library generation **Feature:** Add Intel Xe12/Xe20 architecture support with operation generation and Python bindings. **Use Case:** Enable kernel generation for PyTorch inductor path and ML frameworks on Intel Arc/PVC GPUs. **Key Changes:** - **Architecture Support:** Added Xe12 (PVC) and Xe20 (BMG) with compute capability 12-50 - **Operations:** FP16, BF16, FP8 (E4M3/E5M2), INT8 GEMM kernels with multiple tile sizes (256×256, 128×256, etc.) - **Build Flags:** New CMake options `-DCUTLASS_LIBRARY_GENERATOR_ARCHS="20"` for Intel GPU targets - **Python Integration:** CMake-based shared library (`examples/11_xe20_cutlass_library/`) + ctypes bindings - **Generator:** Extended `python/cutlass_library/generator.py` with `GenerateIntelXe()` functions - **Examples:** Python test scripts with performance benchmarking **Testing:** ✅ Tested BF16 generated kernels, Examples, Documentation **Note** These changes do not make use of new APIs (or modified collectives). That must be different feature / refactoring effort. **ToDo:** - [x] Build Failures - [ ] Benchmark tests for comprehensive performance analysis - [ ] Testing kernels beyond BF16 (FP16, FP8, INT8) - [ ] Optimizing generated kernels with tile sizes - [ ] Modify CMake to avoid explicitly linking with libsycl.so **Type:** Feature | **Tested On:** Xe20 ✅
1 parent a0172fd commit 9acfcd5

File tree

29 files changed

+1930
-41
lines changed

29 files changed

+1930
-41
lines changed

cmake/FindDPCPP.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ add_library(DPCPP::DPCPP INTERFACE IMPORTED)
4141
set(DPCPP_FLAGS "-fsycl;")
4242
if(DPCPP_HOST_COMPILER)
4343
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler=${DPCPP_HOST_COMPILER}")
44-
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=-Wno-changes-meaning -D$<JOIN:$<TARGET_PROPERTY:COMPILE_DEFINITIONS>, -D> -I$<JOIN:$<TARGET_PROPERTY:INCLUDE_DIRECTORIES>, -I>")
44+
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=-Wno-changes-meaning $<$<BOOL:$<TARGET_PROPERTY:POSITION_INDEPENDENT_CODE>>:-fPIC> -D$<JOIN:$<TARGET_PROPERTY:COMPILE_DEFINITIONS>, -D> -I$<JOIN:$<TARGET_PROPERTY:INCLUDE_DIRECTORIES>, -I>")
4545
endif()
4646
set(DPCPP_COMPILE_ONLY_FLAGS "")
4747
set(DPCPP_LINK_ONLY_FLAGS "")
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (C) 2025 Intel Corporation, All rights reserved.
2+
# SPDX-License-Identifier: BSD-3-Clause
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# 1. Redistributions of source code must retain the above copyright notice, this
8+
# list of conditions and the following disclaimer.
9+
#
10+
# 2. Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# 3. Neither the name of the copyright holder nor the names of its
15+
# contributors may be used to endorse or promote products derived from
16+
# this software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22+
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23+
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25+
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26+
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
29+
# Example 11: XE20 CUTLASS Library BF16 GEMM
30+
# This example creates a shared library (.so) that exports CUTLASS BF16 GEMM
31+
# functionality for use with Python via ctypes.
32+
33+
# Create shared library for Python integration
34+
add_library(xe20_cutlass_library_bf16 SHARED
35+
xe_20_cutlass_library_b16.cpp
36+
)
37+
38+
# Set library properties (this creates shared library for python example to link)
39+
set_target_properties(xe20_cutlass_library_bf16 PROPERTIES
40+
CXX_STANDARD 17
41+
CXX_STANDARD_REQUIRED ON
42+
VERSION 1.0
43+
SOVERSION 1
44+
OUTPUT_NAME "xe20_cutlass_library_bf16"
45+
POSITION_INDEPENDENT_CODE ON
46+
)
47+
48+
# Include directories
49+
target_include_directories(xe20_cutlass_library_bf16 PRIVATE
50+
${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR}
51+
${CUTLASS_EXAMPLES_UTILS_DIR}
52+
${CUTLASS_APPLICATIONS_DIR}
53+
)
54+
55+
# Link libraries
56+
target_link_libraries(xe20_cutlass_library_bf16 PRIVATE
57+
CUTLASS
58+
cutlass_tools_util_includes
59+
)
60+
61+
# Add compile definitions
62+
target_compile_definitions(xe20_cutlass_library_bf16 PRIVATE
63+
CUTLASS_ENABLE_SYCL=1
64+
SYCL_INTEL_TARGET=1
65+
DPCPP_SYCL_TARGET=intel_gpu_bmg_g21
66+
)
67+
68+
# Add Intel-specific SYCL link flags for XE20 optimization
69+
if(CUTLASS_ENABLE_SYCL AND SYCL_INTEL_TARGET)
70+
target_link_options(xe20_cutlass_library_bf16 PRIVATE
71+
-Xspirv-translator
72+
-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate
73+
)
74+
75+
add_sycl_to_target(TARGET xe20_cutlass_library_bf16)
76+
add_onemkl_to_target(TARGET xe20_cutlass_library_bf16)
77+
endif()
78+
79+
# Link against CUTLASS XE20 GEMM library if available
80+
if(TARGET cutlass_gemm_xe20_gemm)
81+
target_link_libraries(xe20_cutlass_library_bf16 PRIVATE cutlass_gemm_xe20_gemm)
82+
endif()
83+
84+
# Install the shared library
85+
install(TARGETS xe20_cutlass_library_bf16
86+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
87+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
88+
)
89+
90+
# Add to examples target
91+
add_dependencies(cutlass_examples xe20_cutlass_library_bf16)
92+
93+
# Custom target for building just this library
94+
add_custom_target(xe20_cutlass_library
95+
DEPENDS xe20_cutlass_library_bf16
96+
COMMENT "Building XE20 CUTLASS Library BF16 GEMM Shared Library (.so)"
97+
)
98+
99+
message(STATUS "Added shared library xe20_cutlass_library_bf16 for Python integration")
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
/***************************************************************************************************
2+
* Copyright (C) 2025 Intel Corporation, All rights reserved.
3+
* SPDX-License-Identifier: BSD-3-Clause
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions are met:
7+
*
8+
* 1. Redistributions of source code must retain the above copyright notice, this
9+
* list of conditions and the following disclaimer.
10+
*
11+
* 2. Redistributions in binary form must reproduce the above copyright notice,
12+
* this list of conditions and the following disclaimer in the documentation
13+
* and/or other materials provided with the distribution.
14+
*
15+
* 3. Neither the name of the copyright holder nor the names of its
16+
* contributors may be used to endorse or promote products derived from
17+
* this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
*
30+
***************************************************************************************************/
31+
32+
33+
34+
#include <exception>
35+
#include <iostream>
36+
#include <memory>
37+
#include <random>
38+
#include <vector>
39+
40+
#include "cute/tensor.hpp"
41+
#include "cutlass/cutlass.h"
42+
#include "cutlass/numeric_types.h"
43+
#include "cutlass/tensor_ref.h"
44+
#include "cutlass/util/host_tensor.h"
45+
#include "cutlass/util/reference/host/tensor_fill.h"
46+
#include "cutlass/util/reference/device/tensor_fill.h"
47+
#include "cutlass/util/device_memory.h"
48+
49+
#include "cutlass/gemm/gemm.h"
50+
#include "cutlass/gemm/device/gemm_universal.h"
51+
#include "cutlass/gemm/device/gemm_universal_adapter.h"
52+
#include "cutlass/gemm/kernel/gemm_universal.hpp"
53+
//#include "cutlass/gemm/device/gemm_sparse.h"
54+
#include "cutlass/gemm/collective/collective_builder.hpp"
55+
#include "cutlass/epilogue/collective/collective_builder.hpp"
56+
#include "cutlass/epilogue/collective/default_epilogue.hpp"
57+
#include "cutlass/epilogue/thread/linear_combination.h"
58+
#include "cutlass/epilogue/thread/activation.h"
59+
#include "cutlass/gemm/dispatch_policy.hpp"
60+
#include "cutlass/gemm/kernel/tile_scheduler.hpp"
61+
#include "cutlass/tensor_ref.h"
62+
#include "cutlass/util/distribution.h"
63+
#include "cutlass/util/packed_stride.hpp"
64+
#include "cutlass/util/tensor_view_io.h"
65+
66+
67+
// We compile all models with -fvisibility=hidden. Any symbols that need to be
68+
// exposed in the final shared library must be declared with PT_EXPORT to make
69+
// them visible.
70+
#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
71+
#define PT_EXPORT __attribute__((__visibility__("default")))
72+
#else
73+
#ifdef _WIN32
74+
#define PT_EXPORT __declspec(dllexport)
75+
#else
76+
#define PT_EXPORT
77+
#endif
78+
#endif
79+
80+
using namespace cute;
81+
#define CUTLASS_CHECK(status) \
82+
{ \
83+
cutlass::Status error = status; \
84+
if (error != cutlass::Status::kSuccess) { \
85+
auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " + \
86+
cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__); \
87+
throw std::runtime_error(msg); \
88+
} \
89+
}
90+
91+
// Used as pass-through functor in EVT just for type casting / rounding
92+
template <typename T>
93+
struct identity_op {
94+
CUTLASS_HOST_DEVICE
95+
T operator()(T val) const { return val; }
96+
};
97+
98+
99+
100+
using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue =
101+
typename cutlass::epilogue::collective::CollectiveBuilder<
102+
cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
103+
cute::Shape<cute::_256, cute::_256, cute::_32>,
104+
cute::Shape<cute::_1, cute::_1, cute::_1>,
105+
cutlass::epilogue::collective::EpilogueTileAuto,
106+
float, float,
107+
float, cutlass::layout::RowMajor, 4,
108+
float, cutlass::layout::RowMajor, 4,
109+
cutlass::epilogue::collective::EpilogueScheduleAuto,
110+
cutlass::epilogue::fusion::LinearCombination<
111+
float,
112+
float,
113+
float,
114+
float
115+
>
116+
>::CollectiveOp;
117+
118+
using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop =
119+
typename cutlass::gemm::collective::CollectiveBuilder<
120+
cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
121+
cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
122+
cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
123+
float,
124+
cute::Shape<cute::_256, cute::_256, cute::_32>,
125+
cute::Shape<cute::_1, cute::_1, cute::_1>,
126+
cutlass::gemm::collective::StageCountAuto,
127+
cutlass::gemm::collective::KernelScheduleAuto
128+
>::CollectiveOp;
129+
130+
// Gemm operator cutlass3x_xe11_tensorop_gemm_bf16_128x256_16x0_tn_align2
131+
using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base = cutlass::gemm::kernel::GemmUniversal<
132+
cute::Shape<int,int,int,int>,
133+
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_mainloop,
134+
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_epilogue,
135+
cutlass::gemm::PersistentScheduler>;
136+
137+
// Define named type
138+
struct cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8 :
139+
public cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_base { };
140+
141+
142+
using cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type = cutlass::gemm::device::GemmUniversalAdapter<cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8>;
143+
144+
// When workspace_size is not a nullptr, populates requested workspace_size and returns.
145+
// Otherwise, computes the Gemm kernel using the given workspace ptr.
146+
extern "C" {
147+
PT_EXPORT int sycl_tla_gemm_xe20_bf16(const uint16_t* X, const uint16_t* W, uint16_t* Y, const int M, const int N, const int K, const int B, const int lda, const int ldb, const int ldc, const int ldd, const int X_offset, const int W_offset, const int Y_offset, const uint8_t swizzle, size_t* workspace_size, uint8_t* workspace, sycl::queue* stream) {
148+
try {
149+
using ElementComputeEpilogue = cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::ElementAccumulator;
150+
using coord_t = cutlass::gemm::GemmCoord::Index;
151+
static cutlass::KernelHardwareInfo hw_info;
152+
if (hw_info.sm_count == 0) {
153+
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(0);
154+
CUTLASS_TRACE_HOST("Query result for SM count per device: " << hw_info.sm_count);
155+
}
156+
157+
// Initialize GemmUniversal3xInstance arguments using constructor
158+
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type::Arguments arguments{
159+
cutlass::gemm::GemmUniversalMode::kGemm, // GemmUniversalMode mode
160+
{
161+
static_cast<coord_t>(M),
162+
static_cast<coord_t>(N),
163+
static_cast<coord_t>(K),
164+
static_cast<coord_t>(B)
165+
}, // ProblemShape problem_shape
166+
{
167+
(cutlass::bfloat16_t*)(X + X_offset), // ElementA const* ptr_A
168+
cute::make_tuple(cute::Int<1>{}, int64_t(lda), int64_t(0)), // StrideA dA (column-major: stride_m=1, stride_n=lda, batch=0)
169+
(cutlass::bfloat16_t*)(W + W_offset), // ElementB const* ptr_B
170+
cute::make_tuple(int64_t(ldb), cute::Int<1>{}, int64_t(0)), // StrideB dB (column-major: stride_m=ldb, stride_n=1, batch=0)
171+
}, // MainloopArguments mainloop
172+
173+
// see https://tinyurl.com/4rk89z48
174+
{
175+
{ElementComputeEpilogue(1), ElementComputeEpilogue(0)}, // thread, typename FusionCallbacks::Arguments ( EVT ) or ThreadEpilogueOp::Params (non-EVT )
176+
nullptr, // ElementC const* ptr_C
177+
cute::make_tuple(int64_t(0), cute::Int<1>{}, int64_t(0)), // StrideC dC (row-major: stride_m, stride_n=1, batch=0)
178+
(float*)(Y + Y_offset), // ElementD ptr_D (output is float, not bfloat16)
179+
cute::make_tuple(int64_t(ldd), cute::Int<1>{}, int64_t(0)), // StrideD dD (row-major: stride_m=ldd, stride_n=1, batch=0)
180+
}, // EpilogueArguments epilogue,
181+
hw_info
182+
};
183+
arguments.scheduler.max_swizzle_size = swizzle;
184+
cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8_device_type gemm_op;
185+
if (workspace_size) {
186+
*workspace_size = gemm_op.get_workspace_size(arguments);
187+
return 0;
188+
}
189+
// check for null pointers after workspace size, since querying workspace size doesn't require valid data pointers
190+
#ifndef CUTLASS_BACKEND_DISABLE_CHECKS
191+
{
192+
auto status = gemm_op.can_implement(arguments);
193+
CUTLASS_CHECK(status);
194+
}
195+
#endif
196+
#ifdef CUTLASS_DEBUG_TRACE_LEVEL
197+
#if CUTLASS_DEBUG_TRACE_LEVEL == 1
198+
{
199+
// Print the maximum number of active blocks per SM for the kernel if CUTLASS_DEBUG_TRACE_LEVEL == 1
200+
// we don't need a print statement, it's happening inside the function.
201+
gemm_op.maximum_active_blocks();
202+
}
203+
#endif
204+
#endif
205+
{
206+
auto status = gemm_op.initialize(arguments, workspace, stream);
207+
CUTLASS_CHECK(status);
208+
}
209+
{
210+
auto status = gemm_op(stream);
211+
CUTLASS_CHECK(status);
212+
}
213+
}
214+
catch (std::exception& e) {
215+
std::cerr << "Runtime error: " << e.what() << std::endl;
216+
return -1;
217+
}
218+
catch (...) {
219+
return -1;
220+
}
221+
return 0;
222+
}
223+
}
224+
225+
// configuration name: cutlass3x_xe20_tensorop_gemm_bf16_256x256_32x0_nn_align8

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ if(CUTLASS_ENABLE_SYCL)
113113
08_bmg_gemm_f8
114114
09_bmg_grouped_gemm_f8
115115
10_bmg_grouped_gemm_mixed_dtype
116+
11_xe20_cutlass_library
116117
)
117118
add_subdirectory(${EXAMPLE})
118119
endforeach()

0 commit comments

Comments
 (0)