Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 52 additions & 39 deletions third_party/intel/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ class XPUOptions:
num_warps: int = 4
num_ctas: int = 1
num_stages: int = 2
split_barriers_scope: str = 'None'
cluster_dims: tuple = (1, 1, 1)
threads_per_warp: int = 32
optimize_epilogue: bool = False
Expand All @@ -53,6 +52,7 @@ class XPUOptions:
allow_fp8e4nv: bool = False
allow_fp8e4b15: bool = True
grf_mode: tuple = ('small', 'large', 'auto', 'default')
split_barriers_scope: str = 'None'
max_num_imprecise_acc_default: int = 0 # `max_num_imprecise_acc` only applies to fp8 -> fp32 dot on sm_90 for cuda
extern_libs: dict = None
debug: bool = False
Expand Down Expand Up @@ -223,6 +223,41 @@ def parse_raise_block_pointer_flags() -> dict:
raise_block_ptr_flags['ignore-masks'] = True
return raise_block_ptr_flags

@staticmethod
def validate_options(opt, properties):
# Check threads_per_warp and num_threads are within limits.
if opt.threads_per_warp not in properties['sub_group_sizes']:
raise ValueError(
f"threads_per_warp={opt.threads_per_warp} is unsupported for the target (supported values are {properties['sub_group_sizes']})"
)
if opt.num_warps > properties['max_num_sub_groups']:
raise ValueError(
f"num_warps={opt.num_warps} is unsupported for the target (limit is {properties['max_num_sub_groups']})"
)
if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']:
raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})")

@staticmethod
def annotate_module(mod, properties, opt, target_arch):
# Annotate module with information required by subsequent transformations.
pm = ir.pass_manager(mod.context)
pm.enable_debug()
intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]),
properties["has_subgroup_2d_block_io"],
properties["has_subgroup_matrix_multiply_accumulate"],
properties["has_bfloat16_conversions"], opt.threads_per_warp,
target_arch)
pm.run(mod)

@staticmethod
def get_split_barrier_scope(opt):
split_barriers_scope = intel.SplitBarrierScope.none
if opt.split_barriers_scope == 'Workgroup':
split_barriers_scope = intel.SplitBarrierScope.Workgroup
elif opt.split_barriers_scope == 'Subgroup':
split_barriers_scope = intel.SplitBarrierScope.Subgroup
return split_barriers_scope

@staticmethod
def make_ttir(mod, metadata, opt):
raise_block_ptr_flags = XPUBackend.parse_raise_block_pointer_flags()
Expand Down Expand Up @@ -254,45 +289,19 @@ def make_ttgir(mod, metadata, opt, properties):
cluster_info.clusterDimY = opt.cluster_dims[1]
cluster_info.clusterDimZ = opt.cluster_dims[2]

# 0:No barrier / 1:Workgroup scope / 2:Subgroup scope
split_barriers_scope = intel.SplitBarrierScope.none
if opt.split_barriers_scope == 'Workgroup':
split_barriers_scope = intel.SplitBarrierScope.Workgroup
elif opt.split_barriers_scope == 'Subgroup':
split_barriers_scope = intel.SplitBarrierScope.Subgroup
# Annotate module with information required by subsequent transformations.
pm = ir.pass_manager(mod.context)
pm.enable_debug()
intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]),
properties["has_subgroup_2d_block_io"],
properties["has_subgroup_matrix_multiply_accumulate"],
properties["has_bfloat16_conversions"], opt.threads_per_warp,
"spir64")
pm.run(mod)
XPUBackend.annotate_module(mod, properties, opt, "spir64")

# Overwrite the threads_per_warp option with the module annotation.
opt.threads_per_warp = intel.get_threads_per_warp(mod)

# Check threads_per_warp and num_threads are within limits.
if opt.threads_per_warp not in properties['sub_group_sizes']:
raise ValueError(
f"threads_per_warp={opt.threads_per_warp} is unsupported for the target (supported values are {properties['sub_group_sizes']})"
)
if opt.num_warps > properties['max_num_sub_groups']:
raise ValueError(
f"num_warps={opt.num_warps} is unsupported for the target (limit is {properties['max_num_sub_groups']})"
)
if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']:
raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})")

# Run the TTIR -> TTGIR pass pipeline.
pm = ir.pass_manager(mod.context)
pm.enable_debug()
XPUBackend.validate_options(opt, properties)

if (properties["has_subgroup_2d_block_io"] and properties["has_subgroup_matrix_multiply_accumulate"]
and (os.getenv("TRITON_INTEL_ADVANCED_PATH", "0") == "1" or opt.advanced_path)):
return XPUBackend.AdvancedPath.make_ttgir(mod, metadata, opt)

pm = ir.pass_manager(mod.context)
pm.enable_debug()
passes.ttir.add_convert_to_ttgpuir(pm, "xpu", opt.num_warps, opt.threads_per_warp, opt.num_ctas)
# optimize TTGIR
intel.passes.ttgpuir.add_coalesce(pm)
Expand All @@ -301,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties):
intel.passes.ttgpuir.add_accelerate_matmul(pm)
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
intel.passes.ttgpuir.add_materialize_block_pointer(pm)
intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, split_barriers_scope)
intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, XPUBackend.get_split_barrier_scope(opt))

passes.ttgpuir.add_fuse_nested_loops(pm)
passes.ttgpuir.add_optimize_thread_locality(pm)
Expand All @@ -324,13 +333,6 @@ def make_ttgir(mod, metadata, opt, properties):

@staticmethod
def make_llir(src, metadata, options):
# warp-specialization mutates num_warps
num_warp_groups = src.get_int_attr("ttg.num-warp-groups-per-cta")
if num_warp_groups is not None:
metadata["num_warps"] *= num_warp_groups
threads_per_warp = intel.get_threads_per_warp(src)
metadata["threads_per_warp"] = threads_per_warp

mod = src
# TritonGPU -> LLVM-IR (MLIR)
pm = ir.pass_manager(mod.context)
Expand All @@ -343,9 +345,12 @@ def make_llir(src, metadata, options):
# being used, e.g., convert_layout.
if os.getenv("TRITON_INTEL_REDUCE_TRANSPOSE", "0") != "1":
passes.ttgpuir.add_allocate_shared_memory(pm)
passes.ttgpuir.add_allocate_global_scratch_memory(pm)
intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt,
options.enable_tile_load_linear_layout)
intel.passes.ttgpuir.add_rewrite_stack_ptr(pm)
passes.common.add_canonicalizer(pm)
passes.common.add_cse(pm)
passes.convert.add_arith_to_llvmir(pm)
passes.common.add_canonicalizer(pm)
passes.common.add_cse(pm)
Expand All @@ -360,14 +365,22 @@ def make_llir(src, metadata, options):
intel.set_spv_target_triple(llvm_mod)
if os.getenv("TRITON_INTEL_FAST_MATH", "0") == "1":
intel.set_fast_math(llvm_mod)

if options.extern_libs:
paths = [path for (name, path) in options.extern_libs]
llvm.link_extern_libs(llvm_mod, paths)

intel.optimize_module(llvm_mod, llvm.OPTIMIZE_O3)
intel.post_process_llir(llvm_mod)

# Get some metadata
total_num_warps = src.get_int_attr("ttg.total-num-warps")
if total_num_warps is not None:
metadata["num_warps"] = total_num_warps
metadata["threads_per_warp"] = intel.get_threads_per_warp(src)
metadata["shared"] = src.get_int_attr("ttg.shared")
metadata["global_scratch_size"] = src.get_int_attr("ttg.global_scratch_memory_size")
metadata["global_scratch_align"] = src.get_int_attr("ttg.global_scratch_memory_alignment")
ret = str(llvm_mod)
del llvm_mod
del context
Expand Down