From 2dd9bb67a88fa50fb4c09a067597703d40f3caef Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Thu, 24 Apr 2025 19:29:07 +0000 Subject: [PATCH 1/4] Change make_ttgir and make_llir to make it closer to OpenAI version Signed-off-by: Tiotto, Ettore --- third_party/intel/backend/compiler.py | 97 ++++++++++++++++----------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 7c85f90c96..7f11de764b 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -40,7 +40,6 @@ class XPUOptions: num_warps: int = 4 num_ctas: int = 1 num_stages: int = 2 - split_barriers_scope: str = 'None' cluster_dims: tuple = (1, 1, 1) threads_per_warp: int = 32 optimize_epilogue: bool = False @@ -53,6 +52,7 @@ class XPUOptions: allow_fp8e4nv: bool = False allow_fp8e4b15: bool = True grf_mode: tuple = ('small', 'large', 'auto', 'default') + split_barriers_scope: str = 'None' max_num_imprecise_acc_default: int = 0 # `max_num_imprecise_acc` only applies to fp8 -> fp32 dot on sm_90 for cuda extern_libs: dict = None debug: bool = False @@ -223,6 +223,45 @@ def parse_raise_block_pointer_flags() -> dict: raise_block_ptr_flags['ignore-masks'] = True return raise_block_ptr_flags + + @staticmethod + def validate_options(opt, properties): + # Check threads_per_warp and num_threads are within limits. + if opt.threads_per_warp not in properties['sub_group_sizes']: + raise ValueError( + f"threads_per_warp={opt.threads_per_warp} is unsupported for the target (supported values are {properties['sub_group_sizes']})" + ) + if opt.num_warps > properties['max_num_sub_groups']: + raise ValueError( + f"num_warps={opt.num_warps} is unsupported for the target (limit is {properties['max_num_sub_groups']})" + ) + if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']: + raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})") + + + @staticmethod + def annotate_module(mod, properties, opt, target_arch): + # Annotate module with information required by subsequent transformations. + pm = ir.pass_manager(mod.context) + pm.enable_debug() + intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]), + properties["has_subgroup_2d_block_io"], + properties["has_subgroup_matrix_multiply_accumulate"], + properties["has_bfloat16_conversions"], opt.threads_per_warp, + target_arch) + pm.run(mod) + + + @staticmethod + def get_split_barrier_scope(opt): + split_barriers_scope = intel.SplitBarrierScope.none + if opt.split_barriers_scope == 'Workgroup': + split_barriers_scope = intel.SplitBarrierScope.Workgroup + elif opt.split_barriers_scope == 'Subgroup': + split_barriers_scope = intel.SplitBarrierScope.Subgroup + return split_barriers_scope + + @staticmethod def make_ttir(mod, metadata, opt): raise_block_ptr_flags = XPUBackend.parse_raise_block_pointer_flags() @@ -247,6 +286,7 @@ def make_ttir(mod, metadata, opt): pm.run(mod) return mod + @staticmethod def make_ttgir(mod, metadata, opt, properties): cluster_info = intel.ClusterInfo() @@ -255,46 +295,19 @@ def make_ttgir(mod, metadata, opt, properties): cluster_info.clusterDimY = opt.cluster_dims[1] cluster_info.clusterDimZ = opt.cluster_dims[2] - # 0:No barrier / 1:Workgroup scope / 2:Subgroup scope - split_barriers_scope = intel.SplitBarrierScope.none - if opt.split_barriers_scope == 'Workgroup': - split_barriers_scope = intel.SplitBarrierScope.Workgroup - elif opt.split_barriers_scope == 'Subgroup': - split_barriers_scope = intel.SplitBarrierScope.Subgroup # Annotate module with information required by subsequent transformations. - pm = ir.pass_manager(mod.context) - pm.enable_debug() - target_arch = "spir64" - intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]), - properties["has_subgroup_2d_block_io"], - properties["has_subgroup_matrix_multiply_accumulate"], - properties["has_bfloat16_conversions"], opt.threads_per_warp, - target_arch) - pm.run(mod) + XPUBackend.annotate_module(mod, properties, opt, "spir64") # Overwrite the threads_per_warp option with the module annotation. opt.threads_per_warp = intel.get_threads_per_warp(mod) - - # Check threads_per_warp and num_threads are within limits. - if opt.threads_per_warp not in properties['sub_group_sizes']: - raise ValueError( - f"threads_per_warp={opt.threads_per_warp} is unsupported for the target (supported values are {properties['sub_group_sizes']})" - ) - if opt.num_warps > properties['max_num_sub_groups']: - raise ValueError( - f"num_warps={opt.num_warps} is unsupported for the target (limit is {properties['max_num_sub_groups']})" - ) - if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']: - raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})") - - # Run the TTIR -> TTGIR pass pipeline. - pm = ir.pass_manager(mod.context) - pm.enable_debug() + XPUBackend.validate_options(opt, properties) if (properties["has_subgroup_2d_block_io"] and properties["has_subgroup_matrix_multiply_accumulate"] and (os.getenv("TRITON_INTEL_ADVANCED_PATH", "0") == "1" or opt.advanced_path)): return XPUBackend.AdvancedPath.make_ttgir(mod, metadata, opt) + pm = ir.pass_manager(mod.context) + dump_enabled = pm.enable_debug() passes.ttir.add_convert_to_ttgpuir(pm, "xpu", opt.num_warps, opt.threads_per_warp, opt.num_ctas) # optimize TTGIR intel.passes.ttgpuir.add_coalesce(pm) @@ -303,7 +316,7 @@ def make_ttgir(mod, metadata, opt, properties): intel.passes.ttgpuir.add_accelerate_matmul(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_materialize_block_pointer(pm) - intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, split_barriers_scope) + intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled, XPUBackend.get_split_barrier_scope(opt)) passes.ttgpuir.add_fuse_nested_loops(pm) passes.ttgpuir.add_optimize_thread_locality(pm) @@ -326,13 +339,6 @@ def make_ttgir(mod, metadata, opt, properties): @staticmethod def make_llir(src, metadata, options): - # warp-specialization mutates num_warps - num_warp_groups = src.get_int_attr("ttg.num-warp-groups-per-cta") - if num_warp_groups is not None: - metadata["num_warps"] *= num_warp_groups - threads_per_warp = intel.get_threads_per_warp(src) - metadata["threads_per_warp"] = threads_per_warp - mod = src # TritonGPU -> LLVM-IR (MLIR) pm = ir.pass_manager(mod.context) @@ -345,9 +351,12 @@ def make_llir(src, metadata, options): # being used, e.g., convert_layout. if os.getenv("TRITON_INTEL_REDUCE_TRANSPOSE", "0") != "1": passes.ttgpuir.add_allocate_shared_memory(pm) + passes.ttgpuir.add_allocate_global_scratch_memory(pm) intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt, options.enable_tile_load_linear_layout) intel.passes.ttgpuir.add_rewrite_stack_ptr(pm) + passes.common.add_canonicalizer(pm) + passes.common.add_cse(pm) passes.convert.add_arith_to_llvmir(pm) passes.common.add_canonicalizer(pm) passes.common.add_cse(pm) @@ -362,14 +371,22 @@ def make_llir(src, metadata, options): intel.set_spv_target_triple(llvm_mod) if os.getenv("TRITON_INTEL_FAST_MATH", "0") == "1": intel.set_fast_math(llvm_mod) + if options.extern_libs: paths = [path for (name, path) in options.extern_libs] llvm.link_extern_libs(llvm_mod, paths) + intel.optimize_module(llvm_mod, llvm.OPTIMIZE_O3) intel.post_process_llir(llvm_mod) # Get some metadata + total_num_warps = src.get_int_attr("ttg.total-num-warps") + if total_num_warps is not None: + metadata["num_warps"] = total_num_warps + metadata["threads_per_warp"] = intel.get_threads_per_warp(src) metadata["shared"] = src.get_int_attr("ttg.shared") + metadata["global_scratch_size"] = src.get_int_attr("ttg.global_scratch_memory_size") + metadata["global_scratch_align"] = src.get_int_attr("ttg.global_scratch_memory_alignment") ret = str(llvm_mod) del llvm_mod del context From 4f59eae05246fdf7517e0a6b602b34c7f21f3f95 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Thu, 24 Apr 2025 22:00:42 +0000 Subject: [PATCH 2/4] Fix precommit Signed-off-by: Tiotto, Ettore --- third_party/intel/backend/compiler.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 45f0c2e64d..adf2cf912a 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -52,7 +52,7 @@ class XPUOptions: allow_fp8e4nv: bool = False allow_fp8e4b15: bool = True grf_mode: tuple = ('small', 'large', 'auto', 'default') - split_barriers_scope: str = 'None' + split_barriers_scope: str = 'None' max_num_imprecise_acc_default: int = 0 # `max_num_imprecise_acc` only applies to fp8 -> fp32 dot on sm_90 for cuda extern_libs: dict = None debug: bool = False @@ -223,7 +223,6 @@ def parse_raise_block_pointer_flags() -> dict: raise_block_ptr_flags['ignore-masks'] = True return raise_block_ptr_flags - @staticmethod def validate_options(opt, properties): # Check threads_per_warp and num_threads are within limits. @@ -238,8 +237,7 @@ def validate_options(opt, properties): if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']: raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})") - - @staticmethod + @staticmethod def annotate_module(mod, properties, opt, target_arch): # Annotate module with information required by subsequent transformations. pm = ir.pass_manager(mod.context) @@ -251,7 +249,6 @@ def annotate_module(mod, properties, opt, target_arch): target_arch) pm.run(mod) - @staticmethod def get_split_barrier_scope(opt): split_barriers_scope = intel.SplitBarrierScope.none @@ -261,7 +258,6 @@ def get_split_barrier_scope(opt): split_barriers_scope = intel.SplitBarrierScope.Subgroup return split_barriers_scope - @staticmethod def make_ttir(mod, metadata, opt): raise_block_ptr_flags = XPUBackend.parse_raise_block_pointer_flags() @@ -285,7 +281,6 @@ def make_ttir(mod, metadata, opt): pm.run(mod) return mod - @staticmethod def make_ttgir(mod, metadata, opt, properties): cluster_info = intel.ClusterInfo() From 254952e611294767e68a090cb4c044786edf78b8 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Fri, 25 Apr 2025 13:32:52 +0000 Subject: [PATCH 3/4] Address code review comments Signed-off-by: Tiotto, Ettore --- third_party/intel/backend/compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index adf2cf912a..ee4b5715e3 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -301,7 +301,7 @@ def make_ttgir(mod, metadata, opt, properties): return XPUBackend.AdvancedPath.make_ttgir(mod, metadata, opt) pm = ir.pass_manager(mod.context) - dump_enabled = pm.enable_debug() + pm.enable_debug() passes.ttir.add_convert_to_ttgpuir(pm, "xpu", opt.num_warps, opt.threads_per_warp, opt.num_ctas) # optimize TTGIR intel.passes.ttgpuir.add_coalesce(pm) @@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties): intel.passes.ttgpuir.add_accelerate_matmul(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_materialize_block_pointer(pm) - intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled, XPUBackend.get_split_barrier_scope(opt)) + intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, false, XPUBackend.get_split_barrier_scope(opt)) passes.ttgpuir.add_fuse_nested_loops(pm) passes.ttgpuir.add_optimize_thread_locality(pm) From cd097e12a9cf264c4dd94200c0a599907325244a Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Fri, 25 Apr 2025 13:44:09 +0000 Subject: [PATCH 4/4] Address code review comments Signed-off-by: Tiotto, Ettore --- third_party/intel/backend/compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index ee4b5715e3..646ed28e8a 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties): intel.passes.ttgpuir.add_accelerate_matmul(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_materialize_block_pointer(pm) - intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, false, XPUBackend.get_split_barrier_scope(opt)) + intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, XPUBackend.get_split_barrier_scope(opt)) passes.ttgpuir.add_fuse_nested_loops(pm) passes.ttgpuir.add_optimize_thread_locality(pm)