From 2dd9bb67a88fa50fb4c09a067597703d40f3caef Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Thu, 24 Apr 2025 19:29:07 +0000
Subject: [PATCH 1/4] Change make_ttgir and make_llir to make it closer to
 OpenAI version

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 third_party/intel/backend/compiler.py | 97 ++++++++++++++++-----------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 7c85f90c96..7f11de764b 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -40,7 +40,6 @@ class XPUOptions:
     num_warps: int = 4
     num_ctas: int = 1
     num_stages: int = 2
-    split_barriers_scope: str = 'None'
     cluster_dims: tuple = (1, 1, 1)
     threads_per_warp: int = 32
     optimize_epilogue: bool = False
@@ -53,6 +52,7 @@ class XPUOptions:
     allow_fp8e4nv: bool = False
     allow_fp8e4b15: bool = True
     grf_mode: tuple = ('small', 'large', 'auto', 'default')
+    split_barriers_scope: str = 'None'    
     max_num_imprecise_acc_default: int = 0  # `max_num_imprecise_acc` only applies to fp8 -> fp32 dot on sm_90 for cuda
     extern_libs: dict = None
     debug: bool = False
@@ -223,6 +223,45 @@ def parse_raise_block_pointer_flags() -> dict:
                 raise_block_ptr_flags['ignore-masks'] = True
         return raise_block_ptr_flags
 
+
+    @staticmethod
+    def validate_options(opt, properties):
+        # Check threads_per_warp and num_threads are within limits.
+        if opt.threads_per_warp not in properties['sub_group_sizes']:
+            raise ValueError(
+                f"threads_per_warp={opt.threads_per_warp} is unsupported for the target (supported values are {properties['sub_group_sizes']})"
+            )
+        if opt.num_warps > properties['max_num_sub_groups']:
+            raise ValueError(
+                f"num_warps={opt.num_warps} is unsupported for the target (limit is {properties['max_num_sub_groups']})"
+            )
+        if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']:
+            raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})")
+
+
+    @staticmethod 
+    def annotate_module(mod, properties, opt, target_arch):
+        # Annotate module with information required by subsequent transformations.
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]),
+                                                        properties["has_subgroup_2d_block_io"],
+                                                        properties["has_subgroup_matrix_multiply_accumulate"],
+                                                        properties["has_bfloat16_conversions"], opt.threads_per_warp,
+                                                        target_arch)
+        pm.run(mod)
+
+
+    @staticmethod
+    def get_split_barrier_scope(opt):
+        split_barriers_scope = intel.SplitBarrierScope.none
+        if opt.split_barriers_scope == 'Workgroup':
+            split_barriers_scope = intel.SplitBarrierScope.Workgroup
+        elif opt.split_barriers_scope == 'Subgroup':
+            split_barriers_scope = intel.SplitBarrierScope.Subgroup
+        return split_barriers_scope
+
+
     @staticmethod
     def make_ttir(mod, metadata, opt):
         raise_block_ptr_flags = XPUBackend.parse_raise_block_pointer_flags()
@@ -247,6 +286,7 @@ def make_ttir(mod, metadata, opt):
         pm.run(mod)
         return mod
 
+
     @staticmethod
     def make_ttgir(mod, metadata, opt, properties):
         cluster_info = intel.ClusterInfo()
@@ -255,46 +295,19 @@ def make_ttgir(mod, metadata, opt, properties):
             cluster_info.clusterDimY = opt.cluster_dims[1]
             cluster_info.clusterDimZ = opt.cluster_dims[2]
 
-        # 0:No barrier / 1:Workgroup scope / 2:Subgroup scope
-        split_barriers_scope = intel.SplitBarrierScope.none
-        if opt.split_barriers_scope == 'Workgroup':
-            split_barriers_scope = intel.SplitBarrierScope.Workgroup
-        elif opt.split_barriers_scope == 'Subgroup':
-            split_barriers_scope = intel.SplitBarrierScope.Subgroup
         # Annotate module with information required by subsequent transformations.
-        pm = ir.pass_manager(mod.context)
-        pm.enable_debug()
-        target_arch = "spir64"
-        intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]),
-                                                        properties["has_subgroup_2d_block_io"],
-                                                        properties["has_subgroup_matrix_multiply_accumulate"],
-                                                        properties["has_bfloat16_conversions"], opt.threads_per_warp,
-                                                        target_arch)
-        pm.run(mod)
+        XPUBackend.annotate_module(mod, properties, opt, "spir64")
 
         # Overwrite the threads_per_warp option with the module annotation.
         opt.threads_per_warp = intel.get_threads_per_warp(mod)
-
-        # Check threads_per_warp and num_threads are within limits.
-        if opt.threads_per_warp not in properties['sub_group_sizes']:
-            raise ValueError(
-                f"threads_per_warp={opt.threads_per_warp} is unsupported for the target (supported values are {properties['sub_group_sizes']})"
-            )
-        if opt.num_warps > properties['max_num_sub_groups']:
-            raise ValueError(
-                f"num_warps={opt.num_warps} is unsupported for the target (limit is {properties['max_num_sub_groups']})"
-            )
-        if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']:
-            raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})")
-
-        # Run the TTIR -> TTGIR pass pipeline.
-        pm = ir.pass_manager(mod.context)
-        pm.enable_debug()
+        XPUBackend.validate_options(opt, properties)
 
         if (properties["has_subgroup_2d_block_io"] and properties["has_subgroup_matrix_multiply_accumulate"]
                 and (os.getenv("TRITON_INTEL_ADVANCED_PATH", "0") == "1" or opt.advanced_path)):
             return XPUBackend.AdvancedPath.make_ttgir(mod, metadata, opt)
 
+        pm = ir.pass_manager(mod.context)
+        dump_enabled = pm.enable_debug()
         passes.ttir.add_convert_to_ttgpuir(pm, "xpu", opt.num_warps, opt.threads_per_warp, opt.num_ctas)
         # optimize TTGIR
         intel.passes.ttgpuir.add_coalesce(pm)
@@ -303,7 +316,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, split_barriers_scope)
+        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)
@@ -326,13 +339,6 @@ def make_ttgir(mod, metadata, opt, properties):
 
     @staticmethod
     def make_llir(src, metadata, options):
-        # warp-specialization mutates num_warps
-        num_warp_groups = src.get_int_attr("ttg.num-warp-groups-per-cta")
-        if num_warp_groups is not None:
-            metadata["num_warps"] *= num_warp_groups
-        threads_per_warp = intel.get_threads_per_warp(src)
-        metadata["threads_per_warp"] = threads_per_warp
-
         mod = src
         # TritonGPU -> LLVM-IR (MLIR)
         pm = ir.pass_manager(mod.context)
@@ -345,9 +351,12 @@ def make_llir(src, metadata, options):
         # being used, e.g., convert_layout.
         if os.getenv("TRITON_INTEL_REDUCE_TRANSPOSE", "0") != "1":
             passes.ttgpuir.add_allocate_shared_memory(pm)
+        passes.ttgpuir.add_allocate_global_scratch_memory(pm)
         intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt,
                                            options.enable_tile_load_linear_layout)
         intel.passes.ttgpuir.add_rewrite_stack_ptr(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
         passes.convert.add_arith_to_llvmir(pm)
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
@@ -362,14 +371,22 @@ def make_llir(src, metadata, options):
         intel.set_spv_target_triple(llvm_mod)
         if os.getenv("TRITON_INTEL_FAST_MATH", "0") == "1":
             intel.set_fast_math(llvm_mod)
+
         if options.extern_libs:
             paths = [path for (name, path) in options.extern_libs]
             llvm.link_extern_libs(llvm_mod, paths)
+
         intel.optimize_module(llvm_mod, llvm.OPTIMIZE_O3)
         intel.post_process_llir(llvm_mod)
 
         # Get some metadata
+        total_num_warps = src.get_int_attr("ttg.total-num-warps")
+        if total_num_warps is not None:
+            metadata["num_warps"] = total_num_warps
+        metadata["threads_per_warp"] = intel.get_threads_per_warp(src)
         metadata["shared"] = src.get_int_attr("ttg.shared")
+        metadata["global_scratch_size"] = src.get_int_attr("ttg.global_scratch_memory_size")
+        metadata["global_scratch_align"] = src.get_int_attr("ttg.global_scratch_memory_alignment")
         ret = str(llvm_mod)
         del llvm_mod
         del context

From 4f59eae05246fdf7517e0a6b602b34c7f21f3f95 Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Thu, 24 Apr 2025 22:00:42 +0000
Subject: [PATCH 2/4] Fix precommit

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 third_party/intel/backend/compiler.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 45f0c2e64d..adf2cf912a 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -52,7 +52,7 @@ class XPUOptions:
     allow_fp8e4nv: bool = False
     allow_fp8e4b15: bool = True
     grf_mode: tuple = ('small', 'large', 'auto', 'default')
-    split_barriers_scope: str = 'None'    
+    split_barriers_scope: str = 'None'
     max_num_imprecise_acc_default: int = 0  # `max_num_imprecise_acc` only applies to fp8 -> fp32 dot on sm_90 for cuda
     extern_libs: dict = None
     debug: bool = False
@@ -223,7 +223,6 @@ def parse_raise_block_pointer_flags() -> dict:
                 raise_block_ptr_flags['ignore-masks'] = True
         return raise_block_ptr_flags
 
-
     @staticmethod
     def validate_options(opt, properties):
         # Check threads_per_warp and num_threads are within limits.
@@ -238,8 +237,7 @@ def validate_options(opt, properties):
         if opt.threads_per_warp * opt.num_warps > properties['max_work_group_size']:
             raise ValueError(f"Kernel threads number exceeds the limit ({properties['max_work_group_size']})")
 
-
-    @staticmethod 
+    @staticmethod
     def annotate_module(mod, properties, opt, target_arch):
         # Annotate module with information required by subsequent transformations.
         pm = ir.pass_manager(mod.context)
@@ -251,7 +249,6 @@ def annotate_module(mod, properties, opt, target_arch):
                                                         target_arch)
         pm.run(mod)
 
-
     @staticmethod
     def get_split_barrier_scope(opt):
         split_barriers_scope = intel.SplitBarrierScope.none
@@ -261,7 +258,6 @@ def get_split_barrier_scope(opt):
             split_barriers_scope = intel.SplitBarrierScope.Subgroup
         return split_barriers_scope
 
-
     @staticmethod
     def make_ttir(mod, metadata, opt):
         raise_block_ptr_flags = XPUBackend.parse_raise_block_pointer_flags()
@@ -285,7 +281,6 @@ def make_ttir(mod, metadata, opt):
         pm.run(mod)
         return mod
 
-
     @staticmethod
     def make_ttgir(mod, metadata, opt, properties):
         cluster_info = intel.ClusterInfo()

From 254952e611294767e68a090cb4c044786edf78b8 Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Fri, 25 Apr 2025 13:32:52 +0000
Subject: [PATCH 3/4] Address code review comments

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 third_party/intel/backend/compiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index adf2cf912a..ee4b5715e3 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -301,7 +301,7 @@ def make_ttgir(mod, metadata, opt, properties):
             return XPUBackend.AdvancedPath.make_ttgir(mod, metadata, opt)
 
         pm = ir.pass_manager(mod.context)
-        dump_enabled = pm.enable_debug()
+        pm.enable_debug()
         passes.ttir.add_convert_to_ttgpuir(pm, "xpu", opt.num_warps, opt.threads_per_warp, opt.num_ctas)
         # optimize TTGIR
         intel.passes.ttgpuir.add_coalesce(pm)
@@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled, XPUBackend.get_split_barrier_scope(opt))
+        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, false, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)

From cd097e12a9cf264c4dd94200c0a599907325244a Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Fri, 25 Apr 2025 13:44:09 +0000
Subject: [PATCH 4/4] Address code review comments

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 third_party/intel/backend/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index ee4b5715e3..646ed28e8a 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, false, XPUBackend.get_split_barrier_scope(opt))
+        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)