lint and fix file structure

joydddd · joydddd · commit 5f89ca3922dc · 2025-09-22T10:58:49.000-07:00
diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ Kraken is organized for easy hacking of distributed Triton kernel:
 
 
 ### Inline PTX Utils
-
+`kraken._ptx_utils` provides inline ptx implementation of memory barrier synchorinzations that are not natively supported by triton. 
 
 
 
diff --git a/benchmark/benchmark_all_gather_matmul.py b/benchmark/benchmark_all_gather_matmul.py
@@ -93,7 +93,7 @@ def get_single_backend_fn(backend: str):
     if backend == "torch_symm_mem":
         return torch_symm_mem_ag_mm
     if backend == "triton":
-        return kraken.all_gather.all_gather_matmul
+        return kraken.all_gather_fusion.all_gather_matmul
     raise NotImplementedError(backend)
 
 
diff --git a/benchmark/benchmark_all_reduce.py b/benchmark/benchmark_all_reduce.py
@@ -114,7 +114,7 @@ def get_single_backend_fn(backend: str):
     if backend == "dist_2shot":
         return symm_mem_two_shot_all_reduce
     if backend == "triton_1shot":
-        return kraken.all_reduce.one_shot_all_reduce
+        return kraken.all_reduce_fusion.one_shot_all_reduce
     if backend == "nccl":
         return nccl_ring
     raise NotImplementedError(backend)
diff --git a/benchmark/benchmark_all_reduce_bias.py b/benchmark/benchmark_all_reduce_bias.py
@@ -7,26 +7,23 @@
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
 
+import kraken
 from kraken import _logging as log
-from kraken.all_reduce_fusion import (
-    one_shot_all_reduce_bias,
-    two_shot_all_reduce_bias,
-)
 
 
 def one_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
 def two_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
diff --git a/benchmark/benchmark_all_reduce_bias_rms_norm.py b/benchmark/benchmark_all_reduce_bias_rms_norm.py
@@ -7,44 +7,38 @@
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
 
+import kraken
 from kraken import _logging as log
-from kraken.all_reduce_fusion import (
-    rms_norm,
-    one_shot_all_reduce_bias,
-    one_shot_all_reduce_bias_rms_norm,
-    two_shot_all_reduce_bias,
-    two_shot_all_reduce_bias_rms_norm,
-)
 
 
 def one_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    one_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.all_reduce_fusion.one_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
     return y
 
 
 def one_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return rms_norm(y, rms_weight)
+    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
 
 
 def two_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    two_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.all_reduce_fusion.two_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
     return y
 
 
 def two_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return rms_norm(y, rms_weight)
+    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
 
 
 def nccl_all_reduce_bias_rms_norm(x, bias, rms_weight):
     dist.all_reduce(x)
     y = x + bias
-    return rms_norm(y, rms_weight)
+    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
 
 
 def create_benchmarks(b, t, d_size, device, dtype):
diff --git a/benchmark/benchmark_matmul_reduce_scatter.py b/benchmark/benchmark_matmul_reduce_scatter.py
@@ -1,16 +1,16 @@
 import argparse
+from collections import defaultdict
 import csv
+from dataclasses import asdict, dataclass
 import functools
 import itertools
 import os
 import sys
-from collections import defaultdict
-from dataclasses import asdict, dataclass
 
+from tabulate import tabulate
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
-from tabulate import tabulate
 
 # Add the kraken directory to the Python path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
@@ -20,18 +20,22 @@
 
 
 def torch_symm_mem_gemm_rs(a, b):
-    gemm_rs_output = torch.ops.symm_mem.fused_matmul_reduce_scatter(
+    return torch.ops.symm_mem.fused_matmul_reduce_scatter(
         a, b, "sum", scatter_dim=0, group_name=dist.group.WORLD.group_name
     )
-    return gemm_rs_output
+
 
 def nccl_mem_gemm_rs(a, b):
-    from torch.distributed._functional_collectives import reduce_scatter_tensor, wait_tensor
+    from torch.distributed._functional_collectives import (
+        reduce_scatter_tensor,
+        wait_tensor,
+    )
 
     gemm_output = torch.matmul(a, b)
-    rs_o = reduce_scatter_tensor(gemm_output, "sum", scatter_dim=0, group=dist.group.WORLD)
-    gemm_rs_output = wait_tensor(rs_o)
-    return gemm_rs_output
+    rs_o = reduce_scatter_tensor(
+        gemm_output, "sum", scatter_dim=0, group=dist.group.WORLD
+    )
+    return wait_tensor(rs_o)
 
 
 @dataclass(frozen=True)
diff --git a/kraken/__init__.py b/kraken/__init__.py
@@ -1,3 +1,15 @@
-from . import _logging, all_gather, all_reduce, all_reduce_fusion, reduce_scatter_fusion
+from . import (
+    _logging,
+    all_gather_fusion,
+    all_reduce,
+    all_reduce_fusion,
+    reduce_scatter_fusion,
+)
 
-__all__ = ["_logging", "all_gather", "all_reduce", "all_reduce_fusion", "reduce_scatter_fusion"]
+__all__ = [
+    "_logging",
+    "all_gather_fusion",
+    "all_reduce",
+    "all_reduce_fusion",
+    "reduce_scatter_fusion",
+]
diff --git a/kraken/_ptx_utils/__init__.py b/kraken/_ptx_utils/__init__.py
@@ -1,15 +1,19 @@
 from .gmem_barrier_arrive_wait import arrive_gmem_barrier, wait_gmem_barrier
 from .symm_mem_barrier import (
     _get_flat_tid as get_flat_tid,
+)
+from .symm_mem_barrier import (
     _send_signal as send_signal,
+)
+from .symm_mem_barrier import (
     symm_mem_sync as symm_mem_sync,
 )
 
 __all__ = [
     "arrive_gmem_barrier",
-    "symm_mem_sync",
-    "wait_gmem_barrier",
     "get_flat_tid",
     "send_signal",
+    "symm_mem_sync",
+    "wait_gmem_barrier",
 ]
 # Avoid ptx_utils when possible
diff --git a/kraken/all_gather_fusion/__init__.py b/kraken/all_gather_fusion/__init__.py
diff --git a/kraken/all_gather_fusion/all_gather_matmul.py b/kraken/all_gather_fusion/all_gather_matmul.py
diff --git a/kraken/all_gather_fusion/copy_engine_all_gather.py b/kraken/all_gather_fusion/copy_engine_all_gather.py
diff --git a/kraken/all_reduce/__init__.py b/kraken/all_reduce/__init__.py
diff --git a/kraken/all_reduce_fusion/__init__.py b/kraken/all_reduce_fusion/__init__.py
@@ -1,21 +1,25 @@
-from .rms_norm import rms_norm
 from .gemm_one_shot_all_reduce_fused import (
     gemm_one_shot_all_reduce as gemm_one_shot_all_reduce_fused,
 )
+from .one_shot_all_reduce import (
+    one_shot_all_reduce as one_shot_all_reduce,
+)
 from .one_shot_all_reduce_bias import one_shot_all_reduce_bias
 from .one_shot_all_reduce_bias_rms_norm import (
     one_shot_all_reduce_bias_rms_norm,
 )
+from .rms_norm import rms_norm
 from .two_shot_all_reduce_bias import two_shot_all_reduce_bias
 from .two_shot_all_reduce_bias_rms_norm import (
     two_shot_all_reduce_bias_rms_norm,
 )
 
 __all__ = [
-    "rms_norm",
     "gemm_one_shot_all_reduce_fused",
+    "one_shot_all_reduce",
     "one_shot_all_reduce_bias",
     "one_shot_all_reduce_bias_rms_norm",
+    "rms_norm",
     "two_shot_all_reduce_bias",
     "two_shot_all_reduce_bias_rms_norm",
 ]
diff --git a/kraken/all_reduce_fusion/gemm_one_shot_all_reduce_fused.py b/kraken/all_reduce_fusion/gemm_one_shot_all_reduce_fused.py
@@ -115,9 +115,9 @@ def gemm_one_shot_all_reduce(
         Output matrix of shape (M, N) containing the all-reduced result
     """
 
-    assert (
-        a.shape[1] == b.shape[0]
-    ), "Inner dimensions must match for matrix multiplication"
+    assert a.shape[1] == b.shape[0], (
+        "Inner dimensions must match for matrix multiplication"
+    )
 
     M, K = a.shape
     K, N = b.shape
diff --git a/kraken/all_reduce_fusion/one_shot_all_reduce.py b/kraken/all_reduce_fusion/one_shot_all_reduce.py
diff --git a/kraken/reduce_scatter_fusion/gemm_reduce_scatter_ce_persistent.py b/kraken/reduce_scatter_fusion/gemm_reduce_scatter_ce_persistent.py
@@ -1,10 +1,9 @@
+from cuda.bindings import driver
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
-
 import triton
 import triton.language as tl
-from cuda.bindings import driver
 
 from .._ptx_utils import get_flat_tid, send_signal
 
@@ -189,7 +188,7 @@ def gemm_producer_w_progress(
 ):
     M, K = a.shape
     Kb, N = b.shape
-    assert K == Kb, "Inner dimensions must match for matrix multiplication"
+    assert Kb == K, "Inner dimensions must match for matrix multiplication"
     assert a.dtype == b.dtype, "Input dtypes must match"
 
     bT = b.T
diff --git a/kraken/reduce_scatter_fusion/gemm_reduce_scatter_fused.py b/kraken/reduce_scatter_fusion/gemm_reduce_scatter_fused.py
@@ -61,7 +61,7 @@ def gemm_reduce_scatter_kernel(
 
     # GEMM Computation
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+    for k in range(tl.cdiv(K, BLOCK_SIZE_K)):
         a = tl.load(a_ptrs, mask=(offs_k[None, :] < K - k * BLOCK_SIZE_K), other=0.0)
         b = tl.load(b_ptrs, mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K), other=0.0)
         accumulator = tl.dot(a, b, accumulator)
@@ -161,19 +161,19 @@ def gemm_reduce_scatter(a: torch.Tensor, b: torch.Tensor, **kwargs) -> torch.Ten
         Output matrix of shape (M / world_size, N) containing the reduce-scattered result.
     """
 
-    assert (
-        a.shape[1] == b.shape[0]
-    ), "Inner dimensions must match for matrix multiplication"
+    assert a.shape[1] == b.shape[0], (
+        "Inner dimensions must match for matrix multiplication"
+    )
     M, K = a.shape
     _, N = b.shape
 
     group = kwargs.get("group", dist.group.WORLD)
     world_size = dist.get_world_size(group)
     rank = dist.get_rank(group)
 
-    assert (
-        M % world_size == 0
-    ), f"M dimension ({M}) must be divisible by world_size ({world_size})"
+    assert M % world_size == 0, (
+        f"M dimension ({M}) must be divisible by world_size ({world_size})"
+    )
 
     # Configuration stuff
     BLOCK_SIZE_M = kwargs.get("BLOCK_SIZE_M", 64)
diff --git a/test/test_allgather_matmul.py b/test/test_allgather_matmul.py
@@ -64,7 +64,7 @@ def test_all_gather_matmul(self):
         ).T.contiguous()
         b = bT.T
 
-        ag, c = kraken.all_gather.all_gather_matmul(a_shared, b)
+        ag, c = kraken.all_gather_fusion.all_gather_matmul(a_shared, b)
 
         golden_a = a_shared.clone()
         ag_golden, mm_golden = torch.ops.symm_mem.fused_all_gather_matmul(
diff --git a/test/test_allreduce.py b/test/test_allreduce.py
@@ -57,7 +57,7 @@ def test_one_shot(self):
         input_tensor = input_tensor.normal_()
         symm_mem.rendezvous(input_tensor, group_name)
 
-        result = kraken.all_reduce.one_shot_all_reduce(input_tensor)
+        result = kraken.all_reduce_fusion.one_shot_all_reduce(input_tensor)
 
         golden = input_tensor.clone()
         dist.all_reduce(golden)
diff --git a/test/test_allreduce_bias_rms_norm.py b/test/test_allreduce_bias_rms_norm.py
@@ -13,8 +13,8 @@
 )
 
 from kraken.all_reduce_fusion import (
-    rms_norm,
     one_shot_all_reduce_bias_rms_norm,
+    rms_norm,
     two_shot_all_reduce_bias_rms_norm,
 )
 
@@ -74,9 +74,7 @@ def test_one_shot_bias_rms_norm(self):
             bias = torch.randn(b, 5120, device=self.device, dtype=torch.bfloat16)
             w = torch.randn(5120, device=self.device, dtype=torch.bfloat16)
             y = torch.empty_like(input_tensor)
-            one_shot_all_reduce_bias_rms_norm(
-                symm_mem_buffer, input_tensor, bias, w, y
-            )
+            one_shot_all_reduce_bias_rms_norm(symm_mem_buffer, input_tensor, bias, w, y)
             baseline = self._nccl_all_reduce_bias_rms_norm(
                 input_tensor.clone(), w.clone(), bias.clone()
             )
@@ -107,9 +105,7 @@ def test_two_shot_bias_rms_norm(self):
             bias = torch.randn(b, 5120, device=self.device, dtype=torch.bfloat16)
             w = torch.randn(5120, device=self.device, dtype=torch.bfloat16)
             y = torch.empty_like(input_tensor)
-            two_shot_all_reduce_bias_rms_norm(
-                symm_mem_buffer, input_tensor, bias, w, y
-            )
+            two_shot_all_reduce_bias_rms_norm(symm_mem_buffer, input_tensor, bias, w, y)
             baseline = self._nccl_all_reduce_bias_rms_norm(
                 input_tensor.clone(), w.clone(), bias.clone()
             )
diff --git a/test/test_gemm_allreduce.py b/test/test_gemm_allreduce.py
@@ -1,10 +1,9 @@
+from datetime import timedelta
 import os
 import sys
-from datetime import timedelta
 
 import torch
 import torch.distributed as dist
-import torch.distributed._symmetric_memory as symm_mem
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
@@ -98,7 +97,10 @@ def test_rank_specific_values_all_reduce(self):
         # Each rank contributes (rank + 1) to the final sum
         # This makes it easy to verify all-reduce worked correctly
         rank_multiplier = self.rank + 1
-        a = torch.ones((M, K), dtype=torch.float32, device=self.device) * rank_multiplier
+        a = (
+            torch.ones((M, K), dtype=torch.float32, device=self.device)
+            * rank_multiplier
+        )
         b = torch.ones((K, N), dtype=torch.float32, device=self.device)
 
         result = kraken.all_reduce_fusion.gemm_one_shot_all_reduce_fused(a, b)
@@ -107,10 +109,13 @@ def test_rank_specific_values_all_reduce(self):
         # rank 0: 1*K, rank 1: 2*K, rank 2: 3*K, rank 3: 4*K
         # Total = K * (1+2+3+4) = K * 10
         expected_sum = K * sum(range(1, self.world_size + 1))  # K * 10 = K * 10
-        expected = torch.full((M, N), expected_sum, dtype=torch.float32, device=self.device)
+        expected = torch.full(
+            (M, N), expected_sum, dtype=torch.float32, device=self.device
+        )
 
         torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
         dist.destroy_process_group()
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_gemm_reduce_scatter.py b/test/test_gemm_reduce_scatter.py

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ Kraken is organized for easy hacking of distributed Triton kernel:`
`82`	`82`
`83`	`83`
`84`	`84`	`### Inline PTX Utils`
`85`		`-`
	`85`	+`kraken._ptx_utils` provides inline ptx implementation of memory barrier synchorinzations that are not natively supported by triton.
`86`	`86`
`87`	`87`
`88`	`88`