meta-pytorch
diff --git a/‎README.md‎
Lines changed: 19 additions & 16 deletions b/‎README.md‎
Lines changed: 19 additions & 16 deletions
diff --git a/‎benchmark/benchmark_all_gather_matmul.py‎
Lines changed: 11 additions & 31 deletions b/‎benchmark/benchmark_all_gather_matmul.py‎
Lines changed: 11 additions & 31 deletions
diff --git a/‎benchmark/benchmark_all_reduce.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/benchmark_all_reduce.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmark_all_reduce_bias.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/benchmark_all_reduce_bias.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/benchmark_all_reduce_bias_rms_norm.py‎
Lines changed: 11 additions & 7 deletions b/‎benchmark/benchmark_all_reduce_bias_rms_norm.py‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎benchmark/benchmark_matmul_reduce_scatter.py‎
Lines changed: 11 additions & 31 deletions b/‎benchmark/benchmark_matmul_reduce_scatter.py‎
Lines changed: 11 additions & 31 deletions
diff --git a/‎kraken/__init__.py‎
Lines changed: 6 additions & 8 deletions b/‎kraken/__init__.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎kraken/all_gather_fusion/__init__.py‎
Lines changed: 0 additions & 5 deletions b/‎kraken/all_gather_fusion/__init__.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎kraken/all_gather_fusion/copy_engine_all_gather.py‎
Lines changed: 0 additions & 50 deletions b/‎kraken/all_gather_fusion/copy_engine_all_gather.py‎
Lines changed: 0 additions & 50 deletions
@@ -23,8 +23,9 @@ Our initial kernels are adapted from the [Symmetric Memory Recipes](https://gith
 ## 🚀 Getting Started
 ### Prerequisites
 - PyTorch (version 2.6.0 or higher)
-- Triton (version 3.3.0 or higher)
+- Triton (version 3.3.0)
 - Python (version 3.10 or higher)
+- CUDA (version 12.4 or higher) Version must matche your PyTorch installaltion.
 
 ### Installation
 ```bash
@@ -122,19 +123,22 @@ custom_distributed_kernel[grid](
 Kraken is organized for easy hacking of distributed Triton kernel: 
 
 ### Example Kernels
-#### `kraken.all_gather_fusion`
-- `all_gather_matmul`
-#### `kraken.all_reduce_fusion`
-- `rms_norm`,
-- `gemm_one_shot_all_reduce_fused`
--  `one_shot_all_reduce_bias`
-- `one_shot_all_reduce_bias_rms_norm`
-- `two_shot_all_reduce_bias`
-- `two_shot_all_reduce_bias_rms_norm`
+#### `kraken.comm`
+contains communication kernels with fine-grained sychronizations. 
+- `all_gather_w_progress`
 - `one_shot_all_reduce`
-#### `kraken.reduce_scatter_fusion`
-- `gemm_reduce_scatter`
-- `gemm_reduce_scatter_ce_persistent`
+- (coming soon) `two_shot_all_reduce`
+- (coming soon) `multimem_all_reduce`
+#### `kraken.fused`
+Fused communication/computation kernels. 
+- All gather matmul: `all_gather_matmul`
+- Gemm all reduce: `gemm_one_shot_all_reduce_fused`
+- Gemm reduce scatter: `gemm_reduce_scatter`, `gemm_reduce_scatter_ce_persistent`
+- Reduce bias: `one_shot_all_reduce_bias`, `two_shot_all_reduce_bias`
+- Reduce bias rms_norm: `one_shot_all_reduce_bias_rms_norm`, `two_shot_all_reduce_bias_rms_norm` 
+
+#### `kraken.quantized`
+(comming soon) Fused communication/computation kernels with quantization. 
 
 
 ### Inline PTX Utils
@@ -146,10 +150,9 @@ Kraken is organized for easy hacking of distributed Triton kernel:
 Kraken includes a set of benchmarks in `benchmarks/` to evaluate the performance of its kernels. You can run them as follows:
 
 ```bash
-torchrun --nnodes 1 --nproc-per-node 8 \
+torchrun --nnodes 1 --nproc-per-node <world_size> \
 --rdzv-backend c10d --rdzv-endpoint localhost:0 --no_python python3 \
-benchmark/benchmark_all_reduce.py \
---backend nccl,triton_1shot,dist_1shot
+benchmark/benchmark_all_reduce.py 
 # ... and so on for other benchmarks
 ```
 
 
@@ -3,7 +3,6 @@
 import csv
 from dataclasses import asdict, dataclass
 import functools
-import itertools
 import os
 import sys
 
@@ -63,15 +62,10 @@ def asdict(self):
 
 def generate_experiment_configs(
     dtype: torch.dtype,
-    M: list[int],
-    N: list[int],
-    K: list[int],
+    shapes: list[tuple[int, int, int]],
     backends: list[str],
     device: torch.device,
 ) -> list[ExperimentConfig]:
-    # Generate cross config shapes from M, N, K lists
-    shapes = list(itertools.product(M, N, K))
-
     all_configs = []
     for shape in shapes:
         all_configs.append(
@@ -93,7 +87,7 @@ def get_single_backend_fn(backend: str):
     if backend == "torch_symm_mem":
         return torch_symm_mem_ag_mm
     if backend == "triton":
-        return kraken.all_gather_fusion.all_gather_matmul
+        return kraken.fused.all_gather_matmul
     raise NotImplementedError(backend)
 
 
@@ -176,9 +170,7 @@ def main(args):
     torch.manual_seed(42 + local_rank)
 
     results = []
-    configs = generate_experiment_configs(
-        args.dtype, args.M, args.N, args.K, args.backend, device
-    )
+    configs = generate_experiment_configs(args.dtype, args.shape, args.backend, device)
     for config in configs:
         results.append(
             Experiment(
@@ -196,7 +188,7 @@ def shape_input_type(s):
         M, N, K = map(int, s.split(","))
         return M, N, K
     except Exception as e:
-        raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
+        raise argparse.ArgumentTypeError("Shape must be M, N, K") from e
 
 
 if __name__ == "__main__":
@@ -228,27 +220,15 @@ def shape_input_type(s):
     )
 
     parser.add_argument(
-        "-M",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(7, 11)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-N",
+        "--shape",
         type=shape_input_type,
         nargs="+",
-        default=[6656],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-K",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(12, 15)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
+        default=[
+            (m, 6656, k)
+            for m in [2**x for x in range(7, 11)]
+            for k in [2**x for x in range(12, 16)]
+        ],
+        help="matmul shapes: M, N, K. (M, K) @ (K, N) -> (M, N)",
     )
 
     parser.add_argument("-dtype", type=str, help="dtype", default="bfloat16")
 
@@ -114,7 +114,7 @@ def get_single_backend_fn(backend: str):
     if backend == "dist_2shot":
         return symm_mem_two_shot_all_reduce
     if backend == "triton_1shot":
-        return kraken.all_reduce_fusion.one_shot_all_reduce
+        return kraken.comm.one_shot_all_reduce
     if backend == "nccl":
         return nccl_ring
     raise NotImplementedError(backend)
 
@@ -15,15 +15,15 @@ def one_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.fused.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
 def two_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.fused.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
 
@@ -13,32 +13,36 @@
 
 def one_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.one_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.fused.one_shot_all_reduce_bias_rms_norm(
+        symm_mem_input, x, bias, rms_weight, y
+    )
     return y
 
 
 def one_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
+    kraken.fused.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.fused.rms_norm(y, rms_weight)
 
 
 def two_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.two_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.fused.two_shot_all_reduce_bias_rms_norm(
+        symm_mem_input, x, bias, rms_weight, y
+    )
     return y
 
 
 def two_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
+    kraken.fused.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.fused.rms_norm(y, rms_weight)
 
 
 def nccl_all_reduce_bias_rms_norm(x, bias, rms_weight):
     dist.all_reduce(x)
     y = x + bias
-    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
+    return kraken.fused.rms_norm(y, rms_weight)
 
 
 def create_benchmarks(b, t, d_size, device, dtype):
 
@@ -3,7 +3,6 @@
 import csv
 from dataclasses import asdict, dataclass
 import functools
-import itertools
 import os
 import sys
 
@@ -68,15 +67,10 @@ def asdict(self):
 
 def generate_experiment_configs(
     dtype: torch.dtype,
-    M: list[int],
-    N: list[int],
-    K: list[int],
+    shapes: list[tuple[int, int, int]],
     backends: list[str],
     device: torch.device,
 ) -> list[ExperimentConfig]:
-    # Generate cross config shapes from M, N, K lists
-    shapes = list(itertools.product(M, N, K))
-
     all_configs = []
     for shape in shapes:
         all_configs.append(
@@ -98,7 +92,7 @@ def get_single_backend_fn(backend: str):
     if backend == "torch_symm_mem":
         return torch_symm_mem_gemm_rs
     if backend == "triton":
-        return kraken.reduce_scatter_fusion.gemm_reduce_scatter
+        return kraken.fused.gemm_reduce_scatter
     raise NotImplementedError(backend)
 
 
@@ -181,9 +175,7 @@ def main(args):
     torch.manual_seed(42 + local_rank)
 
     results = []
-    configs = generate_experiment_configs(
-        args.dtype, args.M, args.N, args.K, args.backend, device
-    )
+    configs = generate_experiment_configs(args.dtype, args.shape, args.backend, device)
     for config in configs:
         results.append(
             Experiment(
@@ -201,7 +193,7 @@ def shape_input_type(s):
         M, N, K = map(int, s.split(","))
         return M, N, K
     except Exception as e:
-        raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
+        raise argparse.ArgumentTypeError("Shape must be M, N, K") from e
 
 
 if __name__ == "__main__":
@@ -233,27 +225,15 @@ def shape_input_type(s):
     )
 
     parser.add_argument(
-        "-M",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(7, 11)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-N",
+        "--shape",
         type=shape_input_type,
         nargs="+",
-        default=[6656],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-K",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(12, 16)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
+        default=[
+            (m, 6656, k)
+            for m in [2**x for x in range(7, 11)]
+            for k in [2**x for x in range(12, 16)]
+        ],
+        help="matmul shapes: M, N, K. (M, K) @ (K, N) -> (M, N)",
     )
 
     parser.add_argument("-dtype", type=str, help="dtype", default="float32")
 
@@ -1,15 +1,13 @@
 from . import (
     _logging,
-    all_gather_fusion,
-    all_reduce,
-    all_reduce_fusion,
-    reduce_scatter_fusion,
+    _ptx_utils,
+    comm,
+    fused,
 )
 
 __all__ = [
     "_logging",
-    "all_gather_fusion",
-    "all_reduce",
-    "all_reduce_fusion",
-    "reduce_scatter_fusion",
+    "_ptx_utils",
+    "comm",
+    "fused",
 ]