meta-pytorch
diff --git a/‎README.md‎
Lines changed: 31 additions & 18 deletions b/‎README.md‎
Lines changed: 31 additions & 18 deletions
diff --git a/‎benchmark/benchmark_all_gather_matmul.py‎
Lines changed: 11 additions & 31 deletions b/‎benchmark/benchmark_all_gather_matmul.py‎
Lines changed: 11 additions & 31 deletions
diff --git a/‎benchmark/benchmark_all_reduce.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/benchmark_all_reduce.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmark_all_reduce_bias.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/benchmark_all_reduce_bias.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/benchmark_all_reduce_bias_rms_norm.py‎
Lines changed: 11 additions & 7 deletions b/‎benchmark/benchmark_all_reduce_bias_rms_norm.py‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎benchmark/benchmark_matmul_reduce_scatter.py‎
Lines changed: 11 additions & 31 deletions b/‎benchmark/benchmark_matmul_reduce_scatter.py‎
Lines changed: 11 additions & 31 deletions
diff --git a/‎kraken/__init__.py‎
Lines changed: 6 additions & 8 deletions b/‎kraken/__init__.py‎
Lines changed: 6 additions & 8 deletions
@@ -23,8 +23,9 @@ Our initial kernels are adapted from the [Symmetric Memory Recipes](https://gith
 ## 🚀 Getting Started
 ### Prerequisites
 - PyTorch (version 2.6.0 or higher)
-- Triton (version 3.3.0 or higher)
+- Triton (version 3.3.0)
 - Python (version 3.10 or higher)
+- CUDA (version 12.4 or higher) Version must matche your PyTorch installaltion.
 
 ### Installation
 ```bash
@@ -48,8 +49,10 @@ import torch.distributed._symmetric_memory as symm_mem
 import kraken
 import os
 
-# local_rank is needed for device placement, and can be received from the environment
+# setup distributed process group. 
 local_rank = int(os.environ["LOCAL_RANK"])
+torch.cuda.set_device(f"cuda:{local_rank}")
+dist.init_process_group("nccl")
 
 # Create and initialize a symmetric memory tensor
 # See blog: https://dev-discuss.pytorch.org/t/pytorch-symmetricmemory-harnessing-nvlink-programmability-with-ease/279 for symmetric memory details. 
@@ -62,7 +65,13 @@ symm_mem.rendezvous(a_shared, group=dist.group.WORLD)
 a_shared = a_shared.normal_()
 
 # Call one_shot_all_reduce kernel from kraken. 
-a = kraken.one_shot_all_reduce(a_shared)
+a = kraken.comm.one_shot_all_reduce(a_shared)
+```
+Remember to run with torchrun! Example torchrun command: 
+```shell
+torchrun --nnodes 1 --nproc-per-node <world_size> \
+    --rdzv-backend c10d --rdzv-endpoint localhost:0 --no_python \
+    python3 example.py
 ```
 
 Alternatively, you can build your own custom kernels by leveraging Kraken's low-level primitives. This allows you to create highly optimized kernels tailored to your specific needs. We provide PTX implementations of low-level primitives in `kraken._ptx_utils`.
@@ -102,6 +111,8 @@ def custom_distributed_kernel(
 
 # Create and initialize a symmetric memory tensor
 local_rank = int(os.environ["LOCAL_RANK"])
+torch.cuda.set_device(f"cuda:{local_rank}")
+dist.init_process_group("nccl")
 a_shared = symm_mem.empty((4096, 4096), dtype=torch.bfloat16, device=f"cuda:{local_rank}")
 symm_mem_hdl = symm_mem.rendezvous(a_shared, group=dist.group.WORLD)
 
@@ -122,19 +133,22 @@ custom_distributed_kernel[grid](
 Kraken is organized for easy hacking of distributed Triton kernel: 
 
 ### Example Kernels
-#### `kraken.all_gather_fusion`
-- `all_gather_matmul`
-#### `kraken.all_reduce_fusion`
-- `rms_norm`,
-- `gemm_one_shot_all_reduce_fused`
--  `one_shot_all_reduce_bias`
-- `one_shot_all_reduce_bias_rms_norm`
-- `two_shot_all_reduce_bias`
-- `two_shot_all_reduce_bias_rms_norm`
+#### `kraken.comm`
+contains communication kernels with fine-grained sychronizations. 
+- `all_gather_w_progress`
 - `one_shot_all_reduce`
-#### `kraken.reduce_scatter_fusion`
-- `gemm_reduce_scatter`
-- `gemm_reduce_scatter_ce_persistent`
+- (coming soon) `two_shot_all_reduce`
+- (coming soon) `multimem_all_reduce`
+#### `kraken.fused`
+Fused communication/computation kernels. 
+- All gather matmul: `all_gather_matmul`
+- Gemm all reduce: `gemm_one_shot_all_reduce_fused`
+- Gemm reduce scatter: `gemm_reduce_scatter`, `gemm_reduce_scatter_ce_persistent`
+- Reduce bias: `one_shot_all_reduce_bias`, `two_shot_all_reduce_bias`
+- Reduce bias rms_norm: `one_shot_all_reduce_bias_rms_norm`, `two_shot_all_reduce_bias_rms_norm` 
+
+#### `kraken.quantized`
+(comming soon) Fused communication/computation kernels with quantization. 
 
 
 ### Inline PTX Utils
@@ -146,10 +160,9 @@ Kraken is organized for easy hacking of distributed Triton kernel:
 Kraken includes a set of benchmarks in `benchmarks/` to evaluate the performance of its kernels. You can run them as follows:
 
 ```bash
-torchrun --nnodes 1 --nproc-per-node 8 \
+torchrun --nnodes 1 --nproc-per-node <world_size> \
 --rdzv-backend c10d --rdzv-endpoint localhost:0 --no_python python3 \
-benchmark/benchmark_all_reduce.py \
---backend nccl,triton_1shot,dist_1shot
+benchmark/benchmark_all_reduce.py 
 # ... and so on for other benchmarks
 ```
 
 
@@ -3,7 +3,6 @@
 import csv
 from dataclasses import asdict, dataclass
 import functools
-import itertools
 import os
 import sys
 
@@ -63,15 +62,10 @@ def asdict(self):
 
 def generate_experiment_configs(
     dtype: torch.dtype,
-    M: list[int],
-    N: list[int],
-    K: list[int],
+    shapes: list[tuple[int, int, int]],
     backends: list[str],
     device: torch.device,
 ) -> list[ExperimentConfig]:
-    # Generate cross config shapes from M, N, K lists
-    shapes = list(itertools.product(M, N, K))
-
     all_configs = []
     for shape in shapes:
         all_configs.append(
@@ -93,7 +87,7 @@ def get_single_backend_fn(backend: str):
     if backend == "torch_symm_mem":
         return torch_symm_mem_ag_mm
     if backend == "triton":
-        return kraken.all_gather_fusion.all_gather_matmul
+        return kraken.fused.all_gather_matmul
     raise NotImplementedError(backend)
 
 
@@ -176,9 +170,7 @@ def main(args):
     torch.manual_seed(42 + local_rank)
 
     results = []
-    configs = generate_experiment_configs(
-        args.dtype, args.M, args.N, args.K, args.backend, device
-    )
+    configs = generate_experiment_configs(args.dtype, args.shape, args.backend, device)
     for config in configs:
         results.append(
             Experiment(
@@ -196,7 +188,7 @@ def shape_input_type(s):
         M, N, K = map(int, s.split(","))
         return M, N, K
     except Exception as e:
-        raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
+        raise argparse.ArgumentTypeError("Shape must be M, N, K") from e
 
 
 if __name__ == "__main__":
@@ -228,27 +220,15 @@ def shape_input_type(s):
     )
 
     parser.add_argument(
-        "-M",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(7, 11)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-N",
+        "--shape",
         type=shape_input_type,
         nargs="+",
-        default=[6656],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-K",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(12, 15)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
+        default=[
+            (m, 6656, k)
+            for m in [2**x for x in range(7, 11)]
+            for k in [2**x for x in range(12, 16)]
+        ],
+        help="matmul shapes: M, N, K. (M, K) @ (K, N) -> (M, N)",
     )
 
     parser.add_argument("-dtype", type=str, help="dtype", default="bfloat16")
 
@@ -114,7 +114,7 @@ def get_single_backend_fn(backend: str):
     if backend == "dist_2shot":
         return symm_mem_two_shot_all_reduce
     if backend == "triton_1shot":
-        return kraken.all_reduce_fusion.one_shot_all_reduce
+        return kraken.comm.one_shot_all_reduce
     if backend == "nccl":
         return nccl_ring
     raise NotImplementedError(backend)
 
@@ -15,15 +15,15 @@ def one_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.fused.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
 def two_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.fused.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
 
@@ -13,32 +13,36 @@
 
 def one_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.one_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.fused.one_shot_all_reduce_bias_rms_norm(
+        symm_mem_input, x, bias, rms_weight, y
+    )
     return y
 
 
 def one_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
+    kraken.fused.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.fused.rms_norm(y, rms_weight)
 
 
 def two_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.two_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.fused.two_shot_all_reduce_bias_rms_norm(
+        symm_mem_input, x, bias, rms_weight, y
+    )
     return y
 
 
 def two_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
+    kraken.fused.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.fused.rms_norm(y, rms_weight)
 
 
 def nccl_all_reduce_bias_rms_norm(x, bias, rms_weight):
     dist.all_reduce(x)
     y = x + bias
-    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
+    return kraken.fused.rms_norm(y, rms_weight)
 
 
 def create_benchmarks(b, t, d_size, device, dtype):
 
@@ -3,7 +3,6 @@
 import csv
 from dataclasses import asdict, dataclass
 import functools
-import itertools
 import os
 import sys
 
@@ -68,15 +67,10 @@ def asdict(self):
 
 def generate_experiment_configs(
     dtype: torch.dtype,
-    M: list[int],
-    N: list[int],
-    K: list[int],
+    shapes: list[tuple[int, int, int]],
     backends: list[str],
     device: torch.device,
 ) -> list[ExperimentConfig]:
-    # Generate cross config shapes from M, N, K lists
-    shapes = list(itertools.product(M, N, K))
-
     all_configs = []
     for shape in shapes:
         all_configs.append(
@@ -98,7 +92,7 @@ def get_single_backend_fn(backend: str):
     if backend == "torch_symm_mem":
         return torch_symm_mem_gemm_rs
     if backend == "triton":
-        return kraken.reduce_scatter_fusion.gemm_reduce_scatter
+        return kraken.fused.gemm_reduce_scatter
     raise NotImplementedError(backend)
 
 
@@ -181,9 +175,7 @@ def main(args):
     torch.manual_seed(42 + local_rank)
 
     results = []
-    configs = generate_experiment_configs(
-        args.dtype, args.M, args.N, args.K, args.backend, device
-    )
+    configs = generate_experiment_configs(args.dtype, args.shape, args.backend, device)
     for config in configs:
         results.append(
             Experiment(
@@ -201,7 +193,7 @@ def shape_input_type(s):
         M, N, K = map(int, s.split(","))
         return M, N, K
     except Exception as e:
-        raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
+        raise argparse.ArgumentTypeError("Shape must be M, N, K") from e
 
 
 if __name__ == "__main__":
@@ -233,27 +225,15 @@ def shape_input_type(s):
     )
 
     parser.add_argument(
-        "-M",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(7, 11)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-N",
+        "--shape",
         type=shape_input_type,
         nargs="+",
-        default=[6656],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
-    )
-
-    parser.add_argument(
-        "-K",
-        type=shape_input_type,
-        nargs="+",
-        default=[2**x for x in range(12, 16)],
-        help="matmul shapes: (M, N, K). (M, K) @ (K, N) -> (M, N)",
+        default=[
+            (m, 6656, k)
+            for m in [2**x for x in range(7, 11)]
+            for k in [2**x for x in range(12, 16)]
+        ],
+        help="matmul shapes: M, N, K. (M, K) @ (K, N) -> (M, N)",
     )
 
     parser.add_argument("-dtype", type=str, help="dtype", default="float32")
 
@@ -1,15 +1,13 @@
 from . import (
     _logging,
-    all_gather_fusion,
-    all_reduce,
-    all_reduce_fusion,
-    reduce_scatter_fusion,
+    _ptx_utils,
+    comm,
+    fused,
 )
 
 __all__ = [
     "_logging",
-    "all_gather_fusion",
-    "all_reduce",
-    "all_reduce_fusion",
-    "reduce_scatter_fusion",
+    "_ptx_utils",
+    "comm",
+    "fused",
 ]