meta-pytorch
diff --git a/‎README.md‎
Lines changed: 58 additions & 2 deletions b/‎README.md‎
Lines changed: 58 additions & 2 deletions
diff --git a/‎benchmark/benchmark_all_gather_matmul.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/benchmark_all_gather_matmul.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmark_all_reduce.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/benchmark_all_reduce.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmark_all_reduce_bias.py‎
Lines changed: 3 additions & 6 deletions b/‎benchmark/benchmark_all_reduce_bias.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎benchmark/benchmark_all_reduce_bias_rms_norm.py‎
Lines changed: 8 additions & 14 deletions b/‎benchmark/benchmark_all_reduce_bias_rms_norm.py‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎benchmark/benchmark_matmul_reduce_scatter.py‎
Lines changed: 13 additions & 9 deletions b/‎benchmark/benchmark_matmul_reduce_scatter.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎kraken/__init__.py‎
Lines changed: 14 additions & 2 deletions b/‎kraken/__init__.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎kraken/_ptx_utils/__init__.py‎
Lines changed: 6 additions & 2 deletions b/‎kraken/_ptx_utils/__init__.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎kraken/all_gather/__init__.py‎ renamed to ‎kraken/all_gather_fusion/__init__.py‎ b/‎kraken/all_gather/__init__.py‎ renamed to ‎kraken/all_gather_fusion/__init__.py‎
diff --git a/‎kraken/all_gather/all_gather_matmul.py‎ renamed to ‎kraken/all_gather_fusion/all_gather_matmul.py‎ b/‎kraken/all_gather/all_gather_matmul.py‎ renamed to ‎kraken/all_gather_fusion/all_gather_matmul.py‎
@@ -46,6 +46,10 @@ import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
 import kraken
+import os
+
+# local_rank is needed for device placement, and can be received from the environment
+local_rank = int(os.environ["LOCAL_RANK"])
 
 # Create and initialize a symmetric memory tensor
 # See blog: https://dev-discuss.pytorch.org/t/pytorch-symmetricmemory-harnessing-nvlink-programmability-with-ease/279 for symmetric memory details. 
@@ -54,13 +58,65 @@ a_shared = symm_mem.empty(
         dtype=torch.bfloat16, 
         device=f"cuda:{local_rank}",
     )
-symm_mem.rendezvous(a_shared, dist.group.WORLD.group_name)
+symm_mem.rendezvous(a_shared, group=dist.group.WORLD)
 a_shared = a_shared.normal_()
 
 # Call one_shot_all_reduce kernel from kraken. 
 a = kraken.one_shot_all_reduce(a_shared)
 ```
 
+Alternatively, you can build your own custom kernels by leveraging Kraken's low-level primitives. This allows you to create highly optimized kernels tailored to your specific needs. We provide PTX implementations of low-level primitives in `kraken._ptx_utils`.
+
+Here's an example of how to use `kraken._ptx_utils.symm_mem_sync` to synchronize blocks with matching `block_id` across participating devices in a custom kernel. This is often necessary before and after accessing symmetric memory tensors.
+
+```python
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+
+import triton
+import triton.language as tl
+
+import kraken
+import os
+
+@triton.jit
+def custom_distributed_kernel(
+    a_shared_ptrs,
+    a_signal_pad_ptrs,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+):
+    # Synchronizes blocks with matching block_id across participating devices.
+    # Ensures that all writes to a_shared from previous kernels across all devices
+    #  are visible to the current kernel:
+    kraken._ptx_utils.symm_mem_sync(
+        a_signal_pad_ptrs,
+        None,
+        rank,
+        world_size,
+        hasPreviousMemAccess=False,
+        hasSubsequentMemAccess=True,
+    )
+    ...  # access a_shared via a_shared_ptrs.
+
+# Create and initialize a symmetric memory tensor
+local_rank = int(os.environ["LOCAL_RANK"])
+a_shared = symm_mem.empty((4096, 4096), dtype=torch.bfloat16, device=f"cuda:{local_rank}")
+symm_mem_hdl = symm_mem.rendezvous(a_shared, group=dist.group.WORLD)
+
+# Define the grid for kernel launch. For simplicity, we use a single thread block.
+grid = (1,)
+
+# Call custom kernel
+custom_distributed_kernel[grid](
+    symm_mem_hdl.buffer_ptrs_dev,
+    symm_mem_hdl.signal_pad_ptrs_dev,
+    rank=symm_mem_hdl.rank,
+    world_size=symm_mem_hdl.world_size,
+)
+```
+
 
 ## 📁 Structure
 Kraken is organized for easy hacking of distributed Triton kernel: 
@@ -82,7 +138,7 @@ Kraken is organized for easy hacking of distributed Triton kernel:
 
 
 ### Inline PTX Utils
-
+`kraken._ptx_utils` provides inline ptx implementation of memory barrier synchorinzations that are not natively supported by triton. 
 
 
 
 
@@ -93,7 +93,7 @@ def get_single_backend_fn(backend: str):
     if backend == "torch_symm_mem":
         return torch_symm_mem_ag_mm
     if backend == "triton":
-        return kraken.all_gather.all_gather_matmul
+        return kraken.all_gather_fusion.all_gather_matmul
     raise NotImplementedError(backend)
 
 
 
@@ -114,7 +114,7 @@ def get_single_backend_fn(backend: str):
     if backend == "dist_2shot":
         return symm_mem_two_shot_all_reduce
     if backend == "triton_1shot":
-        return kraken.all_reduce.one_shot_all_reduce
+        return kraken.all_reduce_fusion.one_shot_all_reduce
     if backend == "nccl":
         return nccl_ring
     raise NotImplementedError(backend)
 
@@ -7,26 +7,23 @@
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
 
+import kraken
 from kraken import _logging as log
-from kraken.all_reduce_fusion import (
-    one_shot_all_reduce_bias,
-    two_shot_all_reduce_bias,
-)
 
 
 def one_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
 def two_shot_all_reduce_bias(
     x: torch.Tensor, bias: torch.Tensor, symm_mem_input: torch.Tensor
 ) -> torch.Tensor:
     y = torch.empty_like(x)
-    two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
     return y
 
 
 
@@ -7,44 +7,38 @@
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
 
+import kraken
 from kraken import _logging as log
-from kraken.all_reduce_fusion import (
-    rms_norm,
-    one_shot_all_reduce_bias,
-    one_shot_all_reduce_bias_rms_norm,
-    two_shot_all_reduce_bias,
-    two_shot_all_reduce_bias_rms_norm,
-)
 
 
 def one_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    one_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.all_reduce_fusion.one_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
     return y
 
 
 def one_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return rms_norm(y, rms_weight)
+    kraken.all_reduce_fusion.one_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
 
 
 def two_shot_all_reduce_bias_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    two_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
+    kraken.all_reduce_fusion.two_shot_all_reduce_bias_rms_norm(symm_mem_input, x, bias, rms_weight, y)
     return y
 
 
 def two_shot_all_reduce_bias_with_rms_norm(x, bias, rms_weight, symm_mem_input):
     y = torch.empty_like(x)
-    two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
-    return rms_norm(y, rms_weight)
+    kraken.all_reduce_fusion.two_shot_all_reduce_bias(symm_mem_input, x, bias, y)
+    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
 
 
 def nccl_all_reduce_bias_rms_norm(x, bias, rms_weight):
     dist.all_reduce(x)
     y = x + bias
-    return rms_norm(y, rms_weight)
+    return kraken.all_reduce_fusion.rms_norm(y, rms_weight)
 
 
 def create_benchmarks(b, t, d_size, device, dtype):
 
@@ -1,16 +1,16 @@
 import argparse
+from collections import defaultdict
 import csv
+from dataclasses import asdict, dataclass
 import functools
 import itertools
 import os
 import sys
-from collections import defaultdict
-from dataclasses import asdict, dataclass
 
+from tabulate import tabulate
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
-from tabulate import tabulate
 
 # Add the kraken directory to the Python path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
@@ -20,18 +20,22 @@
 
 
 def torch_symm_mem_gemm_rs(a, b):
-    gemm_rs_output = torch.ops.symm_mem.fused_matmul_reduce_scatter(
+    return torch.ops.symm_mem.fused_matmul_reduce_scatter(
         a, b, "sum", scatter_dim=0, group_name=dist.group.WORLD.group_name
     )
-    return gemm_rs_output
+
 
 def nccl_mem_gemm_rs(a, b):
-    from torch.distributed._functional_collectives import reduce_scatter_tensor, wait_tensor
+    from torch.distributed._functional_collectives import (
+        reduce_scatter_tensor,
+        wait_tensor,
+    )
 
     gemm_output = torch.matmul(a, b)
-    rs_o = reduce_scatter_tensor(gemm_output, "sum", scatter_dim=0, group=dist.group.WORLD)
-    gemm_rs_output = wait_tensor(rs_o)
-    return gemm_rs_output
+    rs_o = reduce_scatter_tensor(
+        gemm_output, "sum", scatter_dim=0, group=dist.group.WORLD
+    )
+    return wait_tensor(rs_o)
 
 
 @dataclass(frozen=True)
 
@@ -1,3 +1,15 @@
-from . import _logging, all_gather, all_reduce, all_reduce_fusion, reduce_scatter_fusion
+from . import (
+    _logging,
+    all_gather_fusion,
+    all_reduce,
+    all_reduce_fusion,
+    reduce_scatter_fusion,
+)
 
-__all__ = ["_logging", "all_gather", "all_reduce", "all_reduce_fusion", "reduce_scatter_fusion"]
+__all__ = [
+    "_logging",
+    "all_gather_fusion",
+    "all_reduce",
+    "all_reduce_fusion",
+    "reduce_scatter_fusion",
+]
@@ -1,15 +1,19 @@
 from .gmem_barrier_arrive_wait import arrive_gmem_barrier, wait_gmem_barrier
 from .symm_mem_barrier import (
     _get_flat_tid as get_flat_tid,
+)
+from .symm_mem_barrier import (
     _send_signal as send_signal,
+)
+from .symm_mem_barrier import (
     symm_mem_sync as symm_mem_sync,
 )
 
 __all__ = [
     "arrive_gmem_barrier",
-    "symm_mem_sync",
-    "wait_gmem_barrier",
     "get_flat_tid",
     "send_signal",
+    "symm_mem_sync",
+    "wait_gmem_barrier",
 ]
 # Avoid ptx_utils when possible