Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/integration-tests-nvidia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ jobs:
run: make NUM_PROCS=24 test-unit
- name: Run gluon tests
run: make NUM_PROCS=24 test-gluon
- name: Run gsan tests
run: make NUM_PROCS=24 test-gsan
- name: Run interpreter tests
if: ${{ matrix.config.runner_type == 'nvidia-h100' }}
run: make test-interpret
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ test-gluon: all
$(PYTEST) -n $(NUM_PROCS) python/test/gluon/ python/tutorials/gluon/
$(PYTEST) -n 2 python/examples/gluon/

.PHONY: test-gsan
test-gsan: all
$(PYTEST) -n $(NUM_PROCS) python/test/gsan

.PHONY: test-regression
test-regression: all
$(PYTEST) -n $(NUM_PROCS) python/test/regression
Expand Down
146 changes: 146 additions & 0 deletions python/test/gsan/test_allocator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from __future__ import annotations

import pytest
import torch

from triton._internal_testing import is_cuda
from triton.experimental.gsan import create_mem_pool
from triton.experimental.gsan._allocator import get_reserve_pointer, get_reserve_size, gsan_free, gsan_malloc
from triton.experimental.gsan._utils import shadow_tensor_for


@pytest.fixture
def _direct_allocator():
device = torch.cuda.current_device()
stream = 0
reserve_ptr = get_reserve_pointer()
reserve_size = get_reserve_size()
allocated = set()

def malloc(size: int) -> int:
ptr_int = gsan_malloc(size, device, stream)
if ptr_int != 0:
allocated.add(ptr_int)
return ptr_int

def free(ptr: int, size: int = 0) -> None:
gsan_free(ptr, device, size, stream)
if ptr in allocated:
allocated.remove(ptr)

try:
yield malloc, free, reserve_ptr, reserve_size
finally:
# Cleanup any allocated pointers
for ptr in list(allocated):
gsan_free(ptr, device, 0, stream)


@pytest.mark.skipif(not is_cuda(), reason="requires CUDA backend")
def test_malloc_edge_cases(_direct_allocator):
malloc, free, reserve_ptr, reserve_size = _direct_allocator

# Invalid sizes are rejected.
assert malloc(0) == 0
assert malloc(-1) == 0
assert malloc(reserve_size) == 0 # larger than the full real region

# Null free is a no-op.
free(0)


def test_malloc_free(_direct_allocator):
malloc, free, reserve_ptr, reserve_size = _direct_allocator
real_base = reserve_ptr + reserve_size // 2

# First valid allocation should come from the real base and be reusable.
p0 = malloc(1)
assert p0 == real_base
free(p0)
assert malloc(1) == p0

p1 = malloc(1)
_ = malloc(1)

free(p1)
p3 = malloc(1)
assert p3 == p1


@pytest.mark.skipif(not is_cuda(), reason="requires CUDA backend")
def test_malloc_fragmentation_reuse_and_coalesce(_direct_allocator):
malloc, free, _, _ = _direct_allocator

p0 = malloc(1)
p1 = malloc(1)
assert p0 != 0 and p1 != 0
assert p0 < p1

block = p1 - p0
assert block > 0

# Reuse exact freed block under fragmentation.
free(p1)
p1_reuse = malloc(1)
assert p1_reuse == p1

# Free two siblings and request a slightly larger block; should coalesce.
free(p0)
free(p1_reuse)
parent = malloc(block + 1)
assert parent == p0

free(parent)
torch.cuda.synchronize()


@pytest.mark.skipif(not is_cuda(), reason="requires CUDA backend")
def test_free_invalid_pointer_and_double_free(_direct_allocator):
malloc, free, _, _ = _direct_allocator

p0 = malloc(1)
assert p0 != 0

free(p0 + 1) # freeing an invalid pointer should not crash.

free(p0)
free(p0) # double free must be a no-op

# p0 should become reusable after the valid free above.
p0_reuse = malloc(1)
assert p0_reuse == p0

free(p0_reuse)
torch.cuda.synchronize()


@pytest.mark.skipif(not is_cuda(), reason="requires CUDA backend")
def test_mem_pool():
pool = create_mem_pool()
with torch.cuda.use_mem_pool(pool):
real = torch.empty(4096, dtype=torch.uint8, device="cuda")

reserve_ptr = get_reserve_pointer()
reserve_size = get_reserve_size()
assert reserve_ptr != 0
assert reserve_size > 0

# Check real allocation is in higher half of reserve
real_base = reserve_ptr + reserve_size // 2
assert real_base <= real.data_ptr() < reserve_ptr + reserve_size

shadow = shadow_tensor_for(real)
assert reserve_ptr <= shadow.data_ptr() < reserve_ptr + reserve_size // 2

# Test that real and shadow allocation can be used
real.zero_()
real.add_(7)
# Note: shadow memory is zero-initialized by the allocator
shadow.add_(3)

assert torch.all(real == 7).item()
assert torch.all(shadow == 3).item()
del pool
del real
del shadow
torch.cuda.synchronize()
15 changes: 15 additions & 0 deletions python/test/gsan/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest
import torch

from triton._internal_testing import is_cuda
from triton.experimental.gsan._utils import uint8_cuda_tensor_from_ptr


@pytest.mark.skipif(not is_cuda(), reason="requires CUDA backend")
def test_uint8_cuda_tensor_from_ptr_delete_tensor():
view = uint8_cuda_tensor_from_ptr(12345, 10, 1)
assert view.data_ptr() == 12345
assert view.shape == (10, )
assert view.dtype == torch.uint8
assert view.device == torch.device("cuda:1")
del view
3 changes: 3 additions & 0 deletions python/triton/experimental/gsan/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._allocator import create_mem_pool, get_allocator

__all__ = ["create_mem_pool", "get_allocator"]
67 changes: 67 additions & 0 deletions python/triton/experimental/gsan/_allocator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

import functools
from pathlib import Path
from types import ModuleType

from triton.runtime import driver as runtime_driver
from triton.runtime.build import compile_module_from_file

_THIS_DIR = Path(__file__).resolve().parent
_GSAN_SOURCE_PATH = _THIS_DIR / "src" / "GSanAllocator.cc"


@functools.lru_cache()
def _load_gsan_module() -> ModuleType:
if runtime_driver.active.get_current_target().backend != "cuda":
raise RuntimeError("GSan allocator requires the CUDA backend.")

from triton.backends.nvidia.driver import library_dirs, include_dirs

return compile_module_from_file(
src_path=str(_GSAN_SOURCE_PATH),
name="gsan_allocator",
library_dirs=library_dirs(),
include_dirs=include_dirs,
libraries=["libcuda.so.1"],
)


@functools.lru_cache()
def _compile_gsan_allocator() -> str:
# __file__ for a compiled module is the so file
return _load_gsan_module().__file__


@functools.lru_cache()
def get_allocator():
from torch.cuda.memory import CUDAPluggableAllocator
so_name = _compile_gsan_allocator()
return CUDAPluggableAllocator(so_name, "gsanMalloc", "gsanFree")


def create_mem_pool():
from torch.cuda.memory import MemPool
return MemPool(get_allocator().allocator())


def gsan_malloc(size: int, device: int, stream: int = 0) -> int:
module = _load_gsan_module()
return module.malloc(size, device, stream)


def gsan_free(ptr: int, device: int, size: int = 0, stream: int = 0) -> None:
module = _load_gsan_module()
module.free(ptr, device, size, stream)


def get_reserve_pointer() -> int:
return _load_gsan_module().get_reserve_pointer()


def get_reserve_size() -> int:
return _load_gsan_module().get_reserve_size()


def get_global_state_pointer() -> int:
return _load_gsan_module().get_global_state_pointer()
111 changes: 111 additions & 0 deletions python/triton/experimental/gsan/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from __future__ import annotations

from triton.experimental.gsan._allocator import get_reserve_pointer, get_reserve_size

import ctypes
import torch

_DLPACK_CAPSULE_NAME = b"dltensor"
_DL_UINT = 1
_DL_BITS_UINT8 = 8
_DL_LANES = 1
_DL_CUDA = 2


class _DLDevice(ctypes.Structure):
_fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int)]


class _DLDataType(ctypes.Structure):
_fields_ = [("code", ctypes.c_uint8), ("bits", ctypes.c_uint8), ("lanes", ctypes.c_uint16)]


class _DLTensor(ctypes.Structure):
_fields_ = [
("data", ctypes.c_void_p),
("device", _DLDevice),
("ndim", ctypes.c_int),
("dtype", _DLDataType),
("shape", ctypes.POINTER(ctypes.c_int64)),
("strides", ctypes.POINTER(ctypes.c_int64)),
("byte_offset", ctypes.c_uint64),
]


class _DLManagedTensor(ctypes.Structure):
pass


_DLManagedTensorHandle = ctypes.POINTER(_DLManagedTensor)
_DLManagedTensorDeleter = ctypes.CFUNCTYPE(None, _DLManagedTensorHandle)

_DLManagedTensor._fields_ = [
("dl_tensor", _DLTensor),
("manager_ctx", ctypes.c_void_p),
("deleter", _DLManagedTensorDeleter),
]

PyCapsule_NewType = ctypes.CFUNCTYPE(ctypes.py_object, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p)
PyCapsule_New = PyCapsule_NewType(ctypes.pythonapi.PyCapsule_New)

# Hold ctypes-backed DLPack payloads until the tensor deleter runs.
_DLPACK_STATE: dict[int, tuple[object, object, object]] = {}


@_DLManagedTensorDeleter
def _dl_managed_tensor_deleter(dl_managed_tensor: _DLManagedTensorHandle) -> None:
if not dl_managed_tensor:
return
_DLPACK_STATE.pop(ctypes.addressof(dl_managed_tensor.contents), None)


def uint8_cuda_tensor_from_ptr(data_ptr: int, numel: int, device_index: int) -> torch.Tensor:
numel = int(numel)
if numel < 0:
raise ValueError(f"numel must be >= 0, got {numel}")

shape = (ctypes.c_int64 * 1)(numel)
strides = (ctypes.c_int64 * 1)(1)
dl_managed_tensor = _DLManagedTensor()
dl_managed_tensor.dl_tensor.data = ctypes.c_void_p(int(data_ptr))
dl_managed_tensor.dl_tensor.device = _DLDevice(_DL_CUDA, device_index)
dl_managed_tensor.dl_tensor.ndim = 1
dl_managed_tensor.dl_tensor.dtype = _DLDataType(_DL_UINT, _DL_BITS_UINT8, _DL_LANES)
dl_managed_tensor.dl_tensor.shape = ctypes.cast(shape, ctypes.POINTER(ctypes.c_int64))
dl_managed_tensor.dl_tensor.strides = ctypes.cast(strides, ctypes.POINTER(ctypes.c_int64))
dl_managed_tensor.dl_tensor.byte_offset = 0
dl_managed_tensor.manager_ctx = None
dl_managed_tensor.deleter = _dl_managed_tensor_deleter

dl_managed_tensor_ptr = ctypes.addressof(dl_managed_tensor)
_DLPACK_STATE[dl_managed_tensor_ptr] = (dl_managed_tensor, shape, strides)

try:
dlpack_capsule = PyCapsule_New(
ctypes.c_void_p(dl_managed_tensor_ptr),
_DLPACK_CAPSULE_NAME,
None,
)
return torch.from_dlpack(dlpack_capsule)
except Exception:
_DLPACK_STATE.pop(dl_managed_tensor_ptr, None)
raise


SHADOW_SIZE_BYTES = 24
SHADOW_GRANULARITY_BYTES = 4


def shadow_region(real_ptr: int, real_size_bytes: int, reserve_ptr: int, reserve_size: int) -> tuple[int, int]:
real_base = reserve_ptr + reserve_size // 2
word_offset = (real_ptr - real_base) // SHADOW_GRANULARITY_BYTES
shadow_ptr = reserve_ptr + word_offset * SHADOW_SIZE_BYTES
shadow_size = ((real_size_bytes + SHADOW_GRANULARITY_BYTES - 1) // SHADOW_GRANULARITY_BYTES) * SHADOW_SIZE_BYTES
return shadow_ptr, shadow_size


def shadow_tensor_for(real: torch.Tensor) -> torch.Tensor:
reserve_ptr = get_reserve_pointer()
reserve_size = get_reserve_size()
shadow_ptr, shadow_size = shadow_region(real.data_ptr(), real.untyped_storage().nbytes(), reserve_ptr, reserve_size)
return uint8_cuda_tensor_from_ptr(shadow_ptr, shadow_size, real.device.index)
Loading
Loading