From e8216c482ec17bd7a7f871639a5ad2f2376fb7a3 Mon Sep 17 00:00:00 2001 From: Ayaka Date: Fri, 20 Sep 2024 07:30:54 -0700 Subject: [PATCH] [Pallas GPU] Enable Pallas `OpsExtraTest` in 64-bit mode This is a follow-up of https://github.com/jax-ml/jax/pull/23747, which enables Pallas `OpsTest` in 64-bit mode. In order to enable Pallas `OpsExtraTest` in 64-bit mode, some of the code in the tests need to be modified. There are three kinds of modifications: 1. Most of the modifications are just changing `jnp.int32` to `intx` and `jnp.float32` to `floatx`, which uses the same approach as the previous PR https://github.com/jax-ml/jax/pull/23747. `intx` and `floatx` are conventions used in Pallas tests to refer to 64-bit types in 64-bit mode and their 32-bit counterparts in 32-bit mode. 2. For the test `test_array_reduce`, the original code uses a simple approach to determine `out_dtype` from `dtype`, which no longer works in 64-bit mode. Therefore, I modified the code to deduct `out_dtype` by executing the operation on a single element first. 3. For the test `test_masked_load_store`, the `idx` variable is expected to be an `int32` array, which is calculated based on `pl.program_id()` and `block_size`. In 64-bit mode, the computation will give out an `int64` array instead. Since `pl.program_id()` always returns an `int32` result, I modified the computation to produce `int32` result. I also modified the `pl.program_id()` docstring to document the behaviour that `pl.program_id()` always returns an `int32` result. PiperOrigin-RevId: 676838304 --- jax/_src/pallas/primitives.py | 2 + tests/pallas/ops_test.py | 69 ++++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/jax/_src/pallas/primitives.py b/jax/_src/pallas/primitives.py index 89b6c6e14acd..40caae76bd8f 100644 --- a/jax/_src/pallas/primitives.py +++ b/jax/_src/pallas/primitives.py @@ -59,6 +59,8 @@ def program_id(axis: int) -> jax.Array: grid coordinates `(1, 2)`, `program_id(axis=0)` returns `1` and `program_id(axis=1)` returns `2`. + The returned value is an array of shape `()` and dtype `int32`. + Args: axis: the axis of the grid along which to count the program. """ diff --git a/tests/pallas/ops_test.py b/tests/pallas/ops_test.py index d8f890c06c32..65cdde30f7d9 100644 --- a/tests/pallas/ops_test.py +++ b/tests/pallas/ops_test.py @@ -752,8 +752,6 @@ class OpsExtraTest(PallasBaseTest): def setUp(self): super().setUp() - if jax.config.x64_enabled: - self.skipTest("Only works in 32-bit") if jtu.test_device_matches(["tpu"]) and not self.INTERPRET: # TODO: most tests fail on TPU in non-interpret mode self.skipTest("On TPU the test works only in interpret mode") @@ -800,7 +798,7 @@ def kernel(x_ref, o_ref): def test_abs_weak_type(self): # see https://github.com/jax-ml/jax/issues/23191 @functools.partial( - self.pallas_call, out_shape=jax.ShapeDtypeStruct((4, 4), jnp.float32), + self.pallas_call, out_shape=jax.ShapeDtypeStruct((4, 4), floatx), ) def kernel(x_ref, o_ref): o_ref[...] = jnp.abs(x_ref[...]) @@ -1145,20 +1143,20 @@ def f(x_ref, o_ref): def test_num_programs(self): @functools.partial( self.pallas_call, - out_shape=jax.ShapeDtypeStruct((4,), jnp.int32), + out_shape=jax.ShapeDtypeStruct((4,), intx), grid=4, ) def kernel(o_ref): o_ref[pl.program_id(0)] = pl.num_programs(0) np.testing.assert_array_equal( - kernel(), np.asarray([4, 4, 4, 4], dtype=np.int32) + kernel(), jnp.array([4, 4, 4, 4], dtype=intx) ) def test_where_broadcasting(self): @functools.partial( self.pallas_call, - out_shape=jax.ShapeDtypeStruct((4, 2, 2), jnp.float32), + out_shape=jax.ShapeDtypeStruct((4, 2, 2), floatx), grid=1, ) def copyitem(x_ref, in_idx_ref, out_idx_ref, o_ref): @@ -1225,11 +1223,12 @@ def dot(x_ref, y_ref, o_ref): def test_masked_load_store(self, size, block_size): @functools.partial( self.pallas_call, - out_shape=(jax.ShapeDtypeStruct((size,), jnp.float32)), + out_shape=(jax.ShapeDtypeStruct((size,), floatx)), grid=pl.cdiv(size, block_size), ) def kernel(x_ref, o_ref): - idx = pl.program_id(0) * block_size + jnp.arange(block_size) + idx = pl.program_id(0) * block_size + jnp.arange( + block_size, dtype=jnp.int32) mask = idx < x_ref.shape[0] x = pl.load(x_ref, (idx,), mask=mask) pl.store(o_ref, (idx,), x + 1.0, mask=mask) @@ -1243,7 +1242,7 @@ def test_masked_oob_load_store_slice(self): @functools.partial( self.pallas_call, - out_shape=(jax.ShapeDtypeStruct((n,), jnp.float32)), + out_shape=(jax.ShapeDtypeStruct((n,), floatx)), grid=1, ) def masked_oob_load_store_slice(x_ref, mask_ref, start_idx_ref, o_ref): @@ -1276,7 +1275,7 @@ def test_broadcasted_load_store(self): @functools.partial( self.pallas_call, - out_shape=(jax.ShapeDtypeStruct((m, n), jnp.float32)), + out_shape=(jax.ShapeDtypeStruct((m, n), floatx)), grid=1, ) def load(x_ref, o_ref): @@ -1319,7 +1318,7 @@ def test_swap(self): @functools.partial( self.pallas_call, - out_shape=(jax.ShapeDtypeStruct((m, n), jnp.float32),) * 2, + out_shape=(jax.ShapeDtypeStruct((m, n), floatx),) * 2, grid=1, input_output_aliases={0: 0, 1: 1}, ) @@ -1339,7 +1338,7 @@ def test_masked_swap(self): @functools.partial( self.pallas_call, - out_shape=(jax.ShapeDtypeStruct((m, n), jnp.float32),) * 2, + out_shape=(jax.ShapeDtypeStruct((m, n), floatx),) * 2, grid=1, input_output_aliases={0: 0, 1: 1}, ) @@ -1360,8 +1359,8 @@ def test_masked_oob_swap_slice(self): @functools.partial( self.pallas_call, - out_shape=(jax.ShapeDtypeStruct((n,), jnp.float32), - jax.ShapeDtypeStruct((m,), jnp.float32)), + out_shape=(jax.ShapeDtypeStruct((n,), floatx), + jax.ShapeDtypeStruct((m,), floatx)), grid=1, input_output_aliases={0: 0, 1: 1}, ) @@ -1430,7 +1429,7 @@ def test_array_atomic_add(self, axis): grid = m else: grid = n - out_shape = jax.ShapeDtypeStruct((n if axis == 0 else m,), jnp.float32) + out_shape = jax.ShapeDtypeStruct((n if axis == 0 else m,), floatx) @functools.partial( self.pallas_call, @@ -1464,8 +1463,8 @@ def reduce(x_ref, _, y_ref): def test_atomic_cas(self, init_value, cmp, new_value): @functools.partial( self.pallas_call, out_shape=( - jax.ShapeDtypeStruct((), jnp.int32), - jax.ShapeDtypeStruct((), jnp.int32)), + jax.ShapeDtypeStruct((), intx), + jax.ShapeDtypeStruct((), intx)), input_output_aliases={0: 0}) def swap(_, lock_ref, out_ref): out_ref[()] = pl.atomic_cas(lock_ref, cmp, new_value) @@ -1528,14 +1527,31 @@ def reduce(x_ref, y_ref): ("argmin", jnp.argmin), ] for axis in [0, 1, (1,), (0, 1)] - for dtype in ["float16", "float32", "int32", "uint32"] + for dtype in [ + "float16", + "float32", + "float64", + "int32", + "int64", + "uint32", + "uint64", + ] if isinstance(axis, int) or "arg" not in op_name ]) def test_array_reduce(self, op, dtype, axis): m, n = 32, 8 - out_dtype = dtype - if op in {jnp.argmin, jnp.argmax}: - out_dtype = jnp.int32 + + if not jax.config.x64_enabled and dtype in ("float64", "int64", "uint64"): + self.skipTest("64-bit types require x64_enabled") + + # Skip argmin/argmax on GPU in 64-bit mode because Pallas expects + # `index_type` to be i32 + if ( + jax.config.x64_enabled + and jtu.test_device_matches(["gpu"]) + and op in {jnp.argmin, jnp.argmax} + ): + self.skipTest("Not supported on GPU in 64-bit mode") def make_x(key): if jnp.issubdtype(dtype, jnp.integer): @@ -1545,9 +1561,10 @@ def make_x(key): else: return random.normal(key, (m, n), dtype=dtype) + # deduct `out_dtype` by executing the op on a single element + out_dtype = op(jnp.arange(1, dtype=dtype)).dtype out_shape = jax.ShapeDtypeStruct( - op(make_x(random.key(0)), axis=axis).shape, out_dtype - ) + op(make_x(random.key(0)), axis=axis).shape, out_dtype) if isinstance(axis, int): grid = tuple(a for i, a in enumerate((m, n)) if i != axis) else: @@ -1555,9 +1572,11 @@ def make_x(key): @functools.partial(self.pallas_call, out_shape=out_shape, grid=grid) def reduce(x_ref, y_ref): - x = pl.load(x_ref, (jnp.arange(m)[:, None], jnp.arange(n)[None])) + x = pl.load(x_ref, (jnp.arange(m, dtype=jnp.int32)[:, None], + jnp.arange(n, dtype=jnp.int32)[None])) y = op(x, axis=axis) - pl.store(y_ref, tuple(jnp.arange(d) for d in y.shape), y) + pl.store(y_ref, + tuple(jnp.arange(d, dtype=jnp.int32) for d in y.shape), y) for i, key in enumerate(random.split(random.key(0), 20)): x = make_x(key)