ROCm · brunomazzottiamd · Apr 28, 2026 · Apr 16, 2026
diff --git a/op_tests/triton_tests/attention/test_fav3_sage.py b/op_tests/triton_tests/attention/test_fav3_sage.py
@@ -185,6 +185,7 @@ def input_helper(
         k_shape = (BATCH, N_CTX_K, HK, D_HEAD)
         v_shape = (BATCH, N_CTX_K, HK, D_HEAD_V)
 
+    torch.manual_seed(20)
     q = torch.randn(q_shape, device="cuda", dtype=dtype)
     k = torch.randn(k_shape, device="cuda", dtype=dtype)
     v = torch.randn(v_shape, device="cuda", dtype=dtype)
@@ -214,6 +215,8 @@ def test_sage(
     dtype=torch.bfloat16,
 ):
     HEAD_SZ = 128
+
+    torch.manual_seed(20)
     torch.cuda.empty_cache()
 
     softmax_scale = 1.0 / math.sqrt(HEAD_SZ)

diff --git a/op_tests/triton_tests/attention/test_fp8_mqa_logits.py b/op_tests/triton_tests/attention/test_fp8_mqa_logits.py
@@ -96,9 +96,9 @@ def test_fp8_mqa_logits(
     head_dim: int,
     disable_cp: bool,
 ) -> None:
-    torch.manual_seed(0)
     if s_q > s_k:
         pytest.skip()
+    torch.manual_seed(0)
     q = torch.randn(s_q, num_heads, head_dim, device="cuda", dtype=torch.bfloat16)
     kv = torch.randn(s_k, head_dim, device="cuda", dtype=torch.bfloat16)
     kv_fp8, scales = per_custom_dims_cast_to_fp8(kv, (0,), False)

diff --git a/op_tests/triton_tests/attention/test_la_paged.py b/op_tests/triton_tests/attention/test_la_paged.py
@@ -57,6 +57,7 @@ def test_persistent_lean_attention(
     torch.cuda.empty_cache()  # Helps avoid hangs in large tests
 
     torch.manual_seed(20)
+    random.seed(20)
     # Long seqlen (>512K) can hit memory access fault. Suspect compiler issue
     # WA with shorter d and longer BLOCK_N
     if any(item > 524288 for item in n_ctx):

diff --git a/op_tests/triton_tests/attention/test_mha.py b/op_tests/triton_tests/attention/test_mha.py
@@ -615,8 +615,6 @@ def test_mha_backward(
     dtype=torch.float16,
 ):
     HAS_DROPOUT = DROPOUT > 0.0
-    torch.cuda.empty_cache()
-    torch.manual_seed(20)
 
     if FP8 and not _supports_fp8:
         pytest.skip(f"FP8 not supported on {arch}")
@@ -629,7 +627,10 @@ def test_mha_backward(
     if FP8 and CAUSAL:
         pytest.skip("FP8+CAUSAL results in random precision errors")
 
+    torch.cuda.empty_cache()
+    torch.manual_seed(20)
     mha_set_use_fused_bwd_kernel(FUSED)
+
     q = torch.randn(BATCH, SEQLEN_Q, NUM_Q_HEADS, HEAD_SZ, device="cuda", dtype=dtype)
     k = torch.randn(BATCH, SEQLEN_K, NUM_K_HEADS, HEAD_SZ, device="cuda", dtype=dtype)
     v = torch.randn(BATCH, SEQLEN_K, NUM_K_HEADS, HEAD_SZ, device="cuda", dtype=dtype)
@@ -778,8 +779,6 @@ def test_mha_backward_varlen(
     HEAD_SZ = 128
     NUM_K_HEADS = 8
     HAS_DROPOUT = DROPOUT > 0.0
-    torch.cuda.empty_cache()
-    torch.manual_seed(20)
 
     if FP8 and not _supports_fp8:
         pytest.skip(f"FP8 not supported on {arch}")
@@ -790,7 +789,10 @@ def test_mha_backward_varlen(
     if CAUSAL and HAS_DROPOUT:
         pytest.skip("CAUSAL+DROPOUT backward results in NaNs")
 
+    torch.cuda.empty_cache()
+    torch.manual_seed(20)
     mha_set_use_fused_bwd_kernel(FUSED)
+
     q = torch.randn(BATCH, SEQLEN_Q, NUM_Q_HEADS, HEAD_SZ, device="cuda", dtype=dtype)
     k = torch.randn(BATCH, SEQLEN_K, NUM_K_HEADS, HEAD_SZ, device="cuda", dtype=dtype)
     v = torch.randn(BATCH, SEQLEN_K, NUM_K_HEADS, HEAD_SZ, device="cuda", dtype=dtype)

diff --git a/op_tests/triton_tests/attention/test_pa_decode.py b/op_tests/triton_tests/attention/test_pa_decode.py
@@ -77,6 +77,7 @@ def input_helper(
     random_seed: int = 0,
 ):
     """Helper function to generate input tensors for paged attention testing."""
+    torch.manual_seed(random_seed)
     torch.cuda.manual_seed(random_seed)
     random.seed(random_seed)
 
@@ -188,10 +189,13 @@ def test_paged_attn(
 ):
 
     head_size = 128
-    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
+
     if SEQ_LEN >= 8192 and B >= 16:
         pytest.skip("B>={4} and SEQ_LEN>={8192} tests are too slow")
+
+    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
     torch.set_printoptions(threshold=100000)
+
     num_blocks = NUM_BLK
 
     (
@@ -277,14 +281,15 @@ def test_paged_attn_per_token_quant(
     compute_type,
     output_type,
 ):
-    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
-    torch.set_printoptions(precision=5, threshold=10000)
     if D == 128 and KV_BLK_SZ == 512:  # Causes Shared Memory out of resources on Mi300
         pytest.skip("D={128} and KV_BLK_SZ={512} causes shared memory out of resources")
 
     if SEQ_LEN >= 8192 and B >= 16:
         pytest.skip("B>={4} and SEQ_LEN>={8192} tests are too slow")
 
+    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
+    torch.set_printoptions(precision=5, threshold=10000)
+
     num_blocks = NUM_BLK
 
     (

diff --git a/op_tests/triton_tests/fusions/test_fused_bmm_rope_kv_cache.py b/op_tests/triton_tests/fusions/test_fused_bmm_rope_kv_cache.py
@@ -54,6 +54,7 @@ def test_fused_fp4_bmm_rope_cat_and_cache_mla(
     if not arch_info.is_fp4_avail():
         pytest.skip("MXFP4 is not available on this device")
 
+    torch.manual_seed(0)
     _, w_k, _, w_k_scale, _ = generate_batched_gemm_a16wfp4_inputs(
         QH_per_KH * KH, T, D_lora, D_q_nope, dtype, layout="TN", output=False
     )
@@ -220,7 +221,7 @@ def test_fused_fp8_bmm_rope_cat_and_cache_mla(
         pytest.skip("MXFP8 is not available on this device")
 
     QH = QH_per_KH * KH
-
+    torch.manual_seed(0)
     q_nope, w_k, w_k_scale, _, _ = generate_batched_gemm_a16w8_inputs(
         QH,
         T,

diff --git a/op_tests/triton_tests/fusions/test_fused_kv_cache.py b/op_tests/triton_tests/fusions/test_fused_kv_cache.py
@@ -35,6 +35,7 @@ def test_fused_qk_rope_cat_and_cache_mla(
     cache_dtype: bool,
     dtype: torch.dtype,
 ):
+    torch.manual_seed(0)
     pos = True
     _, _, _, _, freqs, positions, offsets, cos, sin = generate_rope_inputs(
         1,
@@ -207,6 +208,7 @@ def test_fused_qk_rope_reshape_and_cache(
     offs: bool,
     dtype: torch.dtype,
 ):
+    torch.manual_seed(0)
     pos = True
     q, k, _, _, freqs, positions, offsets, cos, sin = generate_rope_inputs(
         1,
@@ -231,6 +233,7 @@ def test_fused_qk_rope_reshape_and_cache(
         else:
             cache_dtype_actual = torch.float8_e4m3fnuz
             pytest.skip("Skipping FP8 dtype cases non-gfx950")
+    torch.manual_seed(0)
 
     if cache_flash:
         key_cache = torch.zeros(
@@ -441,6 +444,7 @@ def test_fused_qk_rope_reshape_and_cache_value_shuffle_layout(
     """Test fused_qk_rope_reshape_and_cache with value_cache in shuffle layout
     [num_blocks, num_kv_heads, block_size // x, head_size, x].
     """
+    torch.manual_seed(0)
     assert D % x_size == 0
     pos = True
     offs = False
@@ -584,6 +588,7 @@ def test_fused_qk_rope_reshape_and_cache_gpt_oss_120b_config_value_shuffle_preci
     pos = True
     offs = False
 
+    torch.manual_seed(0)
     q, k, _, _, freqs, positions, offsets, cos, sin = generate_rope_inputs(
         1,
         T,
@@ -731,6 +736,7 @@ def test_fused_qk_rope_cosine_cache_llama(
     offs: bool,
     dtype: torch.dtype,
 ):
+    torch.manual_seed(0)
     pos = True
     q, k, _, _, freqs, positions, offsets, cos, sin = generate_rope_inputs(
         1,
@@ -764,6 +770,7 @@ def test_fused_qk_rope_cosine_cache_llama(
         )
     else:
         pytest.skip()
+    torch.manual_seed(0)
 
     if cache_dtype == torch.uint8:
         k_scale = torch.randn(

diff --git a/op_tests/triton_tests/fusions/test_fused_qk_concat.py b/op_tests/triton_tests/fusions/test_fused_qk_concat.py
@@ -5,6 +5,7 @@
 
 
 def generate_qk_inputs(B: int, QH_PER_KH: int, KH: int, D_nope: int, D_pe: int, dtype):
+    torch.manual_seed(0)
     q_nope = torch.randn((B, QH_PER_KH * KH, D_nope), dtype=dtype, device="cuda")
     q_pe = torch.randn((B, QH_PER_KH * KH, D_pe), dtype=dtype, device="cuda")
     k_nope = torch.randn((B, KH, D_nope), dtype=dtype, device="cuda")

diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a16w16.py b/op_tests/triton_tests/gemm/basic/test_gemm_a16w16.py
@@ -10,6 +10,7 @@
 
 
 def generate_gemm_a16w16_inputs(M, N, K, dtype, layout="TN", output=True, bias=False):
+    torch.manual_seed(0)
     if isinstance(dtype, str):
         dtype = str_to_torch_dtype[dtype]
 

diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a16w16_gated.py b/op_tests/triton_tests/gemm/basic/test_gemm_a16w16_gated.py
@@ -9,6 +9,7 @@
 
 
 def generate_gemm_a16w16_gated_inputs(M, N, K, dtype, layout="TN", output=True):
+    torch.manual_seed(0)
     if isinstance(dtype, str):
         dtype = str_to_torch_dtype[dtype]
 

diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a16w8_blockscale.py b/op_tests/triton_tests/gemm/basic/test_gemm_a16w8_blockscale.py
@@ -72,6 +72,7 @@ def generate_gemm_a16w8_blockscale_inputs(
     - x: (M, K) -> row-major format
     - w: (N, K) -> column-major format
     """
+    torch.manual_seed(0)
     scale_n = (N + block_shape_n - 1) // block_shape_n
     scale_k = (K + block_shape_k - 1) // block_shape_k
 

diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8.py
@@ -97,6 +97,7 @@ def generate_gemm_a8w8_inputs(
     - x: (M, K) -> row-major format
     - w: (N, K) -> column-major format
     """
+    torch.manual_seed(0)
     if layout[0] == "T":
         # T (transposed) in Fortran notation equals row-major
         x = torch.randn((M, K), dtype=torch.float32, device="cuda")

diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_blockscale.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_blockscale.py
@@ -70,6 +70,7 @@ def generate_gemm_a8w8_blockscale_inputs(
     - x: (M, K) -> row-major format
     - w: (N, K) -> column-major format
     """
+    torch.manual_seed(0)
     scale_n = (N + block_shape_n - 1) // block_shape_n
     scale_k = (K + block_shape_k - 1) // block_shape_k
 

diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_per_token_scale.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8w8_per_token_scale.py
@@ -43,6 +43,7 @@ def generate_gemm_a8w8_per_token_scale_inputs(
     layout: str = "TN",
     output=False,
 ):
+    torch.manual_seed(0)
 
     if layout[0] == "T":
         x = (torch.rand((M, K), dtype=torch.float16, device="cuda") / 10).to(e4m3_type)

diff --git a/op_tests/triton_tests/gemm/basic/test_gemm_a8wfp4.py b/op_tests/triton_tests/gemm/basic/test_gemm_a8wfp4.py
@@ -343,12 +343,13 @@ def test_gemm_a8wfp4(M: int, N: int, K: int, CLEAR_GPUS=True):
     a_dtype = e4m3_type
     layout = "TN"  # Kernel will occasionally crash for layouts other than TN.
     out_dtype = torch.bfloat16
-    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
 
-    torch.manual_seed(42)  # for reproducibility
     if not (arch_info.is_fp4_avail()):
         pytest.skip("MXFP4 not supported on this architecture")
 
+    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
+    torch.manual_seed(42)  # for reproducibility
+
     # clean up to avoid hangs in large tests
     if CLEAR_GPUS:
         torch.cuda.empty_cache()

diff --git a/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8.py b/op_tests/triton_tests/gemm/batched/test_batched_gemm_a8w8.py
@@ -26,6 +26,7 @@ def generate_batched_gemm_a8w8_inputs(
         - x_scale: shape (B, M, 1)
         - w_scale: shape (B, 1, N)
     """
+    torch.manual_seed(0)
     if isinstance(dtype, str):
         dtype = str_to_torch_dtype[dtype]
     if layout[0] == "T":

diff --git a/...m/batched/test_batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py b/...m/batched/test_batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant.py
@@ -31,6 +31,7 @@ def generate_batched_gemm_a16w8_inputs(
         - x_scale: shape (B, M, 1)
         - w_scale: shape (B, 1, N)
     """
+    torch.manual_seed(0)
     if isinstance(dtype, str):
         dtype = str_to_torch_dtype[dtype]
     if layout[0] == "T":

diff --git a/op_tests/triton_tests/gemm/batched/test_batched_gemm_bf16.py b/op_tests/triton_tests/gemm/batched/test_batched_gemm_bf16.py
@@ -19,6 +19,7 @@ def generate_batched_gemm_a16w16_inputs(
     output: bool,
     layout: str = "TN",
 ):
+    torch.manual_seed(0)
     if isinstance(dtype, str):
         dtype = str_to_torch_dtype[dtype]
     if layout[0] == "T":

diff --git a/op_tests/triton_tests/gemm/feed_forward/test_ff_a16w16.py b/op_tests/triton_tests/gemm/feed_forward/test_ff_a16w16.py
@@ -18,6 +18,7 @@
 def test_ff_a16w16_ungated(
     batch: int, hidden_dim: int, intermediate_dim: int, dtype, output, activation
 ):
+    torch.manual_seed(0)
     ff_ungated_test(
         ff_a16w16_nogate,
         batch=batch,
@@ -37,6 +38,7 @@ def test_ff_a16w16_ungated(
 def test_ff_a16w16_gated(
     batch: int, hidden_dim: int, intermediate_dim: int, dtype, output, activation
 ):
+    torch.manual_seed(0)
     ff_gated_test(
         ff_a16w16_gated,
         batch=batch,

diff --git a/op_tests/triton_tests/gemm/feed_forward/test_ff_a16w16_fused.py b/op_tests/triton_tests/gemm/feed_forward/test_ff_a16w16_fused.py
@@ -24,6 +24,7 @@ def test_ff_a16w16_fused_ungated(
         pytest.skip(
             "Small differences in implementation between Triton & Torch activations accumulate to beyond test bounds w/large matrices."
         )
+    torch.manual_seed(0)
     ff_ungated_test(
         ff_a16w16_fused_ungated,
         batch=batch,
@@ -47,6 +48,7 @@ def test_ff_a16w16_fused_gated(
         pytest.skip(
             "Small differences in implementation between Triton & Torch activations accumulate to beyond test bounds w/large matrices."
         )
+    torch.manual_seed(0)
 
     ff_gated_test(
         ff_a16w16_fused_gated,

diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_a16w16.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_a16w16.py
@@ -87,6 +87,7 @@ def get_x_vals():
 @pytest.mark.parametrize("output", [True, False])
 @pytest.mark.parametrize("skip_reduce", [True, False])
 def test_gemm(dtype, M, N1, N2, K, output, skip_reduce):
+    torch.manual_seed(0)
     block_shape_n, block_shape_k = block_shape
 
     x_fp8, w_fp8, _, x_fp8_scale, _, w_fp8_scale, y_fp8 = (

diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_mul_add.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_mul_add.py
@@ -54,6 +54,7 @@ def test_fused_gemm_a8w8_blockscale_mul_add(
     b_type_is_scalar,
     fuse_type,
 ):
+    torch.manual_seed(0)
 
     (
         x,

diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_split_cat.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_a8w8_blockscale_split_cat.py
@@ -112,6 +112,7 @@ def generate_fused_gemm_a8w8_blockscale_split_cat_inputs(
     - w: (N, K) -> column-major format
     - y: (M, D, S3)
     """
+    torch.manual_seed(0)
     scale_n = (N + block_shape_n - 1) // block_shape_n
     scale_k = (K + block_shape_k - 1) // block_shape_k
 

diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_a16w16.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_a16w16.py
@@ -91,6 +91,7 @@ def test_gemm(dtype, M, N1, N2, K, output, skip_reduce, fp4_shuffle):
 
     if not (arch_info.is_fp4_avail()):
         pytest.skip("MXFP4 not supported on this architecture")
+    torch.manual_seed(0)
 
     (
         x_fp4,

diff --git a/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_mul_add.py b/op_tests/triton_tests/gemm/fused/test_fused_gemm_afp4wfp4_mul_add.py
@@ -72,6 +72,7 @@ def test_fused_gemm_afp4wfp4_mul_add(
             pytest.skip(
                 f"K = {K} is not divisible by 256, skip this test for preshuffled weight/scales tests"
             )
+    torch.manual_seed(0)
 
     torch.cuda.empty_cache()  # Helps avoid hangs in large tests
 

diff --git a/op_tests/triton_tests/moe/test_moe_align_block_size.py b/op_tests/triton_tests/moe/test_moe_align_block_size.py
@@ -99,6 +99,7 @@ def torch_moe_align_block_size(
 
 
 def input_helper(M: int, E: int, top_k: int):
+    torch.manual_seed(0)
     values = torch.randn(M, E, dtype=torch.float16, device="cuda")
 
     softmax_vals = torch.softmax(values, dim=1)

diff --git a/op_tests/triton_tests/moe/test_moe_mx.py b/op_tests/triton_tests/moe/test_moe_mx.py
@@ -304,11 +304,11 @@ def test_fused_moe(
     routed_weight: bool,
     swizzle_mx_scale: bool,
 ):
-    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
-    torch.manual_seed(20)
     if not (arch_info.is_fp4_avail()):
         pytest.skip("MXFP4 not supported on this architecture")
-        pytest.skip("MXFP4 not supported on this architecture")
+
+    torch.cuda.empty_cache()  # Helps avoid hangs in large tests
+    torch.manual_seed(20)
 
     (
         a_tri,

diff --git a/op_tests/triton_tests/normalization/test_fused_add_rmsnorm_pad.py b/op_tests/triton_tests/normalization/test_fused_add_rmsnorm_pad.py
@@ -5,6 +5,7 @@
 
 
 def generate_inputs(M, N, has_res, dtype):
+    torch.manual_seed(0)
     x = torch.randn((M, N), dtype=dtype, device="cuda")
     weight = torch.randn((N,), dtype=dtype, device="cuda")
     res = torch.randn((M, N), dtype=dtype, device="cuda") if has_res else None
-Original file line number
+Diff line change
@@ Expand Up / @@ -54,6 +54,7 @@ def test_fused_gemm_a8w8_blockscale_mul_add( @@
         b_type_is_scalar,
         fuse_type,
     ):
+        torch.manual_seed(0)
         (
             x,
@@ Expand Down @@