ROCm · Micky774 · Nov 4, 2025 · Nov 5, 2025 · Nov 10, 2025 · Nov 12, 2025
@@ -71,13 +71,16 @@ run_test_config_mgpu() {
     *0.4.35*)
         # Workaround for distributed tests hang with xla_flag
  	    XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run 3 test_distributed_fused_attn.py -k 'not test_context_parallel_ring_attn'
+ 	    XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" NVTE_CK_USES_FWD_V3=1 NVTE_CK_USES_BWD_V3=1 run 3 test_distributed_fused_attn.py -k 'not test_context_parallel_ring_attn'
 
         # Test ring attention with xla_flag --xla_experimental_ignore_channel_id only
 	    XLA_FLAGS="--xla_experimental_ignore_channel_id" run_lbl "parallel_ring" 3 test_distributed_fused_attn.py -k test_context_parallel_ring_attn
+	    XLA_FLAGS="--xla_experimental_ignore_channel_id" NVTE_CK_USES_FWD_V3=1 NVTE_CK_USES_BWD_V3=1 run_lbl "parallel_ring" 3 test_distributed_fused_attn.py -k test_context_parallel_ring_attn
         ;;
     *)
         # Workaround for distributed tests hang with xla_flag
         XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run 3 test_distributed_fused_attn.py
+        XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" NVTE_CK_USES_FWD_V3=1 NVTE_CK_USES_BWD_V3=1 run 3 test_distributed_fused_attn.py
         ;;
     esac
 

@@ -94,6 +94,7 @@ run_test_config_mgpu(){
         run 3 distributed/test_numerics.py
         run 3 distributed/test_torch_fsdp2.py
         run 3 fused_attn/test_fused_attn_with_cp.py
+        NVTE_CK_USES_FWD_V3=1 NVTE_CK_USES_BWD_V3=1 run 3 fused_attn/test_fused_attn_with_cp.py
     fi
 }