triton-lang · atalman · Apr 7, 2026 · Apr 7, 2026
@@ -1 +1 @@
-979132a02d146ec79e2f046e31877516d7f32d20
+ac5dc54d509169d387fcfd495d71853d81c46484
@@ -133,6 +133,7 @@ createTargetMachine(llvm::Module *module, std::string proc,
   bool disableLLVMOpt = mlir::triton::tools::getBoolEnv("DISABLE_LLVM_OPT");
   if (enable_fp_fusion)
     opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.NoInfsFPMath = false;
   opt.NoNaNsFPMath = true;
   opt.TrapUnreachable = true;
   opt.MCOptions.AsmVerbose = true;

@@ -1382,9 +1382,7 @@ def kernel(X, Z):
     # atom.add.bf16 is unsupported prior to Hopper so instead we generate an
     # atom.cas add loop on Ampere and prior
     if dst_type == 'bfloat16' and torch.cuda.get_device_capability()[0] < 9:
-        assert "atom.relaxed.gpu.global.cas" in h.asm["ptx"]
-        if sem_str != "relaxed":
-            assert "fence.acq_rel.gpu" in h.asm["ptx"]
+        assert f"atom.{sem_str}.gpu.global.cas" in h.asm["ptx"]
         return
 
     assert f"atom.global.gpu.{sem_str}" in h.asm["ptx"]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		979132a02d146ec79e2f046e31877516d7f32d20
		ac5dc54d509169d387fcfd495d71853d81c46484