diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt index 46ef2941aa59..4faf2ea91f28 100644 --- a/cmake/llvm-hash.txt +++ b/cmake/llvm-hash.txt @@ -1 +1 @@ -979132a02d146ec79e2f046e31877516d7f32d20 +ac5dc54d509169d387fcfd495d71853d81c46484 diff --git a/python/src/llvm.cc b/python/src/llvm.cc index 2941d87f97c2..fa93102ff7ba 100644 --- a/python/src/llvm.cc +++ b/python/src/llvm.cc @@ -133,6 +133,7 @@ createTargetMachine(llvm::Module *module, std::string proc, bool disableLLVMOpt = mlir::triton::tools::getBoolEnv("DISABLE_LLVM_OPT"); if (enable_fp_fusion) opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.NoInfsFPMath = false; opt.NoNaNsFPMath = true; opt.TrapUnreachable = true; opt.MCOptions.AsmVerbose = true; diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 46fcb3d6a2f6..9a70966edfc6 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -1382,9 +1382,7 @@ def kernel(X, Z): # atom.add.bf16 is unsupported prior to Hopper so instead we generate an # atom.cas add loop on Ampere and prior if dst_type == 'bfloat16' and torch.cuda.get_device_capability()[0] < 9: - assert "atom.relaxed.gpu.global.cas" in h.asm["ptx"] - if sem_str != "relaxed": - assert "fence.acq_rel.gpu" in h.asm["ptx"] + assert f"atom.{sem_str}.gpu.global.cas" in h.asm["ptx"] return assert f"atom.global.gpu.{sem_str}" in h.asm["ptx"]