Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions test/Conversion/amd/mbarrier_ops_to_llvm_gfx1250.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 --convert-builtin-func-to-llvm | FileCheck %s --check-prefix=GFX1250
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 --convert-builtin-func-to-llvm | FileCheck %s --enable-var-scope --check-prefix=GFX1250

// GFX1250: [[$MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">

#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
Expand All @@ -12,8 +14,9 @@ module attributes {"ttg.target" = "hip:gfx1250", "ttg.num-ctas" = 1 : i32, "ttg.
// GFX1250-NEXT: llvm.store %[[INIT_VAL1]], %[[ALLOC_PTR]] : i64, !llvm.ptr<3>
// GFX1250-NEXT: llvm.br ^[[BB1]]
// GFX1250-NEXT: ^[[BB1]]:
// GFX1250-NEXT: rocdl.s.wait.dscnt 0
// GFX1250-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
// GFX1250-NEXT: rocdl.s.barrier
// GFX1250-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]}
// GFX1250-NEXT: llvm.return
amdg.init_barrier %alloc, 2 : !ttg.memdesc<1xi64, #shared, #smem, mutable>
tt.return
Expand Down
50 changes: 42 additions & 8 deletions test/Conversion/amd/tritongpu_to_llvm.mlir
Original file line number Diff line number Diff line change
@@ -1,7 +1,39 @@
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --check-prefixes=CHECK,COMMON
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx950 | FileCheck %s --check-prefixes=GFX950,COMMON
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 | FileCheck %s --check-prefixes=GFX1250,COMMON
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx906 | FileCheck %s --check-prefixes=GFX906,COMMON
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --enable-var-scope --check-prefixes=CHECK,COMMON
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx950 | FileCheck %s --enable-var-scope --check-prefixes=GFX950,COMMON
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 | FileCheck %s --enable-var-scope --check-prefixes=GFX1250,COMMON
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx906 | FileCheck %s --enable-var-scope --check-prefixes=GFX906,COMMON

// COMMON-DAG: [[$LOCAL_MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">
// COMMON-DAG: [[$GLOBAL_MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"global">

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
// COMMON-LABEL: lower_barrier
tt.func @lower_barrier() {
// COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
// COMMON-NEXT: rocdl.s.barrier
// COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
ttg.barrier local

// COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
// COMMON-NEXT: rocdl.s.barrier
// COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
ttg.barrier global_read

// COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
// COMMON-NEXT: rocdl.s.barrier
// COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
ttg.barrier global_write

// COMMON: llvm.fence syncscope("workgroup") release{{$}}
// COMMON-NEXT: rocdl.s.barrier
// COMMON-NEXT: llvm.fence syncscope("workgroup") acquire{{$}}
ttg.barrier local|global_read|global_write

tt.return
}
}

// -----

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
// CHECK-LABEL: atomic_add_f32_scalar
Expand All @@ -14,8 +46,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
// CHECK: llvm.atomicrmw
// CHECK: llvm.store
// CHECK: llvm.br
// CHECK: rocdl.s.waitcnt 49279
// CHECK: rocdl.s.barrier
// COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
// COMMON-NEXT: rocdl.s.barrier
// COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
// CHECK: llvm.load
// CHECK: llvm.store
%0 = tt.atomic_rmw fadd, relaxed, gpu, %arg0, %arg2, %arg1 : (!tt.ptr<f32>, f32, i1) -> f32
Expand Down Expand Up @@ -609,8 +642,9 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
// CHECK-NOT: llvm.store
%0 = ttg.local_alloc %arg0 : (tensor<32x32xf16, #blocked>) -> !ttg.memdesc<32x32xf16, #shared, #smem, mutable>
%1 = ttg.memdesc_subslice %0 [16, 0] : !ttg.memdesc<32x32xf16, #shared, #smem, mutable> -> !ttg.memdesc<16x32xf16, #shared, #smem, mutable, 32x32>
// CHECK: rocdl.s.waitcnt
// CHECK-NEXT: rocdl.s.barrier
// COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
// COMMON-NEXT: rocdl.s.barrier
// COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
// CHECK: %[[AFF_I8:.+]] = llvm.mul %{{.+}}, %[[SUBSLICE_CST2]] : i32
// CHECK-NEXT: %[[AFF_SHR:.+]] = llvm.lshr %[[AFF_I8]], %[[SUBSLICE_CST6]] : i32
// CHECK-NEXT: %[[AFF_SHL:.+]] = llvm.shl %[[AFF_SHR]], %[[SUBSLICE_CST3]] : i32
Expand Down
14 changes: 9 additions & 5 deletions test/TritonGPU/amd/amd-block-pingpong-chained-dots.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=4" | FileCheck %s
// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=4" | FileCheck %s --enable-var-scope

// CHECK: [[$MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">

#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
Expand All @@ -23,8 +25,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
// CHECK: ttg.async_commit_group
// CHECK: rocdl.sched.barrier 0
// CHECK-NEXT: rocdl.s.setprio 0
// CHECK-NEXT: amdg.memory_counter_wait ds(0)
// CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
// CHECK-NEXT: rocdl.s.barrier
// CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]}
// CHECK-NEXT: rocdl.sched.barrier 0
// Compute Cluster2
// CHECK: tt.dot
Expand All @@ -38,7 +41,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
// CHECK: ttg.async_commit_group
// CHECK: rocdl.sched.barrier 0
// CHECK-NEXT: rocdl.s.setprio 0
// CHECK-NEXT: amdg.memory_counter_wait ds(0)
// CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
// CHECK-NEXT: scf.yield

tt.func @chained_dots_async_loads(%arg0: tensor<64x16x!tt.ptr<f16>, #blocked>, %arg1: i32, %arg2: i32, %arg3: !ttg.async.token, %arg4: tensor<128x16xf32, #mma>, %arg5: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, %arg6: i32, %arg7: tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>, %arg8: tensor<128x16xf32, #mma>, %arg9: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg10: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %arg11: i32, %arg12: i32, %arg13: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>) -> tensor<128x16xf32, #mma> {
Expand Down Expand Up @@ -95,8 +98,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
// CHECK: tt.load
// CHECK-NEXT: rocdl.sched.barrier 0
// CHECK-NEXT: rocdl.s.setprio 0
// CHECK-NEXT: amdg.memory_counter_wait ds(0)
// CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
// CHECK-NEXT: rocdl.s.barrier
// CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]}
// CHECK-NEXT: rocdl.sched.barrier 0
// Compute Cluster2
// CHECK: tt.dot
Expand All @@ -109,7 +113,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
// CHECK: tt.load
// CHECK-NEXT: rocdl.sched.barrier 0
// CHECK-NEXT: rocdl.s.setprio 0
// CHECK-NEXT: amdg.memory_counter_wait ds(0)
// CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
// CHECK-NEXT: scf.yield

tt.func @chained_dots_tt_loads(%arg0: tensor<64x16xf16, #blocked>, %arg1: tensor<64x16x!tt.ptr<f16>, #blocked>, %arg2: i32, %arg3: i32, %arg4: tensor<128x16xf32, #mma>, %arg5: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, %arg6: i32, %arg7: tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>, %arg8: tensor<128x16xf32, #mma>, %arg9: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg10: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %arg11: i32, %arg12: i32, %arg13: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>) -> tensor<128x16xf32, #mma> {
Expand Down
43 changes: 32 additions & 11 deletions third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "AsyncUtility.h"
#include "Dialect/TritonAMDGPU/IR/Dialect.h"
#include "PatternTritonGPUOpToLLVM.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
#include "triton/Dialect/TritonGPU/IR/Attributes.h"
Expand All @@ -13,6 +14,20 @@ using mlir::triton::amdgpu::ISAFamily;
using ::mlir::triton::gpu::MemDescType;

namespace {

static LLVM::FenceOp createAMDGPUMemoryFence(OpBuilder &builder, Location loc,
LLVM::AtomicOrdering ordering,
StringRef synchronizeAddrSpace) {
auto fence =
LLVM::FenceOp::create(builder, loc, ordering, /*syncscope=*/"workgroup");
if (!synchronizeAddrSpace.empty()) {
Attribute mmra = builder.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as",
synchronizeAddrSpace);
fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
}
return fence;
}

class TransLocalLoadOpConversion
: public ConvertOpToLLVMPattern<triton::gpu::LocalLoadOp> {
public:
Expand Down Expand Up @@ -549,21 +564,27 @@ class BarrierOpConversion
triton::gpu::AddrSpace::TensorWrite;
if ((op.getAddrSpace() & ~mask) != triton::gpu::AddrSpace::None)
return failure();
// We can lower barrier to MemoryCounterWaitOp + s_barrier
// - MemoryCounterWaitOp specifies how many operations to
// VMEM(Read)/VMEM(Write)/LDS can be outstanding when
// the instruction completes.
// - s_barrier synchronizes the execution for the CTA
IntegerAttr zero = rewriter.getI32IntegerAttr(0);
bool localBarrier = op.hasLocal();
bool globalBarrier = op.hasGlobalRead() || op.hasGlobalWrite();
if (localBarrier || globalBarrier) {
amdgpu::MemoryCounterWaitOp::create(
rewriter, op->getLoc(),
/* load= */ op.hasGlobalRead() ? zero : nullptr,
/* store= */ op.hasGlobalWrite() ? zero : nullptr,
/* ds= */ localBarrier ? zero : nullptr);
StringRef mmraAddrSpace = "";
if (localBarrier && !globalBarrier)
mmraAddrSpace = "local";
else if (!localBarrier && globalBarrier)
mmraAddrSpace = "global";

// Local/global barriers use LLVM fences so the AMDGPU memory legalizer
// selects target-specific waits. Mixed local+global barriers are left
// untagged so LLVM conservatively synchronizes every relevant space.
createAMDGPUMemoryFence(rewriter, op->getLoc(),
LLVM::AtomicOrdering::release, mmraAddrSpace);
ROCDL::SBarrierOp::create(rewriter, op->getLoc());
createAMDGPUMemoryFence(rewriter, op->getLoc(),
LLVM::AtomicOrdering::acquire, mmraAddrSpace);
rewriter.eraseOp(op);
return success();
}

rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);

return success();
Expand Down
52 changes: 31 additions & 21 deletions third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "TritonAMDGPUTransforms/Passes.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/BuiltinAttributes.h"
Expand All @@ -27,6 +28,16 @@ namespace mlir {

namespace {

static LLVM::FenceOp createLocalMMRAFence(OpBuilder &builder, Location loc,
LLVM::AtomicOrdering ordering) {
Attribute mmra =
builder.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
auto fence =
LLVM::FenceOp::create(builder, loc, ordering, /*syncscope=*/"workgroup");
fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
return fence;
}

// This pass transforms a for-loop calculating a GEMM. Main purpose of the
// transform is improve the efficiency of the GPU dot instruction (mfma)
// by interleaving the execution of two warps on each SIMD. Especially it groups
Expand Down Expand Up @@ -721,28 +732,30 @@ LogicalResult Pingponger::transformTwoClusterWithAsyncAndAll(OpBuilder &builder,
// Typical `s_xxx` instructions include:
// - Control flow: `s_cbranch`
// - Priority control: `s_setprio`
// - Synchronization and dependency: `s_waitcnt`
// - Synchronization and dependency: MMRA-tagged local fences, lowered by LLVM
// to target-specific wait instructions.
//
// These are usually inserted near `s_barrier` boundaries, and the current
// implementation carefully places them to ensure they belong to the memory
// cluster, improving overall overlap and utilization.
//
//
// 3. Placement of `s_waitcnt lgkmcnt(0)`
// --------------------------------------
// We place `s_waitcnt lgkmcnt(0)` at the *end* of the memory cluster to ensure
// that all shared-memory load (`ds_read`) instructions have completed before
// entering the compute cluster.
// 3. Placement of local MMRA fences
// ---------------------------------
// We place local MMRA release fences at the *end* of the memory cluster to
// ensure that all shared-memory load (`ds_read`) instructions have completed
// before entering the compute cluster. LLVM lowers these fences to the
// appropriate target-specific wait instructions.
//
// This placement prevents the LLVM backend from inserting additional
// `s_waitcnt lgkmcnt()` instructions inside the compute cluster based on
// wait instructions inside the compute cluster based on
// inferred dependencies between `mfma` and `ds_read` operations.
//
// This approach is consistent with the previous design goal: to eliminate all
// `s_xxx` instructions from the compute cluster so it can run uninterrupted
// MFMA and VALU operations. Keeping `s_waitcnt lgkmcnt(0)` at the cluster
// boundary enforces data dependency correctness while preserving the clean
// separation between memory and compute phases.
// MFMA and VALU operations. Keeping the local fence at the cluster boundary
// enforces data dependency correctness while preserving the clean separation
// between memory and compute phases.
LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
Location loc) {
assert(dotOps.size() == 2);
Expand Down Expand Up @@ -785,30 +798,29 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
// Ideally we want the memory cluster to start with
//
// s_barrier
// s_waitcnt vmcnt(x) lgkmcnt(0)
// local wait
// s_setprio 1
//
// However, the membar pass will put s_waitcnt before s_barrier.
// However, the membar path will put the local MMRA fence before s_barrier.
// But we can at least put s_setprio in the memory cluster.
prependOp(ROCDL::SetPrioOp::create(builder, loc, highPriority), false);

// ComputeCluster 2
// We want the 2nd compute cluster to start with
//
// s_setprio 0
// s_waitcnt lgkmcnt(0)
// local MMRA release fence
// s_barrier
//
// Check note 2 and 3 for details.
updateOpInsertion(dotOps[1]);
prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false);
prependOp(ROCDL::SetPrioOp::create(builder, loc, lowPriority), false);
auto dsAttr = builder.getI32IntegerAttr(0);
prependOp(tt::amdgpu::MemoryCounterWaitOp::create(
builder, loc, /* load= */ nullptr, /* store= */ nullptr,
/* ds= */ dsAttr),
prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::release),
false);
prependOp(ROCDL::SBarrierOp::create(builder, loc), false);
prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::acquire),
false);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to check if I understand this correctly. In theory we don't need acquire fence here because sched.barrier is used, right?
Still good to have it to avoid any confusion.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sched.barrier isn't an actual barrier - and is very unrelated to this change - and we do need the acquire fence.

So, having looked and reminded myself how LLVM defines fence, the reason this works is that s.barrier is, conceptually, an atomic operation on something we can model as inaccessible memory, and so a release fence followed by the "atomic" s.barrier that actually synchronizes the lanes, followed by the acquire fence, establishes a happens-before relationship between those two fences.

That, in turn, establishes a happens-before relationship between memory operations before the fence and those after the fence across the entire workgroup (which is the syncscope).

Now, those MMRAs prevent the classical "fences fence too much" issue where that relationship applies to global accesses, breaking software pipelining by forcing vmcnts. The MMRA metadata ensures that only (for example) operations on LDS get fenced.

prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false);

// MemoryCluster2
Expand All @@ -828,7 +840,7 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
// stays in the memory cluster.
//
// s_setprio 0
// s_waitcnt lgkmcnt(0)
// local MMRA release fence
// s_cbranch
// s_barrier
//
Expand All @@ -840,9 +852,7 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
updateOpInsertion(lastInsertedOp->getBlock()->getTerminator());
prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false);
prependOp(ROCDL::SetPrioOp::create(builder, loc, lowPriority), false);
prependOp(tt::amdgpu::MemoryCounterWaitOp::create(
builder, loc, /* load= */ nullptr, /* store= */ nullptr,
/* ds= */ dsAttr),
prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::release),
false);

return success();
Expand Down
Loading