Skip to content
34 changes: 34 additions & 0 deletions source/slang/hlsl.meta.slang
Original file line number Diff line number Diff line change
Expand Up @@ -24170,6 +24170,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
//

/// Store all elements of this CoopVec into a buffer at a specified offset.
/// Pointer accesses are 16-byte aligned.
/// @param buffer The destination buffer to store the values into.
/// @param byteOffset16ByteAligned The byte offset from the start of the buffer where the data will be stored. Must be 16-byte aligned.
[require(cooperative_vector)]
Expand Down Expand Up @@ -24216,6 +24217,18 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
}
}

[ForceInline]
[require(spirv, cooperative_vector)]
void store(T* buffer, int32_t byteOffset16ByteAligned = 0)
{
let pointer = Ptr<T[]>(buffer);
let alignment = 16;
return spirv_asm
{
OpCooperativeVectorStoreNV $pointer $byteOffset16ByteAligned $this Aligned !alignment;
};
}

[ForceInline]
[require(cooperative_vector)]
[require(hlsl_coopvec_poc)]
Expand Down Expand Up @@ -24269,6 +24282,7 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
}

/// Load values from a byte-addressable buffer into a cooperative vector.
/// Pointer accesses are 16-byte aligned.
/// @param buffer The source buffer to load data from.
/// @param byteOffset16ByteAligned The byte offset from the start of the buffer. Must be 16-byte aligned.
/// @return A new cooperative vector containing the loaded values.
Expand Down Expand Up @@ -24368,6 +24382,19 @@ struct CoopVec<T : __BuiltinArithmeticType, let N : int> : IArray<T>, IArithmeti
}
}

[ForceInline]
[__NoSideEffect]
[require(spirv, cooperative_vector)]
static CoopVec<T, N> load(T* buffer, int32_t byteOffset16ByteAligned = 0)
{
let pointer = Ptr<T[]>(buffer);
let alignment = 16;
return spirv_asm
{
result:$$CoopVec<T, N> = OpCooperativeVectorLoadNV $pointer $byteOffset16ByteAligned Aligned !alignment;
};
}

// Groupshared
[ForceInline]
[__NoSideEffect]
Expand Down Expand Up @@ -25736,6 +25763,13 @@ CoopVec<T, N> coopVecLoad<let N : int, T : __BuiltinArithmeticType>(RWStructured
return CoopVec<T, N>.load(buffer, byteOffset16ByteAligned);
}

[ForceInline]
[require(spirv, cooperative_vector)]
CoopVec<T, N> coopVecLoad<let N : int, T : __BuiltinArithmeticType>(T* buffer, int32_t byteOffset16ByteAligned = 0)
{
return CoopVec<T, N>.load(buffer, byteOffset16ByteAligned);
}

// Groupshared
[ForceInline]
[require(cooperative_vector)]
Expand Down
21 changes: 18 additions & 3 deletions source/slang/slang-emit-spirv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2069,10 +2069,25 @@ struct SPIRVEmitContext : public SourceEmitterBase, public SPIRVEmitSharedContex
inst->getOp() == kIROp_ArrayType
? emitOpTypeArray(inst, elementType, irArrayType->getElementCount())
: emitOpTypeRuntimeArray(inst, elementType);
auto strideInst = irArrayType->getArrayStride();
if (strideInst && shouldEmitArrayStride(irArrayType->getElementType()))
if (shouldEmitArrayStride(irArrayType->getElementType()))
{
int stride = (int)getIntVal(strideInst);
auto stride = 0;
if (auto strideInst = irArrayType->getArrayStride())
{
stride = (int)getIntVal(strideInst);
}
else
{
// Stride may not have been calculated for basic element types. Calculate it
// here.
IRSizeAndAlignment sizeAndAlignment;
getNaturalSizeAndAlignment(
m_targetProgram->getOptionSet(),
elementType,
&sizeAndAlignment);
stride = (int)sizeAndAlignment.getStride();
}

emitOpDecorateArrayStride(
getSection(SpvLogicalSectionID::Annotations),
nullptr,
Expand Down
32 changes: 32 additions & 0 deletions tests/cooperative-vector/load-store-pointer.slang
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -render-feature cooperative-vector -emit-spirv-directly

//TEST_INPUT: set inputBuffer = ubuffer(data=[1 2 3 4 5 6 7 8 9 10 11 12], stride=4);
uniform int32_t* inputBuffer;

//TEST_INPUT: set outputBuffer = out ubuffer(data=[0 0 0 0 0 0 0 0], stride=4);
uniform int32_t* outputBuffer;

// CHECK: 9
// CHECK-NEXT: A
// CHECK-NEXT: B
// CHECK-NEXT: C
// CHECK-NEXT: 1
// CHECK-NEXT: 2
// CHECK-NEXT: 3
// CHECK-NEXT: 4

[shader("compute")]
[numthreads(1, 1, 1)]
void computeMain()
{
// First half of input.
let a = coopVecLoad<4, int32_t>(inputBuffer, 0);
// Second half of input.
let b = coopVecLoad<4, int32_t>(inputBuffer + 4, 4*4);

// Store second half of input to first half of output buffer.
b.store(outputBuffer, 0);
// Store first half of input to second half of output buffer.
a.store(outputBuffer, 4*4);
}

Loading