Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dx11] add memory flush/invalidate & image/buffer copies #2149

Merged
merged 2 commits into from
Jun 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 193 additions & 8 deletions src/backend/dx11/shaders/copy.hlsl
Original file line number Diff line number Diff line change
@@ -1,15 +1,200 @@
cbuffer BufferImageCopy : register(b0) {
uint2 BufferSize;
uint2 ImageOffset;
struct BufferCopy {
uint4 SrcDst;
};

StructuredBuffer<uint> CopySrc : register(t0);
RWTexture2D<uint> CopyDst : register(u0);
struct ImageCopy {
uint4 Src;
uint4 Dst;
};

struct BufferImageCopy {
// x=offset, yz=size
uint4 BufferVars;
uint4 ImageOffset;
uint4 ImageExtent;
};

cbuffer CopyConstants : register(b0) {
BufferCopy BufferCopies;
ImageCopy ImageCopies;
BufferImageCopy BufferImageCopies;
};

uint2 GetImageDst(uint3 dispatch_thread_id)
{
return BufferImageCopies.ImageOffset.xy + dispatch_thread_id.xy;
}

uint2 GetImageSrc(uint3 dispatch_thread_id)
{
return BufferImageCopies.ImageOffset.xy + dispatch_thread_id.xy;
}

uint GetBufferDst(uint3 dispatch_thread_id)
{
return BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y;
}

uint GetBufferSrc(uint3 dispatch_thread_id)
{
return BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y;
}

uint Uint4ToUint(uint4 data)
{
data.x = min(data.x, 0x000000ff);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: could probably vectorize this better, e.g.

return dot(min(data, 0xFF), 1 << uint4(0, 8, 16, 24));

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Beautiful! 😄

data.y = min(data.y, 0x000000ff);
data.z = min(data.z, 0x000000ff);
data.w = min(data.w, 0x000000ff);

uint output = (data.x |
(data.y << 8) |
(data.z << 16) |
(data.w << 24));

return output;
}

uint4 UintToUint4(uint data)
{
return uint4((data & 0xff000000) >> 24, (data & 0xff0000) >> 16, (data & 0xff00) >> 8, data & 0xff);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe even simpler:

return ((data >> uint4(24, 16, 8, 0)) & 0xFF;

}

uint2 UintToUint2(uint data)
{
return uint2((data >> 16) & 0x0000ffff, data & 0x0000ffff);
}

uint Uint2ToUint(uint2 data)
{
data.x = min(data.x, 0x0000ffff);
data.y = min(data.y, 0x0000ffff);

uint output = (data.x |
(data.y << 16));

return output;
}

// Buffers are always R32-aligned
StructuredBuffer<uint> BufferCopySrc : register(t0);
RWBuffer<uint> BufferCopyDst: register(u0);

// R32
Texture2D<uint> ImageCopySrcR32 : register(t0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should totally share these definitions between different bit sizes (R16/R32/R8/etc)

RWTexture2D<uint> ImageCopyDstR32 : register(u0);

// TODO: correct, but slow
[numthreads(1, 1, 1)]
void cs_copy_buffer_image2d_r32(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint2 dst_idx = GetImageDst(dispatch_thread_id);
uint src_idx = GetBufferSrc(dispatch_thread_id);

ImageCopyDstR32[dst_idx] = BufferCopySrc[src_idx];
}

[numthreads(1, 1, 1)]
void cs_copy_image2d_r32_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint dst_idx = GetBufferDst(dispatch_thread_id);
uint2 src_idx = GetImageSrc(dispatch_thread_id);

BufferCopyDst[dst_idx] = ImageCopySrcR32[src_idx];
}

// R16G16
Texture2D<uint2> ImageCopySrcR16G16 : register(t0);
RWTexture2D<uint2> ImageCopyDstR16G16 : register(u0);

// TODO: correct, but slow
[numthreads(1, 1, 1)]
void cs_copy_buffer_image_2d(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint2 idx = ImageOffset + dispatch_thread_id.xy;
void cs_copy_buffer_image2d_r16g16(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint2 dst_idx = GetImageDst(dispatch_thread_id);
uint src_idx = GetBufferSrc(dispatch_thread_id);

ImageCopyDstR16G16[dst_idx] = UintToUint2(BufferCopySrc[src_idx]);
}

[numthreads(1, 1, 1)]
void cs_copy_image2d_r16g16_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint dst_idx = GetBufferDst(dispatch_thread_id);
uint2 src_idx = GetImageSrc(dispatch_thread_id);

BufferCopyDst[dst_idx] = Uint2ToUint(ImageCopySrcR16G16[src_idx].yx);
}

// R16
Texture2D<uint> ImageCopySrcR16 : register(t0);
RWTexture2D<uint> ImageCopyDstR16 : register(u0);

[numthreads(1, 1, 1)]
void cs_copy_buffer_image2d_r16(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint src_idx = BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y / 2;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why isn't GetBufferSrc used here?


uint2 data = UintToUint2(BufferCopySrc[src_idx]);

ImageCopyDstR16[GetImageDst(uint3(2, 1, 0) * dispatch_thread_id + uint3(0, 0, 0))] = data.y;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I understand this. The image is R16, why do we need to write into 2 different texels?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

R32_UINT is the only format which has guaranteed UAV typed load support[0], so I was aiming for 1 buffer (source) access per thread, so in the case of R16 one thread would have to do 2 writes. The dispatch call is also scaled depending on the format.

0: https://msdn.microsoft.com/en-us/library/windows/desktop/mt427455(v=vs.85).aspx

ImageCopyDstR16[GetImageDst(uint3(2, 1, 0) * dispatch_thread_id + uint3(1, 0, 0))] = data.x;
}

[numthreads(1, 1, 1)]
void cs_copy_image2d_r16_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint dst_idx = BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y / 2;

uint upper = ImageCopySrcR16[GetImageSrc(uint3(2, 1, 0) * dispatch_thread_id + uint3(0, 0, 0))];
uint lower = ImageCopySrcR16[GetImageSrc(uint3(2, 1, 0) * dispatch_thread_id + uint3(1, 0, 0))];
uint data = Uint2ToUint(uint2(upper, lower));

BufferCopyDst[dst_idx] = data;
}

// R8G8
Texture2D<uint2> ImageCopySrcR8G8 : register(t0);
RWTexture2D<uint2> ImageCopyDstR8G8 : register(u0);

[numthreads(1, 1, 1)]
void cs_copy_buffer_image2d_r8g8(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint src_idx = BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y / 2;

uint4 data = UintToUint4(BufferCopySrc[src_idx]);

ImageCopyDstR8G8[GetImageDst(uint3(2, 1, 0) * dispatch_thread_id + uint3(0, 0, 0))] = data.xy;
ImageCopyDstR8G8[GetImageDst(uint3(2, 1, 0) * dispatch_thread_id + uint3(1, 0, 0))] = data.zw;
}

[numthreads(1, 1, 1)]
void cs_copy_image2d_r8g8_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint dst_idx = BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y / 2;

uint2 lower = ImageCopySrcR8G8[GetImageSrc(uint3(2, 1, 0) * dispatch_thread_id + uint3(0, 0, 0))].yx;
uint2 upper = ImageCopySrcR8G8[GetImageSrc(uint3(2, 1, 0) * dispatch_thread_id + uint3(1, 0, 0))].yx;
uint data = Uint4ToUint(uint4(upper.x, upper.y, lower.x, lower.y));

BufferCopyDst[dst_idx] = data;
}

// R8
Texture2D<uint> ImageCopySrcR8 : register(t0);
RWTexture2D<uint> ImageCopyDstR8 : register(u0);

[numthreads(1, 1, 1)]
void cs_copy_buffer_image2d_r8(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint src_idx = BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y / 4;
uint4 data = UintToUint4(BufferCopySrc[src_idx]);

ImageCopyDstR8[GetImageDst(uint3(4, 1, 0) * dispatch_thread_id + uint3(0, 0, 0))] = data.w;
ImageCopyDstR8[GetImageDst(uint3(4, 1, 0) * dispatch_thread_id + uint3(1, 0, 0))] = data.z;
ImageCopyDstR8[GetImageDst(uint3(4, 1, 0) * dispatch_thread_id + uint3(2, 0, 0))] = data.y;
ImageCopyDstR8[GetImageDst(uint3(4, 1, 0) * dispatch_thread_id + uint3(3, 0, 0))] = data.x;
}

[numthreads(1, 1, 1)]
void cs_copy_image2d_r8_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
uint dst_idx = BufferImageCopies.BufferVars.x + dispatch_thread_id.x + dispatch_thread_id.y * BufferImageCopies.BufferVars.y / 4;

uint src_1 = ImageCopySrcR8[GetImageSrc(uint3(4, 1, 0) * dispatch_thread_id + uint3(0, 0, 0))];
uint src_2 = ImageCopySrcR8[GetImageSrc(uint3(4, 1, 0) * dispatch_thread_id + uint3(1, 0, 0))];
uint src_3 = ImageCopySrcR8[GetImageSrc(uint3(4, 1, 0) * dispatch_thread_id + uint3(2, 0, 0))];
uint src_4 = ImageCopySrcR8[GetImageSrc(uint3(4, 1, 0) * dispatch_thread_id + uint3(3, 0, 0))];

CopyDst[idx] = CopySrc[BufferSize.x + idx.x + idx.y * BufferSize.y];
BufferCopyDst[dst_idx] = Uint4ToUint(uint4(src_1, src_2, src_3, src_4));
}
46 changes: 34 additions & 12 deletions src/backend/dx11/src/conv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,62 +24,84 @@ pub fn map_index_type(ty: IndexType) -> DXGI_FORMAT {
}
}

pub fn typeless_format(format: DXGI_FORMAT) -> Option<DXGI_FORMAT> {
pub fn typeless_format(format: DXGI_FORMAT) -> Option<(DXGI_FORMAT, DXGI_FORMAT)> {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the return value semantics now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_TYPELESS and fully qualified UAV format (eg. R32_UINT)

match format {
DXGI_FORMAT_R8G8B8A8_UNORM |
DXGI_FORMAT_R8G8B8A8_SNORM |
DXGI_FORMAT_R8G8B8A8_UINT |
DXGI_FORMAT_R8G8B8A8_SINT |
DXGI_FORMAT_R8G8B8A8_UNORM_SRGB => Some(DXGI_FORMAT_R8G8B8A8_TYPELESS),
DXGI_FORMAT_R8G8B8A8_UNORM_SRGB => Some((DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UINT)),

// ?`
DXGI_FORMAT_B8G8R8A8_UNORM |
DXGI_FORMAT_B8G8R8A8_UNORM_SRGB => Some(DXGI_FORMAT_B8G8R8A8_TYPELESS),
DXGI_FORMAT_B8G8R8A8_UNORM_SRGB => Some((DXGI_FORMAT_B8G8R8A8_TYPELESS, DXGI_FORMAT_R32_UINT)),

DXGI_FORMAT_R8_UNORM |
DXGI_FORMAT_R8_SNORM |
DXGI_FORMAT_R8_UINT |
DXGI_FORMAT_R8_SINT => Some(DXGI_FORMAT_R8_TYPELESS),
DXGI_FORMAT_R8_SINT => Some((DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UINT)),

DXGI_FORMAT_R8G8_UNORM |
DXGI_FORMAT_R8G8_SNORM |
DXGI_FORMAT_R8G8_UINT |
DXGI_FORMAT_R8G8_SINT => Some(DXGI_FORMAT_R8G8_TYPELESS),
DXGI_FORMAT_R8G8_SINT => Some((DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UINT)),

DXGI_FORMAT_R16_UNORM |
DXGI_FORMAT_R16_SNORM |
DXGI_FORMAT_R16_UINT |
DXGI_FORMAT_R16_SINT |
DXGI_FORMAT_R16_FLOAT => Some(DXGI_FORMAT_R16_TYPELESS),
DXGI_FORMAT_R16_FLOAT => Some((DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UINT)),

DXGI_FORMAT_R16G16_UNORM |
DXGI_FORMAT_R16G16_SNORM |
DXGI_FORMAT_R16G16_UINT |
DXGI_FORMAT_R16G16_SINT |
DXGI_FORMAT_R16G16_FLOAT => Some(DXGI_FORMAT_R16G16_TYPELESS),
DXGI_FORMAT_R16G16_FLOAT => Some((DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UINT)),

DXGI_FORMAT_R16G16B16A16_UNORM |
DXGI_FORMAT_R16G16B16A16_SNORM |
DXGI_FORMAT_R16G16B16A16_UINT |
DXGI_FORMAT_R16G16B16A16_SINT |
DXGI_FORMAT_R16G16B16A16_FLOAT => Some(DXGI_FORMAT_R16G16B16A16_TYPELESS),
DXGI_FORMAT_R16G16B16A16_FLOAT => Some((DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UINT)),

DXGI_FORMAT_D32_FLOAT |
DXGI_FORMAT_R32_UINT |
DXGI_FORMAT_R32_SINT |
DXGI_FORMAT_R32_FLOAT => Some(DXGI_FORMAT_R32_TYPELESS),
DXGI_FORMAT_R32_FLOAT => Some((DXGI_FORMAT_R32_TYPELESS, DXGI_FORMAT_R32_UINT)),

DXGI_FORMAT_R32G32_UINT |
DXGI_FORMAT_R32G32_SINT |
DXGI_FORMAT_R32G32_FLOAT => Some(DXGI_FORMAT_R32G32_TYPELESS),
DXGI_FORMAT_R32G32_FLOAT => Some((DXGI_FORMAT_R32G32_TYPELESS, DXGI_FORMAT_R32G32_UINT)),

DXGI_FORMAT_R32G32B32_UINT |
DXGI_FORMAT_R32G32B32_SINT |
DXGI_FORMAT_R32G32B32_FLOAT => Some(DXGI_FORMAT_R32G32B32_TYPELESS),
DXGI_FORMAT_R32G32B32_FLOAT => Some((DXGI_FORMAT_R32G32B32_TYPELESS, DXGI_FORMAT_R32G32B32_UINT)),

DXGI_FORMAT_R32G32B32A32_UINT |
DXGI_FORMAT_R32G32B32A32_SINT |
DXGI_FORMAT_R32G32B32A32_FLOAT => Some(DXGI_FORMAT_R32G32B32A32_TYPELESS),
DXGI_FORMAT_R32G32B32A32_FLOAT => Some((DXGI_FORMAT_R32G32B32A32_TYPELESS, DXGI_FORMAT_R32G32B32A32_UINT)),

DXGI_FORMAT_BC1_UNORM |
DXGI_FORMAT_BC1_UNORM_SRGB => Some((DXGI_FORMAT_BC1_TYPELESS, DXGI_FORMAT_R32_UINT)),

DXGI_FORMAT_BC2_UNORM |
DXGI_FORMAT_BC2_UNORM_SRGB => Some((DXGI_FORMAT_BC2_TYPELESS, DXGI_FORMAT_R32_UINT)),

DXGI_FORMAT_BC3_UNORM |
DXGI_FORMAT_BC3_UNORM_SRGB => Some((DXGI_FORMAT_BC3_TYPELESS, DXGI_FORMAT_R32_UINT)),

DXGI_FORMAT_BC4_UNORM |
DXGI_FORMAT_BC4_SNORM => Some((DXGI_FORMAT_BC4_TYPELESS, DXGI_FORMAT_R32_UINT)),

DXGI_FORMAT_BC5_UNORM |
DXGI_FORMAT_BC5_SNORM => Some((DXGI_FORMAT_BC5_TYPELESS, DXGI_FORMAT_R32_UINT)),

DXGI_FORMAT_BC6H_UF16 |
DXGI_FORMAT_BC6H_SF16 => Some((DXGI_FORMAT_BC6H_TYPELESS, DXGI_FORMAT_R32_UINT)),

// TODO: srgb craziness
DXGI_FORMAT_BC7_UNORM |
DXGI_FORMAT_BC7_UNORM_SRGB => Some((DXGI_FORMAT_BC7_TYPELESS, DXGI_FORMAT_BC7_UNORM)),

/*R5g6b5Unorm => DXGI_FORMAT_B5G6R5_UNORM,
R5g5b5a1Unorm => DXGI_FORMAT_B5G5R5A1_UNORM,
Expand Down
Loading