Skip to content

Commit

Permalink
merge fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Hazardu committed Oct 27, 2023
2 parents 5406474 + e9194f8 commit b67197d
Show file tree
Hide file tree
Showing 33 changed files with 3,967 additions and 1,894 deletions.
Binary file added .CMakeLists.txt.swp
Binary file not shown.
Binary file added .CMakeLists_BASE_935.txt.swp
Binary file not shown.
Binary file added .CMakeLists_LOCAL_935.txt.swp
Binary file not shown.
Binary file added .CMakeLists_REMOTE_935.txt.swp
Binary file not shown.
12 changes: 6 additions & 6 deletions 34_LRUCacheUnitTest/config.json.template
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@
"buildModes": [],
"runConfiguration": "Release",
"gpuArchitectures": [],
"inputs": [
"ext/config/release.json.template"
]
"inputs": [
"ext/config/release.json.template"
]
},
{
"backend": "vulkan",
"platform": "windows",
"buildModes": [],
"runConfiguration": "Debug",
"gpuArchitectures": [],
"inputs": [
"ext/config/debug.json.template"
]
"inputs": [
"ext/config/debug.json.template"
]
}
],
"inputs": [
Expand Down
71 changes: 48 additions & 23 deletions 48_ArithmeticUnitTest/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,10 @@ struct max_op
static inline constexpr const char* name = "max";
};
template<typename T>
struct ballot : add_op<T> {};
struct ballot : add_op<T>
{
static inline constexpr const char* name = "bitcount";
};


//subgroup method emulations on the CPU, to verify the results of the GPU methods
Expand Down Expand Up @@ -181,9 +184,9 @@ struct emulatedWorkgroupScanInclusive
static inline constexpr const char* name = "workgroup inclusive scan";
};


#include "common.glsl"
constexpr uint32_t kBufferSize = (1u+BUFFER_DWORD_COUNT)*sizeof(uint32_t);
#define HLSL
constexpr uint32_t kBufferSize = (1u + BUFFER_DWORD_COUNT) * sizeof(uint32_t);

//returns true if result matches
template<template<class> class Arithmetic, template<class> class OP>
Expand Down Expand Up @@ -216,9 +219,9 @@ bool validateResults(ILogicalDevice* device, IUtilities* utilities, IGPUQueue* t
if (tmp[localInvocationIndex]!=dataFromBuffer[workgroupOffset+localInvocationIndex])
{
logger->log(
"Failed test #%d (%s) (%s) Expected %s got %s",system::ILogger::ELL_ERROR,
"Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d",system::ILogger::ELL_ERROR,
workgroupSize,Arithmetic<OP<uint32_t>>::name,OP<uint32_t>::name,
std::to_string(tmp[localInvocationIndex]),std::to_string(dataFromBuffer[workgroupOffset+localInvocationIndex])
tmp[localInvocationIndex], dataFromBuffer[workgroupOffset + localInvocationIndex], workgroupOffset, localInvocationIndex
);
success = false;
break;
Expand Down Expand Up @@ -297,10 +300,11 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
NON_GRAPHICAL_APP_CONSTRUCTOR(ArythmeticUnitTestApp)
void onAppInitialized_impl() override
{
#pragma region Init
CommonAPI::InitParams initParams;
initParams.apiType = video::EAT_VULKAN;
initParams.appName = { "Subgroup Arithmetic Test" };
auto initOutput = CommonAPI::Init(std::move(initParams));
auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams));

apiConnection = std::move(initOutput.apiConnection);
gpuPhysicalDevice = std::move(initOutput.physicalDevice);
Expand All @@ -317,7 +321,8 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
auto transferDownQueue = queues[CommonAPI::InitOutput::EQT_TRANSFER_DOWN];

nbl::video::IGPUObjectFromAssetConverter cpu2gpu;

#pragma endregion Init

inputData = new uint32_t[BUFFER_DWORD_COUNT];
{
std::mt19937 randGenerator(std::time(0));
Expand All @@ -331,7 +336,7 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
auto gpuinputDataBuffer = utilities->createFilledDeviceLocalBufferOnDedMem(queues[decltype(initOutput)::EQT_TRANSFER_UP], std::move(inputDataBufferCreationParams), inputData);

//create 8 buffers.
constexpr const auto totalBufferCount = outputBufferCount + 1u;
constexpr const auto totalBufferCount = outputBufferCount + 1u; // output buffers for all ops +1 for the input buffer

core::smart_refctd_ptr<IGPUBuffer> buffers[outputBufferCount];
for (auto i = 0; i < outputBufferCount; i++)
Expand Down Expand Up @@ -387,24 +392,43 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
assert(!bundle.getContents().empty() && bundle.getAssetType() == IAsset::ET_SPECIALIZED_SHADER);
return core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(*bundle.getContents().begin());
};
core::smart_refctd_ptr<ICPUSpecializedShader> shaderGLSL[] =

core::smart_refctd_ptr<ICPUSpecializedShader> shaders[] =
#ifdef HLSL
{
getShaderGLSL("../hlsl/testSubgroupReduce.comp.hlsl"),
getShaderGLSL("../hlsl/testSubgroupExclusive.comp.hlsl"),
getShaderGLSL("../hlsl/testSubgroupInclusive.comp.hlsl"),
//getShaderGLSL("../hlsl/testWorkgroupReduce.comp.hlsl"),
//getShaderGLSL("../hlsl/testWorkgroupInclusive.comp.hlsl"),
//getShaderGLSL("../hlsl/testWorkgroupExclusive.comp.hlsl"),
};
#else
{
getShaderGLSL("../testSubgroupReduce.comp"),
getShaderGLSL("../testSubgroupReduce.comp")/*,
getShaderGLSL("../testSubgroupExclusive.comp"),
getShaderGLSL("../testSubgroupInclusive.comp"),
getShaderGLSL("../testWorkgroupReduce.comp"),
getShaderGLSL("../testWorkgroupExclusive.comp"),
getShaderGLSL("../testWorkgroupInclusive.comp")
getShaderGLSL("../testWorkgroupInclusive.comp")*/
};
constexpr auto kTestTypeCount = sizeof(shaderGLSL) / sizeof(const void*);
#endif

auto getGPUShader = [&](const ICPUSpecializedShader* shader, uint32_t wg_count) -> auto
constexpr auto kTestTypeCount = sizeof(shaders) / sizeof(const void*);

auto getGPUShader = [&](ICPUSpecializedShader* shader, uint32_t wg_count) -> auto
{
#ifdef HLSL
auto overriddenUnspecialized = CHLSLCompiler::createOverridenCopy(shader->getUnspecialized(), "#define _NBL_HLSL_WORKGROUP_SIZE_ %d\n", wg_count);
auto cs = core::make_smart_refctd_ptr<ICPUSpecializedShader>(std::move(overriddenUnspecialized), std::move(ISpecializedShader::SInfo(nullptr, nullptr, "main")));
return cpu2gpu.getGPUObjectsFromAssets(&cs, &cs + 1, cpu2gpuParams)->front();
#else
auto overridenUnspecialized = CGLSLCompiler::createOverridenCopy(shader->getUnspecialized(), "#define _NBL_GLSL_WORKGROUP_SIZE_ %d\n", wg_count);
ISpecializedShader::SInfo specInfo = shader->getSpecializationInfo();
auto cs = core::make_smart_refctd_ptr<ICPUSpecializedShader>(std::move(overridenUnspecialized), std::move(specInfo));
return cpu2gpu.getGPUObjectsFromAssets(&cs, &cs + 1, cpu2gpuParams)->front();
// no need to wait on fences because its only a shader create, does not result in the filling of image or buffers
#endif
};

auto logTestOutcome = [this](bool passed, uint32_t workgroupSize)
Expand All @@ -426,12 +450,13 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
logicalDevice->createCommandBuffers(cmdPools[0].get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf);
computeQueue->startCapture();
for (uint32_t workgroupSize=45u; workgroupSize<=1024u; workgroupSize++)
for (uint32_t workgroupSize=45u; workgroupSize<=1024; workgroupSize++)
{
logger->log("Testing Workgroup Size %u", system::ILogger::ELL_INFO, workgroupSize);
core::smart_refctd_ptr<IGPUComputePipeline> pipelines[kTestTypeCount];
for (uint32_t i = 0u; i < kTestTypeCount; i++)
pipelines[i] = logicalDevice->createComputePipeline(nullptr, core::smart_refctd_ptr(pipelineLayout), std::move(getGPUShader(shaderGLSL[i].get(), workgroupSize)));

for (uint32_t i = 0u; i < kTestTypeCount; i++) {
pipelines[i] = logicalDevice->createComputePipeline(nullptr, core::smart_refctd_ptr(pipelineLayout), std::move(getGPUShader(shaders[i].get(), workgroupSize)));
}
bool passed = true;

const video::IGPUDescriptorSet* ds = descriptorSet.get();
Expand All @@ -441,12 +466,12 @@ class ArythmeticUnitTestApp : public NonGraphicalApplicationBase
logTestOutcome(passed, workgroupSize);
passed = runTest<emulatedSubgroupScanInclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[2u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get()) && passed;
logTestOutcome(passed, workgroupSize);
passed = runTest<emulatedWorkgroupReduction>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[3u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
logTestOutcome(passed, workgroupSize);
passed = runTest<emulatedWorkgroupScanExclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[4u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
logTestOutcome(passed, workgroupSize);
passed = runTest<emulatedWorkgroupScanInclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[5u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
logTestOutcome(passed, workgroupSize);
//passed = runTest<emulatedWorkgroupReduction>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[3u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
//logTestOutcome(passed, workgroupSize);
//passed = runTest<emulatedWorkgroupScanInclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[4u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
//logTestOutcome(passed, workgroupSize);
//passed = runTest<emulatedWorkgroupScanExclusive>(logicalDevice.get(), utilities.get(), transferDownQueue, computeQueue, fence.get(), cmdbuf.get(), pipelines[5u].get(), descriptorSet.get(), inputData, workgroupSize, buffers, logger.get(), true) && passed;
//logTestOutcome(passed, workgroupSize);
}
computeQueue->endCapture();
}
Expand Down
48 changes: 0 additions & 48 deletions 48_ArithmeticUnitTest/shaderCommon.glsl

This file was deleted.

29 changes: 0 additions & 29 deletions 48_ArithmeticUnitTest/subgroupCommon.glsl

This file was deleted.

10 changes: 0 additions & 10 deletions 48_ArithmeticUnitTest/workgroupCommon.glsl

This file was deleted.

22 changes: 12 additions & 10 deletions 51_RadixSort/config.json.template
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,24 @@
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
},
"profiles": [
{
"backend": "vulkan",
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release",
"gpuArchitectures": []
"gpuArchitectures": [],
"inputs": []
},
{
"backend": "vulkan",
"platform": "windows",
"buildModes": [],
"runConfiguration": "Debug",
"gpuArchitectures": [],
"inputs": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
"inputs": []
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,24 @@
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
},
"profiles": [
{
"backend": "vulkan",
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release",
"gpuArchitectures": []
"gpuArchitectures": [],
"inputs": []
},
{
"backend": "vulkan",
"platform": "windows",
"buildModes": [],
"runConfiguration": "Debug",
"gpuArchitectures": [],
"inputs": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
"inputs": []
}
15 changes: 15 additions & 0 deletions 62_CAD/TransparencyAndAA.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
The Algorithm used here for AntiAliasing and Transparency for 2D Lines and Curves relies on `VK_EXT_fragment_shader_interlock`; enabling this capability provides a critical section for fragment shaders to avoid overlapping pixels being processed at the same time, and certain guarantees about the ordering of fragment shader invocations of fragments of overlapping pixels.

Such a guarantee is useful for applications like blending in the fragment shader, where an application requires that fragment values to be composited in the framebuffer in primitive order.

For example Programmable blending operations in the fragment shader, where the destination buffer is read via image loads and the final value is written via image stores.

Alpha value isn't the only thing we store in our R32_UINT texture. We also store an `object id` so that we can avoid objects self intersections (think of polylines crossing over themselves) with 24 bits for ID and 8 bits for alpha.

In more details:
1. Every fragment being processed checks if it's object id is the same as the one in the Read/Write R32_UINT Texture.
- if it's the same then it does a MAX operation from the current calculate alpha and the one existing in the Texture.
- if it's not the same then it **resolves**:
- It renders the pixel using the "Previous!" object's style that was in the texture

2. There is a last fullscreen pass that resolves anything unresolved
15 changes: 15 additions & 0 deletions 62_CAD/TransparencyAndAA.md.orig
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
The Algorithm used here for AntiAliasing and Transparency for 2D Lines and Curves relies on `VK_EXT_fragment_shader_interlock`; enabling this capability provides a critical section for fragment shaders to avoid overlapping pixels being processed at the same time, and certain guarantees about the ordering of fragment shader invocations of fragments of overlapping pixels.

Such a guarantee is useful for applications like blending in the fragment shader, where an application requires that fragment values to be composited in the framebuffer in primitive order.

For example Programmable blending operations in the fragment shader, where the destination buffer is read via image loads and the final value is written via image stores.

Alpha value isn't the only thing we store in our R32_UINT texture. We also store an `object id` so that we can avoid objects self intersections (think of polylines crossing over themselves) with 24 bits for ID and 8 bits for alpha.

In more details:
1. Every fragment being processed checks if it's object id is the same as the one in the Read/Write R32_UINT Texture.
- if it's the same then it does a MAX operation from the current calculate alpha and the one existing in the Texture.
- if it's not the same then it **resolves**:
- It renders the pixel using the "Previous!" object's style that was in the texture

2. There is a last fullscreen pass that resolves anything unresolved
Loading

0 comments on commit b67197d

Please sign in to comment.