Skip to content

Vulkan: Automatically merge render passes to the same target when possible #12242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Core/Compatibility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
CheckSetting(iniFile, gameID, "DisableAccurateDepth", &flags_.DisableAccurateDepth);
CheckSetting(iniFile, gameID, "MGS2AcidHack", &flags_.MGS2AcidHack);
CheckSetting(iniFile, gameID, "SonicRivalsHack", &flags_.SonicRivalsHack);
CheckSetting(iniFile, gameID, "RenderPassMerge", &flags_.RenderPassMerge);
CheckSetting(iniFile, gameID, "BlockTransferAllowCreateFB", &flags_.BlockTransferAllowCreateFB);
CheckSetting(iniFile, gameID, "YugiohSaveFix", &flags_.YugiohSaveFix);
CheckSetting(iniFile, gameID, "ForceUMDDelay", &flags_.ForceUMDDelay);
Expand Down
1 change: 1 addition & 0 deletions Core/Compatibility.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ struct CompatFlags {
bool DisableAccurateDepth;
bool MGS2AcidHack;
bool SonicRivalsHack;
bool RenderPassMerge;
bool BlockTransferAllowCreateFB;
bool YugiohSaveFix;
bool ForceUMDDelay;
Expand Down
1 change: 1 addition & 0 deletions GPU/Vulkan/DebugVisVulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,6 @@ void DrawProfilerVis(UIContext *ui, GPUInterface *gpu) {
Draw::DrawContext *draw = ui->GetDrawContext();
ui->SetFontScale(0.4f, 0.4f);
ui->DrawTextShadow(text.c_str(), 10, 50, 0xFFFFFFFF, FLAG_DYNAMIC_ASCII);
ui->SetFontScale(1.0f, 1.0f);
ui->Flush();
}
3 changes: 3 additions & 0 deletions GPU/Vulkan/GPU_Vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,9 @@ void GPU_Vulkan::InitDeviceObjects() {
hacks |= QUEUE_HACK_MGS2_ACID;
if (PSP_CoreParameter().compat.flags().SonicRivalsHack)
hacks |= QUEUE_HACK_SONIC;
if (PSP_CoreParameter().compat.flags().RenderPassMerge)
hacks |= QUEUE_HACK_RENDERPASS_MERGE;

if (hacks) {
rm->GetQueueRunner()->EnableHacks(hacks);
}
Expand Down
36 changes: 36 additions & 0 deletions assets/compat.ini
Original file line number Diff line number Diff line change
Expand Up @@ -633,3 +633,39 @@ NPUH10047 = true
ULAS42214 = true
ULJS19054 = true
NPJH50184 = true

[RenderPassMerge]
UCJS10114 = true
UCKS45084 = true
# GOW : Ghost of Sparta
UCUS98737 = true
UCAS40323 = true
NPHG00092 = true
NPEG00044 = true
NPEG00045 = true
NPJG00120 = true
NPUG80508 = true
UCJS10114 = true
UCES01401 = true
UCES01473 = true
# GOW : Ghost of Sparta Demo
NPEG90035 = true
NPUG70125 = true
NPJG90095 = true
# GOW : Chains Of Olympus
UCAS40198 = true
UCUS98653 = true
UCES00842 = true
ULJM05438 = true
ULJM05348 = true
UCKS45084 = true
NPUG80325 = true
NPEG00023 = true
NPHG00027 = true
NPHG00028 = true
NPJH50170 = true
UCET00844 = true
# GOW: Chains of Olympus Demo
UCUS98705 = true
UCED00971 = true
UCUS98713 = true
93 changes: 82 additions & 11 deletions ext/native/thin3d/VulkanQueueRunner.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <map>
#include "DataFormat.h"
#include "VulkanQueueRunner.h"
#include "VulkanRenderManager.h"
Expand Down Expand Up @@ -379,7 +380,6 @@ void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &st
// Planned optimizations:
// * Create copies of render target that are rendered to multiple times and textured from in sequence, and push those render passes
// as early as possible in the frame (Wipeout billboards).
// * Merge subsequent render passes to the same target that are interspersed with unrelated draws to other render targets (God of War).

for (int j = 0; j < (int)steps.size(); j++) {
if (steps[j]->stepType == VKRStepType::RENDER &&
Expand Down Expand Up @@ -438,6 +438,9 @@ void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &st
if (hacksEnabled_ & QUEUE_HACK_SONIC) {
ApplySonicHack(steps);
}
if (hacksEnabled_ & QUEUE_HACK_RENDERPASS_MERGE) {
ApplyRenderPassMerge(steps);
}
}

for (size_t i = 0; i < steps.size(); i++) {
Expand Down Expand Up @@ -717,10 +720,10 @@ std::string VulkanQueueRunner::StepToString(const VKRStep &step) const {
snprintf(buffer, sizeof(buffer), "Blit (%dx%d->%dx%d)", step.blit.srcRect.extent.width, step.blit.srcRect.extent.height, step.blit.dstRect.extent.width, step.blit.dstRect.extent.height);
break;
case VKRStepType::READBACK:
snprintf(buffer, sizeof(buffer), "Readback (%dx%d)", step.readback.srcRect.extent.width, step.readback.srcRect.extent.height);
snprintf(buffer, sizeof(buffer), "Readback (%dx%d, fb: %p)", step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, step.readback.src);
break;
case VKRStepType::READBACK_IMAGE:
snprintf(buffer, sizeof(buffer), "ReadbackImage");
snprintf(buffer, sizeof(buffer), "ReadbackImage (%dx%d)", step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height);
break;
case VKRStepType::RENDER_SKIP:
snprintf(buffer, sizeof(buffer), "(SKIPPED RenderPass)");
Expand All @@ -732,6 +735,74 @@ std::string VulkanQueueRunner::StepToString(const VKRStep &step) const {
return std::string(buffer);
}

// Ideally, this should be cheap enough to be applied to all games. At least on mobile, it's pretty
// much a guaranteed neutral or win in terms of GPU power. However, dependency calculation really
// must be perfect!
void VulkanQueueRunner::ApplyRenderPassMerge(std::vector<VKRStep *> &steps) {
// First let's count how many times each framebuffer is rendered to.
// If it's more than one, let's do our best to merge them. This can help God of War quite a bit.
std::map<VKRFramebuffer *, int> counts;
for (int i = 0; i < (int)steps.size(); i++) {
if (steps[i]->stepType == VKRStepType::RENDER) {
counts[steps[i]->render.framebuffer]++;
}
}

// Now, let's go through the steps. If we find one that is rendered to more than once,
// we'll scan forward and slurp up any rendering that can be merged across.
for (int i = 0; i < (int)steps.size(); i++) {
if (steps[i]->stepType == VKRStepType::RENDER && counts[steps[i]->render.framebuffer] > 1) {
auto fb = steps[i]->render.framebuffer;
TinySet<VKRFramebuffer *, 8> touchedFramebuffers; // must be the same fast-size as the dependencies TinySet for annoying reasons.
for (int j = i + 1; j < (int)steps.size(); j++) {
// If any other passes are reading from this framebuffer as-is, we cancel the scan.
switch (steps[j]->stepType) {
case VKRStepType::RENDER:
if (steps[j]->dependencies.contains(fb)) {
goto done_fb;
}
// Prevent Unknown's example case from https://github.com/hrydgard/ppsspp/pull/12242
if (steps[j]->dependencies.contains(touchedFramebuffers)) {
goto done_fb;
}
if (steps[j]->render.framebuffer == fb) {
// ok. Now, if it's a render, slurp up all the commands
// and kill the step.
// Also slurp up any pretransitions.
steps[i]->preTransitions.insert(steps[i]->preTransitions.end(), steps[j]->preTransitions.begin(), steps[j]->preTransitions.end());
steps[i]->commands.insert(steps[i]->commands.end(), steps[j]->commands.begin(), steps[j]->commands.end());
steps[j]->stepType = VKRStepType::RENDER_SKIP;
}
// Remember the framebuffer this wrote to. We can't merge with later passes that depend on these.
if (steps[j]->render.framebuffer != fb) {
touchedFramebuffers.insert(steps[j]->render.framebuffer);
}
break;
case VKRStepType::COPY:
if (steps[j]->copy.src == fb || steps[j]->copy.dst == fb) {
goto done_fb;
}
break;
case VKRStepType::BLIT:
if (steps[j]->blit.src == fb || steps[j]->blit.dst == fb) {
goto done_fb;
}
break;
case VKRStepType::READBACK:
// Not sure this has much effect, when executed READBACK is always the last step
// since we stall the GPU and wait immediately after.
if (steps[j]->readback.src == fb) {
goto done_fb;
}
break;
}
}
done_fb:
;
}
}
}

void VulkanQueueRunner::LogSteps(const std::vector<VKRStep *> &steps) {
ILOG("=======================================");
for (size_t i = 0; i < steps.size(); i++) {
Expand Down Expand Up @@ -804,20 +875,20 @@ void VulkanQueueRunner::LogRenderPass(const VKRStep &pass) {
ILOG("RenderPass End(%x)", fb);
}

void VulkanQueueRunner::LogCopy(const VKRStep &pass) {
ILOG("Copy()");
void VulkanQueueRunner::LogCopy(const VKRStep &step) {
ILOG("%s", StepToString(step).c_str());
}

void VulkanQueueRunner::LogBlit(const VKRStep &pass) {
ILOG("Blit()");
void VulkanQueueRunner::LogBlit(const VKRStep &step) {
ILOG("%s", StepToString(step).c_str());
}

void VulkanQueueRunner::LogReadback(const VKRStep &pass) {
ILOG("Readback");
void VulkanQueueRunner::LogReadback(const VKRStep &step) {
ILOG("%s", StepToString(step).c_str());
}

void VulkanQueueRunner::LogReadbackImage(const VKRStep &pass) {
ILOG("ReadbackImage");
void VulkanQueueRunner::LogReadbackImage(const VKRStep &step) {
ILOG("%s", StepToString(step).c_str());
}

void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer cmd) {
Expand Down
70 changes: 70 additions & 0 deletions ext/native/thin3d/VulkanQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,72 @@ struct VKRImage;
enum {
QUEUE_HACK_MGS2_ACID = 1,
QUEUE_HACK_SONIC = 2,
// Killzone PR = 4.
QUEUE_HACK_RENDERPASS_MERGE = 8,
};

// Insert-only small-set implementation. Performs no allocation unless MaxFastSize is exceeded.
template <class T, int MaxFastSize>
struct TinySet {
~TinySet() { delete slowLookup_; }
inline void insert(T t) {
// Fast linear scan.
for (int i = 0; i < fastCount; i++) {
if (fastLookup_[i] == t)
return; // We already have it.
}
// Fast insertion
if (fastCount < MaxFastSize) {
fastLookup_[fastCount++] = t;
return;
}
// Fall back to slow path.
insertSlow(t);
}
bool contains(T t) const {
for (int i = 0; i < fastCount; i++) {
if (fastLookup_[i] == t)
return true;
}
if (slowLookup_) {
for (auto x : *slowLookup_) {
if (x == t)
return true;
}
}
return false;
}
bool contains(const TinySet<T, MaxFastSize> &otherSet) {
// Awkward, kind of ruins the fun.
for (int i = 0; i < fastCount; i++) {
if (otherSet.contains(fastLookup_[i]))
return true;
}
if (slowLookup_) {
for (auto x : *slowLookup_) {
if (otherSet.contains(x))
return true;
}
}
return false;
}

private:
void insertSlow(T t) {
if (!slowLookup_) {
slowLookup_ = new std::vector<T>();
} else {
for (size_t i = 0; i < slowLookup_->size(); i++) {
if ((*slowLookup_)[i] == t)
return;
}
}
slowLookup_->push_back(t);
}
T fastLookup_[MaxFastSize];
int fastCount = 0;
int slowCount = 0;
std::vector<T> *slowLookup_ = nullptr;
};

enum class VKRRenderCommand : uint8_t {
Expand Down Expand Up @@ -109,9 +175,12 @@ struct TransitionRequest {

struct VKRStep {
VKRStep(VKRStepType _type) : stepType(_type) {}
~VKRStep() {}

VKRStepType stepType;
std::vector<VkRenderData> commands;
std::vector<TransitionRequest> preTransitions;
TinySet<VKRFramebuffer *, 8> dependencies;
union {
struct {
VKRFramebuffer *framebuffer;
Expand Down Expand Up @@ -242,6 +311,7 @@ class VulkanQueueRunner {

void ApplyMGSHack(std::vector<VKRStep *> &steps);
void ApplySonicHack(std::vector<VKRStep *> &steps);
void ApplyRenderPassMerge(std::vector<VKRStep *> &steps);

static void SetupTransitionToTransferSrc(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect);
static void SetupTransitionToTransferDst(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect);
Expand Down
6 changes: 5 additions & 1 deletion ext/native/thin3d/VulkanRenderManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -858,7 +858,8 @@ void VulkanRenderManager::BlitFramebuffer(VKRFramebuffer *src, VkRect2D srcRect,
}

VkImageView VulkanRenderManager::BindFramebufferAsTexture(VKRFramebuffer *fb, int binding, int aspectBit, int attachment) {
// Mark the dependency and return the image.
_dbg_assert_(G3D, curRenderStep_ != nullptr);
// Mark the dependency, check for required transitions, and return the image.

for (int i = (int)steps_.size() - 1; i >= 0; i--) {
if (steps_[i]->stepType == VKRStepType::RENDER && steps_[i]->render.framebuffer == fb) {
Expand All @@ -871,6 +872,9 @@ VkImageView VulkanRenderManager::BindFramebufferAsTexture(VKRFramebuffer *fb, in
}
}

// Track dependencies fully.
curRenderStep_->dependencies.insert(fb);

if (!curRenderStep_->preTransitions.empty() &&
curRenderStep_->preTransitions.back().fb == fb &&
curRenderStep_->preTransitions.back().targetLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
Expand Down