Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle guardband clip/cull better for hardware backends #14833

Merged
merged 17 commits into from
Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Common/GPU/D3D11/thin3d_d3d11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,9 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
// Seems like a fair approximation...
caps_.dualSourceBlend = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
caps_.depthClampSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
// SV_ClipDistance# seems to be 10+.
caps_.clipDistanceSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
caps_.cullDistanceSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;

caps_.depthRangeMinusOneToOne = false;
caps_.framebufferBlitSupported = false;
Expand Down Expand Up @@ -1345,7 +1348,7 @@ void D3D11DrawContext::BindSamplerStates(int start, int count, SamplerState **st
_assert_(start + count <= ARRAY_SIZE(samplers));
for (int i = 0; i < count; i++) {
D3D11SamplerState *samp = (D3D11SamplerState *)states[i];
samplers[i] = samp->ss;
samplers[i] = samp ? samp->ss : nullptr;
}
context_->PSSetSamplers(start, count, samplers);
}
Expand Down
3 changes: 2 additions & 1 deletion Common/GPU/D3D9/thin3d_d3d9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,8 @@ class D3D9Context : public DrawContext {
_assert_(start + count <= MAX_BOUND_TEXTURES);
for (int i = 0; i < count; ++i) {
D3D9SamplerState *s = static_cast<D3D9SamplerState *>(states[i]);
s->Apply(device_, start + i);
if (s)
s->Apply(device_, start + i);
}
}
void BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) override {
Expand Down
1 change: 1 addition & 0 deletions Common/GPU/OpenGL/GLFeatures.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ void CheckGLExtensions() {
gl_extensions.OES_texture_float = g_set_gl_extensions.count("GL_OES_texture_float") != 0;
gl_extensions.EXT_buffer_storage = g_set_gl_extensions.count("GL_EXT_buffer_storage") != 0;
gl_extensions.EXT_clip_cull_distance = g_set_gl_extensions.count("GL_EXT_clip_cull_distance") != 0;
gl_extensions.APPLE_clip_distance = g_set_gl_extensions.count("GL_APPLE_clip_distance") != 0;

#if defined(__ANDROID__)
// On Android, incredibly, this is not consistently non-zero! It does seem to have the same value though.
Expand Down
3 changes: 3 additions & 0 deletions Common/GPU/OpenGL/GLFeatures.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ struct GLExtensions {
// ARM
bool ARM_shader_framebuffer_fetch;

// APPLE
bool APPLE_clip_distance;

// EGL
bool EGL_NV_system_time;
bool EGL_NV_coverage_sample;
Expand Down
17 changes: 17 additions & 0 deletions Common/GPU/OpenGL/GLQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
#include "GLRenderManager.h"
#include "DataFormatGL.h"

// These are the same value, alias for simplicity.
#if defined(GL_CLIP_DISTANCE0_EXT) && !defined(GL_CLIP_DISTANCE0)
#define GL_CLIP_DISTANCE0 GL_CLIP_DISTANCE0_EXT
#elif !defined(GL_CLIP_DISTANCE0)
#define GL_CLIP_DISTANCE0 0x3000
#endif

static constexpr int TEXCACHE_NAME_CACHE_SIZE = 16;

#if PPSSPP_PLATFORM(IOS)
Expand Down Expand Up @@ -798,6 +805,7 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
int logicOp = -1;
bool logicEnabled = false;
#endif
bool clipDistance0Enabled = false;
GLuint blendEqColor = (GLuint)-1;
GLuint blendEqAlpha = (GLuint)-1;

Expand Down Expand Up @@ -1106,6 +1114,13 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
{
if (curProgram != c.program.program) {
glUseProgram(c.program.program->program);
if (c.program.program->use_clip_distance0 != clipDistance0Enabled) {
if (c.program.program->use_clip_distance0)
glEnable(GL_CLIP_DISTANCE0);
else
glDisable(GL_CLIP_DISTANCE0);
clipDistance0Enabled = c.program.program->use_clip_distance0;
}
curProgram = c.program.program;
}
CHECK_GL_ERROR_IF_DEBUG();
Expand Down Expand Up @@ -1340,6 +1355,8 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
glDisable(GL_COLOR_LOGIC_OP);
}
#endif
if (clipDistance0Enabled)
glDisable(GL_CLIP_DISTANCE0);
if ((colorMask & 15) != 15)
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
CHECK_GL_ERROR_IF_DEBUG();
Expand Down
4 changes: 3 additions & 1 deletion Common/GPU/OpenGL/GLRenderManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ class GLRProgram {
std::vector<Semantic> semantics_;
std::vector<UniformLocQuery> queries_;
std::vector<Initializer> initialize_;
bool use_clip_distance0 = false;

struct UniformInfo {
int loc_;
Expand Down Expand Up @@ -422,13 +423,14 @@ class GLRenderManager {
// not be an active render pass.
GLRProgram *CreateProgram(
std::vector<GLRShader *> shaders, std::vector<GLRProgram::Semantic> semantics, std::vector<GLRProgram::UniformLocQuery> queries,
std::vector<GLRProgram::Initializer> initalizers, bool supportDualSource) {
std::vector<GLRProgram::Initializer> initalizers, bool supportDualSource, bool useClipDistance0) {
GLRInitStep step{ GLRInitStepType::CREATE_PROGRAM };
_assert_(shaders.size() <= ARRAY_SIZE(step.create_program.shaders));
step.create_program.program = new GLRProgram();
step.create_program.program->semantics_ = semantics;
step.create_program.program->queries_ = queries;
step.create_program.program->initialize_ = initalizers;
step.create_program.program->use_clip_distance0 = useClipDistance0;
step.create_program.support_dual_source = supportDualSource;
_assert_msg_(shaders.size() > 0, "Can't create a program with zero shaders");
for (size_t i = 0; i < shaders.size(); i++) {
Expand Down
9 changes: 8 additions & 1 deletion Common/GPU/OpenGL/thin3d_gl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,13 @@ OpenGLContext::OpenGLContext() {
caps_.framebufferBlitSupported = gl_extensions.NV_framebuffer_blit || gl_extensions.ARB_framebuffer_object;
caps_.framebufferDepthBlitSupported = caps_.framebufferBlitSupported;
caps_.depthClampSupported = gl_extensions.ARB_depth_clamp;
if (gl_extensions.IsGLES) {
caps_.clipDistanceSupported = gl_extensions.EXT_clip_cull_distance || gl_extensions.APPLE_clip_distance;
caps_.cullDistanceSupported = gl_extensions.EXT_clip_cull_distance;
} else {
caps_.clipDistanceSupported = gl_extensions.VersionGEThan(3, 0);
caps_.cullDistanceSupported = gl_extensions.ARB_cull_distance;
}

// Interesting potential hack for emulating GL_DEPTH_CLAMP (use a separate varying, force depth in fragment shader):
// This will induce a performance penalty on many architectures though so a blanket enable of this
Expand Down Expand Up @@ -1162,7 +1169,7 @@ bool OpenGLPipeline::LinkShaders() {
std::vector<GLRProgram::Initializer> initialize;
for (int i = 0; i < MAX_TEXTURE_SLOTS; ++i)
initialize.push_back({ &samplerLocs_[i], 0, i });
program_ = render_->CreateProgram(linkShaders, semantics, queries, initialize, false);
program_ = render_->CreateProgram(linkShaders, semantics, queries, initialize, false, false);
return true;
}

Expand Down
2 changes: 2 additions & 0 deletions Common/GPU/Vulkan/VulkanContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,8 @@ void VulkanContext::ChooseDevice(int physical_device) {
deviceFeatures_.enabled.depthClamp = deviceFeatures_.available.depthClamp;
deviceFeatures_.enabled.depthBounds = deviceFeatures_.available.depthBounds;
deviceFeatures_.enabled.samplerAnisotropy = deviceFeatures_.available.samplerAnisotropy;
deviceFeatures_.enabled.shaderClipDistance = deviceFeatures_.available.shaderClipDistance;
deviceFeatures_.enabled.shaderCullDistance = deviceFeatures_.available.shaderCullDistance;
// For easy wireframe mode, someday.
deviceFeatures_.enabled.fillModeNonSolid = deviceFeatures_.available.fillModeNonSolid;

Expand Down
7 changes: 7 additions & 0 deletions Common/GPU/Vulkan/thin3d_vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,8 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
caps_.multiViewport = vulkan->GetDeviceFeatures().enabled.multiViewport != 0;
caps_.dualSourceBlend = vulkan->GetDeviceFeatures().enabled.dualSrcBlend != 0;
caps_.depthClampSupported = vulkan->GetDeviceFeatures().enabled.depthClamp != 0;
caps_.clipDistanceSupported = vulkan->GetDeviceFeatures().enabled.shaderClipDistance != 0;
caps_.cullDistanceSupported = vulkan->GetDeviceFeatures().enabled.shaderCullDistance != 0;
caps_.framebufferBlitSupported = true;
caps_.framebufferCopySupported = true;
caps_.framebufferDepthBlitSupported = false; // Can be checked for.
Expand Down Expand Up @@ -816,6 +818,11 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
} else if (caps_.vendor == GPUVendor::VENDOR_INTEL) {
// Workaround for Intel driver bug. TODO: Re-enable after some driver version
bugs_.Infest(Bugs::DUAL_SOURCE_BLENDING_BROKEN);
} else if (caps_.vendor == GPUVendor::VENDOR_ARM) {
// These GPUs (up to some certain hardware version?) have a bug where draws where gl_Position.w == .z
// corrupt the depth buffer. This is easily worked around by simply scaling Z down a tiny bit when this case
// is detected. See: https://github.com/hrydgard/ppsspp/issues/11937
bugs_.Infest(Bugs::EQUAL_WZ_CORRUPTS_DEPTH);
}

caps_.deviceID = deviceProps.deviceID;
Expand Down
3 changes: 3 additions & 0 deletions Common/GPU/thin3d.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ class Bugs {
BROKEN_NAN_IN_CONDITIONAL = 4,
COLORWRITEMASK_BROKEN_WITH_DEPTHTEST = 5,
BROKEN_FLAT_IN_SHADER = 6,
EQUAL_WZ_CORRUPTS_DEPTH = 7,
};

protected:
Expand Down Expand Up @@ -520,6 +521,8 @@ struct DeviceCaps {
bool dualSourceBlend;
bool logicOpSupported;
bool depthClampSupported;
bool clipDistanceSupported;
bool cullDistanceSupported;
bool framebufferCopySupported;
bool framebufferBlitSupported;
bool framebufferDepthCopySupported;
Expand Down
6 changes: 5 additions & 1 deletion Common/UI/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ void UIContext::BeginNoTex() {

void UIContext::BeginPipeline(Draw::Pipeline *pipeline, Draw::SamplerState *samplerState) {
_assert_(pipeline != nullptr);
draw_->BindSamplerStates(0, 1, &samplerState);
// Also clear out any other textures bound.
Draw::SamplerState *samplers[3]{ samplerState };
draw_->BindSamplerStates(0, 3, samplers);
Draw::Texture *textures[2]{};
draw_->BindTextures(1, 2, textures);
RebindTexture();
UIBegin(pipeline);
}
Expand Down
40 changes: 8 additions & 32 deletions GPU/Common/ShaderUniforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,29 +43,12 @@ void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bo
float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
return (pspViewport * heightScale) - yOffset;
};
auto reverseViewportZ = [hasNegZ](float z) {
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();

float scale, center;
if (gstate_c.Supports(GPU_SUPPORTS_ACCURATE_DEPTH)) {
// These are just the reverse of the formulas in GPUStateUtils.
float halfActualZRange = vpZScale * (1.0f / gstate_c.vpDepthScale);
float minz = -((gstate_c.vpZOffset * halfActualZRange) - vpZCenter) - halfActualZRange;

// In accurate depth mode, we're comparing against a value scaled to (minz, maxz).
// And minz might be very negative, (e.g. if we're clamping in that direction.)
scale = halfActualZRange;
center = minz + halfActualZRange;
} else {
// In old-style depth mode, we're comparing against a value scaled to viewport.
// (and possibly incorrectly clipped against it.)
scale = vpZScale;
center = vpZCenter;
auto transformZ = [hasNegZ](float z) {
// Z culling ignores the viewport, so we just redo the projection matrix adjustments.
if (hasNegZ) {
return (z * gstate_c.vpDepthScale) + gstate_c.vpZOffset;
}

float realViewport = (z - center) * (1.0f / scale);
return hasNegZ ? realViewport : (realViewport * 0.5f + 0.5f);
return (z * gstate_c.vpDepthScale * 0.5f) + gstate_c.vpZOffset * 0.5f + 0.5f;
};
auto sortPair = [](float a, float b) {
return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
Expand All @@ -75,7 +58,7 @@ void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bo
// Any vertex outside this range (unless depth clamp enabled) is discarded.
auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
auto z = sortPair(transformZ(-1.000030517578125f), transformZ(1.000030517578125f));
// Since we have space in w, use it to pass the depth clamp flag. We also pass NAN for w "discard".
float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;

Expand Down Expand Up @@ -243,18 +226,11 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
float viewZScale = halfActualZRange * 2.0f;
// Account for the half pixel offset.
float viewZCenter = minz + (DepthSliceFactor() / 256.0f) * 0.5f;
float viewZInvScale;

if (viewZScale != 0.0) {
viewZInvScale = 1.0f / viewZScale;
} else {
viewZInvScale = 0.0;
}

ub->depthRange[0] = viewZScale;
ub->depthRange[1] = viewZCenter;
ub->depthRange[2] = viewZCenter;
ub->depthRange[3] = viewZInvScale;
ub->depthRange[2] = gstate_c.vpZOffset * 0.5f + 0.5f;
ub->depthRange[3] = 2.0f * (1.0f / gstate_c.vpDepthScale);
}

if (dirtyUniforms & DIRTY_CULLRANGE) {
Expand Down
35 changes: 22 additions & 13 deletions GPU/Common/SoftwareTransformCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,8 @@ static void SwapUVs(TransformedVertex &a, TransformedVertex &b) {

// Note: 0 is BR and 2 is TL.

static void RotateUV(TransformedVertex v[4], float flippedMatrix[16], bool flippedY) {
// Transform these two coordinates to figure out whether they're flipped or not.
Vec4f tl;
Vec3ByMatrix44(tl.AsArray(), v[2].pos, flippedMatrix);

Vec4f br;
Vec3ByMatrix44(br.AsArray(), v[0].pos, flippedMatrix);

static void RotateUV(TransformedVertex v[4], Vec4f tl, Vec4f br, bool flippedY) {
// We use the transformed tl/br coordinates to figure out whether they're flipped or not.
float ySign = flippedY ? -1.0 : 1.0;

const float invtlw = 1.0f / tl.w;
Expand Down Expand Up @@ -438,7 +432,7 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
// TODO: This bleeds outside the play area in non-buffered mode. Big deal? Probably not.
// TODO: Allow creating a depth clear and a color draw.
bool reallyAClear = false;
if (maxIndex > 1 && prim == GE_PRIM_RECTANGLES && gstate.isModeClear()) {
if (maxIndex > 1 && prim == GE_PRIM_RECTANGLES && gstate.isModeClear() && throughmode) {
int scissorX2 = gstate.getScissorX2() + 1;
int scissorY2 = gstate.getScissorY2() + 1;
reallyAClear = IsReallyAClear(transformed, maxIndex, scissorX2, scissorY2);
Expand All @@ -465,7 +459,7 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
}

// Detect full screen "clears" that might not be so obvious, to set the safe size if possible.
if (!result->setSafeSize && prim == GE_PRIM_RECTANGLES && maxIndex == 2) {
if (!result->setSafeSize && prim == GE_PRIM_RECTANGLES && maxIndex == 2 && throughmode) {
bool clearingColor = gstate.isModeClear() && (gstate.isClearModeColorMask() || gstate.isClearModeAlphaMask());
bool writingColor = gstate.getColorMask() != 0xFFFFFFFF;
bool startsZeroX = transformed[0].x <= 0.0f && transformed[1].x > 0.0f && transformed[1].x > transformed[0].x;
Expand Down Expand Up @@ -629,10 +623,25 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
trans[3].u = transVtxTL.u;

// That's the four corners. Now process UV rotation.
if (throughmode)
if (throughmode) {
RotateUVThrough(trans);
else
RotateUV(trans, flippedMatrix, flippedY);
} else {
Vec4f tl;
Vec3ByMatrix44(tl.AsArray(), transVtxTL.pos, flippedMatrix);
Vec4f br;
Vec3ByMatrix44(br.AsArray(), transVtxBR.pos, flippedMatrix);

// If both transformed verts are outside Z, cull this rectangle entirely.
constexpr float outsideValue = 1.000030517578125f;
bool tlOutside = fabsf(tl.z / tl.w) >= outsideValue;
bool brOutside = fabsf(br.z / br.w) >= outsideValue;
if (tlOutside && brOutside)
continue;
if (!gstate.isDepthClampEnabled() && (tlOutside || brOutside))
continue;

RotateUV(trans, tl, br, flippedY);
}

// Triangle: BR-TR-TL
indsOut[0] = i * 2 + 0;
Expand Down
Loading