From 87042169c3a2f1d0b48f7b0d81e6a2c28afcd7bf Mon Sep 17 00:00:00 2001 From: Alan Tse Date: Sat, 30 May 2026 00:02:02 -0700 Subject: [PATCH] refactor(llf): align cluster light cap with index pool Two related cleanups to the clustered light-culling pass; no observable behavior change in any realistic scene. 1. Align the per-cluster cap. MAX_CLUSTER_LIGHTS (Common.hlsli) was 256 while the C++ pool is clusterCount * CLUSTER_MAX_LIGHTS = 128. The constants represent the same quantity; set the shader cap to 128 and cross-reference both sides. Overrun was effectively unreachable (global pool, mostly-empty clusters), so this is consistency hardening, not a user-visible fix. Also halves the per-thread visibleLightIndices[] indexable-temp array. 2. Remove dead groupshared staging. The sharedLights copy and its barriers were never read in any commit (a name collision on the pezcode port) and could not have worked anyway: Light[GROUP_SIZE] = 96 KB exceeds the 32 KB cs_5_0 LDS limit, so a live read would not compile. fxc already dead-stripped it; the only DXBC delta is the 256->128 cap. Verified with standalone fxc cs_5_0 compiles (flat + VR) and before/after disassembly diff. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Shaders/LightLimitFix/ClusterCullingCS.hlsl | 16 ++++------------ .../Shaders/LightLimitFix/Common.hlsli | 5 ++++- src/Features/LightLimitFix.h | 3 +++ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/features/Light Limit Fix/Shaders/LightLimitFix/ClusterCullingCS.hlsl b/features/Light Limit Fix/Shaders/LightLimitFix/ClusterCullingCS.hlsl index 53ac13fc19..2132142131 100644 --- a/features/Light Limit Fix/Shaders/LightLimitFix/ClusterCullingCS.hlsl +++ b/features/Light Limit Fix/Shaders/LightLimitFix/ClusterCullingCS.hlsl @@ -18,8 +18,6 @@ RWStructuredBuffer lightIndexCounter : register(u0); RWStructuredBuffer lightIndexList : register(u1); RWStructuredBuffer lightGrid : register(u2); -groupshared Light sharedLights[GROUP_SIZE]; - bool LightIntersectsCluster(float3 position, float radiusSquared, ClusterAABB cluster) { float3 closest = max(cluster.minPoint.xyz, min(position, cluster.maxPoint.xyz)); @@ -42,14 +40,10 @@ bool LightIntersectsCluster(float3 position, float radiusSquared, ClusterAABB cl ClusterAABB cluster = clusters[clusterIndex]; - if (groupIndex < LightCount) { - uint lightIndex = groupIndex; - Light light = lights[lightIndex]; - sharedLights[groupIndex] = light; - } - - GroupMemoryBarrierWithGroupSync(); - + // Threads read the global lights buffer directly (cached); with no + // inter-thread sharing there is nothing to synchronize, so no barriers. Dead + // groupshared staging was removed here -- do not re-add: Light[GROUP_SIZE] is + // 96 KB, over the 32 KB LDS limit, so a live read would not compile. for (uint i = 0; i < LightCount; i++) { Light light = lights[i]; @@ -74,8 +68,6 @@ bool LightIntersectsCluster(float3 position, float radiusSquared, ClusterAABB cl } } - GroupMemoryBarrierWithGroupSync(); - uint offset = 0; InterlockedAdd(lightIndexCounter[0], visibleLightCount, offset); diff --git a/features/Light Limit Fix/Shaders/LightLimitFix/Common.hlsli b/features/Light Limit Fix/Shaders/LightLimitFix/Common.hlsli index 1884d76388..ed7bbe3ea7 100644 --- a/features/Light Limit Fix/Shaders/LightLimitFix/Common.hlsli +++ b/features/Light Limit Fix/Shaders/LightLimitFix/Common.hlsli @@ -5,7 +5,10 @@ #define NUMTHREAD_Y 16 #define NUMTHREAD_Z 4 #define GROUP_SIZE (NUMTHREAD_X * NUMTHREAD_Y * NUMTHREAD_Z) -#define MAX_CLUSTER_LIGHTS 256 +// Per-cluster light cap. MUST match LightLimitFix::CLUSTER_MAX_LIGHTS: the C++ +// side sizes the global lightIndexList pool as clusterCount * that value, so a +// larger cap here can overrun the pool. +#define MAX_CLUSTER_LIGHTS 128 namespace LightFlags { diff --git a/src/Features/LightLimitFix.h b/src/Features/LightLimitFix.h index 7583352f4f..09ec48aac1 100644 --- a/src/Features/LightLimitFix.h +++ b/src/Features/LightLimitFix.h @@ -12,6 +12,9 @@ struct LightLimitFix : OverlayFeature { private: static constexpr uint32_t MAX_LIGHTS = 1024; + // Per-cluster visible-light cap; sizes the global lightIndexList pool as + // clusterCount * CLUSTER_MAX_LIGHTS. MUST match MAX_CLUSTER_LIGHTS in the + // shader-side Common.hlsli or the cull pass can overrun the pool. static constexpr uint32_t CLUSTER_MAX_LIGHTS = 128; public: