Skip to content

Commit a2ccf52

Browse files
Add Persistent Buffers and Push Constants
This work is a heavily refactored and rewritten from TheForge's initial code. TheForge's original code had too many race conditions and was fundamentally flawed as it was too easy to incur into those data races by accident. However they identified the proper places that needed changes, and the idea was sound. I used their work as a blueprint to design this work. This PR implements: - A refactor of some shaders to avoid using multiple sets. - A push constant emulator. - Introduction of UMA buffers used by push constants and a few buffers (most notably the ones filled by _fill_instance_data). Ironically this change seems to positively affect PC more than it does on Mobile. Updates D3D12 Memory Allocator to get GPU_UPLOAD heap support. Metal implementation by Stuart Carnie. Co-authored-by: Stuart Carnie <[email protected]> Co-authored-by: TheForge team
1 parent 2043023 commit a2ccf52

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+3678
-1811
lines changed

doc/classes/RenderingDevice.xml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2149,7 +2149,17 @@
21492149
<constant name="UNIFORM_TYPE_INPUT_ATTACHMENT" value="9" enum="UniformType">
21502150
Input attachment uniform.
21512151
</constant>
2152-
<constant name="UNIFORM_TYPE_MAX" value="10" enum="UniformType">
2152+
<constant name="UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC" value="10" enum="UniformType">
2153+
Same as UNIFORM_TYPE_UNIFORM_BUFFER but for buffers created with BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT.
2154+
[b]Note:[/b] This flag is not available to GD users due to being too dangerous (i.e. wrong usage can result in visual glitches).
2155+
It's exposed in case GD users receive a buffer created with such flag from Godot.
2156+
</constant>
2157+
<constant name="UNIFORM_TYPE_STORAGE_BUFFER_DYNAMIC" value="11" enum="UniformType">
2158+
Same as UNIFORM_TYPE_STORAGE_BUFFER but for buffers created with BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT.
2159+
[b]Note:[/b] This flag is not available to GD users due to being too dangerous (i.e. wrong usage can result in visual glitches).
2160+
It's exposed in case GD users receive a buffer created with such flag from Godot.
2161+
</constant>
2162+
<constant name="UNIFORM_TYPE_MAX" value="12" enum="UniformType">
21532163
Represents the size of the [enum UniformType] enum.
21542164
</constant>
21552165
<constant name="RENDER_PRIMITIVE_POINTS" value="0" enum="RenderPrimitive">

drivers/d3d12/rendering_device_driver_d3d12.cpp

Lines changed: 228 additions & 67 deletions
Large diffs are not rendered by default.

drivers/d3d12/rendering_device_driver_d3d12.h

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
144144
MiscFeaturesSupport misc_features_support;
145145
RenderingShaderContainerFormatD3D12 shader_container_format;
146146
String pipeline_cache_id;
147+
D3D12_HEAP_TYPE dynamic_persistent_upload_heap = D3D12_HEAP_TYPE_UPLOAD;
147148

148149
class CPUDescriptorsHeapPool;
149150

@@ -323,16 +324,29 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
323324
uint64_t size = 0;
324325
struct {
325326
bool usable_as_uav : 1;
327+
bool is_dynamic : 1; // Only used for tracking (e.g. Vulkan needs these checks).
326328
} flags = {};
329+
330+
bool is_dynamic() const { return flags.is_dynamic; }
331+
};
332+
333+
struct BufferDynamicInfo : BufferInfo {
334+
uint32_t frame_idx = UINT32_MAX;
335+
uint8_t *persistent_ptr = nullptr;
336+
#ifdef DEBUG_ENABLED
337+
// For tracking that a persistent buffer isn't mapped twice in the same frame.
338+
uint64_t last_frame_mapped = 0;
339+
#endif
327340
};
328341

329342
public:
330-
virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type) override final;
343+
virtual BufferID buffer_create(uint64_t p_size, BitField<BufferUsageBits> p_usage, MemoryAllocationType p_allocation_type, uint64_t p_frames_drawn) override final;
331344
virtual bool buffer_set_texel_format(BufferID p_buffer, DataFormat p_format) override final;
332345
virtual void buffer_free(BufferID p_buffer) override final;
333346
virtual uint64_t buffer_get_allocation_size(BufferID p_buffer) override final;
334347
virtual uint8_t *buffer_map(BufferID p_buffer) override final;
335348
virtual void buffer_unmap(BufferID p_buffer) override final;
349+
virtual uint8_t *buffer_persistent_map_advance(BufferID p_buffer, uint64_t p_frames_drawn) override final;
336350
virtual uint64_t buffer_get_device_address(BufferID p_buffer) override final;
337351

338352
/*****************/
@@ -705,6 +719,7 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
705719

706720
struct RecentBind {
707721
uint64_t segment_serial = 0;
722+
uint32_t dynamic_state_mask = 0;
708723
uint32_t root_signature_crc = 0;
709724
struct {
710725
TightLocalVector<RootDescriptorTable> resources;
@@ -713,6 +728,8 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
713728
int uses = 0;
714729
} recent_binds[4]; // A better amount may be empirically found.
715730

731+
TightLocalVector<BufferDynamicInfo const *, uint32_t> dynamic_buffers;
732+
716733
#ifdef DEV_ENABLED
717734
// Filthy, but useful for dev.
718735
struct ResourceDescInfo {
@@ -726,15 +743,15 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
726743
public:
727744
virtual UniformSetID uniform_set_create(VectorView<BoundUniform> p_uniforms, ShaderID p_shader, uint32_t p_set_index, int p_linear_pool_index) override final;
728745
virtual void uniform_set_free(UniformSetID p_uniform_set) override final;
746+
virtual uint32_t uniform_sets_get_dynamic_offsets(VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) const override final;
729747

730748
// ----- COMMANDS -----
731749

732750
virtual void command_uniform_set_prepare_for_use(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
733751

734752
private:
735753
void _command_check_descriptor_sets(CommandBufferID p_cmd_buffer);
736-
void _command_bind_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index, bool p_for_compute);
737-
void _command_bind_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, bool p_for_compute);
754+
void _command_bind_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index, uint32_t p_dynamic_offsets, bool p_for_compute);
738755

739756
public:
740757
/******************/
@@ -823,8 +840,7 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
823840

824841
// Binding.
825842
virtual void command_bind_render_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
826-
virtual void command_bind_render_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
827-
virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
843+
virtual void command_bind_render_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
828844

829845
// Drawing.
830846
virtual void command_render_draw(CommandBufferID p_cmd_buffer, uint32_t p_vertex_count, uint32_t p_instance_count, uint32_t p_base_vertex, uint32_t p_first_instance) override final;
@@ -871,8 +887,7 @@ class RenderingDeviceDriverD3D12 : public RenderingDeviceDriver {
871887

872888
// Binding.
873889
virtual void command_bind_compute_pipeline(CommandBufferID p_cmd_buffer, PipelineID p_pipeline) override final;
874-
virtual void command_bind_compute_uniform_set(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) override final;
875-
virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) override final;
890+
virtual void command_bind_compute_uniform_sets(CommandBufferID p_cmd_buffer, VectorView<UniformSetID> p_uniform_sets, ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets) override final;
876891

877892
// Dispatching.
878893
virtual void command_compute_dispatch(CommandBufferID p_cmd_buffer, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) override final;

drivers/metal/metal_objects.h

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ class RenderingDeviceDriverMetal;
135135
class MDUniformSet;
136136
class MDShader;
137137

138+
struct MetalBufferDynamicInfo;
139+
138140
#pragma mark - Resource Factory
139141

140142
struct ClearAttKey {
@@ -385,11 +387,12 @@ class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDCommandBuffer {
385387
BitField<DirtyFlag> dirty = DIRTY_NONE;
386388

387389
LocalVector<MDUniformSet *> uniform_sets;
390+
uint32_t dynamic_offsets = 0;
388391
// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
389392
uint64_t uniform_set_mask = 0;
390393
uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
391394
uint32_t push_constant_data_len = 0;
392-
uint32_t push_constant_bindings[2] = { 0 };
395+
uint32_t push_constant_bindings[2] = { ~0U, ~0U };
393396

394397
_FORCE_INLINE_ void reset();
395398
void end_encoding();
@@ -505,11 +508,12 @@ class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDCommandBuffer {
505508
BitField<DirtyFlag> dirty = DIRTY_NONE;
506509

507510
LocalVector<MDUniformSet *> uniform_sets;
511+
uint32_t dynamic_offsets = 0;
508512
// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
509513
uint64_t uniform_set_mask = 0;
510514
uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
511515
uint32_t push_constant_data_len = 0;
512-
uint32_t push_constant_bindings[1] = { 0 };
516+
uint32_t push_constant_bindings[1] = { ~0U };
513517

514518
_FORCE_INLINE_ void reset();
515519
void end_encoding();
@@ -559,8 +563,7 @@ class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDCommandBuffer {
559563

560564
#pragma mark - Render Commands
561565

562-
void render_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index);
563-
void render_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count);
566+
void render_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets);
564567
void render_clear_attachments(VectorView<RDD::AttachmentClear> p_attachment_clears, VectorView<Rect2i> p_rects);
565568
void render_set_viewport(VectorView<Rect2i> p_viewports);
566569
void render_set_scissor(VectorView<Rect2i> p_scissors);
@@ -593,8 +596,7 @@ class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDCommandBuffer {
593596

594597
#pragma mark - Compute Commands
595598

596-
void compute_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index);
597-
void compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count);
599+
void compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count, uint32_t p_dynamic_offsets);
598600
void compute_dispatch(uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups);
599601
void compute_dispatch_indirect(RDD::BufferID p_indirect_buffer, uint64_t p_offset);
600602

@@ -647,6 +649,7 @@ struct API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) UniformInfo {
647649

648650
struct API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) UniformSet {
649651
LocalVector<UniformInfo> uniforms;
652+
LocalVector<uint32_t> dynamic_uniforms;
650653
uint32_t buffer_size = 0;
651654
HashMap<RDC::ShaderStage, uint32_t> offsets;
652655
HashMap<RDC::ShaderStage, id<MTLArgumentEncoder>> encoders;
@@ -715,10 +718,62 @@ struct ShaderCacheEntry {
715718
~ShaderCacheEntry() = default;
716719
};
717720

721+
/// Godot limits the number of dynamic buffers to 8.
722+
///
723+
/// This is a minimum guarantee for Vulkan.
724+
constexpr uint32_t MAX_DYNAMIC_BUFFERS = 8;
725+
726+
/// Maximum number of queued frames.
727+
///
728+
/// See setting: rendering/rendering_device/vsync/frame_queue_size
729+
constexpr uint32_t MAX_FRAME_COUNT = 4;
730+
731+
class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) DynamicOffsetLayout {
732+
struct Data {
733+
uint8_t offset : 4;
734+
uint8_t count : 4;
735+
};
736+
737+
union {
738+
Data data[MAX_DYNAMIC_BUFFERS];
739+
uint64_t _val = 0;
740+
};
741+
742+
public:
743+
_FORCE_INLINE_ bool is_empty() const { return _val == 0; }
744+
745+
_FORCE_INLINE_ uint32_t get_count(uint32_t p_set_index) const {
746+
return data[p_set_index].count;
747+
}
748+
749+
_FORCE_INLINE_ uint32_t get_offset(uint32_t p_set_index) const {
750+
return data[p_set_index].offset;
751+
}
752+
753+
_FORCE_INLINE_ void set_offset_count(uint32_t p_set_index, uint8_t p_offset, uint8_t p_count) {
754+
data[p_set_index].offset = p_offset;
755+
data[p_set_index].count = p_count;
756+
}
757+
758+
_FORCE_INLINE_ uint32_t get_offset_index_shift(uint32_t p_set_index, uint32_t p_dynamic_index = 0) const {
759+
return (data[p_set_index].offset + p_dynamic_index) * 4u;
760+
}
761+
};
762+
763+
class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) DynamicOffsets {
764+
uint32_t data;
765+
766+
public:
767+
_FORCE_INLINE_ uint32_t get_frame_index(const DynamicOffsetLayout &p_layout) const {
768+
return data;
769+
}
770+
};
771+
718772
class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDShader {
719773
public:
720774
CharString name;
721775
Vector<UniformSet> sets;
776+
DynamicOffsetLayout dynamic_offset_layout;
722777
bool uses_argument_buffers = true;
723778

724779
MDShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers) :
@@ -786,30 +841,49 @@ struct HashMapComparatorDefault<RDD::ShaderID> {
786841
struct BoundUniformSet {
787842
id<MTLBuffer> buffer;
788843
ResourceUsageMap usage_to_resources;
844+
/// Size of the per-frame buffer, which is 0 when there are no dynamic uniforms.
845+
uint32_t frame_size = 0;
789846

790847
/// Perform a 2-way merge each key of `ResourceVector` resources from this set into the
791848
/// destination set.
792849
///
793850
/// Assumes the vectors of resources are sorted.
794851
void merge_into(ResourceUsageMap &p_dst) const;
852+
853+
/// Returns true if this bound uniform set contains dynamic uniforms.
854+
_FORCE_INLINE_ bool is_dynamic() const { return frame_size > 0; }
855+
856+
/// Calculate the offset in the Metal buffer for the current frame.
857+
_FORCE_INLINE_ uint32_t frame_offset(uint32_t p_frame_index) const { return p_frame_index * frame_size; }
858+
859+
/// Calculate the offset in the buffer for the given frame index and base offset.
860+
_FORCE_INLINE_ uint32_t make_offset(uint32_t p_frame_index, uint32_t p_base_offset) const {
861+
return frame_offset(p_frame_index) + p_base_offset;
862+
}
863+
864+
BoundUniformSet() = default;
865+
BoundUniformSet(id<MTLBuffer> p_buffer, ResourceUsageMap &&p_usage_to_resources, uint32_t p_frame_size) :
866+
buffer(p_buffer), usage_to_resources(std::move(p_usage_to_resources)), frame_size(p_frame_size) {}
795867
};
796868

797869
class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDUniformSet {
798870
private:
799-
void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index);
800-
void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index);
801-
void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index);
802-
void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index);
871+
void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
872+
void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets);
873+
void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
874+
void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets);
875+
876+
void update_dynamic_uniforms(MDShader *p_shader, ResourceUsageMap &p_resource_usage, uint32_t p_set_index, BoundUniformSet &p_bound_set, uint32_t p_dynamic_offsets, uint32_t p_frame_idx);
803877

804878
public:
805-
uint32_t index;
879+
uint32_t index = 0;
806880
LocalVector<RDD::BoundUniform> uniforms;
807881
HashMap<MDShader *, BoundUniformSet> bound_uniforms;
808882

809-
void bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index);
810-
void bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index);
883+
void bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
884+
void bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
811885

812-
BoundUniformSet &bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage, uint32_t p_set_index);
886+
BoundUniformSet &bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage, uint32_t p_set_index, uint32_t p_dynamic_offsets, uint32_t p_frame_idx, uint32_t p_frame_count);
813887
};
814888

815889
class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDPipeline {

0 commit comments

Comments
 (0)