diff --git a/crates/bevy_core_pipeline/src/core_2d/mod.rs b/crates/bevy_core_pipeline/src/core_2d/mod.rs
index 48d2bb5a4f609..85f986c111dd4 100644
--- a/crates/bevy_core_pipeline/src/core_2d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_2d/mod.rs
@@ -38,12 +38,11 @@ use bevy_render::{
     render_graph::{EmptyNode, RenderGraphApp, ViewNodeRunner},
     render_phase::{
         sort_phase_system, CachedRenderPipelinePhaseItem, DrawFunctionId, DrawFunctions, PhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
     },
     render_resource::CachedRenderPipelineId,
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
-use nonmax::NonMaxU32;
 
 use crate::{tonemapping::TonemappingNode, upscaling::UpscalingNode};
 
@@ -91,7 +90,7 @@ pub struct Transparent2d {
     pub pipeline: CachedRenderPipelineId,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transparent2d {
@@ -116,13 +115,13 @@ impl PhaseItem for Transparent2d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_core_pipeline/src/core_3d/mod.rs b/crates/bevy_core_pipeline/src/core_3d/mod.rs
index 94bc10b4f3501..e5cd6c049173f 100644
--- a/crates/bevy_core_pipeline/src/core_3d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_3d/mod.rs
@@ -58,7 +58,8 @@ use bevy_render::{
     render_graph::{EmptyNode, RenderGraphApp, ViewNodeRunner},
     render_phase::{
         sort_phase_system, BinnedPhaseItem, BinnedRenderPhase, CachedRenderPipelinePhaseItem,
-        DrawFunctionId, DrawFunctions, PhaseItem, SortedPhaseItem, SortedRenderPhase,
+        DrawFunctionId, DrawFunctions, PhaseItem, PhaseItemExtraIndex, SortedPhaseItem,
+        SortedRenderPhase,
     },
     render_resource::{
         BindGroupId, CachedRenderPipelineId, Extent3d, FilterMode, Sampler, SamplerDescriptor,
@@ -70,7 +71,6 @@ use bevy_render::{
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
 use bevy_utils::{tracing::warn, HashMap};
-use nonmax::NonMaxU32;
 
 use crate::{
     core_3d::main_transmissive_pass_3d_node::MainTransmissivePass3dNode,
@@ -183,8 +183,9 @@ pub struct Opaque3d {
     pub representative_entity: Entity,
     /// The ranges of instances.
     pub batch_range: Range<u32>,
-    /// The dynamic offset.
-    pub dynamic_offset: Option<NonMaxU32>,
+    /// An extra index, which is either a dynamic offset or an index in the
+    /// indirect parameters list.
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 /// Data that must be identical in order to batch meshes together.
@@ -229,14 +230,12 @@ impl PhaseItem for Opaque3d {
         &mut self.batch_range
     }
 
-    #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
-    #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -248,13 +247,13 @@ impl BinnedPhaseItem for Opaque3d {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Opaque3d {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -270,7 +269,7 @@ pub struct AlphaMask3d {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3d {
@@ -295,13 +294,13 @@ impl PhaseItem for AlphaMask3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -313,13 +312,13 @@ impl BinnedPhaseItem for AlphaMask3d {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -337,7 +336,7 @@ pub struct Transmissive3d {
     pub entity: Entity,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transmissive3d {
@@ -373,13 +372,13 @@ impl PhaseItem for Transmissive3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -411,7 +410,7 @@ pub struct Transparent3d {
     pub entity: Entity,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transparent3d {
@@ -436,13 +435,13 @@ impl PhaseItem for Transparent3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_core_pipeline/src/deferred/mod.rs b/crates/bevy_core_pipeline/src/deferred/mod.rs
index 3ccd8caad0e12..1372731820224 100644
--- a/crates/bevy_core_pipeline/src/deferred/mod.rs
+++ b/crates/bevy_core_pipeline/src/deferred/mod.rs
@@ -5,10 +5,12 @@ use std::ops::Range;
 
 use bevy_ecs::prelude::*;
 use bevy_render::{
-    render_phase::{BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem},
+    render_phase::{
+        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem,
+        PhaseItemExtraIndex,
+    },
     render_resource::{CachedRenderPipelineId, TextureFormat},
 };
-use nonmax::NonMaxU32;
 
 use crate::prepass::OpaqueNoLightmap3dBinKey;
 
@@ -26,7 +28,7 @@ pub struct Opaque3dDeferred {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Opaque3dDeferred {
@@ -51,13 +53,13 @@ impl PhaseItem for Opaque3dDeferred {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -69,13 +71,13 @@ impl BinnedPhaseItem for Opaque3dDeferred {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -96,7 +98,7 @@ pub struct AlphaMask3dDeferred {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3dDeferred {
@@ -121,13 +123,13 @@ impl PhaseItem for AlphaMask3dDeferred {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -138,13 +140,13 @@ impl BinnedPhaseItem for AlphaMask3dDeferred {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_core_pipeline/src/prepass/mod.rs b/crates/bevy_core_pipeline/src/prepass/mod.rs
index 670eca9e8e2d2..88a6ac3970539 100644
--- a/crates/bevy_core_pipeline/src/prepass/mod.rs
+++ b/crates/bevy_core_pipeline/src/prepass/mod.rs
@@ -34,11 +34,13 @@ use bevy_ecs::prelude::*;
 use bevy_reflect::Reflect;
 use bevy_render::{
     mesh::Mesh,
-    render_phase::{BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem},
+    render_phase::{
+        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem,
+        PhaseItemExtraIndex,
+    },
     render_resource::{BindGroupId, CachedRenderPipelineId, Extent3d, TextureFormat, TextureView},
     texture::ColorAttachment,
 };
-use nonmax::NonMaxU32;
 
 pub const NORMAL_PREPASS_FORMAT: TextureFormat = TextureFormat::Rgb10a2Unorm;
 pub const MOTION_VECTOR_PREPASS_FORMAT: TextureFormat = TextureFormat::Rg16Float;
@@ -119,7 +121,7 @@ pub struct Opaque3dPrepass {
     pub representative_entity: Entity,
 
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 // TODO: Try interning these.
@@ -163,13 +165,13 @@ impl PhaseItem for Opaque3dPrepass {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -181,13 +183,13 @@ impl BinnedPhaseItem for Opaque3dPrepass {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Opaque3dPrepass {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -208,7 +210,7 @@ pub struct AlphaMask3dPrepass {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3dPrepass {
@@ -233,13 +235,13 @@ impl PhaseItem for AlphaMask3dPrepass {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -251,13 +253,13 @@ impl BinnedPhaseItem for AlphaMask3dPrepass {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_gizmos/src/pipeline_2d.rs b/crates/bevy_gizmos/src/pipeline_2d.rs
index 4fbf9544e22b5..660cec02c92d3 100644
--- a/crates/bevy_gizmos/src/pipeline_2d.rs
+++ b/crates/bevy_gizmos/src/pipeline_2d.rs
@@ -18,7 +18,9 @@ use bevy_ecs::{
 use bevy_math::FloatOrd;
 use bevy_render::{
     render_asset::{prepare_assets, RenderAssets},
-    render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+    render_phase::{
+        AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline, SortedRenderPhase,
+    },
     render_resource::*,
     texture::BevyDefault,
     view::{ExtractedView, Msaa, RenderLayers, ViewTarget},
@@ -293,7 +295,7 @@ fn queue_line_gizmos_2d(
                 pipeline,
                 sort_key: FloatOrd(f32::INFINITY),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
@@ -351,7 +353,7 @@ fn queue_line_joint_gizmos_2d(
                 pipeline,
                 sort_key: FloatOrd(f32::INFINITY),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_gizmos/src/pipeline_3d.rs b/crates/bevy_gizmos/src/pipeline_3d.rs
index ec9800330d1b7..e247220d541bc 100644
--- a/crates/bevy_gizmos/src/pipeline_3d.rs
+++ b/crates/bevy_gizmos/src/pipeline_3d.rs
@@ -22,7 +22,9 @@ use bevy_ecs::{
 use bevy_pbr::{MeshPipeline, MeshPipelineKey, SetMeshViewBindGroup};
 use bevy_render::{
     render_asset::{prepare_assets, RenderAssets},
-    render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+    render_phase::{
+        AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline, SortedRenderPhase,
+    },
     render_resource::*,
     texture::BevyDefault,
     view::{ExtractedView, Msaa, RenderLayers, ViewTarget},
@@ -348,7 +350,7 @@ fn queue_line_gizmos_3d(
                 pipeline,
                 distance: 0.,
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
@@ -437,7 +439,7 @@ fn queue_line_joint_gizmos_3d(
                 pipeline,
                 distance: 0.,
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index 395cb1c6e1746..1309934a1e3b0 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -710,7 +710,7 @@ pub fn queue_material_meshes<M: Material>(
                             pipeline: pipeline_id,
                             distance,
                             batch_range: 0..1,
-                            dynamic_offset: None,
+                            extra_index: PhaseItemExtraIndex::NONE,
                         });
                     } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         let bin_key = Opaque3dBinKey {
@@ -734,7 +734,7 @@ pub fn queue_material_meshes<M: Material>(
                             pipeline: pipeline_id,
                             distance,
                             batch_range: 0..1,
-                            dynamic_offset: None,
+                            extra_index: PhaseItemExtraIndex::NONE,
                         });
                     } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         let bin_key = OpaqueNoLightmap3dBinKey {
@@ -759,7 +759,7 @@ pub fn queue_material_meshes<M: Material>(
                         pipeline: pipeline_id,
                         distance,
                         batch_range: 0..1,
-                        dynamic_offset: None,
+                        extra_index: PhaseItemExtraIndex::NONE,
                     });
                 }
             }
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 21eff19668a65..8e32e96678ad6 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -14,27 +14,35 @@ use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
 use bevy_ecs::{
     component::Component,
     entity::Entity,
-    query::QueryState,
+    query::{Has, QueryState},
     schedule::{common_conditions::resource_exists, IntoSystemConfigs as _},
     system::{lifetimeless::Read, Commands, Res, ResMut, Resource},
     world::{FromWorld, World},
 };
 use bevy_render::{
-    batching::gpu_preprocessing::{self, BatchedInstanceBuffers, PreprocessWorkItem},
+    batching::gpu_preprocessing::{
+        BatchedInstanceBuffers, GpuPreprocessingSupport, IndirectParameters,
+        IndirectParametersBuffer, PreprocessWorkItem,
+    },
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
-        binding_types::{storage_buffer, storage_buffer_read_only},
+        binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer},
         BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, BufferBinding,
         CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor,
         DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType,
         SpecializedComputePipeline, SpecializedComputePipelines,
     },
-    renderer::{RenderContext, RenderDevice},
+    renderer::{RenderContext, RenderDevice, RenderQueue},
+    view::{GpuCulling, ViewUniform, ViewUniformOffset, ViewUniforms},
     Render, RenderApp, RenderSet,
 };
 use bevy_utils::tracing::warn;
+use bitflags::bitflags;
+use smallvec::{smallvec, SmallVec};
 
-use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
+use crate::{
+    graph::NodePbr, MeshCullingData, MeshCullingDataBuffer, MeshInputUniform, MeshUniform,
+};
 
 /// The handle to the `mesh_preprocess.wgsl` compute shader.
 pub const MESH_PREPROCESS_SHADER_HANDLE: Handle<Shader> =
@@ -57,20 +65,46 @@ pub struct GpuMeshPreprocessPlugin {
 
 /// The render node for the mesh uniform building pass.
 pub struct GpuPreprocessNode {
-    view_query: QueryState<(Entity, Read<PreprocessBindGroup>)>,
+    view_query: QueryState<(
+        Entity,
+        Read<PreprocessBindGroup>,
+        Read<ViewUniformOffset>,
+        Has<GpuCulling>,
+    )>,
 }
 
-/// The compute shader pipeline for the mesh uniform building pass.
+/// The compute shader pipelines for the mesh uniform building pass.
 #[derive(Resource)]
+pub struct PreprocessPipelines {
+    /// The pipeline used for CPU culling. This pipeline doesn't populate
+    /// indirect parameters.
+    pub direct: PreprocessPipeline,
+    /// The pipeline used for GPU culling. This pipeline populates indirect
+    /// parameters.
+    pub gpu_culling: PreprocessPipeline,
+}
+
+/// The pipeline for the GPU mesh preprocessing shader.
 pub struct PreprocessPipeline {
-    /// The single bind group layout for the compute shader.
+    /// The bind group layout for the compute shader.
     pub bind_group_layout: BindGroupLayout,
     /// The pipeline ID for the compute shader.
     ///
-    /// This gets filled in in `prepare_preprocess_pipeline`.
+    /// This gets filled in in `prepare_preprocess_pipelines`.
     pub pipeline_id: Option<CachedComputePipelineId>,
 }
 
+bitflags! {
+    /// Specifies variants of the mesh preprocessing shader.
+    #[derive(Clone, Copy, PartialEq, Eq, Hash)]
+    pub struct PreprocessPipelineKey: u8 {
+        /// Whether GPU culling is in use.
+        ///
+        /// This `#define`'s `GPU_CULLING` in the shader.
+        const GPU_CULLING = 1;
+    }
+}
+
 /// The compute shader bind group for the mesh uniform building pass.
 ///
 /// This goes on the view.
@@ -94,9 +128,9 @@ impl Plugin for GpuMeshPreprocessPlugin {
 
         // This plugin does nothing if GPU instance buffer building isn't in
         // use.
-        let render_device = render_app.world().resource::<RenderDevice>();
+        let gpu_preprocessing_support = render_app.world().resource::<GpuPreprocessingSupport>();
         if !self.use_gpu_instance_buffer_builder
-            || !gpu_preprocessing::can_preprocess_on_gpu(render_device)
+            || *gpu_preprocessing_support == GpuPreprocessingSupport::None
         {
             return;
         }
@@ -106,17 +140,18 @@ impl Plugin for GpuMeshPreprocessPlugin {
             .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass))
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass))
-            .init_resource::<PreprocessPipeline>()
+            .init_resource::<PreprocessPipelines>()
             .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>()
             .add_systems(
                 Render,
                 (
-                    prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
+                    prepare_preprocess_pipelines.in_set(RenderSet::Prepare),
                     prepare_preprocess_bind_groups
                         .run_if(
                             resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
                         )
                         .in_set(RenderSet::PrepareBindGroups),
+                    write_mesh_culling_data_buffer.in_set(RenderSet::PrepareResourcesFlush),
                 )
             );
     }
@@ -148,18 +183,7 @@ impl Node for GpuPreprocessNode {
         } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
 
         let pipeline_cache = world.resource::<PipelineCache>();
-        let preprocess_pipeline = world.resource::<PreprocessPipeline>();
-
-        let Some(preprocess_pipeline_id) = preprocess_pipeline.pipeline_id else {
-            warn!("The build mesh uniforms pipeline wasn't created");
-            return Ok(());
-        };
-
-        let Some(preprocess_pipeline) = pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
-        else {
-            // This will happen while the pipeline is being compiled and is fine.
-            return Ok(());
-        };
+        let preprocess_pipelines = world.resource::<PreprocessPipelines>();
 
         let mut compute_pass =
             render_context
@@ -169,13 +193,46 @@ impl Node for GpuPreprocessNode {
                     timestamp_writes: None,
                 });
 
-        compute_pass.set_pipeline(preprocess_pipeline);
-
         // Run the compute passes.
-        for (view, bind_group) in self.view_query.iter_manual(world) {
-            let index_buffer = &index_buffers[&view];
-            compute_pass.set_bind_group(0, &bind_group.0, &[]);
-            let workgroup_count = index_buffer.len().div_ceil(WORKGROUP_SIZE);
+        for (view, bind_group, view_uniform_offset, gpu_culling) in
+            self.view_query.iter_manual(world)
+        {
+            // Grab the index buffer for this view.
+            let Some(index_buffer) = index_buffers.get(&view) else {
+                warn!("The preprocessing index buffer wasn't present");
+                return Ok(());
+            };
+
+            // Select the right pipeline, depending on whether GPU culling is in
+            // use.
+            let maybe_pipeline_id = if gpu_culling {
+                preprocess_pipelines.gpu_culling.pipeline_id
+            } else {
+                preprocess_pipelines.direct.pipeline_id
+            };
+
+            // Fetch the pipeline.
+            let Some(preprocess_pipeline_id) = maybe_pipeline_id else {
+                warn!("The build mesh uniforms pipeline wasn't ready");
+                return Ok(());
+            };
+
+            let Some(preprocess_pipeline) =
+                pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
+            else {
+                // This will happen while the pipeline is being compiled and is fine.
+                return Ok(());
+            };
+
+            compute_pass.set_pipeline(preprocess_pipeline);
+
+            let mut dynamic_offsets: SmallVec<[u32; 1]> = smallvec![];
+            if gpu_culling {
+                dynamic_offsets.push(view_uniform_offset.offset);
+            }
+            compute_pass.set_bind_group(0, &bind_group.0, &dynamic_offsets);
+
+            let workgroup_count = index_buffer.buffer.len().div_ceil(WORKGROUP_SIZE);
             compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
         }
 
@@ -183,72 +240,149 @@ impl Node for GpuPreprocessNode {
     }
 }
 
+impl PreprocessPipelines {
+    pub(crate) fn pipelines_are_loaded(&self, pipeline_cache: &PipelineCache) -> bool {
+        self.direct.is_loaded(pipeline_cache) && self.gpu_culling.is_loaded(pipeline_cache)
+    }
+}
+
+impl PreprocessPipeline {
+    fn is_loaded(&self, pipeline_cache: &PipelineCache) -> bool {
+        self.pipeline_id
+            .is_some_and(|pipeline_id| pipeline_cache.get_compute_pipeline(pipeline_id).is_some())
+    }
+}
+
 impl SpecializedComputePipeline for PreprocessPipeline {
-    type Key = ();
+    type Key = PreprocessPipelineKey;
+
+    fn specialize(&self, key: Self::Key) -> ComputePipelineDescriptor {
+        let mut shader_defs = vec![];
+        if key.contains(PreprocessPipelineKey::GPU_CULLING) {
+            shader_defs.push("INDIRECT".into());
+            shader_defs.push("FRUSTUM_CULLING".into());
+        }
 
-    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
         ComputePipelineDescriptor {
-            label: Some("mesh preprocessing".into()),
+            label: Some(
+                format!(
+                    "mesh preprocessing ({})",
+                    if key.contains(PreprocessPipelineKey::GPU_CULLING) {
+                        "GPU culling"
+                    } else {
+                        "direct"
+                    }
+                )
+                .into(),
+            ),
             layout: vec![self.bind_group_layout.clone()],
             push_constant_ranges: vec![],
             shader: MESH_PREPROCESS_SHADER_HANDLE,
-            shader_defs: vec![],
+            shader_defs,
             entry_point: "main".into(),
         }
     }
 }
 
-impl FromWorld for PreprocessPipeline {
+impl FromWorld for PreprocessPipelines {
     fn from_world(world: &mut World) -> Self {
         let render_device = world.resource::<RenderDevice>();
 
-        let bind_group_layout_entries = DynamicBindGroupLayoutEntries::sequential(
-            ShaderStages::COMPUTE,
-            (
-                // `current_input`
-                storage_buffer_read_only::<MeshInputUniform>(false),
-                // `previous_input`
-                storage_buffer_read_only::<MeshInputUniform>(false),
-                // `indices`
-                storage_buffer_read_only::<PreprocessWorkItem>(false),
-                // `output`
-                storage_buffer::<MeshUniform>(false),
-            ),
+        // GPU culling bind group parameters are a superset of those in the CPU
+        // culling (direct) shader.
+        let direct_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries();
+        let gpu_culling_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries()
+            .extend_sequential((
+                // `indirect_parameters`
+                storage_buffer::<IndirectParameters>(/*has_dynamic_offset=*/ false),
+                // `mesh_culling_data`
+                storage_buffer_read_only::<MeshCullingData>(/*has_dynamic_offset=*/ false),
+                // `view`
+                uniform_buffer::<ViewUniform>(/*has_dynamic_offset=*/ true),
+            ));
+
+        let direct_bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms direct bind group layout",
+            &direct_bind_group_layout_entries,
         );
-
-        let bind_group_layout = render_device.create_bind_group_layout(
-            "build mesh uniforms bind group layout",
-            &bind_group_layout_entries,
+        let gpu_culling_bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms GPU culling bind group layout",
+            &gpu_culling_bind_group_layout_entries,
         );
 
-        PreprocessPipeline {
-            bind_group_layout,
-            pipeline_id: None,
+        PreprocessPipelines {
+            direct: PreprocessPipeline {
+                bind_group_layout: direct_bind_group_layout,
+                pipeline_id: None,
+            },
+            gpu_culling: PreprocessPipeline {
+                bind_group_layout: gpu_culling_bind_group_layout,
+                pipeline_id: None,
+            },
         }
     }
 }
 
-/// A system that specializes the `mesh_preprocess.wgsl` pipeline if necessary.
-pub fn prepare_preprocess_pipeline(
+fn preprocess_direct_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries {
+    DynamicBindGroupLayoutEntries::sequential(
+        ShaderStages::COMPUTE,
+        (
+            // `current_input`
+            storage_buffer_read_only::<MeshInputUniform>(false),
+            // `previous_input`
+            storage_buffer_read_only::<MeshInputUniform>(false),
+            // `indices`
+            storage_buffer_read_only::<PreprocessWorkItem>(false),
+            // `output`
+            storage_buffer::<MeshUniform>(false),
+        ),
+    )
+}
+
+/// A system that specializes the `mesh_preprocess.wgsl` pipelines if necessary.
+pub fn prepare_preprocess_pipelines(
     pipeline_cache: Res<PipelineCache>,
     mut pipelines: ResMut<SpecializedComputePipelines<PreprocessPipeline>>,
-    mut preprocess_pipeline: ResMut<PreprocessPipeline>,
+    mut preprocess_pipelines: ResMut<PreprocessPipelines>,
 ) {
-    if preprocess_pipeline.pipeline_id.is_some() {
-        return;
-    }
+    preprocess_pipelines.direct.prepare(
+        &pipeline_cache,
+        &mut pipelines,
+        PreprocessPipelineKey::empty(),
+    );
+    preprocess_pipelines.gpu_culling.prepare(
+        &pipeline_cache,
+        &mut pipelines,
+        PreprocessPipelineKey::GPU_CULLING,
+    );
+}
 
-    let preprocess_pipeline_id = pipelines.specialize(&pipeline_cache, &preprocess_pipeline, ());
-    preprocess_pipeline.pipeline_id = Some(preprocess_pipeline_id);
+impl PreprocessPipeline {
+    fn prepare(
+        &mut self,
+        pipeline_cache: &PipelineCache,
+        pipelines: &mut SpecializedComputePipelines<PreprocessPipeline>,
+        key: PreprocessPipelineKey,
+    ) {
+        if self.pipeline_id.is_some() {
+            return;
+        }
+
+        let preprocess_pipeline_id = pipelines.specialize(pipeline_cache, self, key);
+        self.pipeline_id = Some(preprocess_pipeline_id);
+    }
 }
 
-/// A system that attaches the mesh uniform buffers to the bind group for the
-/// compute shader.
+/// A system that attaches the mesh uniform buffers to the bind groups for the
+/// variants of the mesh preprocessing compute shader.
 pub fn prepare_preprocess_bind_groups(
     mut commands: Commands,
     render_device: Res<RenderDevice>,
     batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
-    pipeline: Res<PreprocessPipeline>,
+    indirect_parameters_buffer: Res<IndirectParametersBuffer>,
+    mesh_culling_data_buffer: Res<MeshCullingDataBuffer>,
+    view_uniforms: Res<ViewUniforms>,
+    pipelines: Res<PreprocessPipelines>,
 ) {
     // Grab the `BatchedInstanceBuffers`.
     let BatchedInstanceBuffers {
@@ -267,7 +401,7 @@ pub fn prepare_preprocess_bind_groups(
     };
 
     for (view, index_buffer_vec) in index_buffers {
-        let Some(index_buffer) = index_buffer_vec.buffer() else {
+        let Some(index_buffer) = index_buffer_vec.buffer.buffer() else {
             continue;
         };
 
@@ -275,15 +409,27 @@ pub fn prepare_preprocess_bind_groups(
         // length and the underlying buffer may be longer than the actual size
         // of the vector.
         let index_buffer_size = NonZeroU64::try_from(
-            index_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
+            index_buffer_vec.buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
         )
         .ok();
 
-        commands
-            .entity(*view)
-            .insert(PreprocessBindGroup(render_device.create_bind_group(
-                "preprocess_bind_group",
-                &pipeline.bind_group_layout,
+        let bind_group = if index_buffer_vec.gpu_culling {
+            let (
+                Some(indirect_parameters_buffer),
+                Some(mesh_culling_data_buffer),
+                Some(view_uniforms_binding),
+            ) = (
+                indirect_parameters_buffer.buffer(),
+                mesh_culling_data_buffer.buffer(),
+                view_uniforms.uniforms.binding(),
+            )
+            else {
+                continue;
+            };
+
+            PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_gpu_culling_bind_group",
+                &pipelines.gpu_culling.bind_group_layout,
                 &BindGroupEntries::sequential((
                     current_input_buffer.as_entire_binding(),
                     previous_input_buffer.as_entire_binding(),
@@ -293,7 +439,38 @@ pub fn prepare_preprocess_bind_groups(
                         size: index_buffer_size,
                     }),
                     data_buffer.as_entire_binding(),
+                    indirect_parameters_buffer.as_entire_binding(),
+                    mesh_culling_data_buffer.as_entire_binding(),
+                    view_uniforms_binding,
                 )),
-            )));
+            ))
+        } else {
+            PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_direct_bind_group",
+                &pipelines.direct.bind_group_layout,
+                &BindGroupEntries::sequential((
+                    current_input_buffer.as_entire_binding(),
+                    previous_input_buffer.as_entire_binding(),
+                    BindingResource::Buffer(BufferBinding {
+                        buffer: index_buffer,
+                        offset: 0,
+                        size: index_buffer_size,
+                    }),
+                    data_buffer.as_entire_binding(),
+                )),
+            ))
+        };
+
+        commands.entity(*view).insert(bind_group);
     }
 }
+
+/// Writes the information needed to do GPU mesh culling to the GPU.
+pub fn write_mesh_culling_data_buffer(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
+) {
+    mesh_culling_data_buffer.write_buffer(&render_device, &render_queue);
+    mesh_culling_data_buffer.clear();
+}
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index e09b162197ef7..74e340c01a33e 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -22,7 +22,6 @@ use bevy_transform::{components::GlobalTransform, prelude::Transform};
 #[cfg(feature = "trace")]
 use bevy_utils::tracing::info_span;
 use bevy_utils::tracing::{error, warn};
-use nonmax::NonMaxU32;
 use std::{hash::Hash, num::NonZeroU64, ops::Range};
 
 use crate::*;
@@ -1734,7 +1733,7 @@ pub struct Shadow {
     pub key: ShadowBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
@@ -1771,13 +1770,13 @@ impl PhaseItem for Shadow {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -1789,13 +1788,13 @@ impl BinnedPhaseItem for Shadow {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Shadow {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 5d326e05909cc..fa3cbdee014f2 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -12,13 +12,17 @@ use bevy_ecs::{
     query::ROQueryItem,
     system::{lifetimeless::*, SystemParamItem, SystemState},
 };
-use bevy_math::{vec3, Affine3, Rect, UVec2, Vec3, Vec4};
+use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        gpu_preprocessing, no_gpu_preprocessing, GetBatchData, GetFullBatchData,
-        NoAutomaticBatching,
+        gpu_preprocessing::{
+            self, GpuPreprocessingSupport, IndirectParameters, IndirectParametersBuffer,
+        },
+        no_gpu_preprocessing, GetBatchData, GetFullBatchData, NoAutomaticBatching,
     },
+    camera::Camera,
     mesh::*,
+    primitives::Aabb,
     render_asset::RenderAssets,
     render_phase::{
         BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
@@ -27,11 +31,11 @@ use bevy_render::{
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
     texture::{BevyDefault, DefaultImageSampler, ImageSampler, TextureFormatPixelInfo},
-    view::{prepare_view_targets, ViewTarget, ViewUniformOffset, ViewVisibility},
+    view::{prepare_view_targets, GpuCulling, ViewTarget, ViewUniformOffset, ViewVisibility},
     Extract,
 };
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::{tracing::error, Entry, HashMap, Parallel};
+use bevy_utils::{tracing::error, tracing::warn, Entry, HashMap, Parallel};
 
 #[cfg(debug_assertions)]
 use bevy_utils::warn_once;
@@ -139,6 +143,7 @@ impl Plugin for MeshRenderPlugin {
                 .init_resource::<SkinIndices>()
                 .init_resource::<MorphUniform>()
                 .init_resource::<MorphIndices>()
+                .init_resource::<MeshCullingDataBuffer>()
                 .add_systems(
                     ExtractSchedule,
                     (
@@ -167,9 +172,12 @@ impl Plugin for MeshRenderPlugin {
         let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            let render_device = render_app.world().resource::<RenderDevice>();
+            render_app.init_resource::<GpuPreprocessingSupport>();
+
+            let gpu_preprocessing_support =
+                render_app.world().resource::<GpuPreprocessingSupport>();
             let use_gpu_instance_buffer_builder = self.use_gpu_instance_buffer_builder
-                && gpu_preprocessing::can_preprocess_on_gpu(render_device);
+                && *gpu_preprocessing_support != GpuPreprocessingSupport::None;
 
             let render_mesh_instances = RenderMeshInstances::new(use_gpu_instance_buffer_builder);
             render_app.insert_resource(render_mesh_instances);
@@ -209,6 +217,8 @@ impl Plugin for MeshRenderPlugin {
                     );
             };
 
+            let indirect_parameters_buffer = IndirectParametersBuffer::new();
+
             let render_device = render_app.world().resource::<RenderDevice>();
             if let Some(per_object_buffer_batch_size) =
                 GpuArrayBuffer::<MeshUniform>::batch_size(render_device)
@@ -219,7 +229,9 @@ impl Plugin for MeshRenderPlugin {
                 ));
             }
 
-            render_app.init_resource::<MeshPipeline>();
+            render_app
+                .insert_resource(indirect_parameters_buffer)
+                .init_resource::<MeshPipeline>();
         }
 
         // Load the mesh_bindings shader module here as it depends on runtime information about
@@ -295,6 +307,29 @@ pub struct MeshInputUniform {
     pub previous_input_index: u32,
 }
 
+/// Information about each mesh instance needed to cull it on GPU.
+///
+/// This consists of its axis-aligned bounding box (AABB).
+#[derive(ShaderType, Pod, Zeroable, Clone, Copy)]
+#[repr(C)]
+pub struct MeshCullingData {
+    /// The 3D center of the AABB in model space, padded with an extra unused
+    /// float value.
+    pub aabb_center: Vec4,
+    /// The 3D extents of the AABB in model space, divided by two, padded with
+    /// an extra unused float value.
+    pub aabb_half_extents: Vec4,
+}
+
+/// A GPU buffer that holds the information needed to cull meshes on GPU.
+///
+/// At the moment, this simply holds each mesh's AABB.
+///
+/// To avoid wasting CPU time in the CPU culling case, this buffer will be empty
+/// if GPU culling isn't in use.
+#[derive(Resource, Deref, DerefMut)]
+pub struct MeshCullingDataBuffer(BufferVec<MeshCullingData>);
+
 impl MeshUniform {
     pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option<Rect>) -> Self {
         let (inverse_transpose_model_a, inverse_transpose_model_b) =
@@ -403,6 +438,57 @@ pub struct RenderMeshInstanceShared {
     pub flags: RenderMeshInstanceFlags,
 }
 
+/// Information that is gathered during the parallel portion of mesh extraction
+/// when GPU mesh uniform building is enabled.
+///
+/// From this, the [`MeshInputUniform`] and [`RenderMeshInstanceGpu`] are
+/// prepared.
+pub struct RenderMeshInstanceGpuBuilder {
+    /// Data that will be placed on the [`RenderMeshInstanceGpu`].
+    pub shared: RenderMeshInstanceShared,
+    /// The current transform.
+    pub transform: Affine3,
+    /// Four 16-bit unsigned normalized UV values packed into a [`UVec2`]:
+    ///
+    /// ```text
+    ///                         <--- MSB                   LSB --->
+    ///                         +---- min v ----+ +---- min u ----+
+    ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
+    ///                         +---- max v ----+ +---- max u ----+
+    ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
+    ///
+    /// (MSB: most significant bit; LSB: least significant bit.)
+    /// ```
+    pub lightmap_uv_rect: UVec2,
+    /// The index of the previous mesh input.
+    pub previous_input_index: Option<NonMaxU32>,
+    /// Various flags.
+    pub mesh_flags: MeshFlags,
+}
+
+/// The per-thread queues used during [`extract_meshes_for_gpu_building`].
+///
+/// There are two varieties of these: one for when culling happens on CPU and
+/// one for when culling happens on GPU. Having the two varieties avoids wasting
+/// space if GPU culling is disabled.
+#[derive(Default)]
+pub enum RenderMeshInstanceGpuQueue {
+    /// The default value.
+    ///
+    /// This becomes [`RenderMeshInstanceGpuQueue::CpuCulling`] or
+    /// [`RenderMeshInstanceGpuQueue::GpuCulling`] once extraction starts.
+    #[default]
+    None,
+    /// The version of [`RenderMeshInstanceGpuQueue`] that omits the
+    /// [`MeshCullingDataGpuBuilder`], so that we don't waste space when GPU
+    /// culling is disabled.
+    CpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder)>),
+    /// The version of [`RenderMeshInstanceGpuQueue`] that contains the
+    /// [`MeshCullingDataGpuBuilder`], used when any view has GPU culling
+    /// enabled.
+    GpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder, MeshCullingData)>),
+}
+
 impl RenderMeshInstanceShared {
     fn from_components(
         previous_transform: Option<&PreviousGlobalTransform>,
@@ -494,41 +580,147 @@ impl RenderMeshInstances {
     }
 }
 
-pub(crate) trait RenderMeshInstancesTable {
-    /// Returns the ID of the mesh asset attached to the given entity, if any.
-    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>>;
+impl RenderMeshInstancesCpu {
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        self.get(&entity)
+            .map(|render_mesh_instance| render_mesh_instance.mesh_asset_id)
+    }
 
-    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
-    /// mesh attached.
-    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData>;
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        self.get(&entity)
+            .map(|render_mesh_instance| RenderMeshQueueData {
+                shared: &render_mesh_instance.shared,
+                translation: render_mesh_instance.transforms.transform.translation,
+            })
+    }
 }
 
-impl RenderMeshInstancesTable for RenderMeshInstancesCpu {
+impl RenderMeshInstancesGpu {
     fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
-        self.get(&entity).map(|instance| instance.mesh_asset_id)
+        self.get(&entity)
+            .map(|render_mesh_instance| render_mesh_instance.mesh_asset_id)
     }
 
     fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
-        self.get(&entity).map(|instance| RenderMeshQueueData {
-            shared: &instance.shared,
-            translation: instance.transforms.transform.translation,
-        })
+        self.get(&entity)
+            .map(|render_mesh_instance| RenderMeshQueueData {
+                shared: &render_mesh_instance.shared,
+                translation: render_mesh_instance.translation,
+            })
     }
 }
 
-impl RenderMeshInstancesTable for RenderMeshInstancesGpu {
-    /// Returns the ID of the mesh asset attached to the given entity, if any.
-    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
-        self.get(&entity).map(|instance| instance.mesh_asset_id)
+impl RenderMeshInstanceGpuQueue {
+    /// Clears out a [`RenderMeshInstanceGpuQueue`], creating or recreating it
+    /// as necessary.
+    ///
+    /// `any_gpu_culling` should be set to true if any view has GPU culling
+    /// enabled.
+    fn init(&mut self, any_gpu_culling: bool) {
+        match (any_gpu_culling, &mut *self) {
+            (true, RenderMeshInstanceGpuQueue::GpuCulling(queue)) => queue.clear(),
+            (true, _) => *self = RenderMeshInstanceGpuQueue::GpuCulling(vec![]),
+            (false, RenderMeshInstanceGpuQueue::CpuCulling(queue)) => queue.clear(),
+            (false, _) => *self = RenderMeshInstanceGpuQueue::CpuCulling(vec![]),
+        }
     }
 
-    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
-    /// mesh attached.
-    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
-        self.get(&entity).map(|instance| RenderMeshQueueData {
-            shared: &instance.shared,
-            translation: instance.translation,
-        })
+    /// Adds a new mesh to this queue.
+    fn push(
+        &mut self,
+        entity: Entity,
+        instance_builder: RenderMeshInstanceGpuBuilder,
+        culling_data_builder: Option<MeshCullingData>,
+    ) {
+        match (&mut *self, culling_data_builder) {
+            (&mut RenderMeshInstanceGpuQueue::CpuCulling(ref mut queue), None) => {
+                queue.push((entity, instance_builder));
+            }
+            (
+                &mut RenderMeshInstanceGpuQueue::GpuCulling(ref mut queue),
+                Some(culling_data_builder),
+            ) => {
+                queue.push((entity, instance_builder, culling_data_builder));
+            }
+            (_, None) => {
+                *self = RenderMeshInstanceGpuQueue::CpuCulling(vec![(entity, instance_builder)]);
+            }
+            (_, Some(culling_data_builder)) => {
+                *self = RenderMeshInstanceGpuQueue::GpuCulling(vec![(
+                    entity,
+                    instance_builder,
+                    culling_data_builder,
+                )]);
+            }
+        }
+    }
+}
+
+impl RenderMeshInstanceGpuBuilder {
+    /// Flushes this mesh instance to the [`RenderMeshInstanceGpu`] and
+    /// [`MeshInputUniform`] tables.
+    fn add_to(
+        self,
+        entity: Entity,
+        render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
+        current_input_buffer: &mut BufferVec<MeshInputUniform>,
+    ) -> usize {
+        // Push the mesh input uniform.
+        let current_uniform_index = current_input_buffer.push(MeshInputUniform {
+            transform: self.transform.to_transpose(),
+            lightmap_uv_rect: self.lightmap_uv_rect,
+            flags: self.mesh_flags.bits(),
+            previous_input_index: match self.previous_input_index {
+                Some(previous_input_index) => previous_input_index.into(),
+                None => u32::MAX,
+            },
+        });
+
+        // Record the [`RenderMeshInstance`].
+        render_mesh_instances.insert(
+            entity,
+            RenderMeshInstanceGpu {
+                translation: self.transform.translation,
+                shared: self.shared,
+                current_uniform_index: (current_uniform_index as u32)
+                    .try_into()
+                    .unwrap_or_default(),
+            },
+        );
+
+        current_uniform_index
+    }
+}
+
+impl MeshCullingData {
+    /// Returns a new [`MeshCullingData`] initialized with the given AABB.
+    ///
+    /// If no AABB is provided, an infinitely-large one is conservatively
+    /// chosen.
+    fn new(aabb: Option<&Aabb>) -> Self {
+        match aabb {
+            Some(aabb) => MeshCullingData {
+                aabb_center: aabb.center.extend(0.0),
+                aabb_half_extents: aabb.half_extents.extend(0.0),
+            },
+            None => MeshCullingData {
+                aabb_center: Vec3::ZERO.extend(0.0),
+                aabb_half_extents: Vec3::INFINITY.extend(0.0),
+            },
+        }
+    }
+
+    /// Flushes this mesh instance culling data to the
+    /// [`MeshCullingDataBuffer`].
+    fn add_to(&self, mesh_culling_data_buffer: &mut MeshCullingDataBuffer) -> usize {
+        mesh_culling_data_buffer.push(*self)
+    }
+}
+
+impl Default for MeshCullingDataBuffer {
+    #[inline]
+    fn default() -> Self {
+        Self(BufferVec::new(BufferUsages::STORAGE))
     }
 }
 
@@ -625,8 +817,8 @@ pub fn extract_meshes_for_cpu_building(
 
     render_mesh_instances.clear();
     for queue in render_mesh_instance_queues.iter_mut() {
-        for (k, v) in queue.drain(..) {
-            render_mesh_instances.insert_unique_unchecked(k, v);
+        for (entity, render_mesh_instance) in queue.drain(..) {
+            render_mesh_instances.insert_unique_unchecked(entity, render_mesh_instance);
         }
     }
 }
@@ -641,9 +833,8 @@ pub fn extract_meshes_for_gpu_building(
     mut batched_instance_buffers: ResMut<
         gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
     >,
-    mut render_mesh_instance_queues: Local<
-        Parallel<Vec<(Entity, RenderMeshInstanceShared, MeshInputUniform)>>,
-    >,
+    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
+    mut render_mesh_instance_queues: Local<Parallel<RenderMeshInstanceGpuQueue>>,
     meshes_query: Extract<
         Query<(
             Entity,
@@ -651,6 +842,7 @@ pub fn extract_meshes_for_gpu_building(
             &GlobalTransform,
             Option<&PreviousGlobalTransform>,
             Option<&Lightmap>,
+            Option<&Aabb>,
             &Handle<Mesh>,
             Has<NotShadowReceiver>,
             Has<TransmittedShadowReceiver>,
@@ -658,25 +850,22 @@ pub fn extract_meshes_for_gpu_building(
             Has<NoAutomaticBatching>,
         )>,
     >,
+    cameras_query: Extract<Query<(), (With<Camera>, With<GpuCulling>)>>,
 ) {
+    let any_gpu_culling = !cameras_query.is_empty();
+    for render_mesh_instance_queue in render_mesh_instance_queues.iter_mut() {
+        render_mesh_instance_queue.init(any_gpu_culling);
+    }
+
     // Collect render mesh instances. Build up the uniform buffer.
     let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
     else {
         panic!(
-            "`collect_render_mesh_instances_for_gpu_building` should only be called if we're \
-                using GPU `MeshUniform` building"
+            "`extract_meshes_for_gpu_building` should only be called if we're \
+            using GPU `MeshUniform` building"
         );
     };
 
-    let gpu_preprocessing::BatchedInstanceBuffers {
-        ref mut current_input_buffer,
-        ref mut previous_input_buffer,
-        ..
-    } = *batched_instance_buffers;
-
-    // Swap buffers.
-    mem::swap(current_input_buffer, previous_input_buffer);
-
     meshes_query.par_iter().for_each_init(
         || render_mesh_instance_queues.borrow_local_mut(),
         |queue,
@@ -686,6 +875,7 @@ pub fn extract_meshes_for_gpu_building(
             transform,
             previous_transform,
             lightmap,
+            aabb,
             handle,
             not_shadow_receiver,
             transmitted_receiver,
@@ -706,54 +896,92 @@ pub fn extract_meshes_for_gpu_building(
                 no_automatic_batching,
             );
 
-            let previous_input_index = shared
-                .flags
-                .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
-                .then(|| {
-                    render_mesh_instances
-                        .get(&entity)
-                        .map(|render_mesh_instance| {
-                            render_mesh_instance.current_uniform_index.into()
-                        })
-                        .unwrap_or(u32::MAX)
-                })
-                .unwrap_or(u32::MAX);
-
             let lightmap_uv_rect =
                 lightmap::pack_lightmap_uv_rect(lightmap.map(|lightmap| lightmap.uv_rect));
-            let affine3: Affine3 = (&transform.affine()).into();
 
-            queue.push((
-                entity,
+            let gpu_mesh_culling_data = any_gpu_culling.then(|| MeshCullingData::new(aabb));
+
+            let previous_input_index = if shared
+                .flags
+                .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
+            {
+                render_mesh_instances
+                    .get(&entity)
+                    .map(|render_mesh_instance| render_mesh_instance.current_uniform_index)
+            } else {
+                None
+            };
+
+            let gpu_mesh_instance_builder = RenderMeshInstanceGpuBuilder {
                 shared,
-                MeshInputUniform {
-                    flags: mesh_flags.bits(),
-                    lightmap_uv_rect,
-                    transform: affine3.to_transpose(),
-                    previous_input_index,
-                },
-            ));
+                transform: (&transform.affine()).into(),
+                lightmap_uv_rect,
+                mesh_flags,
+                previous_input_index,
+            };
+
+            queue.push(entity, gpu_mesh_instance_builder, gpu_mesh_culling_data);
         },
     );
 
+    collect_meshes_for_gpu_building(
+        render_mesh_instances,
+        &mut batched_instance_buffers,
+        &mut mesh_culling_data_buffer,
+        &mut render_mesh_instance_queues,
+    );
+}
+
+/// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU
+/// mesh uniforms are built.
+fn collect_meshes_for_gpu_building(
+    render_mesh_instances: &mut RenderMeshInstancesGpu,
+    batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers<
+        MeshUniform,
+        MeshInputUniform,
+    >,
+    mesh_culling_data_buffer: &mut MeshCullingDataBuffer,
+    render_mesh_instance_queues: &mut Parallel<RenderMeshInstanceGpuQueue>,
+) {
+    // Collect render mesh instances. Build up the uniform buffer.
+
+    let gpu_preprocessing::BatchedInstanceBuffers {
+        ref mut current_input_buffer,
+        ref mut previous_input_buffer,
+        ..
+    } = batched_instance_buffers;
+
+    // Swap buffers.
+    mem::swap(current_input_buffer, previous_input_buffer);
+
     // Build the [`RenderMeshInstance`]s and [`MeshInputUniform`]s.
     render_mesh_instances.clear();
+
     for queue in render_mesh_instance_queues.iter_mut() {
-        for (entity, shared, mesh_uniform) in queue.drain(..) {
-            let buffer_index = current_input_buffer.push(mesh_uniform);
-            let translation = vec3(
-                mesh_uniform.transform[0].w,
-                mesh_uniform.transform[1].w,
-                mesh_uniform.transform[2].w,
-            );
-            render_mesh_instances.insert_unique_unchecked(
-                entity,
-                RenderMeshInstanceGpu {
-                    shared,
-                    translation,
-                    current_uniform_index: NonMaxU32::new(buffer_index as u32).unwrap_or_default(),
-                },
-            );
+        match *queue {
+            RenderMeshInstanceGpuQueue::None => {
+                // This can only happen if the queue is empty.
+            }
+            RenderMeshInstanceGpuQueue::CpuCulling(ref mut queue) => {
+                for (entity, mesh_instance_builder) in queue.drain(..) {
+                    mesh_instance_builder.add_to(
+                        entity,
+                        render_mesh_instances,
+                        current_input_buffer,
+                    );
+                }
+            }
+            RenderMeshInstanceGpuQueue::GpuCulling(ref mut queue) => {
+                for (entity, mesh_instance_builder, mesh_culling_builder) in queue.drain(..) {
+                    let instance_data_index = mesh_instance_builder.add_to(
+                        entity,
+                        render_mesh_instances,
+                        current_input_buffer,
+                    );
+                    let culling_data_index = mesh_culling_builder.add_to(mesh_culling_data_buffer);
+                    debug_assert_eq!(instance_data_index, culling_data_index);
+                }
+            }
         }
     }
 }
@@ -876,7 +1104,11 @@ impl MeshPipeline {
 }
 
 impl GetBatchData for MeshPipeline {
-    type Param = (SRes<RenderMeshInstances>, SRes<RenderLightmaps>);
+    type Param = (
+        SRes<RenderMeshInstances>,
+        SRes<RenderLightmaps>,
+        SRes<RenderAssets<GpuMesh>>,
+    );
     // The material bind group ID, the mesh ID, and the lightmap ID,
     // respectively.
     type CompareData = (MaterialBindGroupId, AssetId<Mesh>, Option<AssetId<Image>>);
@@ -884,7 +1116,7 @@ impl GetBatchData for MeshPipeline {
     type BufferData = MeshUniform;
 
     fn get_batch_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -915,7 +1147,7 @@ impl GetFullBatchData for MeshPipeline {
     type BufferInputData = MeshInputUniform;
 
     fn get_index_and_compare_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
         // This should only be called during GPU building.
@@ -941,7 +1173,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_batch_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -960,7 +1192,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_index(
-        (mesh_instances, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, _, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<NonMaxU32> {
         // This should only be called during GPU building.
@@ -976,6 +1208,70 @@ impl GetFullBatchData for MeshPipeline {
             .get(&entity)
             .map(|entity| entity.current_uniform_index)
     }
+
+    fn get_batch_indirect_parameters_index(
+        (mesh_instances, _, meshes): &SystemParamItem<Self::Param>,
+        indirect_parameters_buffer: &mut IndirectParametersBuffer,
+        entity: Entity,
+        instance_index: u32,
+    ) -> Option<NonMaxU32> {
+        get_batch_indirect_parameters_index(
+            mesh_instances,
+            meshes,
+            indirect_parameters_buffer,
+            entity,
+            instance_index,
+        )
+    }
+}
+
+/// Pushes a set of [`IndirectParameters`] onto the [`IndirectParametersBuffer`]
+/// for the given mesh instance, and returns the index of those indirect
+/// parameters.
+fn get_batch_indirect_parameters_index(
+    mesh_instances: &RenderMeshInstances,
+    meshes: &RenderAssets<GpuMesh>,
+    indirect_parameters_buffer: &mut IndirectParametersBuffer,
+    entity: Entity,
+    instance_index: u32,
+) -> Option<NonMaxU32> {
+    // This should only be called during GPU building.
+    let RenderMeshInstances::GpuBuilding(ref mesh_instances) = *mesh_instances else {
+        error!(
+            "`get_batch_indirect_parameters_index` should never be called in CPU mesh uniform \
+                building mode"
+        );
+        return None;
+    };
+
+    let mesh_instance = mesh_instances.get(&entity)?;
+    let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
+
+    // Note that `IndirectParameters` covers both of these structures, even
+    // though they actually have distinct layouts. See the comment above that
+    // type for more information.
+    let indirect_parameters = match mesh.buffer_info {
+        GpuBufferInfo::Indexed {
+            count: index_count, ..
+        } => IndirectParameters {
+            vertex_or_index_count: index_count,
+            instance_count: 0,
+            first_vertex: 0,
+            base_vertex_or_first_instance: 0,
+            first_instance: instance_index,
+        },
+        GpuBufferInfo::NonIndexed => IndirectParameters {
+            vertex_or_index_count: mesh.vertex_count,
+            instance_count: 0,
+            first_vertex: 0,
+            base_vertex_or_first_instance: instance_index,
+            first_instance: instance_index,
+        },
+    };
+
+    (indirect_parameters_buffer.push(indirect_parameters) as u32)
+        .try_into()
+        .ok()
 }
 
 bitflags::bitflags! {
@@ -1666,7 +1962,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
 
         let mut dynamic_offsets: [u32; 3] = Default::default();
         let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.dynamic_offset() {
+        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
             dynamic_offsets[offset_count] = dynamic_offset.get();
             offset_count += 1;
         }
@@ -1689,8 +1985,9 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
     type Param = (
         SRes<RenderAssets<GpuMesh>>,
         SRes<RenderMeshInstances>,
+        SRes<IndirectParametersBuffer>,
         SRes<PipelineCache>,
-        Option<SRes<PreprocessPipeline>>,
+        Option<SRes<PreprocessPipelines>>,
     );
     type ViewQuery = Has<PreprocessBindGroup>;
     type ItemQuery = ();
@@ -1699,25 +1996,15 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         item: &P,
         has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
         _item_query: Option<()>,
-        (meshes, mesh_instances, pipeline_cache, preprocess_pipeline): SystemParamItem<
-            'w,
-            '_,
-            Self::Param,
-        >,
+        (meshes, mesh_instances, indirect_parameters_buffer, pipeline_cache, preprocess_pipelines): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
         // If we're using GPU preprocessing, then we're dependent on that
         // compute shader having been run, which of course can only happen if
         // it's compiled. Otherwise, our mesh instance data won't be present.
-        if let Some(preprocess_pipeline) = preprocess_pipeline {
+        if let Some(preprocess_pipelines) = preprocess_pipelines {
             if !has_preprocess_bind_group
-                || !preprocess_pipeline
-                    .pipeline_id
-                    .is_some_and(|preprocess_pipeline_id| {
-                        pipeline_cache
-                            .get_compute_pipeline(preprocess_pipeline_id)
-                            .is_some()
-                    })
+                || !preprocess_pipelines.pipelines_are_loaded(&pipeline_cache)
             {
                 return RenderCommandResult::Failure;
             }
@@ -1725,6 +2012,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
 
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
+        let indirect_parameters_buffer = indirect_parameters_buffer.into_inner();
 
         let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
             return RenderCommandResult::Failure;
@@ -1733,9 +2021,26 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             return RenderCommandResult::Failure;
         };
 
+        // Calculate the indirect offset, and look up the buffer.
+        let indirect_parameters = match item.extra_index().as_indirect_parameters_index() {
+            None => None,
+            Some(index) => match indirect_parameters_buffer.buffer() {
+                None => {
+                    warn!("Not rendering mesh because indirect parameters buffer wasn't present");
+                    return RenderCommandResult::Failure;
+                }
+                Some(buffer) => Some((
+                    index as u64 * mem::size_of::<IndirectParameters>() as u64,
+                    buffer,
+                )),
+            },
+        };
+
         pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
 
         let batch_range = item.batch_range();
+
+        // Draw either directly or indirectly, as appropriate.
         match &gpu_mesh.buffer_info {
             GpuBufferInfo::Indexed {
                 buffer,
@@ -1743,11 +2048,25 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                 count,
             } => {
                 pass.set_index_buffer(buffer.slice(..), 0, *index_format);
-                pass.draw_indexed(0..*count, 0, batch_range.clone());
-            }
-            GpuBufferInfo::NonIndexed => {
-                pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                match indirect_parameters {
+                    None => {
+                        pass.draw_indexed(0..*count, 0, batch_range.clone());
+                    }
+                    Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass
+                        .draw_indexed_indirect(
+                            indirect_parameters_buffer,
+                            indirect_parameters_offset,
+                        ),
+                }
             }
+            GpuBufferInfo::NonIndexed => match indirect_parameters {
+                None => {
+                    pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                }
+                Some((indirect_parameters_offset, indirect_parameters_buffer)) => {
+                    pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset);
+                }
+            },
         }
         RenderCommandResult::Success
     }
diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index c4adaa5105623..614c9091a6e2a 100644
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -9,6 +9,7 @@
 
 #import bevy_pbr::mesh_types::Mesh
 #import bevy_render::maths
+#import bevy_render::view::View
 
 // Per-frame data that the CPU supplies to the GPU.
 struct MeshInput {
@@ -23,15 +24,45 @@ struct MeshInput {
     previous_input_index: u32,
 }
 
+// Information about each mesh instance needed to cull it on GPU.
+//
+// At the moment, this just consists of its axis-aligned bounding box (AABB).
+struct MeshCullingData {
+    // The 3D center of the AABB in model space, padded with an extra unused
+    // float value.
+    aabb_center: vec4<f32>,
+    // The 3D extents of the AABB in model space, divided by two, padded with
+    // an extra unused float value.
+    aabb_half_extents: vec4<f32>,
+}
+
 // One invocation of this compute shader: i.e. one mesh instance in a view.
 struct PreprocessWorkItem {
     // The index of the `MeshInput` in the `current_input` buffer that we read
     // from.
     input_index: u32,
-    // The index of the `Mesh` in `output` that we write to.
+    // In direct mode, the index of the `Mesh` in `output` that we write to. In
+    // indirect mode, the index of the `IndirectParameters` in
+    // `indirect_parameters` that we write to.
     output_index: u32,
 }
 
+// The `wgpu` indirect parameters structure. This is a union of two structures.
+// For more information, see the corresponding comment in
+// `gpu_preprocessing.rs`.
+struct IndirectParameters {
+    // `vertex_count` or `index_count`.
+    data0: u32,
+    // `instance_count` in both structures.
+    instance_count: atomic<u32>,
+    // `first_vertex` in both structures.
+    first_vertex: u32,
+    // `first_instance` or `base_vertex`.
+    data1: u32,
+    // A read-only copy of `instance_index`.
+    instance_index: u32,
+}
+
 // The current frame's `MeshInput`.
 @group(0) @binding(0) var<storage> current_input: array<MeshInput>;
 // The `MeshInput` values from the previous frame.
@@ -43,20 +74,82 @@ struct PreprocessWorkItem {
 // The output array of `Mesh`es.
 @group(0) @binding(3) var<storage, read_write> output: array<Mesh>;
 
+#ifdef INDIRECT
+// The array of indirect parameters for drawcalls.
+@group(0) @binding(4) var<storage, read_write> indirect_parameters: array<IndirectParameters>;
+#endif
+
+#ifdef FRUSTUM_CULLING
+// Data needed to cull the meshes.
+//
+// At the moment, this consists only of AABBs.
+@group(0) @binding(5) var<storage> mesh_culling_data: array<MeshCullingData>;
+
+// The view data, including the view matrix.
+@group(0) @binding(6) var<uniform> view: View;
+
+// Returns true if the view frustum intersects an oriented bounding box (OBB).
+//
+// `aabb_center.w` should be 1.0.
+fn view_frustum_intersects_obb(
+    model: mat4x4<f32>,
+    aabb_center: vec4<f32>,
+    aabb_half_extents: vec3<f32>,
+) -> bool {
+
+    for (var i = 0; i < 5; i += 1) {
+        // Calculate relative radius of the sphere associated with this plane.
+        let plane_normal = view.frustum[i];
+        let relative_radius = dot(
+            abs(
+                vec3(
+                    dot(plane_normal, model[0]),
+                    dot(plane_normal, model[1]),
+                    dot(plane_normal, model[2]),
+                )
+            ),
+            aabb_half_extents
+        );
+
+        // Check the frustum plane.
+        if (!maths::sphere_intersects_plane_half_space(
+                plane_normal, aabb_center, relative_radius)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+#endif
+
 @compute
 @workgroup_size(64)
 fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
+    // Figure out our instance index. If this thread doesn't correspond to any
+    // index, bail.
     let instance_index = global_invocation_id.x;
     if (instance_index >= arrayLength(&work_items)) {
         return;
     }
 
     // Unpack.
-    let mesh_index = work_items[instance_index].input_index;
+    let input_index = work_items[instance_index].input_index;
     let output_index = work_items[instance_index].output_index;
-    let model_affine_transpose = current_input[mesh_index].model;
+    let model_affine_transpose = current_input[input_index].model;
     let model = maths::affine3_to_square(model_affine_transpose);
 
+    // Cull if necessary.
+#ifdef FRUSTUM_CULLING
+    let aabb_center = mesh_culling_data[input_index].aabb_center.xyz;
+    let aabb_half_extents = mesh_culling_data[input_index].aabb_half_extents.xyz;
+
+    // Do an OBB-based frustum cull.
+    let model_center = model * vec4(aabb_center, 1.0);
+    if (!view_frustum_intersects_obb(model, model_center, aabb_half_extents)) {
+        return;
+    }
+#endif
+
     // Calculate inverse transpose.
     let inverse_transpose_model = transpose(maths::inverse_affine3(transpose(
         model_affine_transpose)));
@@ -68,7 +161,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     let inverse_transpose_model_b = inverse_transpose_model[2].z;
 
     // Look up the previous model matrix.
-    let previous_input_index = current_input[mesh_index].previous_input_index;
+    let previous_input_index = current_input[input_index].previous_input_index;
     var previous_model: mat3x4<f32>;
     if (previous_input_index == 0xffffffff) {
         previous_model = model_affine_transpose;
@@ -76,11 +169,21 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
         previous_model = previous_input[previous_input_index].model;
     }
 
+    // Figure out the output index. In indirect mode, this involves bumping the
+    // instance index in the indirect parameters structure. Otherwise, this
+    // index was directly supplied to us.
+#ifdef INDIRECT
+    let mesh_output_index = indirect_parameters[output_index].instance_index +
+        atomicAdd(&indirect_parameters[output_index].instance_count, 1u);
+#else
+    let mesh_output_index = output_index;
+#endif
+
     // Write the output.
-    output[output_index].model = model_affine_transpose;
-    output[output_index].previous_model = previous_model;
-    output[output_index].inverse_transpose_model_a = inverse_transpose_model_a;
-    output[output_index].inverse_transpose_model_b = inverse_transpose_model_b;
-    output[output_index].flags = current_input[mesh_index].flags;
-    output[output_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect;
+    output[mesh_output_index].model = model_affine_transpose;
+    output[mesh_output_index].previous_model = previous_model;
+    output[mesh_output_index].inverse_transpose_model_a = inverse_transpose_model_a;
+    output[mesh_output_index].inverse_transpose_model_b = inverse_transpose_model_b;
+    output[mesh_output_index].flags = current_input[input_index].flags;
+    output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect;
 }
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 22bebd8bf94e9..661ff7f301ed9 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -1,29 +1,74 @@
 //! Batching functionality when GPU preprocessing is in use.
 
-use std::marker::PhantomData;
-
+use bevy_app::{App, Plugin};
+use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
     entity::Entity,
-    query::With,
+    query::{Has, With},
+    schedule::IntoSystemConfigs as _,
     system::{Query, Res, ResMut, Resource, StaticSystemParam},
+    world::{FromWorld, World},
 };
 use bevy_encase_derive::ShaderType;
 use bevy_utils::EntityHashMap;
 use bytemuck::{Pod, Zeroable};
+use nonmax::NonMaxU32;
 use smallvec::smallvec;
-use wgpu::{BindingResource, BufferUsages};
+use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
 
 use crate::{
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices,
     },
-    render_resource::{BufferVec, GpuArrayBufferIndex, GpuArrayBufferable, UninitBufferVec},
-    renderer::{RenderDevice, RenderQueue},
-    view::ViewTarget,
+    render_resource::{BufferVec, GpuArrayBufferable, UninitBufferVec},
+    renderer::{RenderAdapter, RenderDevice, RenderQueue},
+    view::{GpuCulling, ViewTarget},
+    Render, RenderApp, RenderSet,
 };
 
-use super::GetFullBatchData;
+use super::{BatchMeta, GetBatchData, GetFullBatchData};
+
+pub struct BatchingPlugin;
+
+impl Plugin for BatchingPlugin {
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            write_indirect_parameters_buffer.in_set(RenderSet::PrepareResourcesFlush),
+        );
+    }
+
+    fn finish(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.init_resource::<GpuPreprocessingSupport>();
+    }
+}
+
+/// Records whether GPU preprocessing and/or GPU culling are supported on the
+/// device.
+///
+/// No GPU preprocessing is supported on WebGL because of the lack of compute
+/// shader support.  GPU preprocessing is supported on DirectX 12, but due to [a
+/// `wgpu` limitation] GPU culling is not.
+///
+/// [a `wgpu` limitation]: https://github.com/gfx-rs/wgpu/issues/2471
+#[derive(Clone, Copy, PartialEq, Resource)]
+pub enum GpuPreprocessingSupport {
+    /// No GPU preprocessing support is available at all.
+    None,
+    /// GPU preprocessing is available, but GPU culling isn't.
+    PreprocessingOnly,
+    /// Both GPU preprocessing and GPU culling are available.
+    Culling,
+}
 
 /// The GPU buffers holding the data needed to render batches.
 ///
@@ -51,7 +96,7 @@ where
     /// corresponds to each instance.
     ///
     /// This is keyed off each view. Each view has a separate buffer.
-    pub work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
+    pub work_item_buffers: EntityHashMap<Entity, PreprocessWorkItemBuffer>,
 
     /// The uniform data inputs for the current frame.
     ///
@@ -68,6 +113,14 @@ where
     pub previous_input_buffer: BufferVec<BDI>,
 }
 
+/// The buffer of GPU preprocessing work items for a single view.
+pub struct PreprocessWorkItemBuffer {
+    /// The buffer of work items.
+    pub buffer: BufferVec<PreprocessWorkItem>,
+    /// True if we're using GPU culling.
+    pub gpu_culling: bool,
+}
+
 /// One invocation of the preprocessing shader: i.e. one mesh instance in a
 /// view.
 #[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
@@ -76,10 +129,114 @@ pub struct PreprocessWorkItem {
     /// The index of the batch input data in the input buffer that the shader
     /// reads from.
     pub input_index: u32,
-    /// The index of the `MeshUniform` in the output buffer that we write to.
+    /// In direct mode, this is the index of the `MeshUniform` in the output
+    /// buffer that we write to. In indirect mode, this is the index of the
+    /// [`IndirectParameters`].
     pub output_index: u32,
 }
 
+/// The `wgpu` indirect parameters structure.
+///
+/// This is actually a union of the two following structures:
+///
+/// ```
+/// #[repr(C)]
+/// struct ArrayIndirectParameters {
+///     vertex_count: u32,
+///     instance_count: u32,
+///     first_vertex: u32,
+///     first_instance: u32,
+/// }
+///
+/// #[repr(C)]
+/// struct ElementIndirectParameters {
+///     index_count: u32,
+///     instance_count: u32,
+///     first_vertex: u32,
+///     base_vertex: u32,
+///     first_instance: u32,
+/// }
+/// ```
+///
+/// We actually generally treat these two variants identically in code. To do
+/// that, we make the following two observations:
+///
+/// 1. `instance_count` is in the same place in both structures. So we can
+/// access it regardless of the structure we're looking at.
+///
+/// 2. The second structure is one word larger than the first. Thus we need to
+/// pad out the first structure by one word in order to place both structures in
+/// an array. If we pad out `ArrayIndirectParameters` by copying the
+/// `first_instance` field into the padding, then the resulting union structure
+/// will always have a read-only copy of `first_instance` in the final word. We
+/// take advantage of this in the shader to reduce branching.
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct IndirectParameters {
+    /// For `ArrayIndirectParameters`, `vertex_count`; for
+    /// `ElementIndirectParameters`, `index_count`.
+    pub vertex_or_index_count: u32,
+
+    /// The number of instances we're going to draw.
+    ///
+    /// This field is in the same place in both structures.
+    pub instance_count: u32,
+
+    /// The index of the first vertex we're to draw.
+    pub first_vertex: u32,
+
+    /// For `ArrayIndirectParameters`, `first_instance`; for
+    /// `ElementIndirectParameters`, `base_vertex`.`
+    pub base_vertex_or_first_instance: u32,
+
+    /// For `ArrayIndirectParameters`, this is padding; for
+    /// `ElementIndirectParameters`, this is `first_instance`.
+    ///
+    /// Conventionally, we copy `first_instance` into this field when padding
+    /// out `ArrayIndirectParameters`. That way, shader code can read this value
+    /// at the same place, regardless of the specific structure this represents.
+    pub first_instance: u32,
+}
+
+/// The buffer containing the list of [`IndirectParameters`], for draw commands.
+#[derive(Resource, Deref, DerefMut)]
+pub struct IndirectParametersBuffer(pub BufferVec<IndirectParameters>);
+
+impl IndirectParametersBuffer {
+    /// Creates the indirect parameters buffer.
+    pub fn new() -> IndirectParametersBuffer {
+        IndirectParametersBuffer(BufferVec::new(
+            BufferUsages::STORAGE | BufferUsages::INDIRECT,
+        ))
+    }
+}
+
+impl Default for IndirectParametersBuffer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl FromWorld for GpuPreprocessingSupport {
+    fn from_world(world: &mut World) -> Self {
+        let adapter = world.resource::<RenderAdapter>();
+        let device = world.resource::<RenderDevice>();
+
+        if device.limits().max_compute_workgroup_size_x == 0 {
+            GpuPreprocessingSupport::None
+        } else if !device
+            .features()
+            .contains(Features::INDIRECT_FIRST_INSTANCE) ||
+            !adapter.get_downlevel_capabilities().flags.contains(
+        DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW)
+        {
+            GpuPreprocessingSupport::PreprocessingOnly
+        } else {
+            GpuPreprocessingSupport::Culling
+        }
+    }
+}
+
 impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
@@ -110,7 +267,7 @@ where
         self.current_input_buffer.clear();
         self.previous_input_buffer.clear();
         for work_item_buffer in self.work_item_buffers.values_mut() {
-            work_item_buffer.clear();
+            work_item_buffer.buffer.clear();
         }
     }
 }
@@ -125,6 +282,51 @@ where
     }
 }
 
+/// Information about a render batch that we're building up during a sorted
+/// render phase.
+struct SortedRenderBatch<F>
+where
+    F: GetBatchData,
+{
+    /// The index of the first phase item in this batch in the list of phase
+    /// items.
+    phase_item_start_index: u32,
+
+    /// The index of the first instance in this batch in the instance buffer.
+    instance_start_index: u32,
+
+    /// The index of the indirect parameters for this batch in the
+    /// [`IndirectParametersBuffer`].
+    ///
+    /// If CPU culling is being used, then this will be `None`.
+    indirect_parameters_index: Option<NonMaxU32>,
+
+    /// Metadata that can be used to determine whether an instance can be placed
+    /// into this batch.
+    meta: BatchMeta<F::CompareData>,
+}
+
+impl<F> SortedRenderBatch<F>
+where
+    F: GetBatchData,
+{
+    /// Finalizes this batch and updates the [`SortedRenderPhase`] with the
+    /// appropriate indices.
+    ///
+    /// `instance_end_index` is the index of the last instance in this batch
+    /// plus one.
+    fn flush<I>(self, instance_end_index: u32, phase: &mut SortedRenderPhase<I>)
+    where
+        I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    {
+        let (batch_range, batch_extra_index) =
+            phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut();
+        *batch_range = self.instance_start_index..instance_end_index;
+        *batch_extra_index =
+            PhaseItemExtraIndex::maybe_indirect_parameters_index(self.indirect_parameters_index);
+    }
+}
+
 /// A system that runs early in extraction and clears out all the
 /// [`BatchedInstanceBuffers`] for the frame.
 ///
@@ -166,53 +368,126 @@ pub fn delete_old_work_item_buffers<GFBD>(
 /// is in use. This means comparing metadata needed to draw each phase item and
 /// trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
-    gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
-    mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
-    param: StaticSystemParam<GFBD::Param>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+    mut views: Query<(Entity, &mut SortedRenderPhase<I>, Has<GpuCulling>)>,
+    system_param_item: StaticSystemParam<GFBD::Param>,
 ) where
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    let system_param_item = param.into_inner();
-
     // We only process GPU-built batch data in this function.
     let BatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_batched_instance_buffers.into_inner();
+    } = gpu_array_buffer.into_inner();
 
-    for (view, mut phase) in &mut views {
+    for (view, mut phase, gpu_culling) in &mut views {
         // Create the work item buffer if necessary.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    gpu_culling,
+                });
 
-        super::batch_and_prepare_sorted_render_phase::<I, GFBD>(&mut phase, |item| {
-            let (input_index, compare_data) =
-                GFBD::get_index_and_compare_data(&system_param_item, item.entity())?;
-            let output_index = data_buffer.add() as u32;
+        // Walk through the list of phase items, building up batches as we go.
+        let mut batch: Option<SortedRenderBatch<GFBD>> = None;
+        for current_index in 0..phase.items.len() {
+            // Get the index of the input data, and comparison metadata, for
+            // this entity.
+            let current_batch_input_index = GFBD::get_index_and_compare_data(
+                &system_param_item,
+                phase.items[current_index].entity(),
+            );
+
+            // Unpack that index and metadata. Note that it's possible for index
+            // and/or metadata to not be present, which signifies that this
+            // entity is unbatchable. In that case, we break the batch here and
+            // otherwise ignore the phase item.
+            let (current_input_index, current_meta);
+            match current_batch_input_index {
+                Some((input_index, Some(current_compare_data))) => {
+                    current_input_index = Some(input_index);
+                    current_meta = Some(BatchMeta::new(
+                        &phase.items[current_index],
+                        current_compare_data,
+                    ));
+                }
+                _ => {
+                    current_input_index = None;
+                    current_meta = None;
+                }
+            }
 
-            work_item_buffer.push(PreprocessWorkItem {
-                input_index: input_index.into(),
-                output_index,
+            // Determine if this entity can be included in the batch we're
+            // building up.
+            let can_batch = batch.as_ref().is_some_and(|batch| {
+                current_meta
+                    .as_ref()
+                    .is_some_and(|current_meta| batch.meta == *current_meta)
             });
 
-            *item.batch_range_mut() = output_index..output_index + 1;
+            // Make space in the data buffer for this instance.
+            let current_entity = phase.items[current_index].entity();
+            let output_index = data_buffer.add() as u32;
+
+            // If we can't batch, break the existing batch and make a new one.
+            if !can_batch {
+                // Break a batch if we need to.
+                if let Some(batch) = batch.take() {
+                    batch.flush(output_index, &mut phase);
+                }
+
+                // Start a new batch.
+                batch = current_meta.map(|meta| {
+                    let indirect_parameters_index = if gpu_culling {
+                        GFBD::get_batch_indirect_parameters_index(
+                            &system_param_item,
+                            &mut indirect_parameters_buffer,
+                            current_entity,
+                            output_index,
+                        )
+                    } else {
+                        None
+                    };
+                    SortedRenderBatch {
+                        phase_item_start_index: current_index as u32,
+                        instance_start_index: output_index,
+                        indirect_parameters_index,
+                        meta,
+                    }
+                });
+            }
 
-            compare_data
-        });
+            // Add a new preprocessing work item so that the preprocessing
+            // shader will copy the per-instance data over.
+            if let (Some(batch), Some(input_index)) = (batch.as_ref(), current_input_index.as_ref())
+            {
+                work_item_buffer.buffer.push(PreprocessWorkItem {
+                    input_index: (*input_index).into(),
+                    output_index: match batch.indirect_parameters_index {
+                        Some(indirect_parameters_index) => indirect_parameters_index.into(),
+                        None => output_index,
+                    },
+                });
+            }
+        }
+
+        // Flush the final batch if necessary.
+        if let Some(batch) = batch.take() {
+            batch.flush(data_buffer.len() as u32, &mut phase);
+        }
     }
 }
 
 /// Creates batches for a render phase that uses bins.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
-    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>, Has<GpuCulling>)>,
     param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
@@ -224,16 +499,20 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_batched_instance_buffers.into_inner();
+    } = gpu_array_buffer.into_inner();
 
-    for (view, mut phase) in &mut views {
+    for (view, mut phase, gpu_culling) in &mut views {
         let phase = &mut *phase; // Borrow checker.
 
         // Create the work item buffer if necessary; otherwise, just mark it as
         // used this frame.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    gpu_culling,
+                });
 
         // Prepare batchables.
 
@@ -245,19 +524,50 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index: input_index.into(),
-                    output_index,
-                });
-
-                batch
-                    .get_or_insert(BinnedRenderPhaseBatch {
-                        representative_entity: entity,
-                        instance_range: output_index..output_index,
-                        dynamic_offset: None,
-                    })
-                    .instance_range
-                    .end = output_index + 1;
+                match batch {
+                    Some(ref mut batch) => {
+                        batch.instance_range.end = output_index + 1;
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index: batch
+                                .extra_index
+                                .as_indirect_parameters_index()
+                                .unwrap_or(output_index),
+                        });
+                    }
+
+                    None if gpu_culling => {
+                        let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                            &system_param_item,
+                            &mut indirect_parameters_buffer,
+                            entity,
+                            output_index,
+                        );
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index: indirect_parameters_index.unwrap_or_default().into(),
+                        });
+                        batch = Some(BinnedRenderPhaseBatch {
+                            representative_entity: entity,
+                            instance_range: output_index..output_index + 1,
+                            extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
+                                indirect_parameters_index,
+                            ),
+                        });
+                    }
+
+                    None => {
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index,
+                        });
+                        batch = Some(BinnedRenderPhaseBatch {
+                            representative_entity: entity,
+                            instance_range: output_index..output_index + 1,
+                            extra_index: PhaseItemExtraIndex::NONE,
+                        });
+                    }
+                }
             }
 
             if let Some(batch) = batch {
@@ -274,18 +584,38 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index: input_index.into(),
-                    output_index,
-                });
-
-                unbatchables
-                    .buffer_indices
-                    .add(GpuArrayBufferIndex::<GFBD::BufferData> {
-                        index: output_index,
-                        dynamic_offset: None,
-                        element_type: PhantomData,
+                if gpu_culling {
+                    let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                        &system_param_item,
+                        &mut indirect_parameters_buffer,
+                        entity,
+                        output_index,
+                    )
+                    .unwrap_or_default();
+                    work_item_buffer.buffer.push(PreprocessWorkItem {
+                        input_index: input_index.into(),
+                        output_index: indirect_parameters_index.into(),
+                    });
+                    unbatchables
+                        .buffer_indices
+                        .add(UnbatchableBinnedEntityIndices {
+                            instance_index: indirect_parameters_index.into(),
+                            extra_index: PhaseItemExtraIndex::indirect_parameters_index(
+                                indirect_parameters_index.into(),
+                            ),
+                        });
+                } else {
+                    work_item_buffer.buffer.push(PreprocessWorkItem {
+                        input_index: input_index.into(),
+                        output_index,
                     });
+                    unbatchables
+                        .buffer_indices
+                        .add(UnbatchableBinnedEntityIndices {
+                            instance_index: output_index,
+                            extra_index: PhaseItemExtraIndex::NONE,
+                        });
+                }
             }
         }
     }
@@ -295,29 +625,34 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 pub fn write_batched_instance_buffers<GFBD>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
-    mut gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
 ) where
     GFBD: GetFullBatchData,
 {
-    gpu_batched_instance_buffers
-        .data_buffer
-        .write_buffer(&render_device);
-    gpu_batched_instance_buffers
-        .current_input_buffer
-        .write_buffer(&render_device, &render_queue);
+    let BatchedInstanceBuffers {
+        ref mut data_buffer,
+        work_item_buffers: ref mut index_buffers,
+        ref mut current_input_buffer,
+        previous_input_buffer: _,
+    } = gpu_array_buffer.into_inner();
+
+    data_buffer.write_buffer(&render_device);
+    current_input_buffer.write_buffer(&render_device, &render_queue);
     // There's no need to write `previous_input_buffer`, as we wrote
     // that on the previous frame, and it hasn't changed.
 
-    for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
-        work_item_buffer.write_buffer(&render_device, &render_queue);
+    for index_buffer in index_buffers.values_mut() {
+        index_buffer
+            .buffer
+            .write_buffer(&render_device, &render_queue);
     }
 }
 
-/// Determines whether it's possible to run preprocessing on the GPU.
-///
-/// Currently, this simply checks to see whether compute shaders are supported.
-pub fn can_preprocess_on_gpu(render_device: &RenderDevice) -> bool {
-    render_device.limits().max_compute_workgroup_size_x > 0
+pub fn write_indirect_parameters_buffer(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+) {
+    indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
+    indirect_parameters_buffer.clear();
 }
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 3ce9aaf38bb10..0e1a6ada7ceb6 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -14,6 +14,8 @@ use crate::{
     render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
 };
 
+use self::gpu_preprocessing::IndirectParametersBuffer;
+
 pub mod gpu_preprocessing;
 pub mod no_gpu_preprocessing;
 
@@ -52,7 +54,7 @@ impl<T: PartialEq> BatchMeta<T> {
         BatchMeta {
             pipeline_id: item.cached_pipeline(),
             draw_function_id: item.draw_function(),
-            dynamic_offset: item.dynamic_offset(),
+            dynamic_offset: item.extra_index().as_dynamic_offset(),
             user_data,
         }
     }
@@ -133,6 +135,19 @@ pub trait GetFullBatchData: GetBatchData {
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<NonMaxU32>;
+
+    /// Pushes [`gpu_preprocessing::IndirectParameters`] necessary to draw this
+    /// batch onto the given [`IndirectParametersBuffer`], and returns its
+    /// index.
+    ///
+    /// This is only used if GPU culling is enabled (which requires GPU
+    /// preprocessing).
+    fn get_batch_indirect_parameters_index(
+        param: &SystemParamItem<Self::Param>,
+        indirect_parameters_buffer: &mut IndirectParametersBuffer,
+        entity: Entity,
+        instance_index: u32,
+    ) -> Option<NonMaxU32>;
 }
 
 /// Sorts a render phase that uses bins.
diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
index 3387243d13dbd..15dfa7842a009 100644
--- a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@@ -8,7 +8,7 @@ use wgpu::BindingResource;
 use crate::{
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
     },
     render_resource::{GpuArrayBuffer, GpuArrayBufferable},
     renderer::{RenderDevice, RenderQueue},
@@ -79,8 +79,9 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
             let buffer_index = batched_instance_buffer.push(buffer_data);
 
             let index = buffer_index.index;
-            *item.batch_range_mut() = index..index + 1;
-            *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
+            let (batch_range, extra_index) = item.batch_range_and_extra_index_mut();
+            *batch_range = index..index + 1;
+            *extra_index = PhaseItemExtraIndex::maybe_dynamic_offset(buffer_index.dynamic_offset);
 
             compare_data
         });
@@ -90,13 +91,14 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
 /// Creates batches for a render phase that uses bins, when GPU batch data
 /// building isn't in use.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    mut buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
     mut views: Query<&mut BinnedRenderPhase<BPI>>,
     param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
     GFBD: GetFullBatchData,
 {
+    let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     for mut phase in &mut views {
@@ -111,7 +113,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 else {
                     continue;
                 };
-                let instance = buffer.push(buffer_data);
+                let instance = gpu_array_buffer.push(buffer_data);
 
                 // If the dynamic offset has changed, flush the batch.
                 //
@@ -120,12 +122,15 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 // with no storage buffers.
                 if !batch_set.last().is_some_and(|batch| {
                     batch.instance_range.end == instance.index
-                        && batch.dynamic_offset == instance.dynamic_offset
+                        && batch.extra_index
+                            == PhaseItemExtraIndex::maybe_dynamic_offset(instance.dynamic_offset)
                 }) {
                     batch_set.push(BinnedRenderPhaseBatch {
                         representative_entity: entity,
                         instance_range: instance.index..instance.index,
-                        dynamic_offset: instance.dynamic_offset,
+                        extra_index: PhaseItemExtraIndex::maybe_dynamic_offset(
+                            instance.dynamic_offset,
+                        ),
                     });
                 }
 
@@ -145,8 +150,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 else {
                     continue;
                 };
-                let instance = buffer.push(buffer_data);
-                unbatchables.buffer_indices.add(instance);
+                let instance = gpu_array_buffer.push(buffer_data);
+                unbatchables.buffer_indices.add(instance.into());
             }
         }
     }
diff --git a/crates/bevy_render/src/camera/camera.rs b/crates/bevy_render/src/camera/camera.rs
index 215aa2bb3911c..4d0f334f01d49 100644
--- a/crates/bevy_render/src/camera/camera.rs
+++ b/crates/bevy_render/src/camera/camera.rs
@@ -1,4 +1,5 @@
 use crate::{
+    batching::gpu_preprocessing::GpuPreprocessingSupport,
     camera::{CameraProjection, ManualTextureViewHandle, ManualTextureViews},
     prelude::Image,
     primitives::Frustum,
@@ -6,7 +7,9 @@ use crate::{
     render_graph::{InternedRenderSubGraph, RenderSubGraph},
     render_resource::TextureView,
     texture::GpuImage,
-    view::{ColorGrading, ExtractedView, ExtractedWindows, RenderLayers, VisibleEntities},
+    view::{
+        ColorGrading, ExtractedView, ExtractedWindows, GpuCulling, RenderLayers, VisibleEntities,
+    },
     Extract,
 };
 use bevy_asset::{AssetEvent, AssetId, Assets, Handle};
@@ -17,6 +20,7 @@ use bevy_ecs::{
     entity::Entity,
     event::EventReader,
     prelude::With,
+    query::Has,
     reflect::ReflectComponent,
     system::{Commands, Query, Res, ResMut, Resource},
 };
@@ -24,7 +28,7 @@ use bevy_math::{vec2, Dir3, Mat4, Ray3d, Rect, URect, UVec2, UVec4, Vec2, Vec3};
 use bevy_reflect::prelude::*;
 use bevy_render_macros::ExtractComponent;
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::tracing::warn;
+use bevy_utils::{tracing::warn, warn_once};
 use bevy_utils::{HashMap, HashSet};
 use bevy_window::{
     NormalizedWindowRef, PrimaryWindow, Window, WindowCreated, WindowRef, WindowResized,
@@ -827,9 +831,11 @@ pub fn extract_cameras(
             Option<&TemporalJitter>,
             Option<&RenderLayers>,
             Option<&Projection>,
+            Has<GpuCulling>,
         )>,
     >,
     primary_window: Extract<Query<Entity, With<PrimaryWindow>>>,
+    gpu_preprocessing_support: Res<GpuPreprocessingSupport>,
 ) {
     let primary_window = primary_window.iter().next();
     for (
@@ -844,6 +850,7 @@ pub fn extract_cameras(
         temporal_jitter,
         render_layers,
         projection,
+        gpu_culling,
     ) in query.iter()
     {
         let color_grading = *color_grading.unwrap_or(&ColorGrading::default());
@@ -915,6 +922,16 @@ pub fn extract_cameras(
             if let Some(perspective) = projection {
                 commands.insert(perspective.clone());
             }
+
+            if gpu_culling {
+                if *gpu_preprocessing_support == GpuPreprocessingSupport::Culling {
+                    commands.insert(GpuCulling);
+                } else {
+                    warn_once!(
+                        "GPU culling isn't supported on this platform; ignoring `GpuCulling`."
+                    );
+                }
+            }
         }
     }
 }
diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs
index 1ba6d9cbe371e..60d8ef648aa77 100644
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@@ -52,6 +52,7 @@ pub mod prelude {
     };
 }
 
+use batching::gpu_preprocessing::BatchingPlugin;
 use bevy_ecs::schedule::ScheduleBuildSettings;
 use bevy_utils::prelude::default;
 pub use extract_param::Extract;
@@ -334,6 +335,7 @@ impl Plugin for RenderPlugin {
             MeshPlugin,
             GlobalsPlugin,
             MorphPlugin,
+            BatchingPlugin,
         ));
 
         app.init_resource::<RenderAssetBytesPerFrame>()
diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl
index 4070a8679a5b8..720e6bac46a08 100644
--- a/crates/bevy_render/src/maths.wgsl
+++ b/crates/bevy_render/src/maths.wgsl
@@ -52,6 +52,11 @@ fn inverse_affine3(affine: mat4x3<f32>) -> mat4x3<f32> {
     return mat4x3<f32>(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(inv_matrix3 * affine[3]));
 }
 
+// Extracts the upper 3x3 portion of a 4x4 matrix.
+fn mat4x4_to_mat3x3(m: mat4x4<f32>) -> mat3x3<f32> {
+    return mat3x3<f32>(m[0].xyz, m[1].xyz, m[2].xyz);
+}
+
 // Creates an orthonormal basis given a Z vector and an up vector (which becomes
 // Y after orthonormalization).
 //
@@ -64,3 +69,16 @@ fn orthonormalize(z_unnormalized: vec3<f32>, up: vec3<f32>) -> mat3x3<f32> {
     let y_basis = cross(z_basis, x_basis);
     return mat3x3(x_basis, y_basis, z_basis);
 }
+
+// Returns true if any part of a sphere is on the positive side of a plane.
+//
+// `sphere_center.w` should be 1.0.
+//
+// This is used for frustum culling.
+fn sphere_intersects_plane_half_space(
+    plane: vec4<f32>,
+    sphere_center: vec4<f32>,
+    sphere_radius: f32
+) -> bool {
+    return dot(plane, sphere_center) + sphere_radius > 0.0;
+}
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 58e1fa550f4e2..a1125fa938e2c 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -51,7 +51,14 @@ use bevy_ecs::{
     system::{lifetimeless::SRes, SystemParamItem},
 };
 use smallvec::SmallVec;
-use std::{hash::Hash, marker::PhantomData, ops::Range, slice::SliceIndex};
+use std::{
+    fmt::{self, Debug, Formatter},
+    hash::Hash,
+    iter,
+    marker::PhantomData,
+    ops::Range,
+    slice::SliceIndex,
+};
 
 /// A collection of all rendering instructions, that will be executed by the GPU, for a
 /// single render phase for a single view.
@@ -124,7 +131,7 @@ pub struct BinnedRenderPhaseBatch {
     ///
     /// Note that dynamic offsets are only used on platforms that don't support
     /// storage buffers.
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 /// Information about the unbatchable entities in a bin.
@@ -133,7 +140,7 @@ pub(crate) struct UnbatchableBinnedEntities {
     pub(crate) entities: Vec<Entity>,
 
     /// The GPU array buffer indices of each unbatchable binned entity.
-    pub(crate) buffer_indices: UnbatchableBinnedEntityBufferIndex,
+    pub(crate) buffer_indices: UnbatchableBinnedEntityIndexSet,
 }
 
 /// Stores instance indices and dynamic offsets for unbatchable entities in a
@@ -145,7 +152,7 @@ pub(crate) struct UnbatchableBinnedEntities {
 /// platforms that aren't WebGL 2.
 #[derive(Default)]
 
-pub(crate) enum UnbatchableBinnedEntityBufferIndex {
+pub(crate) enum UnbatchableBinnedEntityIndexSet {
     /// There are no unbatchable entities in this bin (yet).
     #[default]
     NoEntities,
@@ -155,26 +162,42 @@ pub(crate) enum UnbatchableBinnedEntityBufferIndex {
     ///
     /// This is the typical case on platforms other than WebGL 2. We special
     /// case this to avoid allocation on those platforms.
-    NoDynamicOffsets {
+    Sparse {
         /// The range of indices.
         instance_range: Range<u32>,
+        /// The index of the first indirect instance parameters.
+        ///
+        /// The other indices immediately follow these.
+        first_indirect_parameters_index: Option<NonMaxU32>,
     },
 
     /// Dynamic uniforms are present for unbatchable entities in this bin.
     ///
     /// We fall back to this on WebGL 2.
-    DynamicOffsets(Vec<UnbatchableBinnedEntityDynamicOffset>),
+    Dense(Vec<UnbatchableBinnedEntityIndices>),
 }
 
 /// The instance index and dynamic offset (if present) for an unbatchable entity.
 ///
 /// This is only useful on platforms that don't support storage buffers.
 #[derive(Clone, Copy)]
-pub(crate) struct UnbatchableBinnedEntityDynamicOffset {
+pub(crate) struct UnbatchableBinnedEntityIndices {
     /// The instance index.
-    instance_index: u32,
-    /// The dynamic offset, if present.
-    dynamic_offset: Option<NonMaxU32>,
+    pub(crate) instance_index: u32,
+    /// The [`PhaseItemExtraIndex`], if present.
+    pub(crate) extra_index: PhaseItemExtraIndex,
+}
+
+impl<T> From<GpuArrayBufferIndex<T>> for UnbatchableBinnedEntityIndices
+where
+    T: Clone + ShaderSize + WriteInto,
+{
+    fn from(value: GpuArrayBufferIndex<T>) -> Self {
+        UnbatchableBinnedEntityIndices {
+            instance_index: value.index,
+            extra_index: PhaseItemExtraIndex::maybe_dynamic_offset(value.dynamic_offset),
+        }
+    }
 }
 
 impl<BPI> BinnedRenderPhase<BPI>
@@ -227,7 +250,7 @@ where
                     key.clone(),
                     batch.representative_entity,
                     batch.instance_range.clone(),
-                    batch.dynamic_offset,
+                    batch.extra_index,
                 );
 
                 // Fetch the draw function.
@@ -246,17 +269,26 @@ where
             let unbatchable_entities = &self.unbatchable_values[key];
             for (entity_index, &entity) in unbatchable_entities.entities.iter().enumerate() {
                 let unbatchable_dynamic_offset = match &unbatchable_entities.buffer_indices {
-                    UnbatchableBinnedEntityBufferIndex::NoEntities => {
+                    UnbatchableBinnedEntityIndexSet::NoEntities => {
                         // Shouldn't happen…
                         continue;
                     }
-                    UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets { instance_range } => {
-                        UnbatchableBinnedEntityDynamicOffset {
-                            instance_index: instance_range.start + entity_index as u32,
-                            dynamic_offset: None,
-                        }
-                    }
-                    UnbatchableBinnedEntityBufferIndex::DynamicOffsets(ref dynamic_offsets) => {
+                    UnbatchableBinnedEntityIndexSet::Sparse {
+                        instance_range,
+                        first_indirect_parameters_index,
+                    } => UnbatchableBinnedEntityIndices {
+                        instance_index: instance_range.start + entity_index as u32,
+                        extra_index: match first_indirect_parameters_index {
+                            None => PhaseItemExtraIndex::NONE,
+                            Some(first_indirect_parameters_index) => {
+                                PhaseItemExtraIndex::indirect_parameters_index(
+                                    u32::from(*first_indirect_parameters_index)
+                                        + entity_index as u32,
+                                )
+                            }
+                        },
+                    },
+                    UnbatchableBinnedEntityIndexSet::Dense(ref dynamic_offsets) => {
                         dynamic_offsets[entity_index]
                     }
                 };
@@ -266,7 +298,7 @@ where
                     entity,
                     unbatchable_dynamic_offset.instance_index
                         ..(unbatchable_dynamic_offset.instance_index + 1),
-                    unbatchable_dynamic_offset.dynamic_offset,
+                    unbatchable_dynamic_offset.extra_index,
                 );
 
                 // Fetch the draw function.
@@ -300,6 +332,42 @@ where
     }
 }
 
+impl UnbatchableBinnedEntityIndexSet {
+    /// Returns the [`UnbatchableBinnedEntityIndices`] for the given entity.
+    fn indices_for_entity_index(
+        &self,
+        entity_index: u32,
+    ) -> Option<UnbatchableBinnedEntityIndices> {
+        match self {
+            UnbatchableBinnedEntityIndexSet::NoEntities => None,
+            UnbatchableBinnedEntityIndexSet::Sparse { instance_range, .. }
+                if entity_index >= instance_range.len() as u32 =>
+            {
+                None
+            }
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                instance_range,
+                first_indirect_parameters_index: None,
+            } => Some(UnbatchableBinnedEntityIndices {
+                instance_index: instance_range.start + entity_index,
+                extra_index: PhaseItemExtraIndex::NONE,
+            }),
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                instance_range,
+                first_indirect_parameters_index: Some(first_indirect_parameters_index),
+            } => Some(UnbatchableBinnedEntityIndices {
+                instance_index: instance_range.start + entity_index,
+                extra_index: PhaseItemExtraIndex::indirect_parameters_index(
+                    u32::from(*first_indirect_parameters_index) + entity_index,
+                ),
+            }),
+            UnbatchableBinnedEntityIndexSet::Dense(ref indices) => {
+                indices.get(entity_index as usize).copied()
+            }
+        }
+    }
+}
+
 /// A convenient abstraction for adding all the systems necessary for a binned
 /// render phase to the render app.
 ///
@@ -395,74 +463,62 @@ where
     }
 }
 
-impl UnbatchableBinnedEntityBufferIndex {
+impl UnbatchableBinnedEntityIndexSet {
     /// Adds a new entity to the list of unbatchable binned entities.
-    pub fn add<T>(&mut self, gpu_array_buffer_index: GpuArrayBufferIndex<T>)
-    where
-        T: ShaderSize + WriteInto + Clone,
-    {
-        match (&mut *self, gpu_array_buffer_index.dynamic_offset) {
-            (UnbatchableBinnedEntityBufferIndex::NoEntities, None) => {
-                // This is the first entity we've seen, and we're not on WebGL
-                // 2. Initialize the fast path.
-                *self = UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets {
-                    instance_range: gpu_array_buffer_index.index
-                        ..(gpu_array_buffer_index.index + 1),
+    pub fn add(&mut self, indices: UnbatchableBinnedEntityIndices) {
+        match self {
+            UnbatchableBinnedEntityIndexSet::NoEntities => {
+                if indices.extra_index.is_dynamic_offset() {
+                    // This is the first entity we've seen, and we don't have
+                    // compute shaders. Initialize an array.
+                    *self = UnbatchableBinnedEntityIndexSet::Dense(vec![indices]);
+                } else {
+                    // This is the first entity we've seen, and we have compute
+                    // shaders. Initialize the fast path.
+                    *self = UnbatchableBinnedEntityIndexSet::Sparse {
+                        instance_range: indices.instance_index..indices.instance_index + 1,
+                        first_indirect_parameters_index: indices
+                            .extra_index
+                            .as_indirect_parameters_index()
+                            .and_then(|index| NonMaxU32::try_from(index).ok()),
+                    }
                 }
             }
 
-            (UnbatchableBinnedEntityBufferIndex::NoEntities, Some(dynamic_offset)) => {
-                // This is the first entity we've seen, and we're on WebGL 2.
-                // Initialize an array.
-                *self = UnbatchableBinnedEntityBufferIndex::DynamicOffsets(vec![
-                    UnbatchableBinnedEntityDynamicOffset {
-                        instance_index: gpu_array_buffer_index.index,
-                        dynamic_offset: Some(dynamic_offset),
-                    },
-                ]);
-            }
-
-            (
-                UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets {
-                    ref mut instance_range,
-                },
-                None,
-            ) if instance_range.end == gpu_array_buffer_index.index => {
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                ref mut instance_range,
+                first_indirect_parameters_index,
+            } if instance_range.end == indices.instance_index
+                && ((first_indirect_parameters_index.is_none()
+                    && indices.extra_index == PhaseItemExtraIndex::NONE)
+                    || first_indirect_parameters_index.is_some_and(
+                        |first_indirect_parameters_index| {
+                            Some(
+                                u32::from(first_indirect_parameters_index) + instance_range.end
+                                    - instance_range.start,
+                            ) == indices.extra_index.as_indirect_parameters_index()
+                        },
+                    )) =>
+            {
                 // This is the normal case on non-WebGL 2.
                 instance_range.end += 1;
             }
 
-            (
-                UnbatchableBinnedEntityBufferIndex::DynamicOffsets(ref mut offsets),
-                dynamic_offset,
-            ) => {
-                // This is the normal case on WebGL 2.
-                offsets.push(UnbatchableBinnedEntityDynamicOffset {
-                    instance_index: gpu_array_buffer_index.index,
-                    dynamic_offset,
-                });
-            }
-
-            (
-                UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets { instance_range },
-                dynamic_offset,
-            ) => {
+            UnbatchableBinnedEntityIndexSet::Sparse { instance_range, .. } => {
                 // We thought we were in non-WebGL 2 mode, but we got a dynamic
                 // offset or non-contiguous index anyway. This shouldn't happen,
                 // but let's go ahead and do the sensible thing anyhow: demote
                 // the compressed `NoDynamicOffsets` field to the full
                 // `DynamicOffsets` array.
-                let mut new_dynamic_offsets: Vec<_> = instance_range
-                    .map(|instance_index| UnbatchableBinnedEntityDynamicOffset {
-                        instance_index,
-                        dynamic_offset: None,
-                    })
+                let new_dynamic_offsets = (0..instance_range.len() as u32)
+                    .flat_map(|entity_index| self.indices_for_entity_index(entity_index))
+                    .chain(iter::once(indices))
                     .collect();
-                new_dynamic_offsets.push(UnbatchableBinnedEntityDynamicOffset {
-                    instance_index: gpu_array_buffer_index.index,
-                    dynamic_offset,
-                });
-                *self = UnbatchableBinnedEntityBufferIndex::DynamicOffsets(new_dynamic_offsets);
+                *self = UnbatchableBinnedEntityIndexSet::Dense(new_dynamic_offsets);
+            }
+
+            UnbatchableBinnedEntityIndexSet::Dense(ref mut dense_indices) => {
+                dense_indices.push(indices);
             }
         }
     }
@@ -487,6 +543,7 @@ pub struct SortedRenderPhase<I>
 where
     I: SortedPhaseItem,
 {
+    /// The items within this [`SortedRenderPhase`].
     pub items: Vec<I>,
 }
 
@@ -604,8 +661,144 @@ pub trait PhaseItem: Sized + Send + Sync + 'static {
     fn batch_range(&self) -> &Range<u32>;
     fn batch_range_mut(&mut self) -> &mut Range<u32>;
 
-    fn dynamic_offset(&self) -> Option<NonMaxU32>;
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32>;
+    /// Returns the [`PhaseItemExtraIndex`].
+    ///
+    /// If present, this is either a dynamic offset or an indirect parameters
+    /// index.
+    fn extra_index(&self) -> PhaseItemExtraIndex;
+
+    /// Returns a pair of mutable references to both the batch range and extra
+    /// index.
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex);
+}
+
+/// The "extra index" associated with some [`PhaseItem`]s, alongside the
+/// indirect instance index.
+///
+/// Sometimes phase items require another index in addition to the range of
+/// instances they already have. These can be:
+///
+/// * The *dynamic offset*: a `wgpu` dynamic offset into the uniform buffer of
+/// instance data. This is used on platforms that don't support storage
+/// buffers, to work around uniform buffer size limitations.
+///
+/// * The *indirect parameters index*: an index into the buffer that specifies
+/// the indirect parameters for this [`PhaseItem`]'s drawcall. This is used when
+/// indirect mode is on (as used for GPU culling).
+///
+/// Note that our indirect draw functionality requires storage buffers, so it's
+/// impossible to have both a dynamic offset and an indirect parameters index.
+/// This convenient fact allows us to pack both indices into a single `u32`.
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PhaseItemExtraIndex(pub u32);
+
+impl Debug for PhaseItemExtraIndex {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if self.is_dynamic_offset() {
+            write!(f, "DynamicOffset({})", self.offset())
+        } else if self.is_indirect_parameters_index() {
+            write!(f, "IndirectParametersIndex({})", self.offset())
+        } else {
+            write!(f, "None")
+        }
+    }
+}
+
+impl PhaseItemExtraIndex {
+    /// The flag that indicates that this index is an indirect parameter. If not
+    /// set, this is a dynamic offset.
+    pub const INDIRECT_PARAMETER_INDEX: u32 = 1 << 31;
+    /// To extract the index from a packed [`PhaseItemExtraIndex`], bitwise-and
+    /// the contents with this value.
+    pub const OFFSET_MASK: u32 = Self::INDIRECT_PARAMETER_INDEX - 1;
+    /// To extract the flag from a packed [`PhaseItemExtraIndex`], bitwise-and
+    /// the contents with this value.
+    pub const FLAGS_MASK: u32 = !Self::OFFSET_MASK;
+
+    /// The special value that indicates that no extra index is present.
+    pub const NONE: PhaseItemExtraIndex = PhaseItemExtraIndex(u32::MAX);
+
+    /// Returns either the indirect parameters index or the dynamic offset,
+    /// depending on which is in use.
+    #[inline]
+    fn offset(&self) -> u32 {
+        self.0 & Self::OFFSET_MASK
+    }
+
+    /// Determines whether this extra index is a dynamic offset.
+    #[inline]
+    fn is_dynamic_offset(&self) -> bool {
+        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) == 0
+    }
+
+    /// Determines whether this extra index is an indirect parameters index.
+    #[inline]
+    fn is_indirect_parameters_index(&self) -> bool {
+        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) != 0
+    }
+
+    /// Packs a indirect parameters index into this extra index.
+    #[inline]
+    pub fn indirect_parameters_index(indirect_parameter_index: u32) -> PhaseItemExtraIndex {
+        // Make sure we didn't overflow.
+        debug_assert_eq!(indirect_parameter_index & Self::FLAGS_MASK, 0);
+        PhaseItemExtraIndex(indirect_parameter_index | Self::INDIRECT_PARAMETER_INDEX)
+    }
+
+    /// Returns either an indirect parameters index or
+    /// [`PhaseItemExtraIndex::NONE`], as appropriate.
+    #[inline]
+    pub fn maybe_indirect_parameters_index(
+        maybe_indirect_parameters_index: Option<NonMaxU32>,
+    ) -> PhaseItemExtraIndex {
+        match maybe_indirect_parameters_index {
+            Some(indirect_parameters_index) => {
+                Self::indirect_parameters_index(indirect_parameters_index.into())
+            }
+            None => PhaseItemExtraIndex::NONE,
+        }
+    }
+
+    /// Packs a dynamic offset into this extra index.
+    #[inline]
+    pub fn dynamic_offset(dynamic_offset: u32) -> PhaseItemExtraIndex {
+        // Make sure we didn't overflow.
+        debug_assert_eq!(dynamic_offset & Self::FLAGS_MASK, 0);
+
+        PhaseItemExtraIndex(dynamic_offset)
+    }
+
+    /// Returns either a dynamic offset or [`PhaseItemExtraIndex::NONE`], as
+    /// appropriate.
+    #[inline]
+    pub fn maybe_dynamic_offset(maybe_dynamic_offset: Option<NonMaxU32>) -> PhaseItemExtraIndex {
+        match maybe_dynamic_offset {
+            Some(dynamic_offset) => Self::dynamic_offset(dynamic_offset.into()),
+            None => PhaseItemExtraIndex::NONE,
+        }
+    }
+
+    /// If this extra index describes a dynamic offset, returns it; otherwise,
+    /// returns `None`.
+    #[inline]
+    pub fn as_dynamic_offset(&self) -> Option<NonMaxU32> {
+        if self.is_dynamic_offset() {
+            NonMaxU32::try_from(self.0 & Self::OFFSET_MASK).ok()
+        } else {
+            None
+        }
+    }
+
+    /// If this extra index describes an indirect parameters index, returns it;
+    /// otherwise, returns `None`.
+    #[inline]
+    pub fn as_indirect_parameters_index(&self) -> Option<u32> {
+        if self.is_indirect_parameters_index() {
+            Some(self.0 & Self::OFFSET_MASK)
+        } else {
+            None
+        }
+    }
 }
 
 /// Represents phase items that are placed into bins. The `BinKey` specifies
@@ -633,7 +826,7 @@ pub trait BinnedPhaseItem: PhaseItem {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self;
 }
 
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index 8a0f77daafb3d..7ff60c1f44078 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -228,6 +228,11 @@ where
         self.len = 0;
     }
 
+    /// Returns the length of the buffer.
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
     /// Materializes the buffer on the GPU with space for `capacity` elements.
     ///
     /// If the buffer is already big enough, this function doesn't reallocate
diff --git a/crates/bevy_render/src/view/mod.rs b/crates/bevy_render/src/view/mod.rs
index 1fef36c5844a0..bdeff34b0d407 100644
--- a/crates/bevy_render/src/view/mod.rs
+++ b/crates/bevy_render/src/view/mod.rs
@@ -208,6 +208,12 @@ pub struct PostProcessWrite<'a> {
     pub destination: &'a TextureView,
 }
 
+#[derive(Component)]
+pub struct GpuCulling;
+
+#[derive(Component)]
+pub struct NoCpuCulling;
+
 impl ViewTarget {
     pub const TEXTURE_FORMAT_HDR: TextureFormat = TextureFormat::Rgba16Float;
 
diff --git a/crates/bevy_render/src/view/visibility/mod.rs b/crates/bevy_render/src/view/visibility/mod.rs
index 72ba3c16a8681..f10d47b710ac7 100644
--- a/crates/bevy_render/src/view/visibility/mod.rs
+++ b/crates/bevy_render/src/view/visibility/mod.rs
@@ -20,6 +20,8 @@ use crate::{
     primitives::{Aabb, Frustum, Sphere},
 };
 
+use super::NoCpuCulling;
+
 /// User indication of whether an entity is visible. Propagates down the entity hierarchy.
 ///
 /// If an entity is hidden in this way, all [`Children`] (and all of their children and so on) who
@@ -397,6 +399,7 @@ pub fn check_visibility<QF>(
         &Frustum,
         Option<&RenderLayers>,
         &Camera,
+        Has<NoCpuCulling>,
     )>,
     mut visible_aabb_query: Query<
         (
@@ -413,7 +416,8 @@ pub fn check_visibility<QF>(
 ) where
     QF: QueryFilter + 'static,
 {
-    for (mut visible_entities, frustum, maybe_view_mask, camera) in &mut view_query {
+    for (mut visible_entities, frustum, maybe_view_mask, camera, no_cpu_culling) in &mut view_query
+    {
         if !camera.is_active {
             continue;
         }
@@ -445,7 +449,7 @@ pub fn check_visibility<QF>(
                 }
 
                 // If we have an aabb, do frustum culling
-                if !no_frustum_culling {
+                if !no_frustum_culling && !no_cpu_culling {
                     if let Some(model_aabb) = maybe_model_aabb {
                         let model = transform.affine();
                         let model_sphere = Sphere {
diff --git a/crates/bevy_sprite/src/mesh2d/material.rs b/crates/bevy_sprite/src/mesh2d/material.rs
index 682d2296817ff..2805840a67694 100644
--- a/crates/bevy_sprite/src/mesh2d/material.rs
+++ b/crates/bevy_sprite/src/mesh2d/material.rs
@@ -17,8 +17,8 @@ use bevy_render::{
         prepare_assets, PrepareAssetError, RenderAsset, RenderAssetPlugin, RenderAssets,
     },
     render_phase::{
-        AddRenderCommand, DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult,
-        SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
+        AddRenderCommand, DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand,
+        RenderCommandResult, SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
     },
     render_resource::{
         AsBindGroup, AsBindGroupError, BindGroup, BindGroupId, BindGroupLayout,
@@ -451,7 +451,7 @@ pub fn queue_material2d_meshes<M: Material2d>(
                 sort_key: FloatOrd(mesh_z + material2d.depth_bias),
                 // Batching is done in batch_and_prepare_render_phase
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 90a67f7517179..0dec7f0cbdac5 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -646,7 +646,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMesh2dBindGroup<I> {
     ) -> RenderCommandResult {
         let mut dynamic_offsets: [u32; 1] = Default::default();
         let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.dynamic_offset() {
+        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
             dynamic_offsets[offset_count] = dynamic_offset.get();
             offset_count += 1;
         }
diff --git a/crates/bevy_sprite/src/render/mod.rs b/crates/bevy_sprite/src/render/mod.rs
index ff0a475af815c..f0f2b524a32a4 100644
--- a/crates/bevy_sprite/src/render/mod.rs
+++ b/crates/bevy_sprite/src/render/mod.rs
@@ -19,8 +19,8 @@ use bevy_math::{Affine3A, FloatOrd, Quat, Rect, Vec2, Vec4};
 use bevy_render::{
     render_asset::RenderAssets,
     render_phase::{
-        DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult, SetItemPipeline,
-        SortedRenderPhase, TrackedRenderPass,
+        DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand, RenderCommandResult,
+        SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
     },
     render_resource::{
         binding_types::{sampler, texture_2d, uniform_buffer},
@@ -516,7 +516,7 @@ pub fn queue_sprites(
                 sort_key,
                 // batch_range and dynamic_offset will be calculated in prepare_sprites
                 batch_range: 0..0,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_ui/src/render/mod.rs b/crates/bevy_ui/src/render/mod.rs
index d4c27a2141b33..881e6918fd88f 100644
--- a/crates/bevy_ui/src/render/mod.rs
+++ b/crates/bevy_ui/src/render/mod.rs
@@ -7,8 +7,12 @@ use bevy_core_pipeline::core_2d::graph::{Core2d, Node2d};
 use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
 use bevy_core_pipeline::{core_2d::Camera2d, core_3d::Camera3d};
 use bevy_hierarchy::Parent;
-use bevy_render::texture::GpuImage;
-use bevy_render::{render_phase::PhaseItem, view::ViewVisibility, ExtractSchedule, Render};
+use bevy_render::{
+    render_phase::{PhaseItem, PhaseItemExtraIndex},
+    texture::GpuImage,
+    view::ViewVisibility,
+    ExtractSchedule, Render,
+};
 use bevy_sprite::{SpriteAssetEvents, TextureAtlas};
 pub use pipeline::*;
 pub use render_pass::*;
@@ -901,7 +905,7 @@ pub fn queue_uinodes(
             ),
             // batch_range will be calculated in prepare_uinodes
             batch_range: 0..0,
-            dynamic_offset: None,
+            extra_index: PhaseItemExtraIndex::NONE,
         });
     }
 }
diff --git a/crates/bevy_ui/src/render/render_pass.rs b/crates/bevy_ui/src/render/render_pass.rs
index e398a46d93d24..d403a44bedc7f 100644
--- a/crates/bevy_ui/src/render/render_pass.rs
+++ b/crates/bevy_ui/src/render/render_pass.rs
@@ -15,7 +15,6 @@ use bevy_render::{
     renderer::*,
     view::*,
 };
-use nonmax::NonMaxU32;
 
 pub struct UiPassNode {
     ui_view_query: QueryState<
@@ -92,7 +91,7 @@ pub struct TransparentUi {
     pub pipeline: CachedRenderPipelineId,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for TransparentUi {
@@ -117,13 +116,13 @@ impl PhaseItem for TransparentUi {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_ui/src/render/ui_material_pipeline.rs b/crates/bevy_ui/src/render/ui_material_pipeline.rs
index b4d13ac65ac13..b1beaaa5d11c6 100644
--- a/crates/bevy_ui/src/render/ui_material_pipeline.rs
+++ b/crates/bevy_ui/src/render/ui_material_pipeline.rs
@@ -671,7 +671,7 @@ pub fn queue_ui_material_nodes<M: UiMaterial>(
                 entity.index(),
             ),
             batch_range: 0..0,
-            dynamic_offset: None,
+            extra_index: PhaseItemExtraIndex::NONE,
         });
     }
 }
diff --git a/examples/2d/mesh2d_manual.rs b/examples/2d/mesh2d_manual.rs
index 9be4c11f8067a..343db4be1d153 100644
--- a/examples/2d/mesh2d_manual.rs
+++ b/examples/2d/mesh2d_manual.rs
@@ -13,7 +13,10 @@ use bevy::{
     render::{
         mesh::{GpuMesh, Indices, MeshVertexAttribute},
         render_asset::{RenderAssetUsages, RenderAssets},
-        render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+        render_phase::{
+            AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline,
+            SortedRenderPhase,
+        },
         render_resource::{
             BlendState, ColorTargetState, ColorWrites, Face, FragmentState, FrontFace,
             MultisampleState, PipelineCache, PolygonMode, PrimitiveState, PrimitiveTopology,
@@ -392,7 +395,7 @@ pub fn queue_colored_mesh2d(
                     sort_key: FloatOrd(mesh_z),
                     // This material is not batched
                     batch_range: 0..1,
-                    dynamic_offset: None,
+                    extra_index: PhaseItemExtraIndex::NONE,
                 });
             }
         }
diff --git a/examples/3d/deferred_rendering.rs b/examples/3d/deferred_rendering.rs
index 165d1de9098fc..6ee41a335a60b 100644
--- a/examples/3d/deferred_rendering.rs
+++ b/examples/3d/deferred_rendering.rs
@@ -7,9 +7,10 @@ use bevy::{
         fxaa::Fxaa,
         prepass::{DeferredPrepass, DepthPrepass, MotionVectorPrepass, NormalPrepass},
     },
-    pbr::NotShadowReceiver,
-    pbr::{CascadeShadowConfigBuilder, DirectionalLightShadowMap},
-    pbr::{DefaultOpaqueRendererMethod, NotShadowCaster, OpaqueRendererMethod},
+    pbr::{
+        CascadeShadowConfigBuilder, DefaultOpaqueRendererMethod, DirectionalLightShadowMap,
+        NotShadowCaster, NotShadowReceiver, OpaqueRendererMethod,
+    },
     prelude::*,
     render::render_resource::TextureFormat,
 };
diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs
index b1c91c8973b07..17fd7823c95ad 100644
--- a/examples/shader/shader_instancing.rs
+++ b/examples/shader/shader_instancing.rs
@@ -15,8 +15,8 @@ use bevy::{
         mesh::{GpuBufferInfo, GpuMesh, MeshVertexBufferLayoutRef},
         render_asset::RenderAssets,
         render_phase::{
-            AddRenderCommand, DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult,
-            SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
+            AddRenderCommand, DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand,
+            RenderCommandResult, SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
         },
         render_resource::*,
         renderer::RenderDevice,
@@ -144,7 +144,7 @@ fn queue_custom(
                 draw_function: draw_custom,
                 distance: rangefinder.distance_translation(&mesh_instance.translation),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/examples/stress_tests/many_cubes.rs b/examples/stress_tests/many_cubes.rs
index 8d583d0535e6a..a10ced7d0f74b 100644
--- a/examples/stress_tests/many_cubes.rs
+++ b/examples/stress_tests/many_cubes.rs
@@ -20,7 +20,7 @@ use bevy::{
         batching::NoAutomaticBatching,
         render_asset::RenderAssetUsages,
         render_resource::{Extent3d, TextureDimension, TextureFormat},
-        view::NoFrustumCulling,
+        view::{GpuCulling, NoCpuCulling, NoFrustumCulling},
     },
     window::{PresentMode, WindowResolution},
     winit::{UpdateMode, WinitSettings},
@@ -51,7 +51,7 @@ struct Args {
     #[argh(option, default = "1")]
     mesh_count: usize,
 
-    /// whether to disable frustum culling. Stresses queuing and batching as all mesh material entities in the scene are always drawn.
+    /// whether to disable all frustum culling. Stresses queuing and batching as all mesh material entities in the scene are always drawn.
     #[argh(switch)]
     no_frustum_culling: bool,
 
@@ -59,6 +59,14 @@ struct Args {
     #[argh(switch)]
     no_automatic_batching: bool,
 
+    /// whether to enable GPU culling.
+    #[argh(switch)]
+    gpu_culling: bool,
+
+    /// whether to disable CPU culling.
+    #[argh(switch)]
+    no_cpu_culling: bool,
+
     /// whether to enable directional light cascaded shadow mapping.
     #[argh(switch)]
     shadows: bool,
@@ -172,7 +180,14 @@ fn setup(
             }
 
             // camera
-            commands.spawn(Camera3dBundle::default());
+            let mut camera = commands.spawn(Camera3dBundle::default());
+            if args.gpu_culling {
+                camera.insert(GpuCulling);
+            }
+            if args.no_cpu_culling {
+                camera.insert(NoCpuCulling);
+            }
+
             // Inside-out box around the meshes onto which shadows are cast (though you cannot see them...)
             commands.spawn((
                 PbrBundle {