diff --git a/src/Trixi.jl b/src/Trixi.jl
index 167e01189c8..57eb76ef2f6 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -25,11 +25,12 @@ const _PREFERENCE_LOOPVECTORIZATION = @load_preference("loop_vectorization", tru
 # (standard library packages first, other packages next, all of them sorted alphabetically)
 
 using Accessors: @reset
-using LinearAlgebra: LinearAlgebra, Diagonal, diag, dot, eigvals, mul!, norm, cross,
+using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, diag, dot, eigvals, mul!, norm,
+                     cross,
                      normalize, I,
                      UniformScaling, det
 using Printf: @printf, @sprintf, println
-using SparseArrays: AbstractSparseMatrix, AbstractSparseMatrixCSC, sparse, droptol!,
+using SparseArrays: SparseMatrixCSC, AbstractSparseMatrix, sparse, droptol!,
                     rowvals, nzrange, nonzeros
 
 # import @reexport now to make it available for further imports/exports
diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl
index 3d7aba9536f..d4779911cee 100644
--- a/src/solvers/dgmulti/dg.jl
+++ b/src/solvers/dgmulti/dg.jl
@@ -44,6 +44,22 @@ end
 
 @inline nelements(dg::DGMulti, cache) = size(cache.solution_container.u_values)[end]
 
+# Returns the components needed to iterate efficiently over the entries of either a
+# `SparseMatrixCSC` or `Adjoint{SparseMatrixCSC}`, for example when performing flux
+# differencing calculations. 
+# 
+# For `Adjoint{SparseMatrixCSC}` (used by `DGMultiFluxDiff`), since `parent(A)` is a 
+# `SparseMatrixCSC` stored in column-major order, iterating over its columns gives 
+# row-major access to `A`.
+# 
+# For `SparseMatrixCSC` (used by `DGMultiPeriodicFDSBP`, for example), `parent(A)` 
+# simply returns `A`. 
+@inline function sparse_operator_data(A::Union{<:SparseMatrixCSC,
+                                               <:Adjoint{<:Any, <:SparseMatrixCSC}})
+    A_base = parent(A)
+    return A_base, axes(A, 2), rowvals(A_base), nonzeros(A_base)
+end
+
 """
     eachdim(mesh)
 
@@ -370,7 +386,7 @@ function prolong2interfaces!(cache, u,
 end
 
 # CARE: This function requires that interpolation to quadrature points is performed before
-# to populate cache.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
+# to populate cache.solution_container.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
 # version for affine meshes
 @inline function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                          have_nonconservative_terms::False, equations,
@@ -398,7 +414,7 @@ end
 end
 
 # CARE: This function requires that interpolation to quadrature points is performed before
-# to populate cache.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
+# to populate cache.solution_container.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
 # version for curved meshes
 @inline function volume_integral_kernel!(du, u, element,
                                          mesh::DGMultiMesh{NDIMS, <:NonAffine},
@@ -517,7 +533,7 @@ function calc_interface_flux!(cache, surface_integral::SurfaceIntegralWeakForm,
     return nothing
 end
 
-# assumes cache.flux_face_values is computed and filled with
+# assumes cache.solution_container.flux_face_values is computed and filled with
 # for polynomial discretizations, use dense LIFT matrix for surface contributions.
 function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
                                 surface_integral::SurfaceIntegralWeakForm,
diff --git a/src/solvers/dgmulti/flux_differencing.jl b/src/solvers/dgmulti/flux_differencing.jl
index 8d056fe1f36..3ba81d36ae7 100644
--- a/src/solvers/dgmulti/flux_differencing.jl
+++ b/src/solvers/dgmulti/flux_differencing.jl
@@ -5,234 +5,6 @@
 @muladd begin
 #! format: noindent
 
-#   hadamard_sum!(du, A,
-#                 flux_is_symmetric, volume_flux,
-#                 orientation_or_normal_direction, u, equations)
-#
-# Computes the flux difference ∑_j A[i, j] * f(u_i, u_j) and accumulates the result into `du`.
-# Called by `local_flux_differencing` to compute local contributions to flux differencing
-# volume integrals.
-#
-# - `du`, `u` are vectors
-# - `A` is the skew-symmetric flux differencing matrix
-# - `flux_is_symmetric` is a `Val{<:Bool}` indicating if f(u_i, u_j) = f(u_j, u_i)
-#
-# The matrix `A` can be either dense or sparse. In the latter case, you should
-# use the `adjoint` of a `SparseMatrixCSC` to mimic a `SparseMatrixCSR`, which
-# is more efficient for matrix vector products.
-
-# Version for dense operators and symmetric fluxes
-@inline function hadamard_sum!(du, A,
-                               flux_is_symmetric::True, volume_flux,
-                               orientation_or_normal_direction, u, equations)
-    row_ids, col_ids = axes(A)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for j in col_ids
-            # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
-            # We avoid computing the lower-triangular part, and instead accumulate those contributions
-            # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
-            # is symmetric).
-            if j > i
-                u_j = u[j]
-                AF_ij = 2 * A[i, j] *
-                        volume_flux(u_i, u_j, orientation_or_normal_direction,
-                                    equations)
-                du_i = du_i + AF_ij
-                du[j] = du[j] - AF_ij
-            end
-        end
-        du[i] = du_i
-    end
-end
-
-# Version for dense operators and non-symmetric fluxes
-@inline function hadamard_sum!(du, A,
-                               flux_is_symmetric::False, volume_flux,
-                               orientation::Integer, u, equations)
-    row_ids, col_ids = axes(A)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for j in col_ids
-            u_j = u[j]
-            f_ij = volume_flux(u_i, u_j, orientation, equations)
-            du_i = du_i + 2 * A[i, j] * f_ij
-        end
-        du[i] = du_i
-    end
-end
-
-@inline function hadamard_sum!(du, A,
-                               flux_is_symmetric::False, volume_flux,
-                               normal_direction::AbstractVector, u, equations)
-    row_ids, col_ids = axes(A)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for j in col_ids
-            u_j = u[j]
-            f_ij = volume_flux(u_i, u_j, normal_direction, equations)
-            du_i = du_i + 2 * A[i, j] * f_ij
-        end
-        du[i] = du_i
-    end
-end
-
-# Version for sparse operators and symmetric fluxes
-@inline function hadamard_sum!(du,
-                               A::LinearAlgebra.Adjoint{<:Any,
-                                                        <:AbstractSparseMatrixCSC},
-                               flux_is_symmetric::True, volume_flux,
-                               orientation_or_normal_direction, u, equations)
-    A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids = axes(A, 2)
-    rows = rowvals(A_base)
-    vals = nonzeros(A_base)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for id in nzrange(A_base, i)
-            j = rows[id]
-            # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
-            # We avoid computing the lower-triangular part, and instead accumulate those contributions
-            # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
-            # is symmetric).
-            if j > i
-                u_j = u[j]
-                A_ij = vals[id]
-                AF_ij = 2 * A_ij *
-                        volume_flux(u_i, u_j, orientation_or_normal_direction,
-                                    equations)
-                du_i = du_i + AF_ij
-                du[j] = du[j] - AF_ij
-            end
-        end
-        du[i] = du_i
-    end
-end
-
-# Version for sparse operators and symmetric fluxes with curved meshes
-@inline function hadamard_sum!(du,
-                               A::LinearAlgebra.Adjoint{<:Any,
-                                                        <:AbstractSparseMatrixCSC},
-                               flux_is_symmetric::True, volume_flux,
-                               normal_directions::AbstractVector{<:AbstractVector},
-                               u, equations)
-    A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids = axes(A, 2)
-    rows = rowvals(A_base)
-    vals = nonzeros(A_base)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for id in nzrange(A_base, i)
-            j = rows[id]
-            # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
-            # We avoid computing the lower-triangular part, and instead accumulate those contributions
-            # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
-            # is symmetric).
-            if j > i
-                u_j = u[j]
-                A_ij = vals[id]
-
-                # provably entropy stable de-aliasing of geometric terms
-                normal_direction = 0.5 * (getindex.(normal_directions, i) +
-                                    getindex.(normal_directions, j))
-
-                AF_ij = 2 * A_ij * volume_flux(u_i, u_j, normal_direction, equations)
-                du_i = du_i + AF_ij
-                du[j] = du[j] - AF_ij
-            end
-        end
-        du[i] = du_i
-    end
-end
-
-# TODO: DGMulti. Fix for curved meshes.
-# Version for sparse operators and non-symmetric fluxes
-@inline function hadamard_sum!(du,
-                               A::LinearAlgebra.Adjoint{<:Any,
-                                                        <:AbstractSparseMatrixCSC},
-                               flux_is_symmetric::False, volume_flux,
-                               normal_direction::AbstractVector, u, equations)
-    A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids = axes(A, 2)
-    rows = rowvals(A_base)
-    vals = nonzeros(A_base)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for id in nzrange(A_base, i)
-            A_ij = vals[id]
-            j = rows[id]
-            u_j = u[j]
-            f_ij = volume_flux(u_i, u_j, normal_direction, equations)
-            du_i = du_i + 2 * A_ij * f_ij
-        end
-        du[i] = du_i
-    end
-end
-
-# For DGMulti implementations, we construct "physical" differentiation operators by taking linear
-# combinations of reference differentiation operators scaled by geometric change of variables terms.
-# We use a lazy evaluation of physical differentiation operators, so that we can compute linear
-# combinations of differentiation operators on-the-fly in an allocation-free manner.
-@inline function build_lazy_physical_derivative(element, orientation,
-                                                mesh::DGMultiMesh{1}, dg, cache,
-                                                operator_scaling = 1.0)
-    @unpack Qrst_skew = cache
-    @unpack rxJ = mesh.md
-    # ignore orientation
-    return LazyMatrixLinearCombo(Qrst_skew, operator_scaling .* (rxJ[1, element],))
-end
-
-@inline function build_lazy_physical_derivative(element, orientation,
-                                                mesh::DGMultiMesh{2}, dg, cache,
-                                                operator_scaling = 1.0)
-    @unpack Qrst_skew = cache
-    @unpack rxJ, sxJ, ryJ, syJ = mesh.md
-    if orientation == 1
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (rxJ[1, element], sxJ[1, element]))
-    else # if orientation == 2
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (ryJ[1, element], syJ[1, element]))
-    end
-end
-
-@inline function build_lazy_physical_derivative(element, orientation,
-                                                mesh::DGMultiMesh{3}, dg, cache,
-                                                operator_scaling = 1.0)
-    @unpack Qrst_skew = cache
-    @unpack rxJ, sxJ, txJ, ryJ, syJ, tyJ, rzJ, szJ, tzJ = mesh.md
-    if orientation == 1
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (rxJ[1, element], sxJ[1, element],
-                                      txJ[1, element]))
-    elseif orientation == 2
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (ryJ[1, element], syJ[1, element],
-                                      tyJ[1, element]))
-    else # if orientation == 3
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (rzJ[1, element], szJ[1, element],
-                                      tzJ[1, element]))
-    end
-end
-
 # Return the contravariant basis vector corresponding to the Cartesian
 # coordinate direction `orientation` in a given `element` of the `mesh`.
 # The contravariant basis vectors have entries `dx_i / dxhat_j` where
@@ -260,6 +32,17 @@ end
     return SVector{NDIMS}(view.(dxidxhatj[:, orientation], :, element))
 end
 
+# For Affine meshes, `get_contravariant_vector` returns an SVector of scalars (constant over the
+# element). The normal direction is the same for all node pairs.
+@inline get_normal_direction(normal_directions::AbstractVector, i, j) = normal_directions
+
+# For NonAffine meshes, `get_contravariant_vector` returns an SVector of per-node arrays.
+# We average the normals at nodes i and j for provably entropy-stable de-aliasing of geometric terms.
+@inline function get_normal_direction(normal_directions::AbstractVector{<:AbstractVector},
+                                      i, j)
+    return 0.5f0 * (getindex.(normal_directions, i) + getindex.(normal_directions, j))
+end
+
 # use hybridized SBP operators for general flux differencing schemes.
 function compute_flux_differencing_SBP_matrices(dg::DGMulti)
     return compute_flux_differencing_SBP_matrices(dg, has_sparse_operators(dg))
@@ -288,6 +71,24 @@ function compute_flux_differencing_SBP_matrices(dg::DGMultiFluxDiffSBP,
     return Qrst_skew
 end
 
+# Build element-to-element connectivity from face-to-face connectivity.
+# Used for smoothing of shock capturing blending parameters (see `apply_smoothing!`).
+function build_element_to_element_connectivity(mesh::DGMultiMesh, dg::DGMulti)
+    face_to_face_connectivity = mesh.md.FToF
+    element_to_element_connectivity = similar(face_to_face_connectivity)
+    for e in axes(face_to_face_connectivity, 2)
+        for f in axes(face_to_face_connectivity, 1)
+            neighbor_face_index = face_to_face_connectivity[f, e]
+            # Reverse-engineer element index from face index. Assumes all elements
+            # have the same number of faces.
+            neighbor_element_index = ((neighbor_face_index - 1) ÷ dg.basis.num_faces) +
+                                     1
+            element_to_element_connectivity[f, e] = neighbor_element_index
+        end
+    end
+    return element_to_element_connectivity
+end
+
 # For flux differencing SBP-type approximations, store solutions in Matrix{SVector{nvars}}.
 # This results in a slight speedup for `calc_volume_integral!`.
 function allocate_nested_array(uEltype, nvars, array_dimensions, dg::DGMultiFluxDiffSBP)
@@ -306,15 +107,15 @@ function create_cache(mesh::DGMultiMesh, equations, dg::DGMultiFluxDiffSBP,
 
     nvars = nvariables(equations)
     # Use an array of SVectors (chunks of `nvars` are contiguous in memory) to speed up flux differencing
-    fluxdiff_local_threaded = [zeros(SVector{nvars, uEltype}, rd.Nq)
-                               for _ in 1:Threads.maxthreadid()]
+    du_local_threaded = [zeros(SVector{nvars, uEltype}, rd.Nq)
+                         for _ in 1:Threads.maxthreadid()]
 
     solution_container = initialize_dgmulti_solution_container(mesh, equations, dg,
                                                                uEltype)
 
     return (; md, Qrst_skew, dxidxhatj = md.rstxyzJ,
             invJ = inv.(md.J), lift_scalings, inv_wq = inv.(rd.wq),
-            solution_container, fluxdiff_local_threaded)
+            solution_container, du_local_threaded)
 end
 
 # most general create_cache: works for `DGMultiFluxDiff{<:Polynomial}`
@@ -353,10 +154,10 @@ function create_cache(mesh::DGMultiMesh, equations, dg::DGMultiFluxDiff, RealT,
                              for _ in 1:Threads.maxthreadid()]
 
     # Use an array of SVectors (chunks of `nvars` are contiguous in memory) to speed up flux differencing
-    # The result is then transferred to rhs_local_threaded::StructArray{<:SVector} before
-    # projecting it and storing it into `du`.
-    fluxdiff_local_threaded = [zeros(SVector{nvars, uEltype}, num_quad_points_total)
-                               for _ in 1:Threads.maxthreadid()]
+    # The result is then transferred to `rhs_local`, a thread-local element of
+    # `rhs_local_threaded::StructArray{<:SVector}` before projecting it and storing it into `du`.
+    du_local_threaded = [zeros(SVector{nvars, uEltype}, num_quad_points_total)
+                         for _ in 1:Threads.maxthreadid()]
     rhs_local_threaded = [allocate_nested_array(uEltype, nvars,
                                                 (num_quad_points_total,), dg)
                           for _ in 1:Threads.maxthreadid()]
@@ -374,7 +175,7 @@ function create_cache(mesh::DGMultiMesh, equations, dg::DGMultiFluxDiff, RealT,
             invJ = inv.(J), dxidxhatj = interpolated_geometric_terms,
             entropy_var_values, projected_entropy_var_values,
             entropy_projected_u_values,
-            solution_container, fluxdiff_local_threaded, rhs_local_threaded)
+            solution_container, du_local_threaded, rhs_local_threaded)
 end
 
 # TODO: DGMulti. Address hard-coding of `entropy2cons!` and `cons2entropy!` for this function.
@@ -450,111 +251,170 @@ end
 
 function calc_volume_integral!(du, u, mesh::DGMultiMesh,
                                have_nonconservative_terms, equations,
-                               volume_integral, dg::DGMultiFluxDiff, cache)
+                               volume_integral, dg::DGMultiFluxDiff, cache,
+                               alpha = true)
     # No interpolation performed for general volume integral.
     # Instead, an element-wise entropy projection (`entropy_projection!`) is performed before, see
     # `rhs!` for `DGMultiFluxDiff`, which populates `entropy_projected_u_values`
     @threaded for element in eachelement(mesh, dg, cache)
         volume_integral_kernel!(du, u, element, mesh,
                                 have_nonconservative_terms, equations,
-                                volume_integral, dg, cache)
+                                volume_integral, dg, cache, alpha)
     end
 
     return nothing
 end
 
-# Computes flux differencing contribution from each Cartesian direction over a single element.
-# For dense operators, we do not use sum factorization.
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+# Computes flux differencing contribution over a single element by looping over node pairs (i, j).
+# The physical normal direction for each pair is n_ij = geometric_matrix * ref_entries,
+# where ref_entries[d] = Qrst_skew[d][i,j].
+# This fuses the NDIMS per-dimension flux
+# evaluations of the old dimension-by-dimension loop into a single evaluation per pair.
+# Essentially, instead of calculating 
+#   volume_flux(u_i, u_j, 1, equations) * Qx[i, j] + volume_flux(u_i, u_j, 2, equations) * Qy[i, j] + ...
+# where Qx[i, j] = dr/dx * Qr[i, j] + ds/dx * Qs[i, j], we can expand out and evaluate
+#   volume_flux(u_i, u_j, [dr/dx, dr/dy] * Qr[i, j], equations) + 
+#   volume_flux(u_i, u_j, [ds/dx, ds/dy] * Qs[i, j], equations)
+# which is slightly faster. 
+# 
+# For dense operators (SBP on Line/Tri/Tet), we do not use this sum factorization trick.
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::False,
                                           volume_flux,
                                           has_sparse_operators::False, mesh,
                                           equations, dg, cache)
-    for dim in eachdim(mesh)
-        Qi_skew = build_lazy_physical_derivative(element_index, dim, mesh, dg, cache)
-        # True() indicates the volume flux is symmetric
-        hadamard_sum!(fluxdiff_local, Qi_skew,
-                      True(), volume_flux,
-                      dim, u_local, equations)
+    @unpack Qrst_skew = cache
+    NDIMS = ndims(mesh)
+    row_ids = axes(first(Qrst_skew), 1)
+    geometric_matrix = get_contravariant_matrix(element_index, mesh, cache)
+    for i in row_ids
+        u_i = u_local[i]
+        for j in row_ids
+            # We use the symmetry of the volume flux and the anti-symmetry
+            # of the derivative operator to save half of the volume flux
+            # computations.
+            if j > i
+                u_j = u_local[j]
+                ref_entries = SVector(ntuple(d -> Qrst_skew[d][i, j], Val(NDIMS)))
+                normal_direction = geometric_matrix * ref_entries
+                AF_ij = 2 * volume_flux(u_i, u_j, normal_direction, equations)
+                du_local[i] = du_local[i] + AF_ij
+                du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+            end
+        end
     end
 end
 
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::True, volume_flux,
                                           has_sparse_operators::False, mesh,
                                           equations, dg, cache)
+    @unpack Qrst_skew = cache
+    NDIMS = ndims(mesh)
     flux_conservative, flux_nonconservative = volume_flux
-    for dim in eachdim(mesh)
-        Qi_skew = build_lazy_physical_derivative(element_index, dim, mesh, dg, cache)
-        # True() indicates the flux is symmetric.
-        hadamard_sum!(fluxdiff_local, Qi_skew,
-                      True(), flux_conservative,
-                      dim, u_local, equations)
-
-        # The final argument .5 scales the operator by 1/2 for the nonconservative terms.
-        half_Qi_skew = build_lazy_physical_derivative(element_index, dim, mesh, dg,
-                                                      cache, 0.5)
-        # False() indicates the flux is non-symmetric.
-        hadamard_sum!(fluxdiff_local, half_Qi_skew,
-                      False(), flux_nonconservative,
-                      dim, u_local, equations)
+    row_ids = axes(first(Qrst_skew), 1)
+    geometric_matrix = get_contravariant_matrix(element_index, mesh, cache)
+    for i in row_ids
+        u_i = u_local[i]
+        for j in row_ids
+            ref_entries = SVector(ntuple(d -> Qrst_skew[d][i, j], Val(NDIMS)))
+            normal_direction = geometric_matrix * ref_entries
+            # We use the symmetry of the volume flux and the anti-symmetry
+            # of the derivative operator to save half of the volume flux
+            # computations.
+            if j > i
+                u_j = u_local[j]
+                AF_ij = 2 * flux_conservative(u_i, u_j, normal_direction, equations)
+                du_local[i] = du_local[i] + AF_ij
+                du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+            end
+            # Non-conservative terms use the full (non-symmetric) loop.
+            # The 0.5f0 factor on the normal direction is necessary for the nonconservative 
+            # fluxes based on the interpretation of global SBP operators.  
+            # See also `calc_interface_flux!` with `have_nonconservative_terms::True` 
+            # in src/solvers/dgsem_tree/dg_1d.jl
+            f_nc = flux_nonconservative(u_i, u_local[j], 0.5f0 * normal_direction,
+                                        equations)
+            du_local[i] = du_local[i] + 2 * f_nc
+        end
     end
 end
 
 # When the operators are sparse, we use the sum-factorization approach to
-# computing flux differencing.
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+# computing flux differencing. Each dimension has its own sparse operator with
+# its own sparsity pattern (e.g., tensor-product structure on Quad/Hex elements),
+# so we loop per-dimension. For each nonzero entry A[i,j] we evaluate the flux once
+# and exploit skew-symmetry to accumulate both the (i,j) and (j,i) contributions.
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::False,
                                           volume_flux,
                                           has_sparse_operators::True, mesh,
                                           equations, dg, cache)
     @unpack Qrst_skew = cache
     for dim in eachdim(mesh)
-        # There are two ways to write this flux differencing discretization on affine meshes.
-        #
-        # 1. Use numerical fluxes in Cartesian directions and sum up the discrete derivative
-        #    operators per coordinate direction accordingly.
-        # 2. Use discrete derivative operators per coordinate direction and corresponding
-        #    numerical fluxes in arbitrary (non-Cartesian) space directions.
-        #
-        # The first option makes it necessary to sum up the individual sparsity
-        # patterns of each reference coordinate direction. On tensor-product
-        # elements such as `Quad()` or `Hex()` elements, this increases the number of
-        # potentially expensive numerical flux evaluations by a factor of `ndims(mesh)`.
-        # Thus, we use the second option below (which basically corresponds to the
-        # well-known sum factorization on tensor product elements).
-        # Note that there is basically no difference for dense derivative operators.
-        normal_direction = get_contravariant_vector(element_index, dim, mesh, cache)
+        normal_directions = get_contravariant_vector(element_index, dim, mesh, cache)
         Q_skew = Qrst_skew[dim]
-
-        # True() indicates the flux is symmetric
-        hadamard_sum!(fluxdiff_local, Q_skew,
-                      True(), volume_flux,
-                      normal_direction, u_local, equations)
+        A_base, row_ids, rows, vals = sparse_operator_data(Q_skew)
+        for i in row_ids
+            u_i = u_local[i]
+            du_i = du_local[i]
+            for id in nzrange(A_base, i)
+                j = rows[id]
+                # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
+                # We avoid computing the lower-triangular part, and instead accumulate those contributions
+                # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
+                # is symmetric).
+                if j > i
+                    u_j = u_local[j]
+                    A_ij = vals[id]
+                    normal_direction_ij = get_normal_direction(normal_directions, i, j)
+                    AF_ij = 2 * A_ij *
+                            volume_flux(u_i, u_j, normal_direction_ij, equations)
+                    du_i = du_i + AF_ij
+                    du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+                end
+            end
+            du_local[i] = du_i
+        end
     end
 end
 
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::True, volume_flux,
                                           has_sparse_operators::True, mesh,
                                           equations, dg, cache)
     @unpack Qrst_skew = cache
     flux_conservative, flux_nonconservative = volume_flux
     for dim in eachdim(mesh)
-        normal_direction = get_contravariant_vector(element_index, dim, mesh, cache)
+        normal_directions = get_contravariant_vector(element_index, dim, mesh, cache)
         Q_skew = Qrst_skew[dim]
-
-        # True() indicates the flux is symmetric
-        hadamard_sum!(fluxdiff_local, Q_skew,
-                      True(), flux_conservative,
-                      normal_direction, u_local, equations)
-
-        # We scale the operator by 1/2 for the nonconservative terms.
-        half_Q_skew = LazyMatrixLinearCombo((Q_skew,), (0.5,))
-        # False() indicates the flux is non-symmetric
-        hadamard_sum!(fluxdiff_local, half_Q_skew,
-                      False(), flux_nonconservative,
-                      normal_direction, u_local, equations)
+        A_base, row_ids, rows, vals = sparse_operator_data(Q_skew)
+        for i in row_ids
+            u_i = u_local[i]
+            du_i = du_local[i]
+            for id in nzrange(A_base, i)
+                j = rows[id]
+                A_ij = vals[id]
+                u_j = u_local[j]
+                normal_direction_ij = get_normal_direction(normal_directions, i, j)
+                # Conservative part: exploit skew-symmetry (calculate upper triangular part only).
+                if j > i
+                    AF_ij = 2 * A_ij *
+                            flux_conservative(u_i, u_j, normal_direction_ij, equations)
+                    du_i = du_i + AF_ij
+                    du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+                end
+                # Non-conservative terms use the full (non-symmetric) loop.
+                # The 0.5f0 factor on the normal direction is necessary for the nonconservative 
+                # fluxes based on the interpretation of global SBP operators.  
+                # See also `calc_interface_flux!` with `have_nonconservative_terms::True` 
+                # in src/solvers/dgsem_tree/dg_1d.jl
+                f_nc = flux_nonconservative(u_i, u_j, 0.5f0 * normal_direction_ij,
+                                            equations)
+                du_i = du_i + 2 * A_ij * f_nc
+            end
+            du_local[i] = du_i
+        end
     end
 end
 
@@ -564,27 +424,27 @@ end
 @inline function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralFluxDifferencing,
-                                         dg::DGMultiFluxDiff, cache)
+                                         dg::DGMultiFluxDiff, cache, alpha = true)
     @unpack entropy_projected_u_values, Ph = cache
-    @unpack fluxdiff_local_threaded, rhs_local_threaded = cache
+    @unpack du_local_threaded, rhs_local_threaded = cache
 
-    fluxdiff_local = fluxdiff_local_threaded[Threads.threadid()]
-    fill!(fluxdiff_local, zero(eltype(fluxdiff_local)))
+    du_local = du_local_threaded[Threads.threadid()]
+    fill!(du_local, zero(eltype(du_local)))
     u_local = view(entropy_projected_u_values, :, element)
 
-    local_flux_differencing!(fluxdiff_local, u_local, element,
+    local_flux_differencing!(du_local, u_local, element,
                              have_nonconservative_terms,
                              volume_integral.volume_flux,
                              has_sparse_operators(dg),
                              mesh, equations, dg, cache)
 
-    # convert fluxdiff_local::Vector{<:SVector} to StructArray{<:SVector} for faster
+    # convert du_local::Vector{<:SVector} to StructArray{<:SVector} for faster
     # apply_to_each_field performance.
     rhs_local = rhs_local_threaded[Threads.threadid()]
-    for i in Base.OneTo(length(fluxdiff_local))
-        rhs_local[i] = fluxdiff_local[i]
+    for i in Base.OneTo(length(du_local))
+        rhs_local[i] = du_local[i]
     end
-    apply_to_each_field(mul_by_accum!(Ph), view(du, :, element), rhs_local)
+    apply_to_each_field(mul_by_accum!(Ph, alpha), view(du, :, element), rhs_local)
 
     return nothing
 end
@@ -592,21 +452,22 @@ end
 @inline function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralFluxDifferencing,
-                                         dg::DGMultiFluxDiffSBP, cache)
-    @unpack fluxdiff_local_threaded, inv_wq = cache
+                                         dg::DGMultiFluxDiffSBP, cache,
+                                         alpha = true)
+    @unpack du_local_threaded, inv_wq = cache
 
-    fluxdiff_local = fluxdiff_local_threaded[Threads.threadid()]
-    fill!(fluxdiff_local, zero(eltype(fluxdiff_local)))
+    du_local = du_local_threaded[Threads.threadid()]
+    fill!(du_local, zero(eltype(du_local)))
     u_local = view(u, :, element)
 
-    local_flux_differencing!(fluxdiff_local, u_local, element,
+    local_flux_differencing!(du_local, u_local, element,
                              have_nonconservative_terms,
                              volume_integral.volume_flux,
                              has_sparse_operators(dg),
                              mesh, equations, dg, cache)
 
     for i in each_quad_node(mesh, dg, cache)
-        du[i, element] = du[i, element] + fluxdiff_local[i] * inv_wq[i]
+        du[i, element] = du[i, element] + alpha * du_local[i] * inv_wq[i]
     end
 
     return nothing
diff --git a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
index 48d75938d86..1b8a257d0a2 100644
--- a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
+++ b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
@@ -490,7 +490,7 @@ function entropy_projection!(cache, u, mesh::DGMultiMesh, equations,
     return nothing
 end
 
-# Assumes cache.flux_face_values is already computed.
+# Assumes cache.solution_container.flux_face_values is already computed.
 # Enables tensor product evaluation of `LIFT isa TensorProductGaussFaceOperator`.
 function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
                                 surface_integral::SurfaceIntegralWeakForm,
@@ -512,29 +512,6 @@ function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::DGMultiMesh,
-                                           have_nonconservative_terms, equations,
-                                           volume_flux, dg::DGMultiFluxDiff{<:GaussSBP},
-                                           cache, alpha = true)
-    fluxdiff_local = cache.fluxdiff_local_threaded[Threads.threadid()]
-    fill!(fluxdiff_local, zero(eltype(fluxdiff_local)))
-    u_local = view(cache.entropy_projected_u_values, :, element)
-
-    local_flux_differencing!(fluxdiff_local, u_local, element,
-                             have_nonconservative_terms,
-                             volume_flux, has_sparse_operators(dg),
-                             mesh, equations, dg, cache)
-
-    # convert `fluxdiff_local::Vector{<:SVector}` to `rhs_local::StructArray{<:SVector}`
-    # for faster performance when using `apply_to_each_field`.
-    rhs_local = cache.rhs_local_threaded[Threads.threadid()]
-    for i in Base.OneTo(length(fluxdiff_local))
-        rhs_local[i] = fluxdiff_local[i]
-    end
-
-    return project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh, dg, cache, alpha)
-end
-
 function project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh::DGMultiMesh,
                                      dg::DGMulti, cache, alpha = true)
 
@@ -564,14 +541,26 @@ end
 function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                  have_nonconservative_terms, equations,
                                  volume_integral::VolumeIntegralFluxDifferencing,
-                                 dg::DGMultiFluxDiff{<:GaussSBP}, cache)
+                                 dg::DGMultiFluxDiff{<:GaussSBP}, cache, alpha = true)
     (; volume_flux) = volume_integral
 
-    flux_differencing_kernel!(du, u, element, mesh,
-                              have_nonconservative_terms, equations,
-                              volume_flux, dg, cache)
+    du_local = cache.du_local_threaded[Threads.threadid()]
+    fill!(du_local, zero(eltype(du_local)))
+    u_local = view(cache.entropy_projected_u_values, :, element)
 
-    return nothing
+    local_flux_differencing!(du_local, u_local, element,
+                             have_nonconservative_terms,
+                             volume_flux, has_sparse_operators(dg),
+                             mesh, equations, dg, cache)
+
+    # convert `du_local::Vector{<:SVector}` to `rhs_local::StructArray{<:SVector}`
+    # for faster performance when using `apply_to_each_field`.
+    rhs_local = cache.rhs_local_threaded[Threads.threadid()]
+    for i in Base.OneTo(length(du_local))
+        rhs_local[i] = du_local[i]
+    end
+
+    return project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh, dg, cache, alpha)
 end
 
 # interpolate back to Lobatto nodes after applying the inverse Jacobian at Gauss points
diff --git a/src/solvers/dgmulti/sbp.jl b/src/solvers/dgmulti/sbp.jl
index bb0aea56b28..2d75b18049f 100644
--- a/src/solvers/dgmulti/sbp.jl
+++ b/src/solvers/dgmulti/sbp.jl
@@ -207,11 +207,7 @@ function calc_volume_integral!(du, u, mesh::DGMultiMesh,
             # This would have to be changed if `have_nonconservative_terms = False()`
             # because then `volume_flux` is non-symmetric.
             A = dg.basis.Drst[dim]
-
-            A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-            row_ids = axes(A, 2)
-            rows = rowvals(A_base)
-            vals = nonzeros(A_base)
+            A_base, row_ids, rows, vals = sparse_operator_data(A)
 
             @threaded for i in row_ids
                 u_i = u[i]
@@ -233,19 +229,35 @@ function calc_volume_integral!(du, u, mesh::DGMultiMesh,
 
     else # if using two threads or fewer
 
-        # Calls `hadamard_sum!``, which uses symmetry to reduce flux evaluations. Symmetry
-        # is expected to yield about a 2x speedup, so we default to the symmetry-exploiting
-        # volume integral unless we have >2 threads (which should yield >2 speedup).
+        # Exploit skew-symmetry to halve the number of flux evaluations (≈2x speedup).
+        # A = Drst[dim] is skew-symmetric for periodic FD-SBP on uniform grids, so
+        # A[i,j] = -A[j,i]. The stored CSC value vals[id] = A[j,i] = -A[i,j], hence
+        # we use -vals[id] to recover A[i,j], matching the multithreaded branch above.
         for dim in eachdim(mesh)
             normal_direction = get_contravariant_vector(1, dim, mesh, cache)
 
             A = dg.basis.Drst[dim]
+            A_base, row_ids, rows, vals = sparse_operator_data(A)
 
-            # since have_nonconservative_terms::False,
-            # the volume flux is symmetric.
-            flux_is_symmetric = True()
-            hadamard_sum!(du, A, flux_is_symmetric, volume_flux,
-                          normal_direction, u, equations)
+            for i in row_ids
+                u_i = u[i]
+                du_i = du[i]
+                for id in nzrange(A_base, i)
+                    j = rows[id]
+                    # We use the symmetry of the volume flux and the anti-symmetry
+                    # of the derivative operator to save half of the volume flux
+                    # computations.
+                    if j > i
+                        A_ij = -vals[id]  # A[j,i] stored; skew-symmetry: -A[j,i] = A[i,j]
+                        u_j = u[j]
+                        AF_ij = 2 * A_ij *
+                                volume_flux(u_i, u_j, normal_direction, equations)
+                        du_i = du_i + AF_ij
+                        du[j] = du[j] - AF_ij # Due to skew-symmetry
+                    end
+                end
+                du[i] = du_i
+            end
         end
     end
 
diff --git a/src/solvers/dgmulti/shock_capturing.jl b/src/solvers/dgmulti/shock_capturing.jl
index d1c9cc3e9d2..b018c7f5894 100644
--- a/src/solvers/dgmulti/shock_capturing.jl
+++ b/src/solvers/dgmulti/shock_capturing.jl
@@ -7,20 +7,7 @@ function create_cache(mesh::DGMultiMesh{NDIMS}, equations,
     @assert volume_integral_blend_high_order isa VolumeIntegralFluxDifferencing "DGMulti is currently only compatible with `VolumeIntegralFluxDifferencing` as `volume_integral_blend_high_order`"
     # `volume_integral_blend_low_order` limited to finite-volume on Gauss-node subcells
 
-    # build element to element (element_to_element_connectivity) connectivity for smoothing of
-    # shock capturing parameters.
-    face_to_face_connectivity = mesh.md.FToF # num_faces x num_elements matrix
-    element_to_element_connectivity = similar(face_to_face_connectivity)
-    for e in axes(face_to_face_connectivity, 2)
-        for f in axes(face_to_face_connectivity, 1)
-            neighbor_face_index = face_to_face_connectivity[f, e]
-
-            # reverse-engineer element index from face. Assumes all elements
-            # have the same number of faces.
-            neighbor_element_index = ((neighbor_face_index - 1) ÷ dg.basis.num_faces) + 1
-            element_to_element_connectivity[f, e] = neighbor_element_index
-        end
-    end
+    element_to_element_connectivity = build_element_to_element_connectivity(mesh, dg)
 
     # create sparse hybridized operators for low order scheme
     Qrst, E = StartUpDG.sparse_low_order_SBP_operators(dg.basis)
@@ -196,10 +183,10 @@ function calc_volume_integral!(du, u, mesh::DGMultiMesh,
                                     dg, cache, 1 - alpha_element)
 
             # Calculate "FV" low order volume integral contribution
-            low_order_flux_differencing_kernel(du, u, element, mesh,
-                                               have_nonconservative_terms, equations,
-                                               volume_integral_blend_low_order,
-                                               dg, cache, alpha_element)
+            volume_integral_kernel!(du, u, element, mesh,
+                                    have_nonconservative_terms, equations,
+                                    volume_integral_blend_low_order,
+                                    dg, cache, alpha_element)
         end
     end
 
@@ -261,57 +248,30 @@ function get_avg_contravariant_matrix(i, j, element, mesh::DGMultiMesh, cache)
             get_contravariant_matrix(j, element, mesh, cache))
 end
 
-# computes an algebraic low order method with internal dissipation.
-# This method is for affine/Cartesian meshes
-function low_order_flux_differencing_kernel(du, u, element,
-                                            mesh::DGMultiMesh,
-                                            have_nonconservative_terms::False, equations,
-                                            volume_integral,
-                                            dg::DGMultiFluxDiff{<:GaussSBP},
-                                            cache, alpha = true)
-    (; volume_flux_fv) = volume_integral
-
-    # accumulates output from flux differencing
-    rhs_local = cache.rhs_local_threaded[Threads.threadid()]
-    fill!(rhs_local, zero(eltype(rhs_local)))
-
-    u_local = view(cache.entropy_projected_u_values, :, element)
-
-    # constant over each element
-    geometric_matrix = get_contravariant_matrix(element, mesh, cache)
-
-    (; sparsity_pattern) = cache
-    A_base = parent(sparsity_pattern) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids, rows = axes(sparsity_pattern, 2), rowvals(A_base)
-    for i in row_ids
-        u_i = u_local[i]
-        du_i = zero(u_i)
-        for id in nzrange(A_base, i)
-            j = rows[id]
-            u_j = u_local[j]
-
-            # compute (Q_1[i,j], Q_2[i,j], ...) where Q_i = ∑_j dxidxhatj * Q̂_j
-            reference_operator_entries = get_sparse_operator_entries(i, j, mesh, cache)
-            normal_direction_ij = geometric_matrix * reference_operator_entries
-
-            # note that we do not need to normalize `normal_direction_ij` since
-            # it is typically normalized within the flux computation.
-            f_ij = volume_flux_fv(u_i, u_j, normal_direction_ij, equations)
-            du_i = du_i + 2 * f_ij
-        end
-        rhs_local[i] = du_i
-    end
+# On affine meshes, the geometric matrix is constant over the element, so we compute it
+# once and reuse it for all node pairs (i, j). The compiler is expected to hoist this
+# out of the inner loop after inlining.
+@inline function get_low_order_geometric_matrix(i, j, element,
+                                                mesh::DGMultiMesh{NDIMS, <:Affine},
+                                                cache) where {NDIMS}
+    return get_contravariant_matrix(element, mesh, cache)
+end
 
-    # TODO: factor this out to avoid calling it twice during calc_volume_integral!
-    return project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh, dg, cache, alpha)
+# On non-affine meshes, we use the average of the geometric matrices at nodes i and j
+# for provably entropy-stable de-aliasing of the geometric terms.
+@inline function get_low_order_geometric_matrix(i, j, element,
+                                                mesh::DGMultiMesh,
+                                                cache)
+    return get_avg_contravariant_matrix(i, j, element, mesh, cache)
 end
 
-function low_order_flux_differencing_kernel(du, u, element,
-                                            mesh::DGMultiMesh{NDIMS, <:NonAffine},
-                                            have_nonconservative_terms::False, equations,
-                                            volume_integral,
-                                            dg::DGMultiFluxDiff{<:GaussSBP},
-                                            cache, alpha = true) where {NDIMS}
+# Calculates the volume integral corresponding to an algebraic low order method.
+# This is used, for example, in shock capturing.
+function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
+                                 have_nonconservative_terms::False, equations,
+                                 volume_integral::VolumeIntegralPureLGLFiniteVolume,
+                                 dg::DGMultiFluxDiff{<:GaussSBP}, cache,
+                                 alpha = true)
     (; volume_flux_fv) = volume_integral
 
     # accumulates output from flux differencing
@@ -321,8 +281,7 @@ function low_order_flux_differencing_kernel(du, u, element,
     u_local = view(cache.entropy_projected_u_values, :, element)
 
     (; sparsity_pattern) = cache
-    A_base = parent(sparsity_pattern) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids, rows = axes(sparsity_pattern, 2), rowvals(A_base)
+    A_base, row_ids, rows, _ = sparse_operator_data(sparsity_pattern)
     for i in row_ids
         u_i = u_local[i]
         du_i = zero(u_i)
@@ -331,7 +290,7 @@ function low_order_flux_differencing_kernel(du, u, element,
             u_j = u_local[j]
 
             # compute (Q_1[i,j], Q_2[i,j], ...) where Q_i = ∑_j dxidxhatj * Q̂_j
-            geometric_matrix = get_avg_contravariant_matrix(i, j, element, mesh, cache)
+            geometric_matrix = get_low_order_geometric_matrix(i, j, element, mesh, cache)
             reference_operator_entries = get_sparse_operator_entries(i, j, mesh, cache)
             normal_direction_ij = geometric_matrix * reference_operator_entries
 
diff --git a/src/solvers/dgmulti/types.jl b/src/solvers/dgmulti/types.jl
index 627ef809dfe..1b408ad57f6 100644
--- a/src/solvers/dgmulti/types.jl
+++ b/src/solvers/dgmulti/types.jl
@@ -341,30 +341,6 @@ function DGMultiMesh(dg::DGMulti{NDIMS}, filename::String;
     return DGMultiMesh(dg, GeometricTermsType(Curved(), dg), md, boundary_faces)
 end
 
-# Matrix type for lazy construction of physical differentiation matrices
-# Constructs a lazy linear combination of B = ∑_i coeffs[i] * A[i]
-struct LazyMatrixLinearCombo{Tcoeffs, N, Tv, TA <: AbstractMatrix{Tv}} <:
-       AbstractMatrix{Tv}
-    matrices::NTuple{N, TA}
-    coeffs::NTuple{N, Tcoeffs}
-    function LazyMatrixLinearCombo(matrices, coeffs)
-        @assert all(matrix -> size(matrix) == size(first(matrices)), matrices)
-        return new{typeof(first(coeffs)), length(matrices), eltype(first(matrices)),
-                   typeof(first(matrices))}(matrices, coeffs)
-    end
-end
-Base.eltype(A::LazyMatrixLinearCombo) = eltype(first(A.matrices))
-Base.IndexStyle(A::LazyMatrixLinearCombo) = IndexCartesian()
-Base.size(A::LazyMatrixLinearCombo) = size(first(A.matrices))
-
-@inline function Base.getindex(A::LazyMatrixLinearCombo{<:Real, N}, i, j) where {N}
-    val = zero(eltype(A))
-    for k in Base.OneTo(N)
-        val = val + A.coeffs[k] * getindex(A.matrices[k], i, j)
-    end
-    return val
-end
-
 # `SimpleKronecker` lazily stores a Kronecker product `kron(ntuple(A, NDIMS)...)`.
 # This object also allocates some temporary storage to enable the fast computation
 # of matrix-vector products.