From db2becc9f302fe8eb3a32ace50ce3f3a5d42e6c4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 13 Aug 2024 23:30:20 +0100
Subject: [PATCH] Add CeedBasisApplyAdd (#1644)

* basis - add CeedBasisApplyAdd + CPU impl

* basis - add ref GPU ApplyAdd

* basis - add shared GPU ApplyAdd

* basis - add MAGMA ApplyAdd

* basis - add CeedBasisApplyAddAtPoints + default impl

* basis - add GPU ApplyAddAtPoints

* tidy - add extra assert to fix clang-tidy

* Apply suggestions from code review

style - consistently use indexing over pointer arithmatic

Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>

* style - more pointer fixes

---------

Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>
---
 backends/cuda-ref/ceed-cuda-ref-basis.c       |  67 +++++--
 backends/cuda-shared/ceed-cuda-shared-basis.c |  63 ++++--
 backends/cuda-shared/ceed-cuda-shared.h       |   2 +
 backends/hip-ref/ceed-hip-ref-basis.c         |  66 +++++--
 backends/hip-shared/ceed-hip-shared-basis.c   |  64 ++++--
 backends/hip-shared/ceed-hip-shared.h         |   2 +
 backends/magma/ceed-magma-basis.c             |  59 +++++-
 backends/magma/ceed-magma.h                   |   4 +
 backends/ref/ceed-ref-basis.c                 |  49 +++--
 include/ceed-impl.h                           |   2 +
 include/ceed/ceed.h                           |   3 +
 .../cuda/cuda-ref-basis-nontensor-templates.h |   4 +-
 .../cuda/cuda-ref-basis-tensor-at-points.h    |  26 +--
 .../jit-source/cuda/cuda-ref-basis-tensor.h   |  23 +--
 .../cuda/cuda-ref-operator-assemble.h         |   2 +-
 .../cuda/cuda-ref-restriction-at-points.h     |   2 +-
 .../cuda/cuda-ref-restriction-curl-oriented.h |   4 +-
 .../cuda/cuda-ref-restriction-offset.h        |   2 +-
 .../cuda/cuda-ref-restriction-oriented.h      |   2 +-
 .../cuda-shared-basis-read-write-templates.h  |  41 ++++
 .../cuda/cuda-shared-basis-tensor-templates.h |  80 ++++----
 .../cuda/cuda-shared-basis-tensor.h           |  68 +++++++
 .../hip/hip-ref-basis-nontensor-templates.h   |   6 +-
 .../hip/hip-ref-basis-tensor-at-points.h      |  30 +--
 .../jit-source/hip/hip-ref-basis-tensor.h     |  23 +--
 .../hip/hip-ref-operator-assemble.h           |   2 +-
 .../hip/hip-ref-restriction-at-points.h       |   2 +-
 .../hip/hip-ref-restriction-curl-oriented.h   |   4 +-
 .../hip/hip-ref-restriction-offset.h          |   2 +-
 .../hip/hip-ref-restriction-oriented.h        |   2 +-
 .../hip-shared-basis-read-write-templates.h   |  41 ++++
 .../hip/hip-shared-basis-tensor-templates.h   |  80 ++++----
 .../jit-source/hip/hip-shared-basis-tensor.h  |  81 ++++++++
 .../jit-source/magma/magma-basis-grad-1d.h    |  45 +++++
 .../jit-source/magma/magma-basis-grad-2d.h    |  51 +++++
 .../jit-source/magma/magma-basis-grad-3d.h    |  58 ++++++
 .../jit-source/magma/magma-basis-interp-1d.h  |  45 +++++
 .../jit-source/magma/magma-basis-interp-2d.h  |  41 ++++
 .../jit-source/magma/magma-basis-interp-3d.h  |  41 ++++
 .../magma-basis-interp-deriv-nontensor.h      | 106 ++++++++++
 .../jit-source/magma/magma-common-nontensor.h |  19 ++
 .../jit-source/magma/magma-common-tensor.h    |  46 +++++
 interface/ceed-basis.c                        | 186 ++++++++++++++++--
 interface/ceed.c                              |   2 +
 tests/README.md                               |   3 +-
 tests/t360-basis.c                            |  56 ++++++
 tests/t361-basis.c                            | 116 +++++++++++
 tests/t362-basis.c                            |  59 ++++++
 tests/t363-basis.c                            |  54 +++++
 tests/t364-basis.c                            |  98 +++++++++
 tests/t365-basis.c                            | 123 ++++++++++++
 51 files changed, 1815 insertions(+), 242 deletions(-)
 create mode 100644 tests/t360-basis.c
 create mode 100644 tests/t361-basis.c
 create mode 100644 tests/t362-basis.c
 create mode 100644 tests/t363-basis.c
 create mode 100644 tests/t364-basis.c
 create mode 100644 tests/t365-basis.c

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 91958b6b25..8cf285cbc8 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -18,7 +18,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                   CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim;
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -33,10 +34,11 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -83,11 +85,23 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                               CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                  CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
-                                CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                           CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim, max_num_points = num_points[0];
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -158,10 +172,11 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -200,11 +215,23 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                          CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                 CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                            CeedVector u, CeedVector v) {
   Ceed                     ceed;
   CeedInt                  num_nodes, num_qpts;
   const CeedInt            is_transpose    = t_mode == CEED_TRANSPOSE;
@@ -222,10 +249,11 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -291,6 +319,18 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                        CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                           CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy tensor basis
 //------------------------------------------------------------------------------
@@ -374,7 +414,9 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
@@ -434,6 +476,7 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
   return CEED_ERROR_SUCCESS;
 }
@@ -493,6 +536,7 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
   return CEED_ERROR_SUCCESS;
 }
@@ -552,6 +596,7 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 118d156a84..4f0901484d 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -27,8 +27,8 @@ int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d,
 //------------------------------------------------------------------------------
 // Apply basis
 //------------------------------------------------------------------------------
-int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                     CeedVector v) {
+static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                                CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   Ceed                   ceed;
   Ceed_Cuda             *ceed_Cuda;
   CeedInt                dim, num_comp;
@@ -45,7 +45,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Apply basis operation
   switch (eval_mode) {
@@ -66,7 +67,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
+                                                      elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
@@ -78,8 +80,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -89,8 +91,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -116,7 +118,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
+                                                      elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
@@ -128,7 +131,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -138,7 +142,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -186,11 +191,23 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                            CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                               CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
-                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                   ceed;
   CeedInt                Q_1d, dim, max_num_points = num_points[0];
   const CeedInt          is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -261,10 +278,11 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -303,6 +321,18 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                              CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                                 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -374,8 +404,10 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
   CeedCallBackend(CeedFree(&basis_kernel_path));
   CeedCallBackend(CeedFree(&basis_kernel_source));
@@ -384,6 +416,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index 40bbaed6a0..96800a0a86 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -14,8 +14,10 @@ typedef struct {
   CUmodule    module;
   CUfunction  Interp;
   CUfunction  InterpTranspose;
+  CUfunction  InterpTransposeAdd;
   CUfunction  Grad;
   CUfunction  GradTranspose;
+  CUfunction  GradTransposeAdd;
   CUfunction  Weight;
   CUmodule    moduleAtPoints;
   CeedInt     num_points;
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index fd1a9fabde..78e27a1a5f 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -17,7 +17,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                  CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim;
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -32,10 +33,11 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -82,11 +84,22 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                 CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
-                               CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                          CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim, max_num_points = num_points[0];
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -157,10 +170,11 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -199,11 +213,23 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                      CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                         CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                           CeedVector u, CeedVector v) {
   Ceed                    ceed;
   CeedInt                 num_nodes, num_qpts;
   const CeedInt           is_transpose    = t_mode == CEED_TRANSPOSE;
@@ -221,10 +247,11 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -290,6 +317,18 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                          CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy tensor basis
 //------------------------------------------------------------------------------
@@ -373,7 +412,9 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
@@ -433,6 +474,7 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
   return CEED_ERROR_SUCCESS;
 }
@@ -492,6 +534,7 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
   return CEED_ERROR_SUCCESS;
 }
@@ -551,6 +594,7 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index ff41f9efe6..bda080ed2d 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -87,8 +87,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c
 //------------------------------------------------------------------------------
 // Apply basis
 //------------------------------------------------------------------------------
-int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                    CeedVector v) {
+static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                               CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   Ceed                  ceed;
   Ceed_Hip             *ceed_Hip;
   CeedInt               dim, num_comp;
@@ -105,7 +105,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Apply basis operation
   switch (eval_mode) {
@@ -125,7 +126,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
+                                                     elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
@@ -136,8 +138,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -147,8 +149,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -174,7 +176,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
@@ -185,7 +188,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -195,7 +199,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -245,11 +250,23 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                    CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
-                                      CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                                 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                  ceed;
   CeedInt               Q_1d, dim, max_num_points = num_points[0];
   const CeedInt         is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -320,10 +337,11 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -362,6 +380,18 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                             CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                                CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -438,8 +468,10 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
                                   has_collocated_grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
   CeedCallBackend(CeedFree(&basis_kernel_path));
   CeedCallBackend(CeedFree(&basis_kernel_source));
@@ -448,7 +480,9 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index 8bc9d041a2..fe3384f55d 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -14,8 +14,10 @@ typedef struct {
   hipModule_t   module;
   hipFunction_t Interp;
   hipFunction_t InterpTranspose;
+  hipFunction_t InterpTransposeAdd;
   hipFunction_t Grad;
   hipFunction_t GradTranspose;
+  hipFunction_t GradTransposeAdd;
   hipFunction_t Weight;
   hipModule_t   moduleAtPoints;
   CeedInt       num_points;
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index de18a1a2fc..71a86d5b8d 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -26,7 +26,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                    CeedVector v) {
   Ceed              ceed;
   Ceed_Magma       *data;
   CeedInt           dim, num_comp, num_nodes, P_1d, Q_1d, P, Q;
@@ -52,7 +53,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
   // Read vectors
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Apply basis operation
   switch (e_mode) {
@@ -115,7 +117,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
       void   *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, grid, num_threads, num_t_col,
+                                                    1, shared_mem, args));
       } else {
         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
@@ -192,7 +195,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, grid, num_threads, num_t_col, 1,
+                                                    shared_mem, args));
       } else {
         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
@@ -248,6 +252,16 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
@@ -259,8 +273,8 @@ int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
-                                         CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode,
+                                             CeedVector u, CeedVector v) {
   Ceed                      ceed;
   Ceed_Magma               *data;
   CeedInt                   num_comp, num_nodes, num_qpts, P, Q, N;
@@ -281,7 +295,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   // Read vectors
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Compile kernels for N as needed
   CeedInt iN = 0;
@@ -344,8 +359,10 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
                                        impl->NB_deriv_t[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_ta", &impl->InterpTransposeAdd[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_ta", &impl->DerivTransposeAdd[iN]));
       if (!impl->Weight) {
         CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight));
         CeedCallBackend(CeedFree(&weight_kernel_path));
@@ -388,7 +405,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
       if (e_mode == CEED_EVAL_INTERP) {
         if (t_mode == CEED_TRANSPOSE) {
-          Kernel = impl->InterpTranspose[iN];
+          Kernel = apply_add ? impl->InterpTransposeAdd[iN] : impl->InterpTranspose[iN];
           NB     = impl->NB_interp_t[iN];
         } else {
           Kernel = impl->Interp[iN];
@@ -396,7 +413,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
         }
       } else {
         if (t_mode == CEED_TRANSPOSE) {
-          Kernel = impl->DerivTranspose[iN];
+          Kernel = apply_add ? impl->DerivTransposeAdd[iN] : impl->DerivTranspose[iN];
           NB     = impl->NB_deriv_t[iN];
         } else {
           Kernel = impl->Deriv[iN];
@@ -414,7 +431,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     } else {
       for (CeedInt d = 0; d < q_comp; d++) {
         if (t_mode == CEED_TRANSPOSE) {
-          const CeedScalar beta = (d > 0) ? 1.0 : 0.0;
+          const CeedScalar beta = (apply_add || (d > 0)) ? 1.0 : 0.0;
           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue);
         } else {
           magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
@@ -443,6 +460,18 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                         CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                            CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy tensor basis
 //------------------------------------------------------------------------------
@@ -559,22 +588,28 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
     case 1:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_1d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_1d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight));
       break;
     case 2:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_2d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_2d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight));
       break;
     case 3:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_3d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_3d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight));
       break;
   }
@@ -588,6 +623,7 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
   return CEED_ERROR_SUCCESS;
@@ -650,6 +686,7 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
   return CEED_ERROR_SUCCESS;
 }
@@ -711,6 +748,7 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
   return CEED_ERROR_SUCCESS;
 }
@@ -772,6 +810,7 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index aa60b37b40..22dd4264b6 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -47,8 +47,10 @@ typedef struct {
   CeedMagmaModule   module;
   CeedMagmaFunction Interp;
   CeedMagmaFunction InterpTranspose;
+  CeedMagmaFunction InterpTransposeAdd;
   CeedMagmaFunction Grad;
   CeedMagmaFunction GradTranspose;
+  CeedMagmaFunction GradTransposeAdd;
   CeedMagmaFunction Weight;
   CeedScalar       *d_interp_1d;
   CeedScalar       *d_grad_1d;
@@ -59,8 +61,10 @@ typedef struct {
   CeedMagmaModule   module[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction InterpTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction DerivTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Weight;
   CeedInt           NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedInt           NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index b82e8bb278..121669012a 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -16,11 +16,11 @@
 //------------------------------------------------------------------------------
 // Basis Apply
 //------------------------------------------------------------------------------
-static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U,
+                                  CeedVector V) {
   Ceed               ceed;
-  bool               is_tensor_basis;
+  bool               is_tensor_basis, add = apply_add || (t_mode == CEED_TRANSPOSE);
   CeedInt            dim, num_comp, q_comp, num_nodes, num_qpts;
-  const CeedInt      add = (t_mode == CEED_TRANSPOSE);
   const CeedScalar  *u;
   CeedScalar        *v;
   CeedTensorContract contract;
@@ -36,13 +36,15 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
   CeedCallBackend(CeedBasisGetTensorContract(basis, &contract));
   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v));
-
   // Clear v if operating in transpose
-  if (t_mode == CEED_TRANSPOSE) {
-    const CeedInt v_size = num_elem * num_comp * num_nodes;
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(V, CEED_MEM_HOST, &v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v));
+
+  if (t_mode == CEED_TRANSPOSE && !apply_add) {
+    CeedSize len;
 
-    for (CeedInt i = 0; i < v_size; i++) v[i] = (CeedScalar)0.0;
+    CeedCallBackend(CeedVectorGetLength(V, &len));
+    for (CeedInt i = 0; i < len; i++) v[i] = 0.0;
   }
 
   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor_basis));
@@ -101,8 +103,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
           //  or Grad to quadrature points (Transpose)
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? interp_1d : impl->collo_grad_1d), t_mode,
-                                                    add && (d > 0),
-                                                    (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : u + d * num_qpts * num_comp * num_elem),
+                                                    (t_mode == CEED_TRANSPOSE) && (d > 0),
+                                                    (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : &u[d * num_qpts * num_comp * num_elem]),
                                                     (t_mode == CEED_NOTRANSPOSE ? (d == dim - 1 ? interp : tmp[(d + 1) % 2]) : interp)));
             pre /= P;
             post *= Q;
@@ -117,9 +119,10 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
           pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem;
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(
-                contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, add && (d == dim - 1),
+                contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode,
+                (t_mode == CEED_NOTRANSPOSE && apply_add) || (t_mode == CEED_TRANSPOSE && (d == dim - 1)),
                 (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])),
-                (t_mode == CEED_NOTRANSPOSE ? v + d * num_qpts * num_comp * num_elem : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
+                (t_mode == CEED_NOTRANSPOSE ? &v[d * num_qpts * num_comp * num_elem] : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
             pre /= P;
             post *= Q;
           }
@@ -133,8 +136,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
 
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, grad_1d, t_mode, add && (d > 0),
-                                                    t_mode == CEED_NOTRANSPOSE ? u : u + d * num_comp * num_qpts * num_elem,
-                                                    t_mode == CEED_TRANSPOSE ? v : v + d * num_comp * num_qpts * num_elem));
+                                                    t_mode == CEED_NOTRANSPOSE ? u : &u[d * num_comp * num_qpts * num_elem],
+                                                    t_mode == CEED_TRANSPOSE ? v : &v[d * num_comp * num_qpts * num_elem]));
             pre /= P;
             post *= Q;
           }
@@ -156,8 +159,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
             for (CeedInt d = 0; d < dim; d++) {
               CeedCallBackend(CeedTensorContractApply(
                   contract, pre, P, post, Q, (p == d) ? grad_1d : interp_1d, t_mode, add && (d == dim - 1),
-                  (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : u + p * num_comp * num_qpts * num_elem) : tmp[d % 2]),
-                  (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : v + p * num_comp * num_qpts * num_elem) : tmp[(d + 1) % 2])));
+                  (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : &u[p * num_comp * num_qpts * num_elem]) : tmp[d % 2]),
+                  (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : &v[p * num_comp * num_qpts * num_elem]) : tmp[(d + 1) % 2])));
               pre /= P;
               post *= Q;
             }
@@ -249,6 +252,16 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+  CeedCallBackend(CeedBasisApplyCore_Ref(basis, false, num_elem, t_mode, eval_mode, U, V));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+  CeedCallBackend(CeedBasisApplyCore_Ref(basis, true, num_elem, t_mode, eval_mode, U, V));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis Destroy Tensor
 //------------------------------------------------------------------------------
@@ -297,6 +310,7 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref));
   return CEED_ERROR_SUCCESS;
 }
@@ -316,6 +330,7 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -334,6 +349,7 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -352,6 +368,7 @@ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index de206e9e59..0b76071a08 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -186,7 +186,9 @@ struct CeedElemRestriction_private {
 struct CeedBasis_private {
   Ceed ceed;
   int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
+  int (*ApplyAdd)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
   int (*ApplyAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
+  int (*ApplyAddAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
   int (*Destroy)(CeedBasis);
   int                ref_count;
   bool               is_tensor_basis; /* flag for tensor basis */
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index f55aa9c6f3..d847525da8 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -305,8 +305,11 @@ CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_
 CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
 CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream);
 CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
 CEED_EXTERN int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
                                        CeedVector x_ref, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                          CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v);
 CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed);
 CEED_EXTERN Ceed CeedBasisReturnCeed(CeedBasis basis);
 CEED_EXTERN int  CeedBasisGetDimension(CeedBasis basis, CeedInt *dim);
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
index 64b57d0d68..6b19ad448d 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
@@ -53,9 +53,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid
     // Run with P threads
     r_V = 0.0;
     for (CeedInt d = 0; d < Q_COMP; d++) {
-      U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
+      U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U];
       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
     }
-    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
+    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V;
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index fc65a10912..263d29338e 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -67,8 +67,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = 1;
         CeedInt           post  = 1;
 
@@ -85,7 +85,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
             // Update buffers used
             pre /= 1;
-            const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
+            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
@@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -133,8 +134,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -169,7 +170,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             // Update buffers used
             pre /= Q;
             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
@@ -222,7 +223,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt     pre   = 1;
         CeedInt     post  = 1;
 
@@ -235,7 +236,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
+            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
 
             pre  = 1;
             post = 1;
@@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -292,7 +294,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -322,7 +324,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
             pre  = BASIS_NUM_QPTS;
             post = 1;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index 468ec978c0..4c8c2f447c 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -42,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-      const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-      CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
@@ -57,13 +57,14 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
 
         // Contract along middle index
         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-          const CeedInt c  = k % post;
-          const CeedInt j  = (k / post) % Q;
-          const CeedInt a  = k / (post * Q);
-          CeedScalar    vk = 0;
-
-          for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-          out[k] = vk;
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
+          if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
         post *= Q;
       }
@@ -106,8 +107,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpo
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
         CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
-        const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
         for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
           __syncthreads();
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 318a07b163..9fc02a1c7a 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                         const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out,
                         const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) {
   extern __shared__ CeedScalar s_CT[];
-  CeedScalar                  *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN;
+  CeedScalar                  *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN];
 
   const int l = threadIdx.x;  // The output column index of each B^T D B operation
                               // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
index 83c4086ed0..87aeda2e3b 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -23,7 +23,7 @@ extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices
 
     if (loc_node >= points_per_elem[elem]) continue;
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
index d317f42cc5..86a4b53545 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -80,7 +80,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
@@ -138,7 +138,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
index 0bd3dc0dd8..9492b31984 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -36,7 +36,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices,
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
index d36f27277e..7c667922bf 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -40,7 +40,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE],
                 u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
     }
   }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index b10ba108f8..56234c28e4 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -46,6 +46,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D) {
+    const CeedInt node = data.t_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D
 //------------------------------------------------------------------------------
@@ -82,6 +95,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
@@ -121,3 +147,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn
     }
   }
 }
+
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index f2fde94139..56989f2b69 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -52,7 +52,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -63,7 +63,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -74,7 +74,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -85,7 +85,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -188,8 +188,8 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -201,8 +201,8 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -214,10 +214,10 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
@@ -229,10 +229,10 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -423,9 +423,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
@@ -438,9 +438,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -453,15 +453,15 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -474,15 +474,15 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -495,12 +495,12 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -513,12 +513,12 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index d6039d3a33..c295362978 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -81,6 +81,39 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   }
 }
 
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                              CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
@@ -154,6 +187,41 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   }
 }
 
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                            CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
index 00b559ff10..0374d459d5 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
@@ -24,7 +24,7 @@ inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     // Run with Q threads
-    U = d_U + elem * strides_elem_U + comp * strides_comp_U;
+    U = &d_U[elem * strides_elem_U + comp * strides_comp_U];
     for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0;
     for (CeedInt i = 0; i < P; i++) {
       const CeedScalar val = U[i];
@@ -53,9 +53,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid
     // Run with P threads
     r_V = 0.0;
     for (CeedInt d = 0; d < Q_COMP; d++) {
-      U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
+      U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U];
       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
     }
-    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
+    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V;
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index fc65a10912..22d81bc30a 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -67,8 +67,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = 1;
         CeedInt           post  = 1;
 
@@ -85,7 +85,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
             // Update buffers used
             pre /= 1;
-            const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
+            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
@@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -133,8 +134,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -169,7 +170,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             // Update buffers used
             pre /= Q;
             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
@@ -222,7 +223,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt     pre   = 1;
         CeedInt     post  = 1;
 
@@ -235,14 +236,14 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
+            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
 
             pre  = 1;
             post = 1;
             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
               // Update buffers used
               pre /= 1;
-              const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
+              const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
 
               // Build Chebyshev polynomial values
@@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -292,7 +294,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -322,7 +324,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
             pre  = BASIS_NUM_QPTS;
             post = 1;
@@ -330,7 +332,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
               // Update buffers used
               pre /= Q;
               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
+              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
 
               // Build Chebyshev polynomial values
               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index 7d732f8e77..db509ac2a0 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -42,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-      const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-      CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
@@ -57,13 +57,14 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
 
         // Contract along middle index
         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-          const CeedInt c  = k % post;
-          const CeedInt j  = (k / post) % Q;
-          const CeedInt a  = k / (post * Q);
-          CeedScalar    vk = 0;
-
-          for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-          out[k] = vk;
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
+          if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
         post *= Q;
       }
@@ -106,8 +107,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpo
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
         CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
-        const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
         for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
           __syncthreads();
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index 35789443b6..838dcfd4a5 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                         const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out,
                         const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) {
   extern __shared__ CeedScalar s_CT[];
-  CeedScalar                  *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN;
+  CeedScalar                  *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN];
 
   const int l = threadIdx.x;  // The output column index of each B^T D B operation
                               // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
index 614628a81f..f4cb95993b 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -23,7 +23,7 @@ extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices
 
     if (loc_node >= points_per_elem[elem]) continue;
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
index 4d3e88ce27..76d9758828 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -80,7 +80,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
@@ -138,7 +138,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
index 26cd41ee92..65283b7193 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -36,7 +36,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices,
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
index cb987fa8a7..f983a24fc0 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -40,7 +40,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE],
                 u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
     }
   }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index a6d945ac56..379d52d13b 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -56,6 +56,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D) {
+    const CeedInt node = data.t_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D
 //------------------------------------------------------------------------------
@@ -92,6 +105,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
@@ -131,3 +157,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt
     }
   }
 }
+
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 5e52d1c829..8dc50e4ed8 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -52,7 +52,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -63,7 +63,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -74,7 +74,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -85,7 +85,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -187,8 +187,8 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -200,8 +200,8 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedS
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -213,10 +213,10 @@ inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__re
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
@@ -228,10 +228,10 @@ inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedSca
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -421,9 +421,9 @@ inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
@@ -436,9 +436,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedS
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -451,15 +451,15 @@ inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__re
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -472,15 +472,15 @@ inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedSca
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -493,12 +493,12 @@ inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedSc
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -511,12 +511,12 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, con
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 0a9a1f3cee..d052e53bf1 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -92,6 +92,44 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)];
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
@@ -181,6 +219,49 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
+                          CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)];
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index dd21682225..cd6f8548fb 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -126,3 +126,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_gradta_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dTgrad, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
+  __syncthreads();
+
+  // sum into V
+  sum_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index 23559716dc..b4e7e2981a 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -188,3 +188,54 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_gradta_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  // sum into V
+  sum_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index c030f8e9e5..c8028be756 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -225,3 +225,61 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   // write V
   write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_gradta_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_Q, (BASIS_Q * BASIS_Q * BASIS_P) + (BASIS_Q * BASIS_P * BASIS_P)));
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 2 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 2, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  // sum into V
+  sum_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index ae8d082653..02f894ecce 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -126,3 +126,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interpta_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
+  __syncthreads();
+
+  // sum into V
+  sum_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index a2a41a25ae..56c8081c83 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -144,3 +144,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_interpta_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U -- there is a sync at the end of this function
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+
+  // no sync needed here -- read_U_2d already syncs at the end
+  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // sum into V
+  sum_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index 50c7e4df4a..ac11e3f8df 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -172,3 +172,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   // write V
   write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_interpta_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp[BASIS_P]                  = {0.0};
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_MAX_P_Q, BASIS_Q * BASIS_P * BASIS_P));
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+  // there is a sync at the end of this function
+
+  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // sum into V
+  sum_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
index f5e2df1e90..0614732f02 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -99,6 +99,52 @@ static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, Ce
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+template <typename T, int Q_COMP, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
+                                                                  CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  CeedScalar rC[NB] = {0.0};
+
+  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
+  for (int d = 0; d < Q_COMP; d++) {
+    // read A using all threads
+    CeedScalar rA[Q];
+    read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+    __syncthreads();
+
+    // read B
+    if (id < nblocks) {
+      read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+    }
+    __syncthreads();
+
+    addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+    dA += P * Q;
+    dB += Q * n;
+
+    __syncthreads();
+  }
+
+  // sum into C
+  if (id < nblocks) {
+    sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 template <typename T, int P, int Q, int NB>
 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
@@ -171,6 +217,42 @@ static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, C
   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
 }
 
+////////////////////////////////////////////////////////////////////////////////
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                   CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  // read A using all threads
+  CeedScalar rA[Q];
+  read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+  __syncthreads();
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  // read B
+  read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+  __syncthreads();
+
+  CeedScalar rC[NB];
+  mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+  // sum into C
+  sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
@@ -195,6 +277,18 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _
 #endif
 }
 
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+#if BASIS_Q_COMP_INTERP == 1
+  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#else
+  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
@@ -218,3 +312,15 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _
   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
 #endif
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+#if BASIS_Q_COMP_DERIV == 1
+  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#else
+  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#endif
+}
diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h
index 730acc6419..945227d145 100644
--- a/include/ceed/jit-source/magma/magma-common-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-common-nontensor.h
@@ -104,6 +104,25 @@ static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into C from reg. to global
+// C is (P x NB)
+// 1D thread config. with (P x 1) threads
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void sum_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) {
+  if (n != NB) {
+    for (int i = 0; i < n; i++) {
+      dC[i * P + tx] += rC[i];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < NB; i++) {
+      dC[i * P + tx] += rC[i];
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // multiply C = A x B using 1D threads in P x 1 config
 // A (P x Q)  in reg., one row per thread
diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h
index 6c483abd9d..494afacd87 100644
--- a/include/ceed/jit-source/magma/magma-common-tensor.h
+++ b/include/ceed/jit-source/magma/magma-common-tensor.h
@@ -36,6 +36,18 @@ static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, cons
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 1D element into global memory from sV[][] --  for all components
+// the devptr is assumed to point directly to the element
+template <typename T, int LENGTH, int NUM_COMP>
+static __device__ __inline__ void sum_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) {
+  if (tx < LENGTH) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      devptr[comp * compstride + tx] += sBuffer[comp][tx];
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // read U of a 2D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
@@ -107,6 +119,23 @@ static __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[D
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void sum_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * Q + tx] += rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // read U of a 3D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
@@ -178,6 +207,23 @@ static __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[D
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to point directly to the element (i.e. already offset by elem-stride)
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void sum_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < (Q * Q)) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * (Q * Q) + tx] += rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // reads T (no-trans) into shared memory
 // T is B x J
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index bc16186459..6be4e3ddf7 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1486,7 +1486,7 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
 }
 
 /**
-  @brief Apply basis evaluation from nodes to quadrature points or vice versa
+  @brief Check input vector dimensions for CeedBasisApply[Add]
 
   @param[in]  basis     `CeedBasis` to evaluate
   @param[in]  num_elem  The number of elements to apply the basis evaluation to;
@@ -1504,9 +1504,9 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
-int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   CeedInt  dim, num_comp, q_comp, num_nodes, num_qpts;
   CeedSize u_length = 0, v_length;
   Ceed     ceed;
@@ -1520,8 +1520,6 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
   CeedCall(CeedVectorGetLength(v, &v_length));
   if (u) CeedCall(CeedVectorGetLength(u, &u_length));
 
-  CeedCheck(basis->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply");
-
   // Check compatibility of topological and geometrical dimensions
   CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0 && u_length % num_qpts == 0) ||
                 (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0 && v_length % num_qpts == 0),
@@ -1544,13 +1542,68 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
       break;
   }
   CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Apply basis evaluation from nodes to quadrature points or vice versa
+
+  @param[in]  basis     `CeedBasis` to evaluate
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  t_mode    @ref CEED_NOTRANSPOSE to evaluate from nodes to quadrature points;
+                          @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes
+  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
+                          @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl,
+                          @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  u         Input `CeedVector`
+  @param[out] v         Output `CeedVector`
+
+  @return An error code: 0 - success, otherwise - failure
 
+  @ref User
+**/
+int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v));
+  CeedCheck(basis->Apply, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply");
   CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Apply basis evaluation from nodes to arbitrary points
+  @brief Apply basis evaluation from quadrature points to nodes and sum into target vector
+
+  @param[in]  basis     `CeedBasis` to evaluate
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  t_mode    @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes;
+                           @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAdd()`
+  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
+                          @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl,
+                          @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  u         Input `CeedVector`
+  @param[out] v         Output `CeedVector` to sum into
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAdd only supports CEED_TRANSPOSE");
+  CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v));
+  CeedCheck(basis->ApplyAdd, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedBasisApplyAdd");
+  CeedCall(basis->ApplyAdd(basis, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints
 
   @param[in]  basis      `CeedBasis` to evaluate
   @param[in]  num_elem   The number of elements to apply the basis evaluation to;
@@ -1567,11 +1620,10 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
-int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
-                           CeedVector x_ref, CeedVector u, CeedVector v) {
-  bool     is_tensor_basis;
+static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                           CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0;
   CeedSize x_length = 0, u_length = 0, v_length;
   Ceed     ceed;
@@ -1624,16 +1676,50 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num
       // LCOV_EXCL_STOP
   }
   CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Backend method
-  if (basis->ApplyAtPoints) {
-    CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
-    return CEED_ERROR_SUCCESS;
-  }
+/**
+  @brief Default implimentation to apply basis evaluation from nodes to arbitrary points
+
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  apply_add  Sum result into target vector or overwrite
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0];
+  Ceed    ceed;
+
+  CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  // Inserting check because clang-tidy doesn't understand this cannot occur
+  CeedCheck(dim > 0, ceed, CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required");
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
 
   // Default implementation
-  CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
-  CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
+  {
+    bool is_tensor_basis;
+
+    CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
+    CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
+  }
   CeedCheck(num_elem == 1, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary  points only supported for a single element at a time");
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCall(CeedVectorSetValue(v, 1.0));
@@ -1805,13 +1891,77 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num
       CeedCall(CeedVectorRestoreArrayRead(u, &u_array));
 
       // -- Interpolate transpose from Chebyshev coefficients
-      CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      if (apply_add) CeedCall(CeedBasisApplyAdd(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      else CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
       break;
     }
   }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Apply basis evaluation from nodes to arbitrary points
+
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                           CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  if (basis->ApplyAtPoints) {
+    CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  } else {
+    CeedCall(CeedBasisApplyAtPoints_Core(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Apply basis evaluation from nodes to arbitrary points and sum into target vector
+
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAddAtPoints()`
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                              CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAddAtPoints only supports CEED_TRANSPOSE");
+  CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  if (basis->ApplyAddAtPoints) {
+    CeedCall(basis->ApplyAddAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  } else {
+    CeedCall(CeedBasisApplyAtPoints_Core(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the `Ceed` associated with a `CeedBasis`
 
diff --git a/interface/ceed.c b/interface/ceed.c
index 3c79c59b33..3bdd471454 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -952,7 +952,9 @@ int CeedInit(const char *resource, Ceed *ceed) {
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetAtPointsElementOffset),
       CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy),
       CEED_FTABLE_ENTRY(CeedBasis, Apply),
+      CEED_FTABLE_ENTRY(CeedBasis, ApplyAdd),
       CEED_FTABLE_ENTRY(CeedBasis, ApplyAtPoints),
+      CEED_FTABLE_ENTRY(CeedBasis, ApplyAddAtPoints),
       CEED_FTABLE_ENTRY(CeedBasis, Destroy),
       CEED_FTABLE_ENTRY(CeedTensorContract, Apply),
       CEED_FTABLE_ENTRY(CeedTensorContract, Destroy),
diff --git a/tests/README.md b/tests/README.md
index fd6e426420..031ff5a030 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -15,7 +15,8 @@ The tests are organized by API object, and some tests are further organized, as
     2. CeedBasis simplex basis tests\
     3. CeedBasis non-tensor H(div) basis tests\
     4. CeedBasis non-tensor H(curl) basis tests\
-    5. CeedBasis evaluation at arbitrary points tests
+    5. CeedBasis evaluation at arbitrary points tests\
+    6. CeedBasis ApplyAdd tests
 4. CeedQFunction Tests\
     0. CeedQFunction user code tests\
     1. CeedQFunction gallery code tests
diff --git a/tests/t360-basis.c b/tests/t360-basis.c
new file mode 100644
index 0000000000..f953157e1c
--- /dev/null
+++ b/tests/t360-basis.c
@@ -0,0 +1,56 @@
+/// @file
+/// Test interpolation ApplyAdd in multiple dimensions
+/// \test Test interpolation ApplyAdd in multiple dimensions
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector u, u_q, v, v_q, w_q;
+    CeedBasis  basis;
+    CeedInt    p = 4, q = 5, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim);
+
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorSetValue(u, 1.0);
+    CeedVectorSetValue(v, 0.0);
+    CeedVectorCreate(ceed, q_dim, &u_q);
+    CeedVectorCreate(ceed, q_dim, &v_q);
+    CeedVectorCreate(ceed, q_dim, &w_q);
+
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis);
+
+    // Compute area
+    CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q);
+    CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q);
+    CeedVectorPointwiseMult(v_q, u_q, w_q);
+    CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+    // Double area computed
+    CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+
+    // Check area
+    {
+      const CeedScalar *v_array;
+      CeedScalar        area = 0.0;
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < p_dim; i++) area += v_array[i];
+      if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim));
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&u_q);
+    CeedVectorDestroy(&v_q);
+    CeedVectorDestroy(&w_q);
+    CeedBasisDestroy(&basis);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t361-basis.c b/tests/t361-basis.c
new file mode 100644
index 0000000000..6671a39ae5
--- /dev/null
+++ b/tests/t361-basis.c
@@ -0,0 +1,116 @@
+/// @file
+/// Test grad ApplyAdd in multiple dimensions
+/// \test Test grad ApplyAdd in multiple dimensions
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) {
+  CeedScalar result = tanh(x[0] + 0.1);
+  if (dim > 1) result += atan(x[1] + 0.2);
+  if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3));
+  return result;
+}
+
+static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
+  CeedScalar tol;
+  if (scalar_type == CEED_SCALAR_FP32) {
+    if (dim == 3) tol = 0.05;
+    else tol = 1.e-3;
+  } else {
+    tol = 1.e-10;
+  }
+  return 2.0 * tol;
+}
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector x, x_q, u, u_q, ones, v;
+    CeedBasis  basis_x_lobatto, basis_u_gauss;
+    CeedInt    p = 8, q = 10, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim), x_dim = CeedIntPow(2, dim);
+    CeedScalar sum_1 = 0, sum_2 = 0;
+
+    CeedVectorCreate(ceed, x_dim * dim, &x);
+    {
+      CeedScalar x_array[x_dim * dim];
+
+      for (CeedInt d = 0; d < dim; d++) {
+        for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1;
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, p_dim * dim, &x_q);
+    CeedVectorSetValue(x_q, 0);
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, q_dim * dim, &u_q);
+    CeedVectorSetValue(u_q, 0);
+    CeedVectorCreate(ceed, q_dim * dim, &ones);
+    CeedVectorSetValue(ones, 1);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorSetValue(v, 0);
+
+    // Get function values at quadrature points
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x_lobatto);
+    CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_q);
+
+    {
+      const CeedScalar *x_q_array;
+      CeedScalar        u_array[p_dim];
+
+      CeedVectorGetArrayRead(x_q, CEED_MEM_HOST, &x_q_array);
+      for (CeedInt i = 0; i < p_dim; i++) {
+        CeedScalar coord[dim];
+
+        for (CeedInt d = 0; d < dim; d++) coord[d] = x_q_array[d * p_dim + i];
+        u_array[i] = Eval(dim, coord);
+      }
+      CeedVectorRestoreArrayRead(x_q, &x_q_array);
+      CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, u_array);
+    }
+
+    // Calculate G u at quadrature points, G' * 1 at dofs
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u_gauss);
+    CeedBasisApply(basis_u_gauss, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u, u_q);
+    CeedVectorScale(u_q, 2.0);
+    CeedBasisApply(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v);
+    CeedBasisApplyAdd(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v);
+
+    // Check if 1' * G * u = u' * (G' * 1)
+    {
+      const CeedScalar *v_array, *u_array, *u_q_array;
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+      CeedVectorGetArrayRead(u_q, CEED_MEM_HOST, &u_q_array);
+      for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i];
+      for (CeedInt i = 0; i < dim * q_dim; i++) sum_2 += u_q_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+      CeedVectorRestoreArrayRead(u, &u_array);
+      CeedVectorRestoreArrayRead(u_q, &u_q_array);
+    }
+    {
+      CeedScalarType scalar_type;
+
+      CeedGetScalarType(&scalar_type);
+
+      CeedScalar tol = GetTolerance(scalar_type, dim);
+
+      if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %0.12f != %0.12f\n", dim, sum_1, sum_2);
+    }
+
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_q);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&u_q);
+    CeedVectorDestroy(&ones);
+    CeedVectorDestroy(&v);
+    CeedBasisDestroy(&basis_x_lobatto);
+    CeedBasisDestroy(&basis_u_gauss);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t362-basis.c b/tests/t362-basis.c
new file mode 100644
index 0000000000..bff1937d66
--- /dev/null
+++ b/tests/t362-basis.c
@@ -0,0 +1,59 @@
+/// @file
+/// Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis
+/// \test Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "t320-basis.h"
+
+// main test
+int main(int argc, char **argv) {
+  Ceed          ceed;
+  CeedVector    u, v, u_q, v_q, w_q;
+  const CeedInt p = 6, q = 4, dim = 2;
+  CeedBasis     basis;
+  CeedScalar    q_ref[dim * q], q_weight[q];
+  CeedScalar    interp[p * q], grad[dim * p * q];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, p, &u);
+  CeedVectorCreate(ceed, p, &v);
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorSetValue(v, 0.0);
+  CeedVectorCreate(ceed, q, &u_q);
+  CeedVectorCreate(ceed, q, &v_q);
+  CeedVectorCreate(ceed, q, &w_q);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
+
+  // Compute area
+  CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q);
+  CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q);
+  CeedVectorPointwiseMult(v_q, u_q, w_q);
+  CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+  // Double area computed
+  CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+
+  // Check area
+  {
+    const CeedScalar *v_array;
+    CeedScalar        area = 0.0;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < p; i++) area += v_array[i];
+    if (fabs(area - 1.0) > 1E-6) printf("Incorrect area computed %f != %f\n", area, 1.0);
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&u_q);
+  CeedVectorDestroy(&v_q);
+  CeedVectorDestroy(&w_q);
+  CeedBasisDestroy(&basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t363-basis.c b/tests/t363-basis.c
new file mode 100644
index 0000000000..6c19f34027
--- /dev/null
+++ b/tests/t363-basis.c
@@ -0,0 +1,54 @@
+/// @file
+/// Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis
+/// \test Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "t320-basis.h"
+
+int main(int argc, char **argv) {
+  Ceed          ceed;
+  CeedVector    u, v;
+  const CeedInt p = 6, q = 4, dim = 2;
+  CeedBasis     basis;
+  CeedScalar    q_ref[dim * q], q_weight[q];
+  CeedScalar    interp[p * q], grad[dim * p * q];
+  CeedScalar    column_sum[p];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, q * dim, &u);
+  CeedVectorSetValue(u, 1);
+  CeedVectorCreate(ceed, p, &v);
+  CeedVectorSetValue(v, 0);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
+
+  CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v);
+  CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v);
+
+  // Check values at quadrature points
+  for (int i = 0; i < p; i++) {
+    column_sum[i] = 0;
+    for (int j = 0; j < q * dim; j++) {
+      column_sum[i] += grad[i + j * p];
+    }
+  }
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (int i = 0; i < p; i++) {
+      if (fabs(column_sum[i] - v_array[i] / 2.0) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, v_array[i] / 2.0, column_sum[i]);
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedBasisDestroy(&basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t364-basis.c b/tests/t364-basis.c
new file mode 100644
index 0000000000..6ab4058d30
--- /dev/null
+++ b/tests/t364-basis.c
@@ -0,0 +1,98 @@
+/// @file
+/// Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D
+/// \test Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#define ALEN(a) (sizeof(a) / sizeof((a)[0]))
+
+static CeedScalar Eval(CeedScalar x, CeedInt n, const CeedScalar *c) {
+  CeedScalar y = c[n - 1];
+  for (CeedInt i = n - 2; i >= 0; i--) y = y * x + c[i];
+  return y;
+}
+
+int main(int argc, char **argv) {
+  Ceed             ceed;
+  CeedVector       x, x_nodes, x_points, x_point, u, v, u_point, v_point;
+  CeedBasis        basis_x, basis_u;
+  const CeedInt    p = 5, q = 5, num_points = 4;
+  const CeedScalar c[4] = {1, 2, 3, 4};  // 1 + 2x + 3x^2 + ...
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, 2, &x);
+  CeedVectorCreate(ceed, p, &x_nodes);
+  CeedVectorCreate(ceed, num_points, &x_points);
+  CeedVectorCreate(ceed, 1, &x_point);
+  CeedVectorCreate(ceed, p, &u);
+  CeedVectorCreate(ceed, num_points, &v);
+  CeedVectorCreate(ceed, p, &u_point);
+  CeedVectorCreate(ceed, 1, &v_point);
+  CeedVectorSetValue(v_point, 1.0);
+
+  // Get nodal coordinates
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, p, CEED_GAUSS_LOBATTO, &basis_x);
+  {
+    CeedScalar x_array[2];
+
+    for (CeedInt i = 0; i < 2; i++) x_array[i] = CeedIntPow(-1, i + 1);
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes);
+
+  // Set values of u at nodes
+  {
+    const CeedScalar *x_array;
+    CeedScalar        u_array[p];
+
+    CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array);
+    for (CeedInt i = 0; i < p; i++) u_array[i] = Eval(x_array[i], ALEN(c), c);
+    CeedVectorRestoreArrayRead(x_nodes, &x_array);
+    CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array);
+  }
+
+  // Interpolate to arbitrary points
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u);
+  {
+    CeedScalar x_array[4] = {-0.33, -0.65, 0.16, 0.99};
+
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+
+  for (CeedInt i = 0; i < num_points; i++) {
+    const CeedInt     num_point[1] = {1};
+    CeedScalar        fx           = 0.0;
+    const CeedScalar *x_array, *u_array, *v_array, *u_point_array;
+
+    CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array);
+    CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    CeedVectorSetValue(x_point, x_array[i]);
+    CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    // Double it
+    CeedBasisApplyAddAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
+    for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j];
+    if (fabs(v_array[i] * 2.0 - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i] * 2.0, fx, x_array[i]);
+    CeedVectorRestoreArrayRead(u_point, &u_point_array);
+    CeedVectorRestoreArrayRead(x_points, &x_array);
+    CeedVectorRestoreArrayRead(u, &u_array);
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&x_nodes);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&x_point);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&u_point);
+  CeedVectorDestroy(&v_point);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t365-basis.c b/tests/t365-basis.c
new file mode 100644
index 0000000000..74f93ce881
--- /dev/null
+++ b/tests/t365-basis.c
@@ -0,0 +1,123 @@
+/// @file
+/// Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points
+/// \test Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) {
+  CeedScalar result = tanh(x[0] + 0.1);
+  if (dim > 1) result += atan(x[1] + 0.2);
+  if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3));
+  return result;
+}
+
+static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
+  CeedScalar tol;
+  if (scalar_type == CEED_SCALAR_FP32) {
+    if (dim == 3) tol = 0.005;
+    else tol = 1.e-4;
+  } else {
+    tol = 1.e-11;
+  }
+  return tol;
+}
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector    x, x_nodes, x_points, u, u_points, v, ones;
+    CeedBasis     basis_x, basis_u;
+    const CeedInt p = 9, q = 9, num_points = 4, x_dim = CeedIntPow(2, dim), p_dim = CeedIntPow(p, dim);
+    CeedScalar    sum_1 = 0, sum_2 = 0;
+
+    CeedVectorCreate(ceed, x_dim * dim, &x);
+    CeedVectorCreate(ceed, p_dim * dim, &x_nodes);
+    CeedVectorCreate(ceed, num_points * dim, &x_points);
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, num_points * dim, &u_points);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorCreate(ceed, num_points * dim, &ones);
+
+    CeedVectorSetValue(ones, 1);
+    CeedVectorSetValue(v, 0);
+
+    // Get nodal coordinates
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x);
+    {
+      CeedScalar x_array[x_dim * dim];
+
+      for (CeedInt d = 0; d < dim; d++) {
+        for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1;
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes);
+
+    // Set values of u at nodes
+    {
+      const CeedScalar *x_array;
+      CeedScalar        u_array[p_dim];
+
+      CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array);
+      for (CeedInt i = 0; i < p_dim; i++) {
+        CeedScalar coord[dim];
+
+        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_dim + i];
+        u_array[i] = Eval(dim, coord);
+      }
+      CeedVectorRestoreArrayRead(x_nodes, &x_array);
+      CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array);
+    }
+
+    // Interpolate to arbitrary points
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+    {
+      CeedScalar x_array[12] = {-0.33, -0.65, 0.16, 0.99, -0.65, 0.16, 0.99, -0.33, 0.16, 0.99, -0.33, -0.65};
+
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+
+    // Calculate G u at arbitrary points, G' * 1 at dofs
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    // Double it
+    CeedBasisApplyAddAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    {
+      const CeedScalar *u_array, *v_array, *u_points_array;
+
+      CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      CeedVectorGetArrayRead(u_points, CEED_MEM_HOST, &u_points_array);
+      for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i];
+      for (CeedInt i = 0; i < num_points * dim; i++) sum_2 += u_points_array[i];
+      CeedVectorRestoreArrayRead(u, &u_array);
+      CeedVectorRestoreArrayRead(v, &v_array);
+      CeedVectorRestoreArrayRead(u_points, &u_points_array);
+    }
+    {
+      CeedScalarType scalar_type;
+
+      CeedGetScalarType(&scalar_type);
+
+      CeedScalar tol = GetTolerance(scalar_type, dim);
+
+      if (fabs(sum_1 - 2.0 * sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, 2.0 * sum_2);
+    }
+
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_nodes);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&u_points);
+    CeedVectorDestroy(&ones);
+    CeedVectorDestroy(&v);
+    CeedBasisDestroy(&basis_x);
+    CeedBasisDestroy(&basis_u);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}