From db2becc9f302fe8eb3a32ace50ce3f3a5d42e6c4 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Tue, 13 Aug 2024 23:30:20 +0100 Subject: [PATCH] Add CeedBasisApplyAdd (#1644) * basis - add CeedBasisApplyAdd + CPU impl * basis - add ref GPU ApplyAdd * basis - add shared GPU ApplyAdd * basis - add MAGMA ApplyAdd * basis - add CeedBasisApplyAddAtPoints + default impl * basis - add GPU ApplyAddAtPoints * tidy - add extra assert to fix clang-tidy * Apply suggestions from code review style - consistently use indexing over pointer arithmatic Co-authored-by: Zach Atkins * style - more pointer fixes --------- Co-authored-by: Zach Atkins --- backends/cuda-ref/ceed-cuda-ref-basis.c | 67 +++++-- backends/cuda-shared/ceed-cuda-shared-basis.c | 63 ++++-- backends/cuda-shared/ceed-cuda-shared.h | 2 + backends/hip-ref/ceed-hip-ref-basis.c | 66 +++++-- backends/hip-shared/ceed-hip-shared-basis.c | 64 ++++-- backends/hip-shared/ceed-hip-shared.h | 2 + backends/magma/ceed-magma-basis.c | 59 +++++- backends/magma/ceed-magma.h | 4 + backends/ref/ceed-ref-basis.c | 49 +++-- include/ceed-impl.h | 2 + include/ceed/ceed.h | 3 + .../cuda/cuda-ref-basis-nontensor-templates.h | 4 +- .../cuda/cuda-ref-basis-tensor-at-points.h | 26 +-- .../jit-source/cuda/cuda-ref-basis-tensor.h | 23 +-- .../cuda/cuda-ref-operator-assemble.h | 2 +- .../cuda/cuda-ref-restriction-at-points.h | 2 +- .../cuda/cuda-ref-restriction-curl-oriented.h | 4 +- .../cuda/cuda-ref-restriction-offset.h | 2 +- .../cuda/cuda-ref-restriction-oriented.h | 2 +- .../cuda-shared-basis-read-write-templates.h | 41 ++++ .../cuda/cuda-shared-basis-tensor-templates.h | 80 ++++---- .../cuda/cuda-shared-basis-tensor.h | 68 +++++++ .../hip/hip-ref-basis-nontensor-templates.h | 6 +- .../hip/hip-ref-basis-tensor-at-points.h | 30 +-- .../jit-source/hip/hip-ref-basis-tensor.h | 23 +-- .../hip/hip-ref-operator-assemble.h | 2 +- .../hip/hip-ref-restriction-at-points.h | 2 +- .../hip/hip-ref-restriction-curl-oriented.h | 4 +- .../hip/hip-ref-restriction-offset.h | 2 +- .../hip/hip-ref-restriction-oriented.h | 2 +- .../hip-shared-basis-read-write-templates.h | 41 ++++ .../hip/hip-shared-basis-tensor-templates.h | 80 ++++---- .../jit-source/hip/hip-shared-basis-tensor.h | 81 ++++++++ .../jit-source/magma/magma-basis-grad-1d.h | 45 +++++ .../jit-source/magma/magma-basis-grad-2d.h | 51 +++++ .../jit-source/magma/magma-basis-grad-3d.h | 58 ++++++ .../jit-source/magma/magma-basis-interp-1d.h | 45 +++++ .../jit-source/magma/magma-basis-interp-2d.h | 41 ++++ .../jit-source/magma/magma-basis-interp-3d.h | 41 ++++ .../magma-basis-interp-deriv-nontensor.h | 106 ++++++++++ .../jit-source/magma/magma-common-nontensor.h | 19 ++ .../jit-source/magma/magma-common-tensor.h | 46 +++++ interface/ceed-basis.c | 186 ++++++++++++++++-- interface/ceed.c | 2 + tests/README.md | 3 +- tests/t360-basis.c | 56 ++++++ tests/t361-basis.c | 116 +++++++++++ tests/t362-basis.c | 59 ++++++ tests/t363-basis.c | 54 +++++ tests/t364-basis.c | 98 +++++++++ tests/t365-basis.c | 123 ++++++++++++ 51 files changed, 1815 insertions(+), 242 deletions(-) create mode 100644 tests/t360-basis.c create mode 100644 tests/t361-basis.c create mode 100644 tests/t362-basis.c create mode 100644 tests/t363-basis.c create mode 100644 tests/t364-basis.c create mode 100644 tests/t365-basis.c diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c index 91958b6b25..8cf285cbc8 100644 --- a/backends/cuda-ref/ceed-cuda-ref-basis.c +++ b/backends/cuda-ref/ceed-cuda-ref-basis.c @@ -18,7 +18,8 @@ //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { +static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -33,10 +34,11 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -83,11 +85,23 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo return CEED_ERROR_SUCCESS; } +static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, - CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -158,10 +172,11 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -200,11 +215,23 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt num_nodes, num_qpts; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -222,10 +249,11 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -291,6 +319,18 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy tensor basis //------------------------------------------------------------------------------ @@ -374,7 +414,9 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); return CEED_ERROR_SUCCESS; } @@ -434,6 +476,7 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); return CEED_ERROR_SUCCESS; } @@ -493,6 +536,7 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); return CEED_ERROR_SUCCESS; } @@ -552,6 +596,7 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index 118d156a84..4f0901484d 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -27,8 +27,8 @@ int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, //------------------------------------------------------------------------------ // Apply basis //------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; Ceed_Cuda *ceed_Cuda; CeedInt dim, num_comp; @@ -45,7 +45,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (eval_mode) { @@ -66,7 +67,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); } @@ -78,8 +80,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -89,8 +91,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -116,7 +118,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); } @@ -128,7 +131,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -138,7 +142,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -186,11 +191,23 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -261,10 +278,11 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -303,6 +321,18 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy basis //------------------------------------------------------------------------------ @@ -374,8 +404,10 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); CeedCallBackend(CeedFree(&basis_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); @@ -384,6 +416,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); return CEED_ERROR_SUCCESS; diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h index 40bbaed6a0..96800a0a86 100644 --- a/backends/cuda-shared/ceed-cuda-shared.h +++ b/backends/cuda-shared/ceed-cuda-shared.h @@ -14,8 +14,10 @@ typedef struct { CUmodule module; CUfunction Interp; CUfunction InterpTranspose; + CUfunction InterpTransposeAdd; CUfunction Grad; CUfunction GradTranspose; + CUfunction GradTransposeAdd; CUfunction Weight; CUmodule moduleAtPoints; CeedInt num_points; diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c index fd1a9fabde..78e27a1a5f 100644 --- a/backends/hip-ref/ceed-hip-ref-basis.c +++ b/backends/hip-ref/ceed-hip-ref-basis.c @@ -17,7 +17,8 @@ //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { +static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -32,10 +33,11 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -82,11 +84,22 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod return CEED_ERROR_SUCCESS; } +static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, - CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -157,10 +170,11 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -199,11 +213,23 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { Ceed ceed; CeedInt num_nodes, num_qpts; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -221,10 +247,11 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -290,6 +317,18 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy tensor basis //------------------------------------------------------------------------------ @@ -373,7 +412,9 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); return CEED_ERROR_SUCCESS; } @@ -433,6 +474,7 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); return CEED_ERROR_SUCCESS; } @@ -492,6 +534,7 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); return CEED_ERROR_SUCCESS; } @@ -551,6 +594,7 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index ff41f9efe6..bda080ed2d 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -87,8 +87,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c //------------------------------------------------------------------------------ // Apply basis //------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; Ceed_Hip *ceed_Hip; CeedInt dim, num_comp; @@ -105,7 +105,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (eval_mode) { @@ -125,7 +126,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); } @@ -136,8 +138,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -147,8 +149,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -174,7 +176,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); } @@ -185,7 +188,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -195,7 +199,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -245,11 +250,23 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee return CEED_ERROR_SUCCESS; } +int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -320,10 +337,11 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -362,6 +380,18 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy basis //------------------------------------------------------------------------------ @@ -438,8 +468,10 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, has_collocated_grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); CeedCallBackend(CeedFree(&basis_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); @@ -448,7 +480,9 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h index 8bc9d041a2..fe3384f55d 100644 --- a/backends/hip-shared/ceed-hip-shared.h +++ b/backends/hip-shared/ceed-hip-shared.h @@ -14,8 +14,10 @@ typedef struct { hipModule_t module; hipFunction_t Interp; hipFunction_t InterpTranspose; + hipFunction_t InterpTransposeAdd; hipFunction_t Grad; hipFunction_t GradTranspose; + hipFunction_t GradTransposeAdd; hipFunction_t Weight; hipModule_t moduleAtPoints; CeedInt num_points; diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index de18a1a2fc..71a86d5b8d 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -26,7 +26,8 @@ //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { +static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, + CeedVector v) { Ceed ceed; Ceed_Magma *data; CeedInt dim, num_comp, num_nodes, P_1d, Q_1d, P, Q; @@ -52,7 +53,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose // Read vectors if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (e_mode) { @@ -115,7 +117,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose void *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem}; if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, grid, num_threads, num_t_col, + 1, shared_mem, args)); } else { CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args)); } @@ -192,7 +195,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose &v_elem_stride, &v_comp_stride, &v_dim_stride, &num_elem}; if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, grid, num_threads, num_t_col, 1, + shared_mem, args)); } else { CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args)); } @@ -248,6 +252,16 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose return CEED_ERROR_SUCCESS; } +static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ @@ -259,8 +273,8 @@ int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, + CeedVector u, CeedVector v) { Ceed ceed; Ceed_Magma *data; CeedInt num_comp, num_nodes, num_qpts, P, Q, N; @@ -281,7 +295,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed // Read vectors if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Compile kernels for N as needed CeedInt iN = 0; @@ -344,8 +359,10 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed impl->NB_deriv_t[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_ta", &impl->InterpTransposeAdd[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_ta", &impl->DerivTransposeAdd[iN])); if (!impl->Weight) { CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight)); CeedCallBackend(CeedFree(&weight_kernel_path)); @@ -388,7 +405,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { if (e_mode == CEED_EVAL_INTERP) { if (t_mode == CEED_TRANSPOSE) { - Kernel = impl->InterpTranspose[iN]; + Kernel = apply_add ? impl->InterpTransposeAdd[iN] : impl->InterpTranspose[iN]; NB = impl->NB_interp_t[iN]; } else { Kernel = impl->Interp[iN]; @@ -396,7 +413,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed } } else { if (t_mode == CEED_TRANSPOSE) { - Kernel = impl->DerivTranspose[iN]; + Kernel = apply_add ? impl->DerivTransposeAdd[iN] : impl->DerivTranspose[iN]; NB = impl->NB_deriv_t[iN]; } else { Kernel = impl->Deriv[iN]; @@ -414,7 +431,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed } else { for (CeedInt d = 0; d < q_comp; d++) { if (t_mode == CEED_TRANSPOSE) { - const CeedScalar beta = (d > 0) ? 1.0 : 0.0; + const CeedScalar beta = (apply_add || (d > 0)) ? 1.0 : 0.0; magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue); } else { magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue); @@ -443,6 +460,18 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy tensor basis //------------------------------------------------------------------------------ @@ -559,22 +588,28 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const case 1: CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_1d_kernel", &impl->InterpTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_1d_kernel", &impl->GradTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight)); break; case 2: CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_2d_kernel", &impl->InterpTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_2d_kernel", &impl->GradTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight)); break; case 3: CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_3d_kernel", &impl->InterpTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_3d_kernel", &impl->GradTransposeAdd)); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight)); break; } @@ -588,6 +623,7 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedCallBackend(CeedBasisSetData(basis, impl)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); return CEED_ERROR_SUCCESS; @@ -650,6 +686,7 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); return CEED_ERROR_SUCCESS; } @@ -711,6 +748,7 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); return CEED_ERROR_SUCCESS; } @@ -772,6 +810,7 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); return CEED_ERROR_SUCCESS; } diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h index aa60b37b40..22dd4264b6 100644 --- a/backends/magma/ceed-magma.h +++ b/backends/magma/ceed-magma.h @@ -47,8 +47,10 @@ typedef struct { CeedMagmaModule module; CeedMagmaFunction Interp; CeedMagmaFunction InterpTranspose; + CeedMagmaFunction InterpTransposeAdd; CeedMagmaFunction Grad; CeedMagmaFunction GradTranspose; + CeedMagmaFunction GradTransposeAdd; CeedMagmaFunction Weight; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; @@ -59,8 +61,10 @@ typedef struct { CeedMagmaModule module[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction InterpTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction DerivTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Weight; CeedInt NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedInt NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c index b82e8bb278..121669012a 100644 --- a/backends/ref/ceed-ref-basis.c +++ b/backends/ref/ceed-ref-basis.c @@ -16,11 +16,11 @@ //------------------------------------------------------------------------------ // Basis Apply //------------------------------------------------------------------------------ -static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { +static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, + CeedVector V) { Ceed ceed; - bool is_tensor_basis; + bool is_tensor_basis, add = apply_add || (t_mode == CEED_TRANSPOSE); CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; - const CeedInt add = (t_mode == CEED_TRANSPOSE); const CeedScalar *u; CeedScalar *v; CeedTensorContract contract; @@ -36,13 +36,15 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo CeedCallBackend(CeedBasisGetTensorContract(basis, &contract)); if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v)); - // Clear v if operating in transpose - if (t_mode == CEED_TRANSPOSE) { - const CeedInt v_size = num_elem * num_comp * num_nodes; + if (apply_add) CeedCallBackend(CeedVectorGetArray(V, CEED_MEM_HOST, &v)); + else CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v)); + + if (t_mode == CEED_TRANSPOSE && !apply_add) { + CeedSize len; - for (CeedInt i = 0; i < v_size; i++) v[i] = (CeedScalar)0.0; + CeedCallBackend(CeedVectorGetLength(V, &len)); + for (CeedInt i = 0; i < len; i++) v[i] = 0.0; } CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor_basis)); @@ -101,8 +103,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo // or Grad to quadrature points (Transpose) for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? interp_1d : impl->collo_grad_1d), t_mode, - add && (d > 0), - (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : u + d * num_qpts * num_comp * num_elem), + (t_mode == CEED_TRANSPOSE) && (d > 0), + (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : &u[d * num_qpts * num_comp * num_elem]), (t_mode == CEED_NOTRANSPOSE ? (d == dim - 1 ? interp : tmp[(d + 1) % 2]) : interp))); pre /= P; post *= Q; @@ -117,9 +119,10 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply( - contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, add && (d == dim - 1), + contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, + (t_mode == CEED_NOTRANSPOSE && apply_add) || (t_mode == CEED_TRANSPOSE && (d == dim - 1)), (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])), - (t_mode == CEED_NOTRANSPOSE ? v + d * num_qpts * num_comp * num_elem : (d == dim - 1 ? v : tmp[(d + 1) % 2])))); + (t_mode == CEED_NOTRANSPOSE ? &v[d * num_qpts * num_comp * num_elem] : (d == dim - 1 ? v : tmp[(d + 1) % 2])))); pre /= P; post *= Q; } @@ -133,8 +136,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, grad_1d, t_mode, add && (d > 0), - t_mode == CEED_NOTRANSPOSE ? u : u + d * num_comp * num_qpts * num_elem, - t_mode == CEED_TRANSPOSE ? v : v + d * num_comp * num_qpts * num_elem)); + t_mode == CEED_NOTRANSPOSE ? u : &u[d * num_comp * num_qpts * num_elem], + t_mode == CEED_TRANSPOSE ? v : &v[d * num_comp * num_qpts * num_elem])); pre /= P; post *= Q; } @@ -156,8 +159,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply( contract, pre, P, post, Q, (p == d) ? grad_1d : interp_1d, t_mode, add && (d == dim - 1), - (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : u + p * num_comp * num_qpts * num_elem) : tmp[d % 2]), - (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : v + p * num_comp * num_qpts * num_elem) : tmp[(d + 1) % 2]))); + (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : &u[p * num_comp * num_qpts * num_elem]) : tmp[d % 2]), + (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : &v[p * num_comp * num_qpts * num_elem]) : tmp[(d + 1) % 2]))); pre /= P; post *= Q; } @@ -249,6 +252,16 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo return CEED_ERROR_SUCCESS; } +static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { + CeedCallBackend(CeedBasisApplyCore_Ref(basis, false, num_elem, t_mode, eval_mode, U, V)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAdd_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { + CeedCallBackend(CeedBasisApplyCore_Ref(basis, true, num_elem, t_mode, eval_mode, U, V)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis Destroy Tensor //------------------------------------------------------------------------------ @@ -297,6 +310,7 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref)); return CEED_ERROR_SUCCESS; } @@ -316,6 +330,7 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); return CEED_ERROR_SUCCESS; } @@ -334,6 +349,7 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); return CEED_ERROR_SUCCESS; } @@ -352,6 +368,7 @@ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nod CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref)); return CEED_ERROR_SUCCESS; } diff --git a/include/ceed-impl.h b/include/ceed-impl.h index de206e9e59..0b76071a08 100644 --- a/include/ceed-impl.h +++ b/include/ceed-impl.h @@ -186,7 +186,9 @@ struct CeedElemRestriction_private { struct CeedBasis_private { Ceed ceed; int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector); + int (*ApplyAdd)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector); int (*ApplyAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector); + int (*ApplyAddAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector); int (*Destroy)(CeedBasis); int ref_count; bool is_tensor_basis; /* flag for tensor basis */ diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h index f55aa9c6f3..d847525da8 100644 --- a/include/ceed/ceed.h +++ b/include/ceed/ceed.h @@ -305,8 +305,11 @@ CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_ CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy); CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream); CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v); +CEED_EXTERN int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v); CEED_EXTERN int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v); +CEED_EXTERN int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v); CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed); CEED_EXTERN Ceed CeedBasisReturnCeed(CeedBasis basis); CEED_EXTERN int CeedBasisGetDimension(CeedBasis basis, CeedInt *dim); diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h index 64b57d0d68..6b19ad448d 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h @@ -53,9 +53,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid // Run with P threads r_V = 0.0; for (CeedInt d = 0; d < Q_COMP; d++) { - U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; + U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U]; for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; } - d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; + d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V; } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h index fc65a10912..263d29338e 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h @@ -67,8 +67,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt if (is_transpose) { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = 1; CeedInt post = 1; @@ -85,7 +85,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt for (CeedInt d = 0; d < BASIS_DIM; d++) { // Update buffers used pre /= 1; - const CeedScalar *in = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1); + const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values @@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } @@ -133,8 +134,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt } else { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; @@ -169,7 +170,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt // Update buffers used pre /= Q; const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2); + CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); @@ -222,7 +223,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is if (is_transpose) { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = 1; CeedInt post = 1; @@ -235,7 +236,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; pre = 1; post = 1; @@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } @@ -292,7 +294,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is } else { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; @@ -322,7 +324,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; pre = BASIS_NUM_QPTS; post = 1; diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h index 468ec978c0..4c8c2f447c 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h @@ -42,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; @@ -57,13 +57,14 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans // Contract along middle index for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar vk = 0; - - for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - out[k] = vk; + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; + if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= Q; } @@ -106,8 +107,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpo for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { CeedInt pre = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; CeedInt post = 1; - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { __syncthreads(); diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h index 318a07b163..9fc02a1c7a 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h @@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { extern __shared__ CeedScalar s_CT[]; - CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; + CeedScalar *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN]; const int l = threadIdx.x; // The output column index of each B^T D B operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h index 83c4086ed0..87aeda2e3b 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h @@ -23,7 +23,7 @@ extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices if (loc_node >= points_per_elem[elem]) continue; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h index d317f42cc5..86a4b53545 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h @@ -80,7 +80,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } @@ -138,7 +138,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h index 0bd3dc0dd8..9492b31984 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h @@ -36,7 +36,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h index d36f27277e..7c667922bf 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h @@ -40,7 +40,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); } } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index b10ba108f8..56234c28e4 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -46,6 +46,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn } } +template +inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D) { + const CeedInt node = data.t_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 2D //------------------------------------------------------------------------------ @@ -82,6 +95,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn } } +template +inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 3D //------------------------------------------------------------------------------ @@ -121,3 +147,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn } } } + +template +inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; + } + } + } +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h index f2fde94139..56989f2b69 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h @@ -52,7 +52,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca template inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_B, r_V + comp); + ContractX1d(data, &r_U[comp], c_B, &r_V[comp]); } } @@ -63,7 +63,7 @@ template inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_B, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_B, &r_V[comp]); } } @@ -74,7 +74,7 @@ template inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_G, r_V + comp); + ContractX1d(data, &r_U[comp], c_G, &r_V[comp]); } } @@ -85,7 +85,7 @@ template inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_G, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_G, &r_V[comp]); } } @@ -188,8 +188,8 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_ CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_B, r_V + comp); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp]); } } @@ -201,8 +201,8 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp, c_B, r_t); - ContractTransposeX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp], c_B, r_t); + ContractTransposeX2d(data, r_t, c_B, &r_V[comp]); } } @@ -214,10 +214,10 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_G, r_t); - ContractY2d(data, r_t, c_B, r_V + comp + 0 * NUM_COMP); - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_G, r_V + comp + 1 * NUM_COMP); + ContractX2d(data, &r_U[comp], c_G, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); } } @@ -229,10 +229,10 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp + 0 * NUM_COMP, c_B, r_t); - ContractTransposeX2d(data, r_t, c_G, r_V + comp); - ContractTransposeY2d(data, r_U + comp + 1 * NUM_COMP, c_G, r_t); - ContractTransposeAddX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); + ContractTransposeX2d(data, r_t, c_G, &r_V[comp]); + ContractTransposeY2d(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); + ContractTransposeAddX2d(data, r_t, c_B, &r_V[comp]); } } @@ -423,9 +423,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_ CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D]); } } @@ -438,9 +438,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, &r_U[comp * Q_1D], c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } @@ -453,15 +453,15 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_G, r_t1); + ContractX3d(data, &r_U[comp * P_1D], c_G, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_G, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractZ3d(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } @@ -474,15 +474,15 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_G, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1); + ContractTransposeX3d(data, r_t2, c_G, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); ContractTransposeY3d(data, r_t1, c_G, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } @@ -495,12 +495,12 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_t1); - ContractX3d(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractY3d(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractZ3d(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractX3d(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractY3d(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractZ3d(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } @@ -513,12 +513,12 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddY3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddX3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddY3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddX3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); ContractTransposeZ3d(data, r_t2, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h index d6039d3a33..c295362978 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h @@ -81,6 +81,39 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca } } +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, c_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, c_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, c_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ @@ -154,6 +187,41 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala } } +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, c_B, c_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, c_B, c_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, c_B, c_G, r_V); + else GradTransposeTensor3d(data, r_U, c_B, c_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h index 00b559ff10..0374d459d5 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h @@ -24,7 +24,7 @@ inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U for (CeedInt comp = 0; comp < NUM_COMP; comp++) { // Run with Q threads - U = d_U + elem * strides_elem_U + comp * strides_comp_U; + U = &d_U[elem * strides_elem_U + comp * strides_comp_U]; for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0; for (CeedInt i = 0; i < P; i++) { const CeedScalar val = U[i]; @@ -53,9 +53,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid // Run with P threads r_V = 0.0; for (CeedInt d = 0; d < Q_COMP; d++) { - U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; + U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U]; for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; } - d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; + d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V; } } diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h index fc65a10912..22d81bc30a 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h @@ -67,8 +67,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt if (is_transpose) { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = 1; CeedInt post = 1; @@ -85,7 +85,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt for (CeedInt d = 0; d < BASIS_DIM; d++) { // Update buffers used pre /= 1; - const CeedScalar *in = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1); + const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values @@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } @@ -133,8 +134,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt } else { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; @@ -169,7 +170,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt // Update buffers used pre /= Q; const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2); + CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); @@ -222,7 +223,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is if (is_transpose) { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = 1; CeedInt post = 1; @@ -235,14 +236,14 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; pre = 1; post = 1; for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { // Update buffers used pre /= 1; - const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); + const CeedScalar *in = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1); CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values @@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } @@ -292,7 +294,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is } else { for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; @@ -322,7 +324,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; pre = BASIS_NUM_QPTS; post = 1; @@ -330,7 +332,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is // Update buffers used pre /= Q; const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); - CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h index 7d732f8e77..db509ac2a0 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h @@ -42,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; CeedInt pre = u_size; CeedInt post = 1; @@ -57,13 +57,14 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans // Contract along middle index for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar vk = 0; - - for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - out[k] = vk; + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; + if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= Q; } @@ -106,8 +107,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpo for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { CeedInt pre = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; CeedInt post = 1; - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { __syncthreads(); diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h index 35789443b6..838dcfd4a5 100644 --- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h +++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h @@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { extern __shared__ CeedScalar s_CT[]; - CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; + CeedScalar *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN]; const int l = threadIdx.x; // The output column index of each B^T D B operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h index 614628a81f..f4cb95993b 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h @@ -23,7 +23,7 @@ extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices if (loc_node >= points_per_elem[elem]) continue; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); } } } diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h index 4d3e88ce27..76d9758828 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h @@ -80,7 +80,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } @@ -138,7 +138,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value); } } } diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h index 26cd41ee92..65283b7193 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h @@ -36,7 +36,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); } } } diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h index cb987fa8a7..f983a24fc0 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h @@ -40,7 +40,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices const CeedInt elem = node / RSTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, + atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); } } diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h index a6d945ac56..379d52d13b 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h @@ -56,6 +56,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt } } +template +inline __device__ void SumElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D) { + const CeedInt node = data.t_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 2D //------------------------------------------------------------------------------ @@ -92,6 +105,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt } } +template +inline __device__ void SumElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 3D //------------------------------------------------------------------------------ @@ -131,3 +157,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt } } } + +template +inline __device__ void SumElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; + } + } + } +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h index 5e52d1c829..8dc50e4ed8 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h @@ -52,7 +52,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal template inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_B, r_V + comp); + ContractX1d(data, &r_U[comp], c_B, &r_V[comp]); } } @@ -63,7 +63,7 @@ template inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_B, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_B, &r_V[comp]); } } @@ -74,7 +74,7 @@ template inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX1d(data, r_U + comp, c_G, r_V + comp); + ContractX1d(data, &r_U[comp], c_G, &r_V[comp]); } } @@ -85,7 +85,7 @@ template inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeX1d(data, r_U + comp, c_G, r_V + comp); + ContractTransposeX1d(data, &r_U[comp], c_G, &r_V[comp]); } } @@ -187,8 +187,8 @@ template inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_B, r_V + comp); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp]); } } @@ -200,8 +200,8 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedS CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp, c_B, r_t); - ContractTransposeX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp], c_B, r_t); + ContractTransposeX2d(data, r_t, c_B, &r_V[comp]); } } @@ -213,10 +213,10 @@ inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__re CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX2d(data, r_U + comp, c_G, r_t); - ContractY2d(data, r_t, c_B, r_V + comp + 0 * NUM_COMP); - ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_G, r_V + comp + 1 * NUM_COMP); + ContractX2d(data, &r_U[comp], c_G, r_t); + ContractY2d(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); + ContractX2d(data, &r_U[comp], c_B, r_t); + ContractY2d(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); } } @@ -228,10 +228,10 @@ inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedSca CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp + 0 * NUM_COMP, c_B, r_t); - ContractTransposeX2d(data, r_t, c_G, r_V + comp); - ContractTransposeY2d(data, r_U + comp + 1 * NUM_COMP, c_G, r_t); - ContractTransposeAddX2d(data, r_t, c_B, r_V + comp); + ContractTransposeY2d(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); + ContractTransposeX2d(data, r_t, c_G, &r_V[comp]); + ContractTransposeY2d(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); + ContractTransposeAddX2d(data, r_t, c_B, &r_V[comp]); } } @@ -421,9 +421,9 @@ inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__ CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D]); } } @@ -436,9 +436,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedS CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, &r_U[comp * Q_1D], c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } @@ -451,15 +451,15 @@ inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__re CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_G, r_t1); + ContractX3d(data, &r_U[comp * P_1D], c_G, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_G, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractZ3d(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } @@ -472,15 +472,15 @@ inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedSca CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_G, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1); + ContractTransposeX3d(data, r_t2, c_G, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); ContractTransposeY3d(data, r_t1, c_G, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeAddX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } @@ -493,12 +493,12 @@ inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedSc CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); + ContractX3d(data, &r_U[comp * P_1D], c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_t1); - ContractX3d(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); - ContractY3d(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); - ContractZ3d(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); + ContractX3d(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); + ContractY3d(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); + ContractZ3d(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); } } @@ -511,12 +511,12 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, con CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddY3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2); - ContractTransposeAddX3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2); + ContractTransposeZ3d(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddY3d(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); + ContractTransposeAddX3d(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); ContractTransposeZ3d(data, r_t2, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeX3d(data, r_t2, c_B, &r_V[comp * P_1D]); } } diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h index 0a9a1f3cee..d052e53bf1 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h @@ -92,6 +92,44 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } } +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)]; + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ @@ -181,6 +219,49 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ } } +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + loadMatrix(d_grad_1d, s_G); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)]; + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h index dd21682225..cd6f8548fb 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h @@ -126,3 +126,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_1d(sV, dV, cstrdV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_gradta_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_Q * BASIS_P; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_Q); + sV[comp] = sV[comp - 1] + (1 * BASIS_P); + } + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dTgrad, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_grad_1d_device(sT, sU, sV, tx); + __syncthreads(); + + // sum into V + sum_1d(sV, dV, cstrdV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h index 23559716dc..b4e7e2981a 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h @@ -188,3 +188,54 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_gradta_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_Q * BASIS_P; + CeedScalar *sTmp = sTgrad + BASIS_Q * BASIS_P; + sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dinterp1d, sTinterp); + read_T_trans_gm2sm(tx, dgrad1d, sTgrad); + } + __syncthreads(); + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + // sum into V + sum_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h index c030f8e9e5..c8028be756 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h @@ -225,3 +225,61 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // write V write_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_gradta_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_Q * BASIS_P; + CeedScalar *sTmp = sTgrad + BASIS_Q * BASIS_P; + sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_Q, (BASIS_Q * BASIS_Q * BASIS_P) + (BASIS_Q * BASIS_P * BASIS_P))); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dinterp1d, sTinterp); + read_T_trans_gm2sm(tx, dgrad1d, sTgrad); + } + __syncthreads(); + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 2 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + read_U_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); + /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + // sum into V + sum_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h index ae8d082653..02f894ecce 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h @@ -126,3 +126,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_1d(sV, dV, cstrdV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interpta_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_Q * BASIS_P; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_Q); + sV[comp] = sV[comp - 1] + (1 * BASIS_P); + } + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dT, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_interp_1d_device(sT, sU, sV, tx); + __syncthreads(); + + // sum into V + sum_1d(sV, dV, cstrdV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h index a2a41a25ae..56c8081c83 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h @@ -144,3 +144,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // write V write_V_2d(dV, cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_interpta_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_Q * BASIS_P; + sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dT, sT); + } + + // read U -- there is a sync at the end of this function + read_U_2d(dU, cstrdU, rU, sTmp, tx); + + // no sync needed here -- read_U_2d already syncs at the end + magma_interp_2d_device(sT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // sum into V + sum_V_2d(dV, cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h index 50c7e4df4a..ac11e3f8df 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h @@ -172,3 +172,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // write V write_V_3d(dV, cstrdV, rV, tx); } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_interpta_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp[BASIS_P] = {0.0}; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_Q * BASIS_P; + sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_MAX_P_Q, BASIS_Q * BASIS_P * BASIS_P)); + + // read T + if (ty == 0) { + read_T_trans_gm2sm(tx, dT, sT); + } + + // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0) + read_U_3d(dU, cstrdU, rU, sTmp, tx); + // there is a sync at the end of this function + + magma_interp_3d_device(sT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // sum into V + sum_V_3d(dV, cstrdV, rV, tx); +} diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h index f5e2df1e90..0614732f02 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h @@ -99,6 +99,52 @@ static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, Ce } } +//////////////////////////////////////////////////////////////////////////////// +template +static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC, + CeedScalar *shared_data) { + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = (n + NB - 1) / NB; + const int myn = min(NB, n - id * NB); + + dB += id * Q * NB; + dC += id * P * NB; + + // A is P x Q + CeedScalar *sA = shared_data; + CeedScalar *sB = shared_data + ty * Q * NB; + + CeedScalar rC[NB] = {0.0}; + + // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) + for (int d = 0; d < Q_COMP; d++) { + // read A using all threads + CeedScalar rA[Q]; + read_A_notrans_g2r_1D_nosync(tx, ty, dA, sA, rA); + __syncthreads(); + + // read B + if (id < nblocks) { + read_B_g2s_1D_nosync(tx, myn, dB, sB); + } + __syncthreads(); + + addmul_rAsBrC_1D_nosync(rA, sB, rC); + + dA += P * Q; + dB += Q * n; + + __syncthreads(); + } + + // sum into C + if (id < nblocks) { + sum_C_r2g_1D_nosync(tx, myn, rC, dC); + } +} + //////////////////////////////////////////////////////////////////////////////// template static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, @@ -171,6 +217,42 @@ static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, C write_C_r2g_1D_nosync(tx, myn, rC, dC); } +//////////////////////////////////////////////////////////////////////////////// +template +static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, + CeedScalar *shared_data) { + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = (n + NB - 1) / NB; + const int myn = min(NB, n - id * NB); + + dB += id * Q * NB; + dC += id * P * NB; + + // A is P x Q + CeedScalar *sA = shared_data; + CeedScalar *sB = shared_data + ty * Q * NB; + + // read A using all threads + CeedScalar rA[Q]; + read_A_notrans_g2r_1D_nosync(tx, ty, dA, sA, rA); + __syncthreads(); + + // terminate threads with no work + if (id >= nblocks) return; + + // read B + read_B_g2s_1D_nosync(tx, myn, dB, sB); + __syncthreads(); + + CeedScalar rC[NB]; + mul_rAsBrC_1D_nosync(rA, sB, rC); + + // sum into C + sum_C_r2g_1D_nosync(tx, myn, rC, dC); +} + //////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { @@ -195,6 +277,18 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _ #endif } +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + +#if BASIS_Q_COMP_INTERP == 1 + magma_basis_nontensor_device_ta1(n, dA, dB, dC, (CeedScalar *)shared_data); +#else + magma_basis_nontensor_device_ta(n, dA, dB, dC, (CeedScalar *)shared_data); +#endif +} + //////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { @@ -218,3 +312,15 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _ magma_basis_nontensor_device_t(n, dA, dB, dC, (CeedScalar *)shared_data); #endif } + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + +#if BASIS_Q_COMP_DERIV == 1 + magma_basis_nontensor_device_ta1(n, dA, dB, dC, (CeedScalar *)shared_data); +#else + magma_basis_nontensor_device_ta(n, dA, dB, dC, (CeedScalar *)shared_data); +#endif +} diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h index 730acc6419..945227d145 100644 --- a/include/ceed/jit-source/magma/magma-common-nontensor.h +++ b/include/ceed/jit-source/magma/magma-common-nontensor.h @@ -104,6 +104,25 @@ static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int } } +//////////////////////////////////////////////////////////////////////////////// +// sum into C from reg. to global +// C is (P x NB) +// 1D thread config. with (P x 1) threads +// no sync at the end of the function +template +static __device__ __inline__ void sum_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) { + if (n != NB) { + for (int i = 0; i < n; i++) { + dC[i * P + tx] += rC[i]; + } + } else { +#pragma unroll + for (int i = 0; i < NB; i++) { + dC[i * P + tx] += rC[i]; + } + } +} + //////////////////////////////////////////////////////////////////////////////// // multiply C = A x B using 1D threads in P x 1 config // A (P x Q) in reg., one row per thread diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h index 6c483abd9d..494afacd87 100644 --- a/include/ceed/jit-source/magma/magma-common-tensor.h +++ b/include/ceed/jit-source/magma/magma-common-tensor.h @@ -36,6 +36,18 @@ static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, cons } } +//////////////////////////////////////////////////////////////////////////////// +// sum into V of a 1D element into global memory from sV[][] -- for all components +// the devptr is assumed to point directly to the element +template +static __device__ __inline__ void sum_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) { + if (tx < LENGTH) { + for (int comp = 0; comp < NUM_COMP; comp++) { + devptr[comp * compstride + tx] += sBuffer[comp][tx]; + } + } +} + //////////////////////////////////////////////////////////////////////////////// // read U of a 2D element into registers rU[][][] -- for all components of a single dim // dU is assumed to be offset by elem-stride and dim-stride @@ -107,6 +119,23 @@ static __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[D } } +//////////////////////////////////////////////////////////////////////////////// +// sum into V of a 2D element from registers rV[][][] to global memory -- for all components of a single dim +// dV is assumed to be offset by elem-stride and dim-stride +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. max(P, Q)) +template +static __device__ __inline__ void sum_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < Q) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + dV[comp * compstride + j * Q + tx] += rV[i_DIM][comp][j]; + } + } + } +} + //////////////////////////////////////////////////////////////////////////////// // read U of a 3D element into registers rU[][][] -- for all components of a single dim // dU is assumed to be offset by elem-stride and dim-stride @@ -178,6 +207,23 @@ static __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[D } } +//////////////////////////////////////////////////////////////////////////////// +// sum into V of a 3D element from registers rV[][][] to global memory -- for all components of a single dim +// dV is assumed to point directly to the element (i.e. already offset by elem-stride) +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. max(P, Q)) +template +static __device__ __inline__ void sum_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < (Q * Q)) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + dV[comp * compstride + j * (Q * Q) + tx] += rV[i_DIM][comp][j]; + } + } + } +} + //////////////////////////////////////////////////////////////////////////////// // reads T (no-trans) into shared memory // T is B x J diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c index bc16186459..6be4e3ddf7 100644 --- a/interface/ceed-basis.c +++ b/interface/ceed-basis.c @@ -1486,7 +1486,7 @@ int CeedBasisView(CeedBasis basis, FILE *stream) { } /** - @brief Apply basis evaluation from nodes to quadrature points or vice versa + @brief Check input vector dimensions for CeedBasisApply[Add] @param[in] basis `CeedBasis` to evaluate @param[in] num_elem The number of elements to apply the basis evaluation to; @@ -1504,9 +1504,9 @@ int CeedBasisView(CeedBasis basis, FILE *stream) { @return An error code: 0 - success, otherwise - failure - @ref User + @ref Developer **/ -int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { +static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; CeedSize u_length = 0, v_length; Ceed ceed; @@ -1520,8 +1520,6 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedCall(CeedVectorGetLength(v, &v_length)); if (u) CeedCall(CeedVectorGetLength(u, &u_length)); - CeedCheck(basis->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply"); - // Check compatibility of topological and geometrical dimensions CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0 && u_length % num_qpts == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0 && v_length % num_qpts == 0), @@ -1544,13 +1542,68 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, break; } CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode"); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Apply basis evaluation from nodes to quadrature points or vice versa + + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to quadrature points; + @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes + @param[in] eval_mode @ref CEED_EVAL_NONE to use values directly, + @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_DIV to use divergence, + @ref CEED_EVAL_CURL to use curl, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] u Input `CeedVector` + @param[out] v Output `CeedVector` + + @return An error code: 0 - success, otherwise - failure + @ref User +**/ +int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v)); + CeedCheck(basis->Apply, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply"); CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v)); return CEED_ERROR_SUCCESS; } /** - @brief Apply basis evaluation from nodes to arbitrary points + @brief Apply basis evaluation from quadrature points to nodes and sum into target vector + + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] t_mode @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes; + @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAdd()` + @param[in] eval_mode @ref CEED_EVAL_NONE to use values directly, + @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_DIV to use divergence, + @ref CEED_EVAL_CURL to use curl, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] u Input `CeedVector` + @param[out] v Output `CeedVector` to sum into + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAdd only supports CEED_TRANSPOSE"); + CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v)); + CeedCheck(basis->ApplyAdd, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedBasisApplyAdd"); + CeedCall(basis->ApplyAdd(basis, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints @param[in] basis `CeedBasis` to evaluate @param[in] num_elem The number of elements to apply the basis evaluation to; @@ -1567,11 +1620,10 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, @return An error code: 0 - success, otherwise - failure - @ref User + @ref Developer **/ -int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, - CeedVector x_ref, CeedVector u, CeedVector v) { - bool is_tensor_basis; +static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { CeedInt dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0; CeedSize x_length = 0, u_length = 0, v_length; Ceed ceed; @@ -1624,16 +1676,50 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num // LCOV_EXCL_STOP } CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode"); + return CEED_ERROR_SUCCESS; +} - // Backend method - if (basis->ApplyAtPoints) { - CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); - return CEED_ERROR_SUCCESS; - } +/** + @brief Default implimentation to apply basis evaluation from nodes to arbitrary points + + @param[in] basis `CeedBasis` to evaluate + @param[in] apply_add Sum result into target vector or overwrite + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem` + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to points; + @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes + @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] x_ref `CeedVector` holding reference coordinates of each point + @param[in] u Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE + @param[out] v Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP + + @return An error code: 0 - success, otherwise - failure + + @ref Developer +**/ +static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0]; + Ceed ceed; + + CeedCall(CeedBasisGetCeed(basis, &ceed)); + CeedCall(CeedBasisGetDimension(basis, &dim)); + // Inserting check because clang-tidy doesn't understand this cannot occur + CeedCheck(dim > 0, ceed, CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required"); + CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); // Default implementation - CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis)); - CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases"); + { + bool is_tensor_basis; + + CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis)); + CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases"); + } CeedCheck(num_elem == 1, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for a single element at a time"); if (eval_mode == CEED_EVAL_WEIGHT) { CeedCall(CeedVectorSetValue(v, 1.0)); @@ -1805,13 +1891,77 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num CeedCall(CeedVectorRestoreArrayRead(u, &u_array)); // -- Interpolate transpose from Chebyshev coefficients - CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v)); + if (apply_add) CeedCall(CeedBasisApplyAdd(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v)); + else CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v)); break; } } return CEED_ERROR_SUCCESS; } +/** + @brief Apply basis evaluation from nodes to arbitrary points + + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem` + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to points; + @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes + @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] x_ref `CeedVector` holding reference coordinates of each point + @param[in] u Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE + @param[out] v Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + if (basis->ApplyAtPoints) { + CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + } else { + CeedCall(CeedBasisApplyAtPoints_Core(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + } + return CEED_ERROR_SUCCESS; +} + +/** + @brief Apply basis evaluation from nodes to arbitrary points and sum into target vector + + @param[in] basis `CeedBasis` to evaluate + @param[in] num_elem The number of elements to apply the basis evaluation to; + the backend will specify the ordering in @ref CeedElemRestrictionCreate() + @param[in] num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem` + @param[in] t_mode @ref CEED_NOTRANSPOSE to evaluate from nodes to points; + @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAddAtPoints()` + @param[in] eval_mode @ref CEED_EVAL_INTERP to use interpolated values, + @ref CEED_EVAL_GRAD to use gradients, + @ref CEED_EVAL_WEIGHT to use quadrature weights + @param[in] x_ref `CeedVector` holding reference coordinates of each point + @param[in] u Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE + @param[out] v Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAddAtPoints only supports CEED_TRANSPOSE"); + CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + if (basis->ApplyAddAtPoints) { + CeedCall(basis->ApplyAddAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + } else { + CeedCall(CeedBasisApplyAtPoints_Core(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + } + return CEED_ERROR_SUCCESS; +} + /** @brief Get the `Ceed` associated with a `CeedBasis` diff --git a/interface/ceed.c b/interface/ceed.c index 3c79c59b33..3bdd471454 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -952,7 +952,9 @@ int CeedInit(const char *resource, Ceed *ceed) { CEED_FTABLE_ENTRY(CeedElemRestriction, GetAtPointsElementOffset), CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy), CEED_FTABLE_ENTRY(CeedBasis, Apply), + CEED_FTABLE_ENTRY(CeedBasis, ApplyAdd), CEED_FTABLE_ENTRY(CeedBasis, ApplyAtPoints), + CEED_FTABLE_ENTRY(CeedBasis, ApplyAddAtPoints), CEED_FTABLE_ENTRY(CeedBasis, Destroy), CEED_FTABLE_ENTRY(CeedTensorContract, Apply), CEED_FTABLE_ENTRY(CeedTensorContract, Destroy), diff --git a/tests/README.md b/tests/README.md index fd6e426420..031ff5a030 100644 --- a/tests/README.md +++ b/tests/README.md @@ -15,7 +15,8 @@ The tests are organized by API object, and some tests are further organized, as     2. CeedBasis simplex basis tests\     3. CeedBasis non-tensor H(div) basis tests\     4. CeedBasis non-tensor H(curl) basis tests\ -    5. CeedBasis evaluation at arbitrary points tests +    5. CeedBasis evaluation at arbitrary points tests\ + 6. CeedBasis ApplyAdd tests 4. CeedQFunction Tests\     0. CeedQFunction user code tests\     1. CeedQFunction gallery code tests diff --git a/tests/t360-basis.c b/tests/t360-basis.c new file mode 100644 index 0000000000..f953157e1c --- /dev/null +++ b/tests/t360-basis.c @@ -0,0 +1,56 @@ +/// @file +/// Test interpolation ApplyAdd in multiple dimensions +/// \test Test interpolation ApplyAdd in multiple dimensions +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector u, u_q, v, v_q, w_q; + CeedBasis basis; + CeedInt p = 4, q = 5, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim); + + CeedVectorCreate(ceed, p_dim, &u); + CeedVectorCreate(ceed, p_dim, &v); + CeedVectorSetValue(u, 1.0); + CeedVectorSetValue(v, 0.0); + CeedVectorCreate(ceed, q_dim, &u_q); + CeedVectorCreate(ceed, q_dim, &v_q); + CeedVectorCreate(ceed, q_dim, &w_q); + + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis); + + // Compute area + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q); + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q); + CeedVectorPointwiseMult(v_q, u_q, w_q); + CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + // Double area computed + CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + + // Check area + { + const CeedScalar *v_array; + CeedScalar area = 0.0; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < p_dim; i++) area += v_array[i]; + if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim)); + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&u_q); + CeedVectorDestroy(&v_q); + CeedVectorDestroy(&w_q); + CeedBasisDestroy(&basis); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t361-basis.c b/tests/t361-basis.c new file mode 100644 index 0000000000..6671a39ae5 --- /dev/null +++ b/tests/t361-basis.c @@ -0,0 +1,116 @@ +/// @file +/// Test grad ApplyAdd in multiple dimensions +/// \test Test grad ApplyAdd in multiple dimensions +#include +#include +#include + +static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { + CeedScalar result = tanh(x[0] + 0.1); + if (dim > 1) result += atan(x[1] + 0.2); + if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3)); + return result; +} + +static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { + CeedScalar tol; + if (scalar_type == CEED_SCALAR_FP32) { + if (dim == 3) tol = 0.05; + else tol = 1.e-3; + } else { + tol = 1.e-10; + } + return 2.0 * tol; +} + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector x, x_q, u, u_q, ones, v; + CeedBasis basis_x_lobatto, basis_u_gauss; + CeedInt p = 8, q = 10, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim), x_dim = CeedIntPow(2, dim); + CeedScalar sum_1 = 0, sum_2 = 0; + + CeedVectorCreate(ceed, x_dim * dim, &x); + { + CeedScalar x_array[x_dim * dim]; + + for (CeedInt d = 0; d < dim; d++) { + for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1; + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, p_dim * dim, &x_q); + CeedVectorSetValue(x_q, 0); + CeedVectorCreate(ceed, p_dim, &u); + CeedVectorCreate(ceed, q_dim * dim, &u_q); + CeedVectorSetValue(u_q, 0); + CeedVectorCreate(ceed, q_dim * dim, &ones); + CeedVectorSetValue(ones, 1); + CeedVectorCreate(ceed, p_dim, &v); + CeedVectorSetValue(v, 0); + + // Get function values at quadrature points + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x_lobatto); + CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_q); + + { + const CeedScalar *x_q_array; + CeedScalar u_array[p_dim]; + + CeedVectorGetArrayRead(x_q, CEED_MEM_HOST, &x_q_array); + for (CeedInt i = 0; i < p_dim; i++) { + CeedScalar coord[dim]; + + for (CeedInt d = 0; d < dim; d++) coord[d] = x_q_array[d * p_dim + i]; + u_array[i] = Eval(dim, coord); + } + CeedVectorRestoreArrayRead(x_q, &x_q_array); + CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, u_array); + } + + // Calculate G u at quadrature points, G' * 1 at dofs + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u_gauss); + CeedBasisApply(basis_u_gauss, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u, u_q); + CeedVectorScale(u_q, 2.0); + CeedBasisApply(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v); + CeedBasisApplyAdd(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v); + + // Check if 1' * G * u = u' * (G' * 1) + { + const CeedScalar *v_array, *u_array, *u_q_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array); + CeedVectorGetArrayRead(u_q, CEED_MEM_HOST, &u_q_array); + for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i]; + for (CeedInt i = 0; i < dim * q_dim; i++) sum_2 += u_q_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + CeedVectorRestoreArrayRead(u, &u_array); + CeedVectorRestoreArrayRead(u_q, &u_q_array); + } + { + CeedScalarType scalar_type; + + CeedGetScalarType(&scalar_type); + + CeedScalar tol = GetTolerance(scalar_type, dim); + + if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %0.12f != %0.12f\n", dim, sum_1, sum_2); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_q); + CeedVectorDestroy(&u); + CeedVectorDestroy(&u_q); + CeedVectorDestroy(&ones); + CeedVectorDestroy(&v); + CeedBasisDestroy(&basis_x_lobatto); + CeedBasisDestroy(&basis_u_gauss); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t362-basis.c b/tests/t362-basis.c new file mode 100644 index 0000000000..bff1937d66 --- /dev/null +++ b/tests/t362-basis.c @@ -0,0 +1,59 @@ +/// @file +/// Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis +/// \test Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis +#include +#include +#include + +#include "t320-basis.h" + +// main test +int main(int argc, char **argv) { + Ceed ceed; + CeedVector u, v, u_q, v_q, w_q; + const CeedInt p = 6, q = 4, dim = 2; + CeedBasis basis; + CeedScalar q_ref[dim * q], q_weight[q]; + CeedScalar interp[p * q], grad[dim * p * q]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, p, &u); + CeedVectorCreate(ceed, p, &v); + CeedVectorSetValue(u, 1.0); + CeedVectorSetValue(v, 0.0); + CeedVectorCreate(ceed, q, &u_q); + CeedVectorCreate(ceed, q, &v_q); + CeedVectorCreate(ceed, q, &w_q); + + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis); + + // Compute area + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q); + CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q); + CeedVectorPointwiseMult(v_q, u_q, w_q); + CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + // Double area computed + CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v); + + // Check area + { + const CeedScalar *v_array; + CeedScalar area = 0.0; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < p; i++) area += v_array[i]; + if (fabs(area - 1.0) > 1E-6) printf("Incorrect area computed %f != %f\n", area, 1.0); + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&u_q); + CeedVectorDestroy(&v_q); + CeedVectorDestroy(&w_q); + CeedBasisDestroy(&basis); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t363-basis.c b/tests/t363-basis.c new file mode 100644 index 0000000000..6c19f34027 --- /dev/null +++ b/tests/t363-basis.c @@ -0,0 +1,54 @@ +/// @file +/// Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis +/// \test Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis +#include +#include +#include + +#include "t320-basis.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedVector u, v; + const CeedInt p = 6, q = 4, dim = 2; + CeedBasis basis; + CeedScalar q_ref[dim * q], q_weight[q]; + CeedScalar interp[p * q], grad[dim * p * q]; + CeedScalar column_sum[p]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, q * dim, &u); + CeedVectorSetValue(u, 1); + CeedVectorCreate(ceed, p, &v); + CeedVectorSetValue(v, 0); + + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis); + + CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v); + CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v); + + // Check values at quadrature points + for (int i = 0; i < p; i++) { + column_sum[i] = 0; + for (int j = 0; j < q * dim; j++) { + column_sum[i] += grad[i + j * p]; + } + } + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (int i = 0; i < p; i++) { + if (fabs(column_sum[i] - v_array[i] / 2.0) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, v_array[i] / 2.0, column_sum[i]); + } + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedBasisDestroy(&basis); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t364-basis.c b/tests/t364-basis.c new file mode 100644 index 0000000000..6ab4058d30 --- /dev/null +++ b/tests/t364-basis.c @@ -0,0 +1,98 @@ +/// @file +/// Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D +/// \test Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D +#include +#include +#include + +#define ALEN(a) (sizeof(a) / sizeof((a)[0])) + +static CeedScalar Eval(CeedScalar x, CeedInt n, const CeedScalar *c) { + CeedScalar y = c[n - 1]; + for (CeedInt i = n - 2; i >= 0; i--) y = y * x + c[i]; + return y; +} + +int main(int argc, char **argv) { + Ceed ceed; + CeedVector x, x_nodes, x_points, x_point, u, v, u_point, v_point; + CeedBasis basis_x, basis_u; + const CeedInt p = 5, q = 5, num_points = 4; + const CeedScalar c[4] = {1, 2, 3, 4}; // 1 + 2x + 3x^2 + ... + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, 2, &x); + CeedVectorCreate(ceed, p, &x_nodes); + CeedVectorCreate(ceed, num_points, &x_points); + CeedVectorCreate(ceed, 1, &x_point); + CeedVectorCreate(ceed, p, &u); + CeedVectorCreate(ceed, num_points, &v); + CeedVectorCreate(ceed, p, &u_point); + CeedVectorCreate(ceed, 1, &v_point); + CeedVectorSetValue(v_point, 1.0); + + // Get nodal coordinates + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, p, CEED_GAUSS_LOBATTO, &basis_x); + { + CeedScalar x_array[2]; + + for (CeedInt i = 0; i < 2; i++) x_array[i] = CeedIntPow(-1, i + 1); + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes); + + // Set values of u at nodes + { + const CeedScalar *x_array; + CeedScalar u_array[p]; + + CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array); + for (CeedInt i = 0; i < p; i++) u_array[i] = Eval(x_array[i], ALEN(c), c); + CeedVectorRestoreArrayRead(x_nodes, &x_array); + CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array); + } + + // Interpolate to arbitrary points + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u); + { + CeedScalar x_array[4] = {-0.33, -0.65, 0.16, 0.99}; + + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v); + + for (CeedInt i = 0; i < num_points; i++) { + const CeedInt num_point[1] = {1}; + CeedScalar fx = 0.0; + const CeedScalar *x_array, *u_array, *v_array, *u_point_array; + + CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array); + CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array); + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + CeedVectorSetValue(x_point, x_array[i]); + CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); + // Double it + CeedBasisApplyAddAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); + CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array); + for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j]; + if (fabs(v_array[i] * 2.0 - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i] * 2.0, fx, x_array[i]); + CeedVectorRestoreArrayRead(u_point, &u_point_array); + CeedVectorRestoreArrayRead(x_points, &x_array); + CeedVectorRestoreArrayRead(u, &u_array); + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_nodes); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&x_point); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&u_point); + CeedVectorDestroy(&v_point); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t365-basis.c b/tests/t365-basis.c new file mode 100644 index 0000000000..74f93ce881 --- /dev/null +++ b/tests/t365-basis.c @@ -0,0 +1,123 @@ +/// @file +/// Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points +/// \test Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points +#include +#include +#include + +static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { + CeedScalar result = tanh(x[0] + 0.1); + if (dim > 1) result += atan(x[1] + 0.2); + if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3)); + return result; +} + +static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { + CeedScalar tol; + if (scalar_type == CEED_SCALAR_FP32) { + if (dim == 3) tol = 0.005; + else tol = 1.e-4; + } else { + tol = 1.e-11; + } + return tol; +} + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector x, x_nodes, x_points, u, u_points, v, ones; + CeedBasis basis_x, basis_u; + const CeedInt p = 9, q = 9, num_points = 4, x_dim = CeedIntPow(2, dim), p_dim = CeedIntPow(p, dim); + CeedScalar sum_1 = 0, sum_2 = 0; + + CeedVectorCreate(ceed, x_dim * dim, &x); + CeedVectorCreate(ceed, p_dim * dim, &x_nodes); + CeedVectorCreate(ceed, num_points * dim, &x_points); + CeedVectorCreate(ceed, p_dim, &u); + CeedVectorCreate(ceed, num_points * dim, &u_points); + CeedVectorCreate(ceed, p_dim, &v); + CeedVectorCreate(ceed, num_points * dim, &ones); + + CeedVectorSetValue(ones, 1); + CeedVectorSetValue(v, 0); + + // Get nodal coordinates + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x); + { + CeedScalar x_array[x_dim * dim]; + + for (CeedInt d = 0; d < dim; d++) { + for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1; + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes); + + // Set values of u at nodes + { + const CeedScalar *x_array; + CeedScalar u_array[p_dim]; + + CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array); + for (CeedInt i = 0; i < p_dim; i++) { + CeedScalar coord[dim]; + + for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_dim + i]; + u_array[i] = Eval(dim, coord); + } + CeedVectorRestoreArrayRead(x_nodes, &x_array); + CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array); + } + + // Interpolate to arbitrary points + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + { + CeedScalar x_array[12] = {-0.33, -0.65, 0.16, 0.99, -0.65, 0.16, 0.99, -0.33, 0.16, 0.99, -0.33, -0.65}; + + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + + // Calculate G u at arbitrary points, G' * 1 at dofs + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points); + CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v); + // Double it + CeedBasisApplyAddAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v); + { + const CeedScalar *u_array, *v_array, *u_points_array; + + CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array); + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + CeedVectorGetArrayRead(u_points, CEED_MEM_HOST, &u_points_array); + for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i]; + for (CeedInt i = 0; i < num_points * dim; i++) sum_2 += u_points_array[i]; + CeedVectorRestoreArrayRead(u, &u_array); + CeedVectorRestoreArrayRead(v, &v_array); + CeedVectorRestoreArrayRead(u_points, &u_points_array); + } + { + CeedScalarType scalar_type; + + CeedGetScalarType(&scalar_type); + + CeedScalar tol = GetTolerance(scalar_type, dim); + + if (fabs(sum_1 - 2.0 * sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, 2.0 * sum_2); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_nodes); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&u); + CeedVectorDestroy(&u_points); + CeedVectorDestroy(&ones); + CeedVectorDestroy(&v); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + } + CeedDestroy(&ceed); + return 0; +}