Skip to content

Commit

Permalink
Efficiently use in-order and out-of-order queues in the SYCL backends (
Browse files Browse the repository at this point in the history
…CEED#1456)

* Efficiently use both in-order and out-of-order sycl queues. (#59)

* Default to in-order queues.

* Only order queue as necessary.

* Use the same queue recursively through a hierarchy of Ceed objects,

* Adds missing changes from previous git-cherry-pick

* Efficiently use both in-order and out-of-order sycl queues. (#59)

* Default to in-order queues.

* Only order queue as necessary.

* Use the same queue recursively through a hierarchy of Ceed objects,

* Adds missing changes from previous git-cherry-pick

* fix formatting errors

* further formatting corrections

* fix formatting errors for merge

* fix errors

* fix errors

---------

Co-authored-by: Kris Rowe <[email protected]>
  • Loading branch information
uumesh and kris-rowe authored Apr 4, 2024
1 parent 76f753b commit 1f4b1b4
Show file tree
Hide file tree
Showing 12 changed files with 204 additions and 170 deletions.
10 changes: 7 additions & 3 deletions backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -741,9 +741,13 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
code << "// -----------------------------------------------------------------------------\n\n";

// Copy the struct (containing device addresses) from the host to the device
sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1);
sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1);
sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1);
std::vector<sycl::event> e;

if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};

sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1, e);
sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1, e);
sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1, e);
// These copies can happen while the JIT is being done
CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));

Expand Down
5 changes: 3 additions & 2 deletions backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,9 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
sycl::nd_range<3> kernel_range(global_range, local_range);

//-----------
// Order queue
sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;

if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(e);
Expand Down
10 changes: 5 additions & 5 deletions backends/sycl-gen/ceed-sycl-gen.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
CeedCallBackend(CeedInit_Sycl(ceed, resource));

CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));

CeedCallBackend(CeedGetData(ceed_shared, &shared_data));
// Need to use the same queue everywhere for correct synchronization
shared_data->sycl_queue = data->sycl_queue;

CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
CeedCallBackend(CeedSetStream_Sycl(ceed_shared, &(data->sycl_queue)));

CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));

Ceed ceed_fallback = NULL;
CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, &(data->sycl_queue)));

CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Sycl));
Expand Down
77 changes: 46 additions & 31 deletions backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
sycl::range<1> global_range(num_elem * work_group_size);
sycl::nd_range<1> kernel_range(global_range, local_range);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);

sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);
Expand Down Expand Up @@ -150,10 +152,12 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
sycl::range<1> global_range(num_elem * work_group_size);
sycl::nd_range<1> kernel_range(global_range, local_range);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);

sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);
Expand Down Expand Up @@ -248,9 +252,11 @@ static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem,
const CeedInt num_quad_z = (dim > 2) ? Q_1d : 1;
sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, {e}, [=](sycl::item<3> work_item) {
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, e, [=](sycl::item<3> work_item) {
if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]];
if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]];
if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d];
Expand Down Expand Up @@ -283,9 +289,10 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
if (is_transpose) {
CeedSize length;
CeedCallBackend(CeedVectorGetLength(v, &length));
// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
std::vector<sycl::event> e;

if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
data->sycl_queue.fill<CeedScalar>(d_v, 0, length, e);
}

// Basis action
Expand Down Expand Up @@ -343,9 +350,11 @@ static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt n

sycl::range<2> kernel_range(num_elem, v_size);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];

Expand Down Expand Up @@ -387,9 +396,11 @@ static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num

sycl::range<2> kernel_range(num_elem, v_size);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];

Expand Down Expand Up @@ -425,9 +436,11 @@ static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt n

sycl::range<2> kernel_range(num_elem, num_qpts);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];
d_V[i + elem * num_qpts] = q_weight[i];
Expand Down Expand Up @@ -564,18 +577,19 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
impl->buf_len = num_comp * CeedIntMax(num_nodes, num_qpts);
impl->op_len = Q_1d * P_1d;

// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;

if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);

const CeedInt interp_length = Q_1d * P_1d;
CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);

CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

Expand Down Expand Up @@ -618,19 +632,20 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
impl->num_nodes = num_nodes;
impl->num_qpts = num_qpts;

// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;

if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);

const CeedInt interp_length = num_qpts * num_nodes;
CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e);

const CeedInt grad_length = num_qpts * num_nodes * dim;
CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e);

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

Expand Down
63 changes: 36 additions & 27 deletions backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -749,47 +749,49 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
for (CeedInt i = 0; i < num_e_mode_in; i++) has_eval_none = has_eval_none || (e_mode_in[i] == CEED_EVAL_NONE);
for (CeedInt i = 0; i < num_e_mode_out; i++) has_eval_none = has_eval_none || (e_mode_out[i] == CEED_EVAL_NONE);

// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;

if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};

std::vector<sycl::event> copy_events;

if (has_eval_none) {
CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity));
for (CeedSize i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0;
CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device<CeedScalar>(i_len, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, i_len, {e});
sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, i_len, e);
copy_events.push_back(identity_copy);
}

// CEED_EVAL_INTERP
CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in));
CeedCallSycl(ceed, diag->d_interp_in = sycl::malloc_device<CeedScalar>(i_len, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event interp_in_copy = sycl_data->sycl_queue.copy<CeedScalar>(interp_in, diag->d_interp_in, i_len, {e});
sycl::event interp_in_copy = sycl_data->sycl_queue.copy<CeedScalar>(interp_in, diag->d_interp_in, i_len, e);
copy_events.push_back(interp_in_copy);

CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out));
CeedCallSycl(ceed, diag->d_interp_out = sycl::malloc_device<CeedScalar>(i_len, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event interp_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(interp_out, diag->d_interp_out, i_len, {e});
sycl::event interp_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(interp_out, diag->d_interp_out, i_len, e);
copy_events.push_back(interp_out_copy);

// CEED_EVAL_GRAD
CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in));
CeedCallSycl(ceed, diag->d_grad_in = sycl::malloc_device<CeedScalar>(g_len, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event grad_in_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_in, diag->d_grad_in, g_len, {e});
sycl::event grad_in_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_in, diag->d_grad_in, g_len, e);
copy_events.push_back(grad_in_copy);

CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out));
CeedCallSycl(ceed, diag->d_grad_out = sycl::malloc_device<CeedScalar>(g_len, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event grad_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_out, diag->d_grad_out, g_len, {e});
sycl::event grad_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_out, diag->d_grad_out, g_len, e);
copy_events.push_back(grad_out_copy);

// Arrays of e_modes
CeedCallSycl(ceed, diag->d_e_mode_in = sycl::malloc_device<CeedEvalMode>(num_e_mode_in, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event e_mode_in_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_in, diag->d_e_mode_in, num_e_mode_in, {e});
sycl::event e_mode_in_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_in, diag->d_e_mode_in, num_e_mode_in, e);
copy_events.push_back(e_mode_in_copy);

CeedCallSycl(ceed, diag->d_e_mode_out = sycl::malloc_device<CeedEvalMode>(num_e_mode_out, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event e_mode_out_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_out, diag->d_e_mode_out, num_e_mode_out, {e});
sycl::event e_mode_out_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_out, diag->d_e_mode_out, num_e_mode_out, e);
copy_events.push_back(e_mode_out_copy);

// Restriction
Expand Down Expand Up @@ -820,9 +822,11 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i

sycl::range<1> kernel_range(num_elem * num_nodes);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, {e}, [=](sycl::id<1> idx) {
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, e, [=](sycl::id<1> idx) {
const CeedInt tid = idx % num_nodes;
const CeedInt e = idx / num_nodes;

Expand Down Expand Up @@ -1078,14 +1082,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
CeedEvalMode eval_mode = eval_mode_in[i];

if (eval_mode == CEED_EVAL_INTERP) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], elem_size * num_qpts, {e});
std::vector<sycl::event> e;

if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], elem_size * num_qpts, e);
mat_start += elem_size * num_qpts;
} else if (eval_mode == CEED_EVAL_GRAD) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * elem_size * num_qpts, {e});
std::vector<sycl::event> e;

if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * elem_size * num_qpts, e);
mat_start += dim * elem_size * num_qpts;
}
}
Expand All @@ -1108,14 +1114,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
CeedEvalMode eval_mode = eval_mode_out[i];

if (eval_mode == CEED_EVAL_INTERP) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], elem_size * num_qpts, {e});
std::vector<sycl::event> e;

if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], elem_size * num_qpts, e);
mat_start += elem_size * num_qpts;
} else if (eval_mode == CEED_EVAL_GRAD) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * elem_size * num_qpts, {e});
std::vector<sycl::event> e;

if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * elem_size * num_qpts, e);
mat_start += dim * elem_size * num_qpts;
}
}
Expand Down Expand Up @@ -1157,9 +1165,10 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp

sycl::range<3> kernel_range(num_elem, block_size_y, block_size_x);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedOperatorSyclLinearAssemble>(kernel_range, {e}, [=](sycl::id<3> idx) {
std::vector<sycl::event> e;

if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
sycl_queue.parallel_for<CeedOperatorSyclLinearAssemble>(kernel_range, e, [=](sycl::id<3> idx) {
const int e = idx.get(0); // Element index
const int l = idx.get(1); // The output column index of each B^TDB operation
const int i = idx.get(2); // The output row index of each B^TDB operation
Expand Down
7 changes: 4 additions & 3 deletions backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
// Get context data
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &context_data));

// Order queue
sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;

if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};

// Launch as a basic parallel_for over Q quadrature points
ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);

int iarg{};
cgh.set_arg(iarg, context_data);
Expand Down
Loading

0 comments on commit 1f4b1b4

Please sign in to comment.