@@ -44,6 +44,7 @@ Julian Oppermann, Codeplay +
4444Ewan Crawford, Codeplay +
4545Ben Tracy, Codeplay +
4646John Pennycook, Intel +
47+ Greg Lueck, Intel +
4748
4849== Dependencies
4950
@@ -192,8 +193,9 @@ By adding the `insert_barriers` property, a _work-group barrier_ will be
192193inserted between the kernels. To achieve a device-wide synchronization, i.e.,
193194a synchronization between different work-groups that is implicit between two
194195kernels when executed separately, users should leverage the subgraph feature of
195- the SYCL graph proposal. By creating two subgraphs, fusing each and adding both
196- to the same graph, a device-wide synchronization between two fused parts can be
196+ the SYCL graph proposal, as device-wide synchronization inside the fused kernel
197+ is not achievable. By creating two subgraphs, fusing each and adding both to
198+ the same graph, a device-wide synchronization between two fused parts can be
197199achieved if necessary.
198200====
199201
@@ -233,9 +235,11 @@ the property is specified on an accessor) or in any kernel in the graph (in case
233235the property is specified on a buffer or an USM pointer).
234236
235237More concretely, the two shortcuts express the following semantics:
238+
236239* `access_scope_work_group`: Applying this specialization asserts that each
237240element in the buffer or allocated device memory is accessed by no more than one
238241work-group.
242+
239243* `access_scope_work_item`: Applying this specialization asserts that each
240244element in the buffer or allocated device memory is accessed by no more than one
241245work-item.
@@ -320,7 +324,7 @@ in the sequence before the command itself.
320324
321325The exact linearization of the dependency DAG (which generally only implies a
322326partial order) is implementation defined. The linearization should be
323- deterministic, i.e. it should yield the same sequence when presented with the
327+ deterministic, i.e., it should yield the same sequence when presented with the
324328same DAG.
325329
326330=== Synchronization in kernels
@@ -336,13 +340,13 @@ barrier can added between each of the kernels being fused by applying the
336340As the fusion compiler can reason about the access behavior of the different
337341kernels only in a very limited fashion, **it's the user's responsibility to
338342make sure no data races occur in the fused kernel**. Data races could in
339- particular be introduced because the implicit inter-work-group synchronization
343+ particular be introduced because the implicit device-wide synchronization
340344between the execution of two separate kernels is eliminated by fusion. The user
341345must ensure that the kernels combined during fusion do not rely on this
342346synchronization or introduce appropriate synchronization.
343347
344348Device-wide synchronization can be achieved by splitting the graph into multiple
345- subgraphs and fusing each separately, as decribed above.
349+ subgraphs and fusing each separately, as described above.
346350
347351=== Limitations
348352
@@ -421,7 +425,7 @@ To achieve this result during fusion, a fusion compiler must establish some
421425additional context and information.
422426
423427First, the compiler must know that two arguments refer to the same underlying
424- memory. This is possible during runtime, so no additional user input is
428+ memory. This can be inferred during runtime, so no additional user input is
425429required.
426430
427431For the remaining information that needs to be established, the necessity of
@@ -456,10 +460,12 @@ must be provided by the user by applying the `fusion_internal_memory` property
456460to the buffer or allocated device memory as described above.
457461
458462The type of memory that can be used for internalization depends on the memory
459- access pattern of the fuses kernel. Depending on the access pattern, the buffer
463+ access pattern of the fused kernel. Depending on the access pattern, the buffer
460464or allocated device memory can be classified as:
465+
461466* _Privately internalizable_: If not a single element of the buffer/memory is to
462467 be accessed by more than one work-item;
468+
463469* _Locally internalizable_: If not a single element of the buffer/memory is to
464470 be accessed by work items of different work groups.
465471
@@ -483,10 +489,10 @@ dataflow internalization. Implementations should document the necessary
483489properties required to enable internalization in implementation documentation.
484490
485491All internalization-related properties are only _descriptive_, so it is not an
486- error if an implementation is unable to perform internalization based on the
487- specified properties. Implementations can provide a diagnostic message in case
488- the set of specified properties are not sufficient to perform internalization,
489- but are not required to do so.
492+ error if an implementation is unable to or for other reasons decides not to
493+ perform internalization based on the specified properties. Implementations can
494+ provide a diagnostic message in case the set of specified properties are not
495+ sufficient to perform internalization, but are not required to do so.
490496
491497[NOTE]
492498====
@@ -575,87 +581,86 @@ internalization is performed.
575581```c++
576582#include <sycl/sycl.hpp>
577583
578- using namespace sycl;
584+ namespace sycl_ext = sycl::ext::oneapi::experimental ;
579585
580586struct AddKernel {
581- accessor<int, 1> accIn1;
582- accessor<int, 1> accIn2;
583- accessor<int, 1> accOut;
587+ sycl:: accessor<int, 1> accIn1;
588+ sycl:: accessor<int, 1> accIn2;
589+ sycl:: accessor<int, 1> accOut;
584590
585- void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
591+ void operator()(sycl:: id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
586592};
587593
588594int main() {
589595 constexpr size_t dataSize = 512;
590596 int in1[dataSize], in2[dataSize], in3[dataSize], out[dataSize];
591597
592- queue q{default_selector_v};
598+ sycl:: queue q{default_selector_v};
593599
594600 {
595- buffer<int> bIn1{in1, range{dataSize}};
601+ sycl:: buffer<int> bIn1{in1, sycl:: range{dataSize}};
596602 bIn1.set_write_back(false);
597- buffer<int> bIn2{in2, range{dataSize}};
603+ sycl:: buffer<int> bIn2{in2, sycl:: range{dataSize}};
598604 bIn2.set_write_back(false);
599- buffer<int> bIn3{in3, range{dataSize}};
605+ sycl:: buffer<int> bIn3{in3, sycl:: range{dataSize}};
600606 bIn3.set_write_back(false);
601607 buffer<int> bTmp1{range{dataSize}};
602608 // Internalization specified on the buffer
603- buffer<int> bTmp2{
604- range{dataSize},
605- {sycl::ext::oneapi::experimental ::property::access_scope_work_item{},
606- sycl::ext::oneapi::experimental ::property::fusion_internal_memory{},
607- no_init}};
609+ sycl:: buffer<int> bTmp2{
610+ sycl:: range{dataSize},
611+ {sycl_ext ::property::access_scope_work_item{},
612+ sycl_ext ::property::fusion_internal_memory{},
613+ sycl:: no_init}};
608614 // Internalization specified on the buffer
609- buffer<int> bTmp3{
610- range{dataSize},
611- {sycl::ext::oneapi::experimental ::property::access_scope_work_item{},
612- sycl::ext::oneapi::experimental ::property::fusion_internal_memory{},
613- no_init}};
614- buffer<int> bOut{out, range{dataSize}};
615+ sycl:: buffer<int> bTmp3{
616+ sycl:: range{dataSize},
617+ {sycl_ext ::property::access_scope_work_item{},
618+ sycl_ext ::property::fusion_internal_memory{},
619+ sycl:: no_init}};
620+ sycl:: buffer<int> bOut{out, sycl:: range{dataSize}};
615621 bOut.set_write_back(false);
616622
617- ext::oneapi::experimental ::command_graph graph{
623+ sycl_ext ::command_graph graph{
618624 q.get_context(), q.get_device(),
619- sycl::ext::oneapi::experimental::property::graph::
620- assume_buffer_outlives_graph{}};
625+ sycl_ext::property::graph::assume_buffer_outlives_graph{}};
621626
622627 graph.begin_recording(q);
623628
624- q.submit([&](handler &cgh) {
629+ q.submit([&](sycl:: handler &cgh) {
625630 auto accIn1 = bIn1.get_access(cgh);
626631 auto accIn2 = bIn2.get_access(cgh);
627632 // Internalization specified on each accessor.
628633 auto accTmp1 = bTmp1.get_access(cgh,
629- sycl::ext::oneapi::experimental ::property::access_scope_work_item{}
630- sycl::ext::oneapi::experimental ::property::fusion_internal_memory{},
631- no_init);
634+ sycl_ext ::property::access_scope_work_item{}
635+ sycl_ext ::property::fusion_internal_memory{},
636+ sycl:: no_init);
632637 cgh.parallel_for<AddKernel>(dataSize, AddKernel{accIn1, accIn2, accTmp1});
633638 });
634639
635- q.submit([&](handler &cgh) {
640+ q.submit([&](sycl:: handler &cgh) {
636641 // Internalization specified on each accessor.
637642 auto accTmp1 = bTmp1.get_access(cgh,
638- sycl::ext::oneapi::experimental ::property::access_scope_work_item{}
639- sycl::ext::oneapi::experimental ::property::fusion_internal_memory{},
640- no_init);
643+ sycl_ext ::property::access_scope_work_item{}
644+ sycl_ext ::property::fusion_internal_memory{},
645+ sycl:: no_init);
641646 auto accIn3 = bIn3.get_access(cgh);
642647 auto accTmp2 = bTmp2.get_access(cgh);
643648 cgh.parallel_for<class KernelOne>(
644- dataSize, [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
649+ dataSize, [=](sycl:: id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
645650 });
646651
647- q.submit([&](handler &cgh) {
652+ q.submit([&](sycl:: handler &cgh) {
648653 // Internalization specified on each accessor.
649654 auto accTmp1 = bTmp1.get_access(cgh,
650- sycl::ext::oneapi::experimental ::property::access_scope_work_item{}
651- sycl::ext::oneapi::experimental ::property::fusion_internal_memory{},
652- no_init);
655+ sycl_ext ::property::access_scope_work_item{}
656+ sycl_ext ::property::fusion_internal_memory{},
657+ sycl:: no_init);
653658 auto accTmp3 = bTmp3.get_access(cgh);
654659 cgh.parallel_for<class KernelTwo>(
655- dataSize, [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
660+ dataSize, [=](sycl:: id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
656661 });
657662
658- q.submit([&](handler &cgh) {
663+ q.submit([&](sycl:: handler &cgh) {
659664 auto accTmp2 = bTmp2.get_access(cgh);
660665 auto accTmp3 = bTmp3.get_access(cgh);
661666 auto accOut = bOut.get_access(cgh);
@@ -667,8 +672,7 @@ int main() {
667672
668673 // Trigger fusion during finalization.
669674 auto exec_graph =
670- graph.finalize({sycl::ext::oneapi::experimental::property::
671- graph::require_fusion{}});
675+ graph.finalize({sycl_ext::property::graph::require_fusion{}});
672676
673677 q.ext_oneapi_graph(exec_graph);
674678
@@ -683,8 +687,6 @@ int main() {
683687```c++
684688#include <sycl/sycl.hpp>
685689
686- using namespace sycl;
687-
688690namespace sycl_ext = sycl::ext::oneapi::experimental;
689691
690692int main() {
@@ -693,56 +695,56 @@ int main() {
693695
694696 int in1[dataSize], in2[dataSize], in3[dataSize], out[dataSize];
695697
696- queue q{default_selector_v};
698+ sycl:: queue q{default_selector_v};
697699
698700 sycl_ext::command_graph graph{q.get_context(), q.get_device()};
699701
700702 int *dIn1, dIn2, dIn3, dTmp, dOut;
701703
702- dIn1 = malloc_device<int>(q, dataSize);
703- dIn2 = malloc_device<int>(q, dataSize);
704- dIn3 = malloc_device<int>(q, dataSize);
705- dOut = malloc_device<int>(q, dataSize);
704+ dIn1 = sycl:: malloc_device<int>(q, dataSize);
705+ dIn2 = sycl:: malloc_device<int>(q, dataSize);
706+ dIn3 = sycl:: malloc_device<int>(q, dataSize);
707+ dOut = sycl:: malloc_device<int>(q, dataSize);
706708
707- // Specify internalization for an USM pointer
708- dTmp = malloc_device<int>(q, dataSize)
709+ // Specify internalization to local memory for an USM pointer
710+ dTmp = sycl:: malloc_device<int>(q, dataSize)
709711 auto annotatedTmp = sycl_ext::annotated_ptr(
710- dTmp, sycl_ext::property::access_scope_work_item {},
712+ dTmp, sycl_ext::property::access_scope_work_group {},
711713 sycl_ext::property::fusion_internal_memory{}, no_init);
712714
713715 // This explicit memory operation is compatible with fusion, as it can be
714716 // linearized before any device kernel in the graph.
715717 auto copy_in1 =
716- graph.add([&](handler &cgh) { cgh.memcpy(dIn1, in1, numBytes); });
718+ graph.add([&](sycl:: handler &cgh) { cgh.memcpy(dIn1, in1, numBytes); });
717719
718720 // This explicit memory operation is compatible with fusion, as it can be
719721 // linearized before any device kernel in the graph.
720722 auto copy_in2 =
721- graph.add([&](handler &cgh) { cgh.memcpy(dIn2, in2, numBytes); });
723+ graph.add([&](sycl:: handler &cgh) { cgh.memcpy(dIn2, in2, numBytes); });
722724
723725 auto kernel1 = graph.add(
724- [&](handler &cgh) {
726+ [&](sycl:: handler &cgh) {
725727 cgh.parallel_for<class KernelOne>(
726- dataSize, [=](id<1> i) { annotatedTmp[i] = in1[i] + in2[i]; });
728+ dataSize, [=](sycl:: id<1> i) { annotatedTmp[i] = in1[i] + in2[i]; });
727729 },
728730 {sycl_ext::property::node::depends_on(copy_in1, copy_in2)});
729731
730732 // This explicit memory operation is compatible with fusion, as it can be
731733 // linearized before any device kernel in the graph.
732734 auto copy_in3 =
733- graph.add([&](handler &cgh) { cgh.memcpy(dIn3, in3, numBytes); });
735+ graph.add([&](sycl:: handler &cgh) { cgh.memcpy(dIn3, in3, numBytes); });
734736
735737 auto kernel2 = graph.add(
736- [&](handler &cgh) {
738+ [&](sycl:: handler &cgh) {
737739 cgh.parallel_for<class KernelTwo>(
738- dataSize, [=](id<1> i) { out[i] = annotatedTmp[i] * in3[i]; });
740+ dataSize, [=](sycl:: id<1> i) { out[i] = annotatedTmp[i] * in3[i]; });
739741 },
740742 {sycl_ext::property::node::depends_on(copy_in3, kernel1)});
741743
742744 // This explicit memory operation is compatible with fusion, as it can be
743745 // linearized after any device kernel in the graph.
744746 auto copy_out =
745- graph.add([&](handler &cgh) { cgh.memcpy(out, dOut, numBytes); },
747+ graph.add([&](sycl:: handler &cgh) { cgh.memcpy(out, dOut, numBytes); },
746748 {sycl_ext::property::node::depends_on(kernel2)});
747749
748750 // Trigger fusion during finalization.
@@ -751,11 +753,11 @@ int main() {
751753 // use queue shortcut for graph submission
752754 q.ext_oneapi_graph(exec).wait();
753755
754- free(dIn1, q);
755- free(dIn2, q);
756- free(dIn3, q);
757- free(dOut, q);
758- free(dTmp, q);
756+ sycl:: free(dIn1, q);
757+ sycl:: free(dIn2, q);
758+ sycl:: free(dIn3, q);
759+ sycl:: free(dOut, q);
760+ sycl:: free(dTmp, q);
759761
760762 return 0;
761763}
0 commit comments