@@ -55,6 +55,13 @@ This extension builds on top of the experimental SYCL graphs
5555https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc[extension
5656proposal]. All references to the "graphs proposal" refer to this proposal.
5757
58+ In addition, this extension also depends on the following other SYCL extensions:
59+
60+ * link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties]
61+ extension.
62+ * link:../experimental/sycl_ext_oneapi_annotated_ptr.asciidoc[sycl_ext_oneapi_annotated_ptr]
63+ extension.
64+
5865== Status
5966
6067This is a proposed extension specification, intended to gather community
@@ -198,9 +205,8 @@ different APIs, namely:
198205* The `accessor` constructor, giving a more granular control.
199206* The `buffer` constructor, in which case all the `accessors` derived from
200207this buffer will inherit this property (unless overridden).
201- * The `property_list` parameter of `sycl::malloc_device()`,
202- `sycl::aligned_alloc_device()`, `sycl::malloc_shared()`, or
203- `sycl::aligned_alloc_shared()` to apply the property to an USM pointer.
208+ * The property list parameter of `annotated_ptr`, to apply the property to a
209+ USM pointer.
204210
205211```c++
206212namespace sycl::ext::oneapi::experimental::property{
@@ -248,16 +254,22 @@ Implementations can provide a diagnostic message in case internalization was
248254not performed through an implementation-specified mechanism, but are not
249255required to do so.
250256
257+ In case the `access_scope` property is attached to `annotated_ptr`, the
258+ properties should be inspected by an implementation when the `annotated_ptr` is
259+ captured by a kernel lambda or otherwise passed as an argument to a kernel
260+ function. Implementations are not required to track internalization-related
261+ information from other USM pointers that may be used by a kernel, such as those
262+ stored inside of structs or other data structures.
263+
251264===== Internal memory property
252265
253266The following property can be passed to three different APIs, namely:
254267
255268* The `accessor` constructor, giving a more granular control.
256269* The `buffer` constructor, in which case all the `accessors` derived from
257270this buffer will inherit this property (unless overridden).
258- * The `property_list` parameter of `sycl::malloc_device()`,
259- `sycl::aligned_alloc_device()`, `sycl::malloc_shared()`, or
260- `sycl::aligned_alloc_shared()` to apply the property to an USM pointer.
271+ * The property list parameter of `annotated_ptr`, to apply the property to a
272+ USM pointer.
261273
262274```c++
263275sycl::ext::oneapi::experimental::property::fusion_internal_memory
@@ -277,6 +289,14 @@ Implementations can provide a diagnostic message in case internalization was
277289not performed through an implementation-specified mechanism, but are not
278290required to do so.
279291
292+ In case the `fusion_internal_memory` property is attached to `annotated_ptr`,
293+ the properties should be inspected by an implementation when the
294+ `annotated_ptr` is captured by a kernel lambda or otherwise passed as an
295+ argument to a kernel function. Implementations are not required to track
296+ internalization-related information from other USM pointers that may be used by
297+ a kernel, such as those stored inside of structs or other data structures.
298+
299+
280300==== Device aspect
281301
282302To support querying whether a SYCL device and the underlying platform support
@@ -418,9 +438,13 @@ https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_propertie
418438
419439To this end, this extension allows the use of the property in more places than
420440defined in Table 52 in the SYCL specification. More concretely, this extension
421- allows to use the property in the buffer constructor or the `property_list`
422- parameter of `sycl::malloc_device()`, `sycl::aligned_alloc_device()`,
423- `sycl::malloc_shared()` and `sycl::aligned_alloc_shared()`.
441+ allows to use the property in the buffer constructor or the property list
442+ parameter of `annotated_ptr<...>`. In case the `no_init` property is attached to
443+ `annotated_ptr`, the properties should be inspected by an implementation when
444+ the `annotated_ptr` is captured by a kernel lambda or otherwise passed as an
445+ argument to a kernel function. Implementations are not required to track
446+ internalization-related information from other USM pointers that may be used by
447+ a kernel, such as those stored inside of structs or other data structures.
424448
425449If the implementation's fusion compiler is not able to guarantee write-back of
426450the final result after internalization, values stored to an internalized
@@ -445,6 +469,14 @@ specializations of the `access_scope` property template defined in this
445469proposal can be used to inform the fusion compiler about the access pattern of
446470the kernels involved in fusion.
447471
472+ If an `annotated_ptr` is created with any of the properties relating to
473+ internalization and captured by a kernel lambda or otherwise passed as an
474+ argument to a kernel function participating in fusion, the underlying memory
475+ must only be accessed via pointers that are also captured or passed as kernel
476+ argument. Access to the underlying memory via a different pointer, such as
477+ pointers stored inside of structs or other data structures results in undefined
478+ behavior.
479+
448480As already stated above, it depends on the implementation's capabilities which
449481properties need to be applied to a buffer or allocated device memory to enable
450482dataflow internalization. Implementations should document the necessary
@@ -506,11 +538,12 @@ properties must be combined as follows:
506538|===
507539
508540In case different internalization targets are used for accessors to the same
509- buffer, the following (commutative and associative) rules are followed:
541+ buffer or for `annotated_ptr` pointing to the same underlying memory, the
542+ following (commutative and associative) rules are followed:
510543
511544[options="header"]
512545|===
513- |Accessor~1~ Access Scope|Accessor~2~ Access Scope|Resulting Access Scope
546+ |Accessor/Ptr ~1~ Access Scope|Accessor/Ptr ~2~ Access Scope|Resulting Access Scope
514547
515548|None
516549|_Any_
@@ -528,7 +561,7 @@ buffer, the following (commutative and associative) rules are followed:
528561|Work Item
529562|===
530563
531- If no work-group size is specified or two accessors specify different
564+ If no work-group size is specified or two kernels specify different
532565work-group sizes when attempting local internalization for any of the
533566kernels involved in the fusion, no internalization will be
534567performed. If there is a mismatch between the two accessors (access
@@ -672,10 +705,10 @@ int main() {
672705 dOut = malloc_device<int>(q, dataSize);
673706
674707 // Specify internalization for an USM pointer
675- dTmp = malloc_device<int>(
676- q, dataSize,
677- { sycl_ext::property::access_scope_work_item{},
678- sycl_ext::property::fusion_internal_memory{}, no_init} );
708+ dTmp = malloc_device<int>(q, dataSize)
709+ auto annotatedTmp = sycl_ext::annotated_ptr(
710+ dTmp, sycl_ext::property::access_scope_work_item{},
711+ sycl_ext::property::fusion_internal_memory{}, no_init);
679712
680713 // This explicit memory operation is compatible with fusion, as it can be
681714 // linearized before any device kernel in the graph.
@@ -690,7 +723,7 @@ int main() {
690723 auto kernel1 = graph.add(
691724 [&](handler &cgh) {
692725 cgh.parallel_for<class KernelOne>(
693- dataSize, [=](id<1> i) { tmp [i] = in1[i] + in2[i]; });
726+ dataSize, [=](id<1> i) { annotatedTmp [i] = in1[i] + in2[i]; });
694727 },
695728 {sycl_ext::property::node::depends_on(copy_in1, copy_in2)});
696729
@@ -702,7 +735,7 @@ int main() {
702735 auto kernel2 = graph.add(
703736 [&](handler &cgh) {
704737 cgh.parallel_for<class KernelTwo>(
705- dataSize, [=](id<1> i) { out[i] = tmp [i] * in3[i]; });
738+ dataSize, [=](id<1> i) { out[i] = annotatedTmp [i] * in3[i]; });
706739 },
707740 {sycl_ext::property::node::depends_on(copy_in3, kernel1)});
708741
@@ -740,4 +773,5 @@ int main() {
740773|3|2023-04-11|Lukas Sommer|*Update usage examples for graph API changes*
741774|4|2023-08-17|Lukas Sommer|*Update after graph extension has been merged*
742775|5|2023-09-01|Lukas Sommer|*Split internalization properties and change barrier*
776+ |6|2023-09-13|Lukas Sommer|*Use annotated_ptr for USM internalization*
743777|========================================
0 commit comments