@@ -85,12 +85,12 @@ namespace sycl::ext::oneapi {
8585int32_t dot_acc(vec<int8_t,4> a, vec<int8_t,4> b, int32_t c);
8686int32_t dot_acc(vec<int8_t,4> a, vec<uint8_t,4> b, int32_t c);
8787int32_t dot_acc(vec<uint8_t,4> a, vec<int8_t,4> b, int32_t c);
88- int32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, int32_t c);
88+ uint32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, uint32_t c);
8989
90- int32_t doc_acc_4x8packed(int32_t a, int32_t b, int32_t c);
91- int32_t doc_acc_4x8packed(int32_t a, uint32_t b, int32_t c);
92- int32_t doc_acc_4x8packed (uint32_t a, int32_t b, int32_t c);
93- int32_t doc_acc_4x8packed (uint32_t a, uint32_t b, int32_t c);
90+ int32_t doc_acc_4x8packed_ss(uint32_t a, uint32_t b, int32_t c);
91+ int32_t doc_acc_4x8packed_su(uint32_t a, uint32_t b, int32_t c);
92+ int32_t doc_acc_4x8packed_us (uint32_t a, uint32_t b, int32_t c);
93+ uint32_t doc_acc_4x8packed_uu (uint32_t a, uint32_t b, uint32_t c);
9494
9595} // namespace sycl::ext::oneapi
9696----
@@ -111,51 +111,71 @@ int32_t dot_acc(vec<int8_t,4> a,
111111int32_t dot_acc(vec<uint8_t,4> a,
112112 vec<int8_t,4> b,
113113 int32_t c)
114- int32_t dot_acc(vec<uint8_t,4> a,
115- vec<uint8_t,4> b,
116- int32_t c)
114+ uint32_t dot_acc(vec<uint8_t,4> a,
115+ vec<uint8_t,4> b,
116+ uint32_t c)
117+ ----
118+
119+ |Performs a four-component integer dot product accumulate operation. The value
120+ that is returned is equivalent to `dot(a, b) + c`, where `dot` computes the
121+ dot product of two vectors.
122+
123+ |[source,c]
124+ ----
125+ int32_t doc_acc_4x8packed_ss(uint32_t a,
126+ uint32_t b,
127+ int32_t c)
128+ ----
129+
130+ |Performs a four-component integer dot product accumulate operation, where
131+ `a` and `b` are both interpreted as `vec<int8_t,4>`.
132+
133+ |[source,c]
134+ ----
135+ int32_t doc_acc_4x8packed_su(uint32_t a,
136+ uint32_t b,
137+ int32_t c)
117138----
118139
119- |Performs a four-component integer dot product accumulate operation. +
120- {blank}
121- The value that is returned is equivalent to +
122- {blank}
123- `dot(a, b) + c`
140+ |Performs a four-component integer dot product accumulate operation, where
141+ `a` is interpreted as `vec<int8_t,4>` and `b` is interpreted as
142+ `vec<uint8_t,4>`.
124143
125144|[source,c]
126145----
127- int32_t doc_acc_4x8packed(int32_t a,
128- int32_t b,
129- int32_t c)
130- int32_t doc_acc_4x8packed(int32_t a,
131- uint32_t b,
132- int32_t c)
133- int32_t doc_acc_4x8packed(uint32_t a,
134- int32_t b,
135- int32_t c)
136- int32_t doc_acc_4x8packed(uint32_t a,
137- uint32_t b,
138- int32_t c);
146+ int32_t doc_acc_4x8packed_us(uint32_t a,
147+ uint32_t b,
148+ int32_t c)
139149----
140150
141151|Performs a four-component integer dot product accumulate operation, where
142- `a` and `b` are 32-bit integers that represent a vector of 4 8-bit elements.
143- When the type of `a` or `b` is `int32_t`, it is interpreted as `vec<int8_t,4>`.
144- When the type of `a` or `b` is `uint32_t`, it is interpreted as
145- `vec<uint8_t,4>`. In each case, the least significant byte is element 0, and
146- the most significant byte is element 3.
152+ `a` is interpreted as `vec<uint8_t,4>` and `b` is interpreted as
153+ `vec<int8_t,4>`.
147154
155+ |[source,c]
156+ ----
157+ uint32_t doc_acc_4x8packed_uu(uint32_t a,
158+ uint32_t b,
159+ uint32_t c);
160+ ----
161+
162+ |Performs a four-component integer dot product accumulate operation, where
163+ `a` and `b` are both interpreted as `vec<uint8_t,4>`.
148164|====
149165
166+ For all the "packed" overloads, the least significant byte of the integer is
167+ element 0, and the most significant byte is element 3.
168+
150169=== Deprecated functions
151170
152- The following functions are deprecated. They have the same effect as the
153- `doc_acc_4x8packed` overloads described above.
171+ The following functions are deprecated.
154172
155173[source,c++]
156174----
157175namespace sycl::ext::oneapi {
158176
177+ int32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, int32_t c);
178+
159179int32_t dot_acc(int32_t a, int32_t b, int32_t c);
160180int32_t dot_acc(int32_t a, uint32_t b, int32_t c);
161181int32_t dot_acc(uint32_t a, int32_t b, int32_t c);
@@ -164,15 +184,63 @@ int32_t dot_acc(uint32_t a, uint32_t b, int32_t c);
164184} // namespace sycl::ext::oneapi
165185----
166186
187+ [cols="4a,4",options="header"]
188+ |====
189+ | *Function*
190+ | *Description*
167191
168- == Issues
192+ |[source,c]
193+ ----
194+ int32_t dot_acc(vec<uint8_t,4> a,
195+ vec<uint8_t,4> b,
196+ int32_t c)
197+ ----
198+
199+ |Performs a four-component integer dot product accumulate operation, where the
200+ elements of `a` and `b` are unsigned while `c` is signed. Use the version
201+ taking an unsigned `c` instead.
202+
203+ |[source,c]
204+ ----
205+ int32_t dot_acc(int32_t a,
206+ int32_t b,
207+ int32_t c)
208+ ----
169209
170- * The overloads that take two unsigned vectors have a signed `c` and return a
171- signed result. This is inconsistent with the SPIR-V primitives and the
172- OpenCL C APIs, both of which return an unsigned value in this case and expect
173- an unsigned `c`. I think we could implement the APIs as they are using the
174- SPIR-V primitives, but the extra unsigned-to-signed conversions might
175- generate less efficient code (I haven't checked). Is there a compelling
176- reason to keep these APIs as they are now? If not, we could deprecate them
177- and introduce overloads that take an unsigned `c` and return an unsigned
178- value.
210+ |Performs a four-component integer dot product accumulate operation, where
211+ `a` and `b` are both interpreted as `vec<int8_t,4>`. Use
212+ `doc_acc_4x8packed_ss` instead.
213+
214+ |[source,c]
215+ ----
216+ int32_t dot_acc(int32_t a,
217+ uint32_t b,
218+ int32_t c)
219+ ----
220+
221+ |Performs a four-component integer dot product accumulate operation, where
222+ `a` is interpreted as `vec<int8_t,4>` and `b` is interpreted as
223+ `vec<uint8_t,4>`. Use `doc_acc_4x8packed_su` instead.
224+
225+ |[source,c]
226+ ----
227+ int32_t dot_acc(uint32_t a,
228+ int32_t b,
229+ int32_t c)
230+ ----
231+
232+ |Performs a four-component integer dot product accumulate operation, where
233+ `a` is interpreted as `vec<uint8_t,4>` and `b` is interpreted as
234+ `vec<int8_t,4>`. Use `doc_acc_4x8packed_us` instead.
235+
236+ |[source,c]
237+ ----
238+ int32_t dot_acc(uint32_t a,
239+ uint32_t b,
240+ int32_t c)
241+ ----
242+
243+ |Performs a four-component integer dot product accumulate operation, where
244+ `a` and `b` are both interpreted as `vec<uint8_t,4>`. Use
245+ `doc_acc_4x8packed_uu` instead.
246+ |====
0 commit comments