-
Notifications
You must be signed in to change notification settings - Fork 438
[Feature] Add memory_order PTX for vectorized atomic add #1112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
36be855
5ced776
c65c9d3
0f4b8c7
0185eb5
f10dd82
694f201
5097d34
e275347
a6a1a32
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ | |
| #endif | ||
|
|
||
| #include <cuda/atomic> | ||
| #include <cuda_fp16.h> | ||
| #include <cutlass/numeric_types.h> | ||
|
|
||
| using cutlass::bfloat16_t; | ||
|
|
@@ -45,8 +46,9 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val, | |
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| using NT1 = typename normalize_atomic_type<T1>::type; | ||
| T1 *address = &ref; | ||
| if constexpr (std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) { | ||
| if constexpr ((std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) && | ||
| memory_order == int(cuda::memory_order_relaxed)) { | ||
| atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)); | ||
| } else { | ||
| cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address); | ||
|
|
@@ -59,8 +61,9 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val, | |
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| using NT1 = typename normalize_atomic_type<T1>::type; | ||
| T1 *address = &ref; | ||
| if constexpr (std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) { | ||
| if constexpr ((std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) && | ||
| memory_order == int(cuda::memory_order_relaxed)) { | ||
| return static_cast<T1>( | ||
| atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val))); | ||
| } else { | ||
|
|
@@ -75,8 +78,9 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val, | |
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| using NT1 = typename normalize_atomic_type<T1>::type; | ||
| T1 *address = &ref; | ||
| if constexpr (std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) { | ||
| if constexpr ((std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) && | ||
| memory_order == int(cuda::memory_order_relaxed)) { | ||
| atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)); | ||
| } else { | ||
| cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address); | ||
|
|
@@ -89,8 +93,9 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val, | |
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| using NT1 = typename normalize_atomic_type<T1>::type; | ||
| T1 *address = &ref; | ||
| if constexpr (std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) { | ||
| if constexpr ((std::is_same_v<NT1, half> || | ||
| std::is_same_v<NT1, __nv_bfloat16>) && | ||
| memory_order == int(cuda::memory_order_relaxed)) { | ||
| return static_cast<T1>( | ||
| atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val))); | ||
| } else { | ||
|
|
@@ -135,59 +140,321 @@ TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val, | |
| // TODO add memory_order for vectorized atomic add | ||
| TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| atomicAdd(reinterpret_cast<half2 *>(ref), | ||
| static_cast<half2>(*reinterpret_cast<half2 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| atomicAdd(reinterpret_cast<half2 *>(ref), | ||
| static_cast<half2>(*reinterpret_cast<half2 *>(val))); | ||
| } else { | ||
| // Since atomicAdd does not support memory order, atomic_ref does not | ||
| // support vectorized atomic operation we can only inline ptx code here | ||
| // Note: Vectorized atomic operations only support global space | ||
| // Note: for 16-bit value, we need to reinterpret_cast the value to unsigned | ||
| // short and use "h" register in assembly | ||
| __half2 add_val = *reinterpret_cast<__half2 *>(val); | ||
| unsigned short add_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.x); | ||
| unsigned short add_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.y); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| __half ret_val_x, ret_val_y; | ||
| unsigned short ret_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val_x); | ||
| unsigned short ret_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val_y); | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
|
Comment on lines
+158
to
+163
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uninitialized read into
- __half ret_val_x, ret_val_y;
- unsigned short ret_val_x_cast =
- *reinterpret_cast<unsigned short *>(&ret_val_x);
- unsigned short ret_val_y_cast =
- *reinterpret_cast<unsigned short *>(&ret_val_y);
+ unsigned short ret_val_x_cast; // written by asm
+ unsigned short ret_val_y_cast; // written by asmAnd similarly for the bf16 paths: - __nv_bfloat162 ret_val;
- unsigned short ret_val_x_cast =
- *reinterpret_cast<unsigned short *>(&ret_val.x);
- unsigned short ret_val_y_cast =
- *reinterpret_cast<unsigned short *>(&ret_val.y);
+ unsigned short ret_val_x_cast; // written by asm
+ unsigned short ret_val_y_cast; // written by asmAlso applies to: 200-205, 246-250, 286-290 🤖 Prompt for AI Agents |
||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile( | ||
| "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
|
Comment on lines
+163
to
+169
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
On RMW atomics, - if (memory_order == int(cuda::memory_order_release) ||
- memory_order == int(cuda::memory_order_consume)) {
+ if (memory_order == int(cuda::memory_order_release)) {
...
- } else if (memory_order == int(cuda::memory_order_acquire)) {
+ } else if (memory_order == int(cuda::memory_order_acquire) ||
+ memory_order == int(cuda::memory_order_consume)) {
...Do this consistently in all x2/x4 paths. Also consider whether Also applies to: 205-212, 250-256, 290-296, 324-330, 356-361, 389-397, 429-437 🤖 Prompt for AI Agents |
||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile( | ||
| "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile( | ||
| "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } | ||
| } | ||
|
Comment on lines
+143
to
+184
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chainGuard non-relaxed PTX paths for half/bf16 with arch checks like float (>= sm_90). Vectorized atomics with memory semantics require recent PTX/SM. Currently only float v2/v4 are guarded. Without guards, compilation may fail for lower arch lists even if the branch is not taken at runtime. Wrap the non‑relaxed inline PTX under the same (or correct) guard you used for float: +#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
// existing non-relaxed inline PTX...
- } else {
+ } else {
...
}
+#else
+ } else {
+ // Fallback: no vectorized non‑relaxed support on this arch
+ cuda::atomic_ref<half, cuda::thread_scope_device> ax(ref[0]);
+ cuda::atomic_ref<half, cuda::thread_scope_device> ay(ref[1]);
+ ax.fetch_add(val[0], cuda::memory_order(memory_order));
+ ay.fetch_add(val[1], cuda::memory_order(memory_order));
+ }
+#endifPlease verify the minimum SM/PTX that supports Also applies to: 190-229, 234-268, 274-311 I'll verify the claims in this review comment by examining the actual code structure and confirming the SM/PTX requirements. 🏁 Script executed: #!/bin/bash
# Inspect the file to see actual guards and structure for float vs half/bf16
fd -t f "atomic.h" | head -5Length of output: 30 🏁 Script executed: #!/bin/bash
# Get file content around the specified lines to see guards
if [ -f "src/tl_templates/cuda/atomic.h" ]; then
echo "=== Lines 130-190 (float section) ==="
sed -n '130,190p' "src/tl_templates/cuda/atomic.h"
echo ""
echo "=== Lines 180-240 (half section start) ==="
sed -n '180,240p' "src/tl_templates/cuda/atomic.h"
fiLength of output: 5670 🏁 Script executed: #!/bin/bash
# Search for #if directives and arch checks throughout the file
if [ -f "src/tl_templates/cuda/atomic.h" ]; then
echo "=== Architecture guards in file ==="
rg -n "#if|__CUDA_ARCH|sm_90" "src/tl_templates/cuda/atomic.h" -C 2
fiLength of output: 944 🌐 Web query: 💡 Result: Short answer: atom.release.gpu.global.add.v2.f16 requires NVIDIA SM 90 (compute capability 9.0 / Hopper) or newer. Sources: NVIDIA PTX/Parallel-Thread-Execution docs and Hopper compatibility guide. [1][2][3] Sources
I need to verify the additional sections mentioned ("also applies to: 190-229, 234-268, 274-311"): 🏁 Script executed: #!/bin/bash
# Check lines 234-311 to verify guard status on remaining sections
if [ -f "src/tl_templates/cuda/atomic.h" ]; then
echo "=== Lines 230-270 (bf16 AtomicAddx2Ret and next section) ==="
sed -n '230,270p' "src/tl_templates/cuda/atomic.h"
echo ""
echo "=== Lines 270-312 (remainder) ==="
sed -n '270,312p' "src/tl_templates/cuda/atomic.h"
fiLength of output: 4320 Add SM 90 arch guards to half and bf16 non-relaxed PTX paths; currently unguarded (half) or insufficiently guarded (bf16) while float is correctly guarded. All three sections require fixes:
Float (lines 314+) correctly uses 🤖 Prompt for AI Agents |
||
| } | ||
|
|
||
| TL_DEVICE half2 | ||
| AtomicAddx2Ret(half_t *ref, half_t *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd(reinterpret_cast<half2 *>(ref), | ||
| static_cast<half2>(*reinterpret_cast<half2 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd(reinterpret_cast<half2 *>(ref), | ||
| static_cast<half2>(*reinterpret_cast<half2 *>(val))); | ||
| } else { | ||
| __half2 add_val = *reinterpret_cast<__half2 *>(val); | ||
| unsigned short add_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.x); | ||
| unsigned short add_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.y); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| __half ret_val_x, ret_val_y; | ||
| unsigned short ret_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val_x); | ||
| unsigned short ret_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val_y); | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile( | ||
| "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile( | ||
| "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile( | ||
| "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } | ||
| return half2(*reinterpret_cast<__half *>(&ret_val_x_cast), | ||
| *reinterpret_cast<__half *>(&ret_val_y_cast)); | ||
|
Comment on lines
+226
to
+227
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix return construction for x2 returns (half2/bf16x2).
If available in your CUDA version, prefer: - return half2(*reinterpret_cast<__half *>(&ret_val_x_cast),
- *reinterpret_cast<__half *>(&ret_val_y_cast));
+ return __halves2half2(__ushort_as_half(ret_val_x_cast),
+ __ushort_as_half(ret_val_y_cast));For bf16, use the bf16 equivalents (please confirm intrinsic names in your toolchain): - return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast),
- *reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast));
+ __nv_bfloat16 hx, hy;
+ memcpy(&hx, &ret_val_x_cast, sizeof(hx)); // avoids aliasing UB
+ memcpy(&hy, &ret_val_y_cast, sizeof(hy));
+ return __nv_bfloat162{hx, hy};If you prefer no memcpy, introduce a small constexpr bit_cast helper for device code. Also applies to: 308-310 |
||
| } | ||
| } | ||
|
|
||
| #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750)) | ||
| TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| atomicAdd( | ||
| reinterpret_cast<__nv_bfloat162 *>(ref), | ||
| static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| atomicAdd( | ||
| reinterpret_cast<__nv_bfloat162 *>(ref), | ||
| static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val))); | ||
| } else { | ||
| __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val); | ||
| unsigned short add_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.x); | ||
| unsigned short add_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.y); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| __nv_bfloat162 ret_val; | ||
| unsigned short ret_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val.x); | ||
| unsigned short ret_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val.y); | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| TL_DEVICE __nv_bfloat162 | ||
| AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd( | ||
| reinterpret_cast<__nv_bfloat162 *>(ref), | ||
| static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd( | ||
| reinterpret_cast<__nv_bfloat162 *>(ref), | ||
| static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val))); | ||
| } else { | ||
| __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val); | ||
| unsigned short add_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.x); | ||
| unsigned short add_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&add_val.y); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| __nv_bfloat162 ret_val; | ||
| unsigned short ret_val_x_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val.x); | ||
| unsigned short ret_val_y_cast = | ||
| *reinterpret_cast<unsigned short *>(&ret_val.y); | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};" | ||
| : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast) | ||
| : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast) | ||
| : "memory"); | ||
| } | ||
| return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast), | ||
| *reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast)); | ||
| } | ||
| } | ||
| #endif | ||
|
|
||
| #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900)) | ||
| TL_DEVICE void AtomicAddx2(float *ref, float *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| atomicAdd(reinterpret_cast<float2 *>(ref), | ||
| static_cast<float2>(*reinterpret_cast<float2 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| atomicAdd(reinterpret_cast<float2 *>(ref), | ||
| static_cast<float2>(*reinterpret_cast<float2 *>(val))); | ||
| } else { | ||
| float2 add_val = *reinterpret_cast<float2 *>(val); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| float2 ret_val; | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y) | ||
| : "memory"); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| TL_DEVICE float2 | ||
| AtomicAddx2Ret(float *ref, float *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd(reinterpret_cast<float2 *>(ref), | ||
| static_cast<float2>(*reinterpret_cast<float2 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd(reinterpret_cast<float2 *>(ref), | ||
| static_cast<float2>(*reinterpret_cast<float2 *>(val))); | ||
| } else { | ||
| float2 add_val = *reinterpret_cast<float2 *>(val); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| float2 ret_val; | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y) | ||
| : "memory"); | ||
| } | ||
| return ret_val; | ||
| } | ||
| } | ||
|
|
||
| TL_DEVICE void AtomicAddx4(float *ref, float *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| atomicAdd(reinterpret_cast<float4 *>(ref), | ||
| static_cast<float4>(*reinterpret_cast<float4 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| atomicAdd(reinterpret_cast<float4 *>(ref), | ||
| static_cast<float4>(*reinterpret_cast<float4 *>(val))); | ||
| } else { | ||
| // Since atomicAdd does not support memory order, atomic_ref does not | ||
| // support vectorized atomic operation we can only inline ptx code here | ||
| // Note: Vectorized atomic operations only support global space | ||
| float4 add_val = *reinterpret_cast<float4 *>(val); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| float4 ret_val; | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], " | ||
| "{%5,%6,%7,%8};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z), | ||
| "=f"(ret_val.w) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y), | ||
| "f"(add_val.z), "f"(add_val.w) | ||
| : "memory"); | ||
|
Comment on lines
+389
to
+397
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PTX mnemonic order bug: uses You use Apply: - asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+ asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
...
- asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+ asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
...
- asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+ asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "Also applies to: 431-455 🤖 Prompt for AI Agents |
||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], " | ||
| "{%5,%6,%7,%8};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z), | ||
| "=f"(ret_val.w) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y), | ||
| "f"(add_val.z), "f"(add_val.w) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], " | ||
| "{%5,%6,%7,%8};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z), | ||
| "=f"(ret_val.w) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y), | ||
| "f"(add_val.z), "f"(add_val.w) | ||
| : "memory"); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| TL_DEVICE float4 | ||
| AtomicAddx4Ret(float *ref, float *val, | ||
| int memory_order = int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd(reinterpret_cast<float4 *>(ref), | ||
| static_cast<float4>(*reinterpret_cast<float4 *>(val))); | ||
| if (memory_order == int(cuda::memory_order_relaxed)) { | ||
| return atomicAdd(reinterpret_cast<float4 *>(ref), | ||
| static_cast<float4>(*reinterpret_cast<float4 *>(val))); | ||
| } else { | ||
| float4 add_val = *reinterpret_cast<float4 *>(val); | ||
| unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref); | ||
| float4 ret_val; | ||
| if (memory_order == int(cuda::memory_order_release) || | ||
| memory_order == int(cuda::memory_order_consume)) { | ||
| asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], " | ||
| "{%5,%6,%7,%8};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z), | ||
| "=f"(ret_val.w) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y), | ||
| "f"(add_val.z), "f"(add_val.w) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acquire)) { | ||
| asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], " | ||
| "{%5,%6,%7,%8};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z), | ||
| "=f"(ret_val.w) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y), | ||
| "f"(add_val.z), "f"(add_val.w) | ||
| : "memory"); | ||
| } else if (memory_order == int(cuda::memory_order_acq_rel) || | ||
| memory_order == int(cuda::memory_order_seq_cst)) { | ||
| asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], " | ||
| "{%5,%6,%7,%8};" | ||
| : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z), | ||
| "=f"(ret_val.w) | ||
| : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y), | ||
| "f"(add_val.z), "f"(add_val.w) | ||
| : "memory"); | ||
| } | ||
| return ret_val; | ||
| } | ||
| } | ||
| #endif | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
Alignment preconditions for vectorized operations are not enforced.
Reinterpreting
half_t*/float*ashalf2*/float2*/float4*requires 4/8/16‑byte alignment. Misalignment is UB and can fault on hardware.assert(((uintptr_t)ref % align) == 0)in debug).Also applies to: 191-193, 235-238, 275-278, 318-320, 349-351, 380-382, 423-425
Length of output: 30
🏁 Script executed:
Length of output: 32
🏁 Script executed:
Length of output: 4830
🏁 Script executed:
Length of output: 9436
🏁 Script executed:
Length of output: 34
🏁 Script executed:
Length of output: 1296
Length of output: 4234
🏁 Script executed:
Length of output: 469
🏁 Script executed:
Length of output: 0
🏁 Script executed:
Length of output: 0
🏁 Script executed:
Length of output: 562
🏁 Script executed:
Length of output: 235
🌐 Web query:
💡 Result:
Short answer
Details (important)
If you want, tell me your CUDA version / target GPU compute capability and I’ll show a minimal safe implementation (with alignment attributes or memcpy-based workaround).
Add runtime alignment assertions to prevent undefined behavior from misaligned vectorized atomic operations.
The concern is valid. CUDA vectorized atomicAdd operations require strict alignment: float2 requires 8-byte alignment, float4 requires 16-byte alignment. The code reinterpret_casts raw pointers to vectorized types without enforcing these preconditions, which causes undefined behavior and potential hardware faults.
All 8 locations (lines 144–146, 191–193, 235–238, 275–278, 318–320, 349–351, 380–382, 423–425) lack guards. Add runtime
assert(((uintptr_t)ref % alignment) == 0)checks, or implement a fallback scalar path when alignment cannot be guaranteed.