Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 293 additions & 26 deletions src/tl_templates/cuda/atomic.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#endif

#include <cuda/atomic>
#include <cuda_fp16.h>
#include <cutlass/numeric_types.h>

using cutlass::bfloat16_t;
Expand Down Expand Up @@ -45,8 +46,9 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) {
if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
} else {
cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
Expand All @@ -59,8 +61,9 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) {
if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
return static_cast<T1>(
atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
} else {
Expand All @@ -75,8 +78,9 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) {
if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
} else {
cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
Expand All @@ -89,8 +93,9 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) {
if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
return static_cast<T1>(
atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
} else {
Expand Down Expand Up @@ -135,59 +140,321 @@ TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
// TODO add memory_order for vectorized atomic add
TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val,
int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<half2 *>(ref),
static_cast<half2>(*reinterpret_cast<half2 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<half2 *>(ref),
static_cast<half2>(*reinterpret_cast<half2 *>(val)));
} else {
Comment on lines +144 to +146
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Alignment preconditions for vectorized operations are not enforced.

Reinterpreting half_t*/float* as half2*/float2*/float4* requires 4/8/16‑byte alignment. Misalignment is UB and can fault on hardware.

  • Document and assert alignment (static_assert on types is insufficient; add runtime assert(((uintptr_t)ref % align) == 0) in debug).
  • Alternatively, load/add/store scalars when non‑relaxed and alignment isn’t guaranteed (see previous guard fallback snippet).

Also applies to: 191-193, 235-238, 275-278, 318-320, 349-351, 380-382, 423-425





---

🏁 Script executed:

```shell
#!/bin/bash
# Locate the file and display the full content around the mentioned line ranges
fd "atomic.h" --type f | head -5

Length of output: 30


🏁 Script executed:

#!/bin/bash
# Find the file in the repository
find . -name "atomic.h" -path "*/tl_templates/cuda/*" 2>/dev/null | head -1

Length of output: 32


🏁 Script executed:

#!/bin/bash
# Search for reinterpret_cast patterns with half2, float2, float4 in the atomic operations
rg -n "reinterpret_cast.*half2|reinterpret_cast.*float2|reinterpret_cast.*float4" --type cpp -A2 -B2

Length of output: 4830


🏁 Script executed:

#!/bin/bash
# Check for any existing alignment checks or assertions in the file
rg -n "assert.*align|alignment|uintptr_t.*%|__align|assume_aligned" --type cpp

Length of output: 9436


🏁 Script executed:

#!/bin/bash
# Get a broader view of the atomic.h file structure and line counts
wc -l src/tl_templates/cuda/atomic.h 2>/dev/null || find . -name "atomic.h" -path "*/tl_templates/cuda/*" -exec wc -l {} \;

Length of output: 34


🏁 Script executed:

#!/bin/bash
# Display specific line ranges mentioned in the review
if [ -f "src/tl_templates/cuda/atomic.h" ]; then
  echo "=== Lines 144-146 ==="; sed -n '144,146p' src/tl_templates/cuda/atomic.h
  echo "=== Lines 191-193 ==="; sed -n '191,193p' src/tl_templates/cuda/atomic.h
  echo "=== Lines 235-238 ==="; sed -n '235,238p' src/tl_templates/cuda/atomic.h
  echo "=== Lines 275-278 ==="; sed -n '275,278p' src/tl_templates/cuda/atomic.h
  echo "=== Lines 318-320 ==="; sed -n '318,320p' src/tl_templates/cuda/atomic.h
  echo "=== Lines 349-351 ==="; sed -n '349,351p' src/tl_templates/cuda/atomic.h
  echo "=== Lines 380-382 ==="; sed -n '380,382p' src/tl_templates/cuda/atomic.h
  echo "=== Lines 423-425 ==="; sed -n '423,425p' src/tl_templates/cuda/atomic.h
fi

Length of output: 1296



---

🏁 Script executed:

```shell
#!/bin/bash
# Get the function signatures and broader context for the operations
sed -n '140,160p' src/tl_templates/cuda/atomic.h
sed -n '230,250p' src/tl_templates/cuda/atomic.h
sed -n '313,330p' src/tl_templates/cuda/atomic.h
sed -n '375,395p' src/tl_templates/cuda/atomic.h

Length of output: 4234


🏁 Script executed:

#!/bin/bash
# Search for function parameter types and any documentation about alignment
rg -n "void.*atomic_ref|ref.*half_t|ref.*float\*|val.*half_t" --type cpp -B5 -A1 src/tl_templates/cuda/atomic.h | head -60

Length of output: 469


🏁 Script executed:

#!/bin/bash
# Look for any static_asserts in the file
grep -n "static_assert\|_Static_assert" src/tl_templates/cuda/atomic.h

Length of output: 0


🏁 Script executed:

#!/bin/bash
# Check if there's any documentation or comments about alignment preconditions
sed -n '100,150p' src/tl_templates/cuda/atomic.h | grep -n -C3 "align\|precond\|require"

Length of output: 0








🏁 Script executed:

#!/bin/bash
# Search for callers of AtomicAddx2 and AtomicAddx4 functions
rg -n "AtomicAddx2|AtomicAddx4" --type cpp -g '!atomic.h' -B2 -A2 | head -100

Length of output: 562


🏁 Script executed:

#!/bin/bash
# Check if there are any type or memory layout constraints in related headers
find src/tl_templates/cuda -name "*.h" -type f -exec grep -l "AtomicAdd\|alignment\|align" {} \;

Length of output: 235


🌐 Web query:

CUDA atomicAdd vectorized float2 float4 alignment requirements

💡 Result:

Short answer

  • atomicAdd(float2*) requires 8-byte alignment; atomicAdd(float4*) requires 16-byte alignment. Use these only on properly aligned global-memory addresses. [1][2]

Details (important)

  • CUDA Programming Guide: float2/float4 vector atomicAdd are supported for global memory on devices with compute capability 9.x+ and guarantee atomicity per float element (not as a single 64- or 128-bit atomic access). [1]
  • Built-in vector-type alignment table: float2 (8-byte alignment), float4 (16-byte alignment) — do not cast from a 4-byte-aligned float* to float2*/float4* unless you can guarantee the stricter alignment. [2]

If you want, tell me your CUDA version / target GPU compute capability and I’ll show a minimal safe implementation (with alignment attributes or memcpy-based workaround).


Add runtime alignment assertions to prevent undefined behavior from misaligned vectorized atomic operations.

The concern is valid. CUDA vectorized atomicAdd operations require strict alignment: float2 requires 8-byte alignment, float4 requires 16-byte alignment. The code reinterpret_casts raw pointers to vectorized types without enforcing these preconditions, which causes undefined behavior and potential hardware faults.

All 8 locations (lines 144–146, 191–193, 235–238, 275–278, 318–320, 349–351, 380–382, 423–425) lack guards. Add runtime assert(((uintptr_t)ref % alignment) == 0) checks, or implement a fallback scalar path when alignment cannot be guaranteed.

// Since atomicAdd does not support memory order, atomic_ref does not
// support vectorized atomic operation we can only inline ptx code here
// Note: Vectorized atomic operations only support global space
// Note: for 16-bit value, we need to reinterpret_cast the value to unsigned
// short and use "h" register in assembly
__half2 add_val = *reinterpret_cast<__half2 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__half ret_val_x, ret_val_y;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val_x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val_y);
if (memory_order == int(cuda::memory_order_release) ||
Comment on lines +158 to +163
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Uninitialized read into ret_val_*_cast before asm.

ret_val_x_cast and ret_val_y_cast are initialized by reading the bytes of uninitialized ret_val_*. That’s UB. Make them plain locals; asm outputs will write them.

-    __half ret_val_x, ret_val_y;
-    unsigned short ret_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val_x);
-    unsigned short ret_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val_y);
+    unsigned short ret_val_x_cast;  // written by asm
+    unsigned short ret_val_y_cast;  // written by asm

And similarly for the bf16 paths:

-    __nv_bfloat162 ret_val;
-    unsigned short ret_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val.x);
-    unsigned short ret_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val.y);
+    unsigned short ret_val_x_cast;  // written by asm
+    unsigned short ret_val_y_cast;  // written by asm

Also applies to: 200-205, 246-250, 286-290

🤖 Prompt for AI Agents
In src/tl_templates/cuda/atomic.h around lines 158-163 (and similarly at
200-205, 246-250, 286-290), the code creates unsigned short variables by
reinterpret_casting uninitialized __half/__nv_bfloat16 locals which causes an
uninitialized read; instead, remove the reinterpret_cast reads and declare the
casted variables as plain unsigned short locals (uninitialized) so the inline
asm outputs can populate them, and for all bf16/hf16 codepaths ensure the asm()
uses the unsigned short output operands directly rather than reading from
uninitialized floating-point temporaries.

memory_order == int(cuda::memory_order_consume)) {
asm volatile(
"atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
Comment on lines +163 to +169
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

memory_order_consume mapped to release — use acquire (or acq_rel for RMW).

On RMW atomics, consume should not be weaker than the load side; common practice maps consume to acquire. Mapping it to release loses required ordering for dependent reads.

-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
+    if (memory_order == int(cuda::memory_order_release)) {
       ...
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
+    } else if (memory_order == int(cuda::memory_order_acquire) ||
+               memory_order == int(cuda::memory_order_consume)) {
       ...

Do this consistently in all x2/x4 paths. Also consider whether seq_cst should be stronger than acq_rel (e.g., add an extra fence) per your memory model.

Also applies to: 205-212, 250-256, 290-296, 324-330, 356-361, 389-397, 429-437

🤖 Prompt for AI Agents
In src/tl_templates/cuda/atomic.h around lines 163-169 (and similarly at
205-212, 250-256, 290-296, 324-330, 356-361, 389-397, 429-437), the branch
incorrectly treats memory_order_consume as release; change the emitted asm for
consume to use the acquire (or acq_rel for read-modify-write) variant rather
than the release variant, and make the mapping consistent across all x2/x4 code
paths; also review seq_cst cases and, if required by the memory model, add the
stronger fence (or use seq_cst-specific asm) so seq_cst is stronger than
acq_rel.

} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile(
"atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile(
"atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
}
Comment on lines +143 to +184
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Guard non-relaxed PTX paths for half/bf16 with arch checks like float (>= sm_90).

Vectorized atomics with memory semantics require recent PTX/SM. Currently only float v2/v4 are guarded. Without guards, compilation may fail for lower arch lists even if the branch is not taken at runtime.

Wrap the non‑relaxed inline PTX under the same (or correct) guard you used for float:

+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
   // existing non-relaxed inline PTX...
-  } else {
+  } else {
     ...
   }
+#else
+  } else {
+    // Fallback: no vectorized non‑relaxed support on this arch
+    cuda::atomic_ref<half, cuda::thread_scope_device> ax(ref[0]);
+    cuda::atomic_ref<half, cuda::thread_scope_device> ay(ref[1]);
+    ax.fetch_add(val[0], cuda::memory_order(memory_order));
+    ay.fetch_add(val[1], cuda::memory_order(memory_order));
+  }
+#endif

Please verify the minimum SM/PTX that supports atom.*.v2.f16/v2.bf16.

Also applies to: 190-229, 234-268, 274-311


I'll verify the claims in this review comment by examining the actual code structure and confirming the SM/PTX requirements.


🏁 Script executed:

#!/bin/bash
# Inspect the file to see actual guards and structure for float vs half/bf16
fd -t f "atomic.h" | head -5

Length of output: 30


🏁 Script executed:

#!/bin/bash
# Get file content around the specified lines to see guards
if [ -f "src/tl_templates/cuda/atomic.h" ]; then
  echo "=== Lines 130-190 (float section) ===" 
  sed -n '130,190p' "src/tl_templates/cuda/atomic.h"
  echo ""
  echo "=== Lines 180-240 (half section start) ==="
  sed -n '180,240p' "src/tl_templates/cuda/atomic.h"
fi

Length of output: 5670


🏁 Script executed:

#!/bin/bash
# Search for #if directives and arch checks throughout the file
if [ -f "src/tl_templates/cuda/atomic.h" ]; then
  echo "=== Architecture guards in file ==="
  rg -n "#if|__CUDA_ARCH|sm_90" "src/tl_templates/cuda/atomic.h" -C 2
fi

Length of output: 944


🌐 Web query:

CUDA sm_90 atom.release.gpu.global.add.v2.f16 minimum architecture requirements

💡 Result:

Short answer: atom.release.gpu.global.add.v2.f16 requires NVIDIA SM 90 (compute capability 9.0 / Hopper) or newer. Sources: NVIDIA PTX/Parallel-Thread-Execution docs and Hopper compatibility guide. [1][2][3]

Sources

  • NVIDIA PTX / PTX ISA notes (atomic/vector support and SM requirements). [1]
  • PTX Parallel-Thread-Execution doc showing vector atomics (atom.global.v2.f16.add) and SM requirements. [2]
  • Hopper compatibility / CUDA docs (SM 90 / compute capability 9.0 = Hopper). [3]

I need to verify the additional sections mentioned ("also applies to: 190-229, 234-268, 274-311"):


🏁 Script executed:

#!/bin/bash
# Check lines 234-311 to verify guard status on remaining sections
if [ -f "src/tl_templates/cuda/atomic.h" ]; then
  echo "=== Lines 230-270 (bf16 AtomicAddx2Ret and next section) ==="
  sed -n '230,270p' "src/tl_templates/cuda/atomic.h"
  echo ""
  echo "=== Lines 270-312 (remainder) ==="
  sed -n '270,312p' "src/tl_templates/cuda/atomic.h"
fi

Length of output: 4320


Add SM 90 arch guards to half and bf16 non-relaxed PTX paths; currently unguarded (half) or insufficiently guarded (bf16) while float is correctly guarded.

All three sections require fixes:

  1. Lines 143-184 (AtomicAddx2 half): Completely unguarded; contains atom.release/acquire/acq_rel.gpu.global.add.noftz.v2.f16 which requires SM 90.

  2. Lines 190-229 (AtomicAddx2Ret half): Completely unguarded; same PTX instruction dependency.

  3. Lines 231-269 & 270-312 (AtomicAddx2 and AtomicAddx2Ret bf16): Guarded with #if (__CUDA_ARCH_LIST__ > 750) (line 231), but requires >= 900 for atom.*.v2.bf16 support. The #endif at line 312 closes the bf16 block.

Float (lines 314+) correctly uses #if (__CUDA_ARCH_LIST__ >= 900). Half sections must use identical guards; bf16 guard must be tightened to >= 900. Without these, compilation fails for architectures below SM 90 even when the non-relaxed branch is unreachable.

🤖 Prompt for AI Agents
In src/tl_templates/cuda/atomic.h around lines 143-184, 190-229, and 231-312,
the PTX vectorized atomic instructions for half and bf16 (atom.*.v2.f16 and
atom.*.v2.bf16) are currently unguarded or guarded only for >750, which allows
compilation on architectures that lack SM 90 support; wrap the entire
non-relaxed PTX branches that emit these instructions with the same architecture
guard used for float (i.e., #if (__CUDA_ARCH_LIST__ >= 900)) and change the
existing bf16 guard from >750 to >=900, making sure each #if has the matching
#endif around the PTX asm blocks so the code only emits these instructions for
SM 90+.

}

TL_DEVICE half2
AtomicAddx2Ret(half_t *ref, half_t *val,
int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<half2 *>(ref),
static_cast<half2>(*reinterpret_cast<half2 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<half2 *>(ref),
static_cast<half2>(*reinterpret_cast<half2 *>(val)));
} else {
__half2 add_val = *reinterpret_cast<__half2 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__half ret_val_x, ret_val_y;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val_x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val_y);
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile(
"atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile(
"atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile(
"atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
return half2(*reinterpret_cast<__half *>(&ret_val_x_cast),
*reinterpret_cast<__half *>(&ret_val_y_cast));
Comment on lines +226 to +227
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fix return construction for x2 returns (half2/bf16x2).

half2(__half, __half) and __nv_bfloat162(__nv_bfloat16, __nv_bfloat16) are not valid constructors; also avoid aliasing through reinterpret_cast. Use intrinsics or brace initialization with proper bit-casts.

If available in your CUDA version, prefer:

-    return half2(*reinterpret_cast<__half *>(&ret_val_x_cast),
-                 *reinterpret_cast<__half *>(&ret_val_y_cast));
+    return __halves2half2(__ushort_as_half(ret_val_x_cast),
+                          __ushort_as_half(ret_val_y_cast));

For bf16, use the bf16 equivalents (please confirm intrinsic names in your toolchain):

-    return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast),
-                          *reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast));
+    __nv_bfloat16 hx, hy;
+    memcpy(&hx, &ret_val_x_cast, sizeof(hx));  // avoids aliasing UB
+    memcpy(&hy, &ret_val_y_cast, sizeof(hy));
+    return __nv_bfloat162{hx, hy};

If you prefer no memcpy, introduce a small constexpr bit_cast helper for device code.

Also applies to: 308-310

}
}

#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))
TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val,
int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd(
reinterpret_cast<__nv_bfloat162 *>(ref),
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
atomicAdd(
reinterpret_cast<__nv_bfloat162 *>(ref),
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
} else {
__nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__nv_bfloat162 ret_val;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val.x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val.y);
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
}
}

TL_DEVICE __nv_bfloat162
AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val,
int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd(
reinterpret_cast<__nv_bfloat162 *>(ref),
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
return atomicAdd(
reinterpret_cast<__nv_bfloat162 *>(ref),
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
} else {
__nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__nv_bfloat162 ret_val;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val.x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val.y);
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast),
*reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast));
}
}
#endif

#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
TL_DEVICE void AtomicAddx2(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<float2 *>(ref),
static_cast<float2>(*reinterpret_cast<float2 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<float2 *>(ref),
static_cast<float2>(*reinterpret_cast<float2 *>(val)));
} else {
float2 add_val = *reinterpret_cast<float2 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float2 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
}
}
}

TL_DEVICE float2
AtomicAddx2Ret(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<float2 *>(ref),
static_cast<float2>(*reinterpret_cast<float2 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<float2 *>(ref),
static_cast<float2>(*reinterpret_cast<float2 *>(val)));
} else {
float2 add_val = *reinterpret_cast<float2 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float2 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
}
return ret_val;
}
}

TL_DEVICE void AtomicAddx4(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<float4 *>(ref),
static_cast<float4>(*reinterpret_cast<float4 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<float4 *>(ref),
static_cast<float4>(*reinterpret_cast<float4 *>(val)));
} else {
// Since atomicAdd does not support memory order, atomic_ref does not
// support vectorized atomic operation we can only inline ptx code here
// Note: Vectorized atomic operations only support global space
float4 add_val = *reinterpret_cast<float4 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float4 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
Comment on lines +389 to +397
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

PTX mnemonic order bug: uses atom.global.gpu.release...; should be atom.release.gpu.global....

You use atom.release.gpu.global... elsewhere, but in AtomicAddx4Ret the order is atom.global.gpu.release..., which will fail to assemble. Fix all three mnemonics in AtomicAddx4Ret to keep .semantics .scope .space order.

Apply:

-      asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+      asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
...
-      asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+      asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
...
-      asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+      asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "

Also applies to: 431-455

🤖 Prompt for AI Agents
In src/tl_templates/cuda/atomic.h around lines 389-397 (and also apply the same
fix to 431-455), the PTX mnemonic components are in the wrong order in
AtomicAddx4Ret: currently using the `.space` and `.scope` parts before
`.semantics` (e.g., `atom.global.gpu.release...`) which fails to assemble;
change the three affected mnemonics in AtomicAddx4Ret to use the correct
`.semantics .scope .space` order (e.g., `atom.release.gpu.global...`) so all
three atom instructions match the correct PTX ordering used elsewhere.

} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
}
}
}

TL_DEVICE float4
AtomicAddx4Ret(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<float4 *>(ref),
static_cast<float4>(*reinterpret_cast<float4 *>(val)));
if (memory_order == int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<float4 *>(ref),
static_cast<float4>(*reinterpret_cast<float4 *>(val)));
} else {
float4 add_val = *reinterpret_cast<float4 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float4 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
}
return ret_val;
}
}
#endif

Expand Down
Loading