diff --git a/rocprim/include/rocprim/config.hpp b/rocprim/include/rocprim/config.hpp index 7c96b54cc..3b6d2929d 100644 --- a/rocprim/include/rocprim/config.hpp +++ b/rocprim/include/rocprim/config.hpp @@ -77,28 +77,42 @@ #undef ROCPRIM_TARGET_CDNA1 #undef ROCPRIM_TARGET_CDNA2 #undef ROCPRIM_TARGET_CDNA3 +#undef ROCPRIM_TARGET_UNKNOWN // See https://llvm.org/docs/AMDGPUUsage.html#instructions -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx9_4_generic__) #define ROCPRIM_TARGET_CDNA3 1 #elif defined(__gfx90a__) #define ROCPRIM_TARGET_CDNA2 1 #elif defined(__gfx908__) #define ROCPRIM_TARGET_CDNA1 1 #elif defined(__gfx900__) || defined(__gfx902__) || defined(__gfx904__) || defined(__gfx906__) \ - || defined(__gfx90c__) + || defined(__gfx90c__) || defined(__gfx9_generic__) #define ROCPRIM_TARGET_GCN5 1 -#elif defined(__GFX12__) +#elif defined(__GFX12__) || defined(__gfx12_generic__) #define ROCPRIM_TARGET_RDNA4 1 -#elif defined(__GFX11__) +#elif defined(__GFX11__) || defined(__gfx11_generic__) #define ROCPRIM_TARGET_RDNA3 1 #elif defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) \ - || defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) + || defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) \ + || defined(__gfx10_3_generic__) #define ROCPRIM_TARGET_RDNA2 1 -#elif defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__) || defined(__gfx1013__) +#elif defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__) || defined(__gfx1013__) \ + || defined(__gfx10_1_generic__) #define ROCPRIM_TARGET_RDNA1 1 #elif defined(__GFX8__) #define ROCPRIM_TARGET_GCN3 1 +#elif defined(__HIP_DEVICE_COMPILE__) + // Double check the build target for typos otherwise please submit an issue or pull request! + #warning "unknown build target" + #define ROCPRIM_TARGET_UNKNOWN 1 +#endif + +// unknown targets do not support 128-bit atomics. +#if defined(ROCPRIM_TARGET_UKNOWN) + #define ROCPRIM_MAX_ATOMIC_SIZE 8 +#else + #define ROCPRIM_MAX_ATOMIC_SIZE 16 #endif // DPP is supported only after Volcanic Islands (GFX8+) diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp index 0376ec4d1..d49d0015c 100644 --- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp +++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp @@ -108,11 +108,13 @@ enum class lookback_scan_determinism default_determinism = nondeterministic, }; +constexpr const int MAX_PAYLOAD_SIZE = ROCPRIM_MAX_ATOMIC_SIZE - 1; + // lookback_scan_state object keeps track of prefixes status for // a look-back prefix scan. Initially every prefix can be either // invalid (padding values) or empty. One thread in a block should // later set it to partial, and later to complete. -template +template struct lookback_scan_state; /// Reduce lanes `0-valid_items` and return the result in lane 0.