Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions rocprim/include/rocprim/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,28 +77,42 @@
#undef ROCPRIM_TARGET_CDNA1
#undef ROCPRIM_TARGET_CDNA2
#undef ROCPRIM_TARGET_CDNA3
#undef ROCPRIM_TARGET_UNKNOWN

// See https://llvm.org/docs/AMDGPUUsage.html#instructions
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx9_4_generic__)
#define ROCPRIM_TARGET_CDNA3 1
#elif defined(__gfx90a__)
#define ROCPRIM_TARGET_CDNA2 1
#elif defined(__gfx908__)
#define ROCPRIM_TARGET_CDNA1 1
#elif defined(__gfx900__) || defined(__gfx902__) || defined(__gfx904__) || defined(__gfx906__) \
|| defined(__gfx90c__)
|| defined(__gfx90c__) || defined(__gfx9_generic__)
#define ROCPRIM_TARGET_GCN5 1
#elif defined(__GFX12__)
#elif defined(__GFX12__) || defined(__gfx12_generic__)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does no harm to do this, but I will note that the __GFX12__ macro is defined whenever __gfx12_generic__ is defined.

#define ROCPRIM_TARGET_RDNA4 1
#elif defined(__GFX11__)
#elif defined(__GFX11__) || defined(__gfx11_generic__)
Comment on lines -93 to +94
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does no harm to do this, but I will note that the __GFX11__ macro is defined whenever __gfx11_generic__ is defined.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#define ROCPRIM_TARGET_RDNA3 1
#elif defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) \
|| defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
|| defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) \
|| defined(__gfx10_3_generic__)
#define ROCPRIM_TARGET_RDNA2 1
#elif defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__) || defined(__gfx1013__)
#elif defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__) || defined(__gfx1013__) \
|| defined(__gfx10_1_generic__)
#define ROCPRIM_TARGET_RDNA1 1
#elif defined(__GFX8__)
#define ROCPRIM_TARGET_GCN3 1
#elif defined(__HIP_DEVICE_COMPILE__)
// Double check the build target for typos otherwise please submit an issue or pull request!
#warning "unknown build target"
#define ROCPRIM_TARGET_UNKNOWN 1
#endif

// unknown targets do not support 128-bit atomics.
#if defined(ROCPRIM_TARGET_UKNOWN)
#define ROCPRIM_MAX_ATOMIC_SIZE 8
#else
#define ROCPRIM_MAX_ATOMIC_SIZE 16
#endif

// DPP is supported only after Volcanic Islands (GFX8+)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,13 @@ enum class lookback_scan_determinism
default_determinism = nondeterministic,
};

constexpr const int MAX_PAYLOAD_SIZE = ROCPRIM_MAX_ATOMIC_SIZE - 1;

// lookback_scan_state object keeps track of prefixes status for
// a look-back prefix scan. Initially every prefix can be either
// invalid (padding values) or empty. One thread in a block should
// later set it to partial, and later to complete.
template<class T, bool UseSleep = false, bool IsSmall = (sizeof(T) <= 15)>
template<class T, bool UseSleep = false, bool IsSmall = (sizeof(T) <= MAX_PAYLOAD_SIZE)>
struct lookback_scan_state;

/// Reduce lanes `0-valid_items` and return the result in lane 0.
Expand Down