@@ -45,14 +45,14 @@ __device__ __forceinline__ void fence_release_sys()
4545
4646__device__  __forceinline__  void  mbarrier_init (uint64_t * addr, uint32_t  const & count)
4747{
48- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 800
48+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 800
4949    asm (" mbarrier.init.shared.b64 [%0], %1;" " r" __as_ptr_smem (addr)), " r" " memory" 
5050#endif 
5151}
5252
5353__device__  __forceinline__  void  mbarrier_expect_tx (uint64_t * addr, const  uint32_t  txCount)
5454{
55- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
55+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
5656    asm (" mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" 
5757        :
5858        : " r" __as_ptr_smem (addr)), " r" 
@@ -62,7 +62,7 @@ __device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_
6262
6363__device__  __forceinline__  uint64_t  mbarrier_arrive (uint64_t * addr)
6464{
65- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 800
65+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 800
6666    uint64_t  state;
6767    asm (" mbarrier.arrive.shared.b64 %0, [%1];" " =l" " r" __as_ptr_smem (addr)) : " memory" 
6868    return  state;
@@ -73,7 +73,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr)
7373
7474__device__  __forceinline__  uint64_t  mbarrier_arrive_expect_tx (uint64_t * addr, const  uint32_t  txCount)
7575{
76- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
76+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
7777    uint64_t  state;
7878    asm (" mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" 
7979        : " =l" 
@@ -87,7 +87,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, co
8787
8888__device__  __forceinline__  bool  mbarrier_try_wait_parity (uint64_t * addr, uint32_t  const & phaseParity)
8989{
90- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
90+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
9191    uint32_t  waitComplete;
9292    asm (" {\n\t  .reg .pred P_OUT; \n\t " 
9393        " mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2;\n\t " 
@@ -105,7 +105,7 @@ __device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_
105105template  <int  COPY_SIZE = 4 >
106106__device__  __forceinline__  void  ldgsts (int * dstShm, int  const * srcMem, bool  predGuard)
107107{
108- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 800
108+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 800
109109    asm  volatile (
110110        " {\n " 
111111        "   .reg .pred p;\n " 
@@ -118,22 +118,22 @@ __device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool pred
118118
119119__device__  __forceinline__  void  cp_async_commit_group ()
120120{
121- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 800
121+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 800
122122    asm  volatile (" cp.async.commit_group;" 
123123#endif 
124124}
125125
126126template  <int  N = 0 >
127127__device__  __forceinline__  void  cp_async_wait_group ()
128128{
129- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 800
129+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 800
130130    asm  volatile (" cp.async.wait_group %0;" " n" " memory" 
131131#endif 
132132}
133133
134134__device__  __forceinline__  void  cp_async_bulk_g2s (void * dstMem, void  const * srcMem, int  copySize, uint64_t * smemBar)
135135{
136- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
136+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
137137    asm (" cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" 
138138        :
139139        : " r" __as_ptr_smem (dstMem)), " l" __as_ptr_gmem (srcMem)), " r" " r" __as_ptr_smem (smemBar))
@@ -143,7 +143,7 @@ __device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcM
143143
144144__device__  __forceinline__  void  cp_async_bulk_s2g (void * dstMem, void  const * srcMem, int  copySize)
145145{
146- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
146+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
147147    asm (" cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;" 
148148        :
149149        : " l" __as_ptr_gmem (dstMem)), " r" __as_ptr_smem (srcMem)), " r" 
@@ -153,23 +153,23 @@ __device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcM
153153
154154__device__  __forceinline__  void  cp_async_bulk_commit_group ()
155155{
156- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
156+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
157157    asm  volatile (" cp.async.bulk.commit_group;" 
158158#endif 
159159}
160160
161161template  <int  N = 0 >
162162__device__  __forceinline__  void  cp_async_bulk_wait_group ()
163163{
164- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
164+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
165165    asm  volatile (" cp.async.bulk.wait_group %0;" " n" " memory" 
166166#endif 
167167}
168168
169169template  <int  N = 0 >
170170__device__  __forceinline__  void  cp_async_bulk_wait_group_read ()
171171{
172- #if  defined(__CUDACC__) ||  __CUDA_ARCH__ >= 900
172+ #if  defined(__CUDACC__) &&  __CUDA_ARCH__ >= 900
173173    asm  volatile (" cp.async.bulk.wait_group.read %0;" " n" " memory" 
174174#endif 
175175}
0 commit comments