Skip to content

Commit c9360c9

Browse files
committed
fix comiple and skip tests cleanup
Signed-off-by: Dongxu Yang <[email protected]>
1 parent 5ceb287 commit c9360c9

File tree

3 files changed

+20
-16
lines changed

3 files changed

+20
-16
lines changed

cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,14 @@ __device__ __forceinline__ void fence_release_sys()
4545

4646
__device__ __forceinline__ void mbarrier_init(uint64_t* addr, uint32_t const& count)
4747
{
48-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
48+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
4949
asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(addr)), "r"(count) : "memory");
5050
#endif
5151
}
5252

5353
__device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_t txCount)
5454
{
55-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
55+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
5656
asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
5757
:
5858
: "r"(__as_ptr_smem(addr)), "r"(txCount)
@@ -62,7 +62,7 @@ __device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_
6262

6363
__device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr)
6464
{
65-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
65+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
6666
uint64_t state;
6767
asm("mbarrier.arrive.shared.b64 %0, [%1];" : "=l"(state) : "r"(__as_ptr_smem(addr)) : "memory");
6868
return state;
@@ -73,7 +73,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr)
7373

7474
__device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, const uint32_t txCount)
7575
{
76-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
76+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
7777
uint64_t state;
7878
asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
7979
: "=l"(state)
@@ -87,7 +87,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, co
8787

8888
__device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_t const& phaseParity)
8989
{
90-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
90+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
9191
uint32_t waitComplete;
9292
asm("{\n\t .reg .pred P_OUT; \n\t"
9393
"mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2;\n\t"
@@ -105,7 +105,7 @@ __device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_
105105
template <int COPY_SIZE = 4>
106106
__device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool predGuard)
107107
{
108-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
108+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
109109
asm volatile(
110110
"{\n"
111111
" .reg .pred p;\n"
@@ -118,22 +118,22 @@ __device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool pred
118118

119119
__device__ __forceinline__ void cp_async_commit_group()
120120
{
121-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
121+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
122122
asm volatile("cp.async.commit_group;" : : :);
123123
#endif
124124
}
125125

126126
template <int N = 0>
127127
__device__ __forceinline__ void cp_async_wait_group()
128128
{
129-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
129+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
130130
asm volatile("cp.async.wait_group %0;" : : "n"(N) : "memory");
131131
#endif
132132
}
133133

134134
__device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcMem, int copySize, uint64_t* smemBar)
135135
{
136-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
136+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
137137
asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
138138
:
139139
: "r"(__as_ptr_smem(dstMem)), "l"(__as_ptr_gmem(srcMem)), "r"(copySize), "r"(__as_ptr_smem(smemBar))
@@ -143,7 +143,7 @@ __device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcM
143143

144144
__device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcMem, int copySize)
145145
{
146-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
146+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
147147
asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;"
148148
:
149149
: "l"(__as_ptr_gmem(dstMem)), "r"(__as_ptr_smem(srcMem)), "r"(copySize)
@@ -153,23 +153,23 @@ __device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcM
153153

154154
__device__ __forceinline__ void cp_async_bulk_commit_group()
155155
{
156-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
156+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
157157
asm volatile("cp.async.bulk.commit_group;" : : :);
158158
#endif
159159
}
160160

161161
template <int N = 0>
162162
__device__ __forceinline__ void cp_async_bulk_wait_group()
163163
{
164-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
164+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
165165
asm volatile("cp.async.bulk.wait_group %0;" : : "n"(N) : "memory");
166166
#endif
167167
}
168168

169169
template <int N = 0>
170170
__device__ __forceinline__ void cp_async_bulk_wait_group_read()
171171
{
172-
#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
172+
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
173173
asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(N) : "memory");
174174
#endif
175175
}

cpp/tensorrt_llm/kernels/moePrepareKernels.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,6 @@ __global__ void computeCumsumDevice(int* sendCountsCumsum, int* recvCountsCumsum
243243

244244
int tid = threadIdx.x;
245245
int threadData = tid < rankCount ? inputOutputPtr[tid] : 0;
246-
int count = threadData;
247246
__syncthreads();
248247

249248
BlockScan(temp_storage).InclusiveSum(threadData, threadData);

cpp/tests/unit_tests/kernels/fusedMoeCommKernelTest.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class FusedMoeCommTestBase : public ::testing::Test
5050
{
5151
if (shouldSkip())
5252
{
53+
skipped = true;
5354
GTEST_SKIP() << "Skipping due to no/unsupported GPU";
5455
}
5556
TLLM_CUDA_CHECK(cudaStreamCreate(&stream));
@@ -58,10 +59,14 @@ class FusedMoeCommTestBase : public ::testing::Test
5859

5960
void TearDown() override
6061
{
61-
TLLM_CUDA_CHECK(cudaStreamDestroy(stream));
62+
if (!skipped)
63+
{
64+
TLLM_CUDA_CHECK(cudaStreamDestroy(stream));
65+
}
6266
}
6367

64-
cudaStream_t stream;
68+
bool skipped = false;
69+
cudaStream_t stream = nullptr;
6570

6671
// Helper function to allocate and initialize test data
6772
template <typename T>

0 commit comments

Comments
 (0)