fix comiple and skip tests cleanup

dongxuy04 · dongxuy04 · commit c9360c9d0986 · 2025-08-18T15:05:54.000+08:00
Signed-off-by: Dongxu Yang &lt;78518666+dongxuy04@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu
@@ -45,14 +45,14 @@ __device__ __forceinline__ void fence_release_sys()
 
 __device__ __forceinline__ void mbarrier_init(uint64_t* addr, uint32_t const& count)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
     asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(addr)), "r"(count) : "memory");
 #endif
 }
 
 __device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_t txCount)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
         :
         : "r"(__as_ptr_smem(addr)), "r"(txCount)
@@ -62,7 +62,7 @@ __device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_
 
 __device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
     uint64_t state;
     asm("mbarrier.arrive.shared.b64 %0, [%1];" : "=l"(state) : "r"(__as_ptr_smem(addr)) : "memory");
     return state;
@@ -73,7 +73,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr)
 
 __device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, const uint32_t txCount)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     uint64_t state;
     asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
         : "=l"(state)
@@ -87,7 +87,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, co
 
 __device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_t const& phaseParity)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     uint32_t waitComplete;
     asm("{\n\t .reg .pred P_OUT; \n\t"
         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2;\n\t"
@@ -105,7 +105,7 @@ __device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_
 template <int COPY_SIZE = 4>
 __device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool predGuard)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
     asm volatile(
         "{\n"
         "  .reg .pred p;\n"
@@ -118,22 +118,22 @@ __device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool pred
 
 __device__ __forceinline__ void cp_async_commit_group()
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
     asm volatile("cp.async.commit_group;" : : :);
 #endif
 }
 
 template <int N = 0>
 __device__ __forceinline__ void cp_async_wait_group()
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 800
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800
     asm volatile("cp.async.wait_group %0;" : : "n"(N) : "memory");
 #endif
 }
 
 __device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcMem, int copySize, uint64_t* smemBar)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
         :
         : "r"(__as_ptr_smem(dstMem)), "l"(__as_ptr_gmem(srcMem)), "r"(copySize), "r"(__as_ptr_smem(smemBar))
@@ -143,7 +143,7 @@ __device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcM
 
 __device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcMem, int copySize)
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;"
         :
         : "l"(__as_ptr_gmem(dstMem)), "r"(__as_ptr_smem(srcMem)), "r"(copySize)
@@ -153,23 +153,23 @@ __device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcM
 
 __device__ __forceinline__ void cp_async_bulk_commit_group()
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     asm volatile("cp.async.bulk.commit_group;" : : :);
 #endif
 }
 
 template <int N = 0>
 __device__ __forceinline__ void cp_async_bulk_wait_group()
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     asm volatile("cp.async.bulk.wait_group %0;" : : "n"(N) : "memory");
 #endif
 }
 
 template <int N = 0>
 __device__ __forceinline__ void cp_async_bulk_wait_group_read()
 {
-#if defined(__CUDACC__) || __CUDA_ARCH__ >= 900
+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900
     asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(N) : "memory");
 #endif
 }
diff --git a/cpp/tensorrt_llm/kernels/moePrepareKernels.cu b/cpp/tensorrt_llm/kernels/moePrepareKernels.cu
@@ -243,7 +243,6 @@ __global__ void computeCumsumDevice(int* sendCountsCumsum, int* recvCountsCumsum
 
     int tid = threadIdx.x;
     int threadData = tid < rankCount ? inputOutputPtr[tid] : 0;
-    int count = threadData;
     __syncthreads();
 
     BlockScan(temp_storage).InclusiveSum(threadData, threadData);
diff --git a/cpp/tests/unit_tests/kernels/fusedMoeCommKernelTest.cpp b/cpp/tests/unit_tests/kernels/fusedMoeCommKernelTest.cpp
@@ -50,6 +50,7 @@ class FusedMoeCommTestBase : public ::testing::Test
     {
         if (shouldSkip())
         {
+            skipped = true;
             GTEST_SKIP() << "Skipping due to no/unsupported GPU";
         }
         TLLM_CUDA_CHECK(cudaStreamCreate(&stream));
@@ -58,10 +59,14 @@ class FusedMoeCommTestBase : public ::testing::Test
 
     void TearDown() override
     {
-        TLLM_CUDA_CHECK(cudaStreamDestroy(stream));
+        if (!skipped)
+        {
+            TLLM_CUDA_CHECK(cudaStreamDestroy(stream));
+        }
     }
 
-    cudaStream_t stream;
+    bool skipped = false;
+    cudaStream_t stream = nullptr;
 
     // Helper function to allocate and initialize test data
     template <typename T>

Original file line number	Diff line number	Diff line change
`@@ -45,14 +45,14 @@ __device__ __forceinline__ void fence_release_sys()`
`45`	`45`
`46`	`46`	`__device__ __forceinline__ void mbarrier_init(uint64_t* addr, uint32_t const& count)`
`47`	`47`	`{`
`48`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 800`
	`48`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800`
`49`	`49`	`asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(addr)), "r"(count) : "memory");`
`50`	`50`	`#endif`
`51`	`51`	`}`
`52`	`52`
`53`	`53`	`__device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_t txCount)`
`54`	`54`	`{`
`55`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`55`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`56`	`56`	`asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"`
`57`	`57`	`:`
`58`	`58`	`: "r"(__as_ptr_smem(addr)), "r"(txCount)`
`@@ -62,7 +62,7 @@ __device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_`
`62`	`62`
`63`	`63`	`__device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr)`
`64`	`64`	`{`
`65`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 800`
	`65`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800`
`66`	`66`	`uint64_t state;`
`67`	`67`	`asm("mbarrier.arrive.shared.b64 %0, [%1];" : "=l"(state) : "r"(__as_ptr_smem(addr)) : "memory");`
`68`	`68`	`return state;`
`@@ -73,7 +73,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr)`
`73`	`73`
`74`	`74`	`__device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, const uint32_t txCount)`
`75`	`75`	`{`
`76`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`76`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`77`	`77`	`uint64_t state;`
`78`	`78`	`asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"`
`79`	`79`	`: "=l"(state)`
`@@ -87,7 +87,7 @@ __device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, co`
`87`	`87`
`88`	`88`	`__device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_t const& phaseParity)`
`89`	`89`	`{`
`90`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`90`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`91`	`91`	`uint32_t waitComplete;`
`92`	`92`	`asm("{\n\t .reg .pred P_OUT; \n\t"`
`93`	`93`	`"mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2;\n\t"`
`@@ -105,7 +105,7 @@ __device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_`
`105`	`105`	`template <int COPY_SIZE = 4>`
`106`	`106`	`__device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool predGuard)`
`107`	`107`	`{`
`108`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 800`
	`108`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800`
`109`	`109`	`asm volatile(`
`110`	`110`	`"{\n"`
`111`	`111`	`" .reg .pred p;\n"`
`@@ -118,22 +118,22 @@ __device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool pred`
`118`	`118`
`119`	`119`	`__device__ __forceinline__ void cp_async_commit_group()`
`120`	`120`	`{`
`121`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 800`
	`121`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800`
`122`	`122`	`asm volatile("cp.async.commit_group;" : : :);`
`123`	`123`	`#endif`
`124`	`124`	`}`
`125`	`125`
`126`	`126`	`template <int N = 0>`
`127`	`127`	`__device__ __forceinline__ void cp_async_wait_group()`
`128`	`128`	`{`
`129`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 800`
	`129`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800`
`130`	`130`	`asm volatile("cp.async.wait_group %0;" : : "n"(N) : "memory");`
`131`	`131`	`#endif`
`132`	`132`	`}`
`133`	`133`
`134`	`134`	`__device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcMem, int copySize, uint64_t* smemBar)`
`135`	`135`	`{`
`136`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`136`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`137`	`137`	`asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"`
`138`	`138`	`:`
`139`	`139`	`: "r"(__as_ptr_smem(dstMem)), "l"(__as_ptr_gmem(srcMem)), "r"(copySize), "r"(__as_ptr_smem(smemBar))`
`@@ -143,7 +143,7 @@ __device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcM`
`143`	`143`
`144`	`144`	`__device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcMem, int copySize)`
`145`	`145`	`{`
`146`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`146`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`147`	`147`	`asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;"`
`148`	`148`	`:`
`149`	`149`	`: "l"(__as_ptr_gmem(dstMem)), "r"(__as_ptr_smem(srcMem)), "r"(copySize)`
`@@ -153,23 +153,23 @@ __device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcM`
`153`	`153`
`154`	`154`	`__device__ __forceinline__ void cp_async_bulk_commit_group()`
`155`	`155`	`{`
`156`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`156`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`157`	`157`	`asm volatile("cp.async.bulk.commit_group;" : : :);`
`158`	`158`	`#endif`
`159`	`159`	`}`
`160`	`160`
`161`	`161`	`template <int N = 0>`
`162`	`162`	`__device__ __forceinline__ void cp_async_bulk_wait_group()`
`163`	`163`	`{`
`164`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`164`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`165`	`165`	`asm volatile("cp.async.bulk.wait_group %0;" : : "n"(N) : "memory");`
`166`	`166`	`#endif`
`167`	`167`	`}`
`168`	`168`
`169`	`169`	`template <int N = 0>`
`170`	`170`	`__device__ __forceinline__ void cp_async_bulk_wait_group_read()`
`171`	`171`	`{`
`172`		`-#if defined(__CUDACC__) \|\| __CUDA_ARCH__ >= 900`
	`172`	`+#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900`
`173`	`173`	`asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(N) : "memory");`
`174`	`174`	`#endif`
`175`	`175`	`}`
Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@ class FusedMoeCommTestBase : public ::testing::Test`
`50`	`50`	`{`
`51`	`51`	`if (shouldSkip())`
`52`	`52`	`{`
	`53`	`+ skipped = true;`
`53`	`54`	`GTEST_SKIP() << "Skipping due to no/unsupported GPU";`
`54`	`55`	`}`
`55`	`56`	`TLLM_CUDA_CHECK(cudaStreamCreate(&stream));`
`@@ -58,10 +59,14 @@ class FusedMoeCommTestBase : public ::testing::Test`
`58`	`59`
`59`	`60`	`void TearDown() override`
`60`	`61`	`{`
`61`		`- TLLM_CUDA_CHECK(cudaStreamDestroy(stream));`
	`62`	`+ if (!skipped)`
	`63`	`+ {`
	`64`	`+ TLLM_CUDA_CHECK(cudaStreamDestroy(stream));`
	`65`	`+ }`
`62`	`66`	`}`
`63`	`67`
`64`		`- cudaStream_t stream;`
	`68`	`+ bool skipped = false;`
	`69`	`+ cudaStream_t stream = nullptr;`
`65`	`70`
`66`	`71`	`// Helper function to allocate and initialize test data`
`67`	`72`	`template <typename T>`