flashinfer-ai · aleozlx · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · coderabbitai
@@ -3,6 +3,10 @@
 import torch.nn.functional as F
 from flashinfer.utils import get_compute_capability
 
+pytestmark = pytest.mark.skip(
+    reason="tinygemm2 hangs on CI H100 runners — investigation in progress"
+)
-pytestmark = pytest.mark.skip(
-    reason="tinygemm2 hangs on CI H100 runners — investigation in progress"
-)
+import os
+import torch
+import pytest
+import torch.nn.functional as F
+from flashinfer.utils import get_compute_capability
+
+def _is_ci_h100():
+    cc = get_compute_capability(torch.device("cuda"))
+    return os.getenv("CI") == "true" and cc[0] == 9
+
+pytestmark = pytest.mark.skipif(
+    _is_ci_h100(),
+    reason="tinygemm2 hangs on CI H100 runners — investigation in progress",
+)
-pytestmark = pytest.mark.skip(
-    reason="tinygemm2 hangs on CI H100 runners — investigation in progress"
-)
+import os
+import torch
+import pytest
+import torch.nn.functional as F
+from flashinfer.utils import get_compute_capability
+
+def _is_ci_h100():
+    cc = get_compute_capability(torch.device("cuda"))
+    return os.getenv("CI") == "true" and cc[0] == 9
+
+pytestmark = pytest.mark.skipif(
+    _is_ci_h100(),
+    reason="tinygemm2 hangs on CI H100 runners — investigation in progress",
+)
+
 
 def _skip_if_not_sm90():
     cc = get_compute_capability(torch.device("cuda"))

@@ -3475,6 +3475,7 @@ def test_mxfp8_block_scale_moe_relu2_non_gated(
         weight_processing=weight_processing,
         activation_type=ActivationType.Relu2,
         cache_permute_indices=cache_permute_indices,
+        logits_dtype=torch.bfloat16,
         zero_hidden_states=zero_hidden_states,
     )
 
@@ -3510,6 +3511,7 @@ def test_mxfp8_block_scale_moe_relu2_deepseekv3_topk22(cache_permute_indices):
         },
         activation_type=ActivationType.Relu2,
         cache_permute_indices=cache_permute_indices,
+        logits_dtype=torch.float32,
     )
 
 
@@ -3598,6 +3600,7 @@ def test_fp8_block_scale_autotune_valid_configs(autotune_case, cache_permute_ind
         },
         activation_type=autotune_case["activation_type"],
         cache_permute_indices=cache_permute_indices,
+        logits_dtype=torch.float32,
         zero_hidden_states=False,
     )
 
@@ -3659,6 +3662,7 @@ def test_fp8_per_tensor_autotune_valid_configs_nonefp8(
         },
         activation_type=autotune_case["activation_type"],
         cache_permute_indices=cache_permute_indices,
+        logits_dtype=torch.bfloat16,
         zero_hidden_states=False,
     )