torch-spyre · prashantgupta24 · Jul 11, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 11, 2025
@@ -125,7 +125,7 @@ def remote_openai_server(request):
 
         if 'tp_size' in params:
             tp_size = params['tp_size']
-            skip_unsupported_tp_size(int(tp_size))
+            skip_unsupported_tp_size(int(tp_size), backend)
         server_args.extend(["--tensor-parallel-size", str(tp_size)])
 
     try:

@@ -23,7 +23,8 @@
     pytest.param(2, marks=pytest.mark.multi),
     pytest.param(4, marks=pytest.mark.multi),
     pytest.param(8, marks=pytest.mark.multi),
-])
+],
+                         ids=lambda val: f"TP({val})")
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_output(
     model: str,
@@ -45,7 +46,7 @@ def test_output(
     After debugging, DISABLE_ASSERTS should be reset to 'False'.
     '''
 
-    skip_unsupported_tp_size(tp_size)
+    skip_unsupported_tp_size(tp_size, backend)
 
     prompts = get_chicken_soup_prompts(4)
 

@@ -9,7 +9,8 @@
     pytest.param(2, marks=pytest.mark.multi),
     pytest.param(4, marks=pytest.mark.multi),
     pytest.param(8, marks=pytest.mark.multi),
-])
+],
+                         ids=lambda val: f"TP({val})")
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("warmup_shape", [[
     (64, 20, 1),

@@ -19,10 +19,11 @@
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("tp_size", [
-    pytest.param(1, id="tp_size"),
-    pytest.param(2, marks=pytest.mark.multi, id="tp_size"),
-    pytest.param(4, marks=pytest.mark.multi, id="tp_size")
-])
+    pytest.param(1),
+    pytest.param(2, marks=pytest.mark.multi),
+    pytest.param(4, marks=pytest.mark.multi)
+],
+                         ids=lambda val: f"TP({val})")
 def test_prompt_logprobs(
     backend: str,
     model: str,
@@ -33,7 +34,7 @@ def test_prompt_logprobs(
     This test checks the prompt_logprobs output from vllm against a reference
     implementation using huggingface.
     '''
-    skip_unsupported_tp_size(tp_size)
+    skip_unsupported_tp_size(tp_size, backend)
     num_prompt_logprobs = 5
 
     prompts = get_chicken_soup_prompts(4)

@@ -548,7 +548,13 @@ def create_random_request(
                              **extra_kwargs)
 
 
-def skip_unsupported_tp_size(size: int):
+def skip_unsupported_tp_size(size: int, backend: str):
+    if backend in ["eager", "inductor"]:
+        # Spyre cards aren't required for running TP on CPU backends
+        # But it's really slow to run tp > 2
+        if size > 2:
+            pytest.skip("Skipping TP test on CPU with TP size > 2")
+        return
     cards = int(os.getenv("AIU_WORLD_SIZE", "0"))
     if cards < size:
         pytest.skip(f"Cannot run TP size {size}: "

@@ -397,10 +397,6 @@ def execute_model(
                                        masks=model_input.input_masks,
                                        is_prompt=model_input.is_prompt)
 
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return EMPTY_MODEL_RUNNER_OUTPUT
-
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, None)
 
@@ -434,6 +430,10 @@ def execute_model(
         prompt_logprobs_dicts = self._get_prompt_logprobs_dict(
             logits=logits, model_inputs=model_input)
 
+        # Only return outputs from the driver worker
+        if not self.is_driver_worker:
+            return EMPTY_MODEL_RUNNER_OUTPUT
+
         model_output = ModelRunnerOutput(
             req_ids=list(req_id_to_index.keys()),
             req_id_to_index=req_id_to_index,