vllm-project · youkaichao · Aug 27, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
@@ -12,5 +12,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
-    python3 /workspace/vllm/examples/offline_inference_tpu.py
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 /workspace/vllm/tests/tpu/test_compilation.py"
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
@@ -0,0 +1,23 @@
+import glob
+import os
+import runpy
+import tempfile
+
+import depyf
+
+temp_dir = tempfile.mkdtemp()
+with depyf.prepare_debug(temp_dir):
+    cur_dir = os.path.dirname(__file__)
+    parent_dir = os.path.dirname(cur_dir)
+    root_dir = os.path.dirname(parent_dir)
+    example_file = os.path.join(root_dir, "examples",
+                                "offline_inference_tpu.py")
+    runpy.run_path(example_file)
+
+compiled_code = glob.glob(os.path.join(temp_dir, "__transformed_code*.py"))
+# we should only trigger Dynamo compilation twice,
+# one for the prefill phase, and one for the decode phase.
+# the graphs will have symbolic shapes, and later calls should
+# not trigger Dynamo compilation again.
+# NOTE: it might still trigger XLA compilation.
+assert len(compiled_code) == 2
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -946,6 +946,7 @@ def load_model(self) -> None:
                     "This may lead to less accurate results!")
 
         if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE:
+            self._not_compiled_model = self.model
             self.model = torch.compile(self.model,
                                        fullgraph=True,
                                        backend="eager")
@@ -1094,7 +1095,11 @@ def profile_run(self) -> None:
                 batch_size=batch_size,
                 dtype=self.model_config.dtype,
                 device=self.device)
-        self.execute_model(model_input, kv_caches, intermediate_tensors)
+        # no compilation is needed for profiling memory
+        self.execute_model(model_input,
+                           kv_caches,
+                           intermediate_tensors,
+                           compiled=False)
         torch.cuda.synchronize()
         return
 
@@ -1367,6 +1372,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        compiled: bool = True,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError("num_steps > 1 is not supported in ModelRunner")
@@ -1398,8 +1404,10 @@ def execute_model(
             graph_batch_size = model_input.input_tokens.shape[0]
             model_executable = self.graph_runners[virtual_engine][
                 graph_batch_size]
-        else:
+        elif compiled:
             model_executable = self.model
+        else:
+            model_executable = self.not_compiled_model
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
         seqlen_agnostic_kwargs = {

diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
@@ -202,6 +202,17 @@ def execute_model(
         """
         raise NotImplementedError
 
+    @property
+    def not_compiled_model(self):
+        """
+        Return the model that is not compiled.
+        """
+        if hasattr(self, "_not_compiled_model"):
+            return self._not_compiled_model
+        if hasattr(self, "model"):
+            return self.model
+        raise RuntimeError("No model found")
+
     def get_generators(self, finished_request_ids: Optional[List[str]] = None):
         """
         Return dict of per-request generators used for random sampling.

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
@@ -145,6 +145,7 @@ def load_model(self) -> None:
         model = model.eval()
         xm.wait_device_ops()
         model = ModelWrapper(model)
+        self._not_compiled_model = model
         self.model = torch.compile(model,
                                    backend="openxla",
                                    fullgraph=True,
@@ -156,6 +157,7 @@ def _dummy_run(
         seq_len: int,
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
         is_prompt: bool,
+        compiled: bool = True,
     ) -> None:
         if is_prompt:
             seq_len = (seq_len + 15) // 16 * 16
@@ -235,7 +237,8 @@ def _dummy_run(
             torch._dynamo.mark_dynamic(t, 0)
             torch._dynamo.mark_dynamic(p, 0)
         # Dummy run.
-        self.model(token_ids, position_ids, attn_metadata, input_lens, t, p,
+        executable = self.model if compiled else self.not_compiled_model
+        executable(token_ids, position_ids, attn_metadata, input_lens, t, p,
                    num_samples, kv_caches)
 
     def warmup_model(
@@ -510,6 +513,7 @@ def execute_model(
         kv_caches: Optional[List[Any]],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        compiled: bool = True,
     ) -> List[SamplerOutput]:
         assert intermediate_tensors is None
         if num_steps > 1:
@@ -530,7 +534,8 @@ def _execute_model(*args):
                     if getattr(arg, "context_lens", None) is not None:
                         arg.context_lens = arg.context_lens.to(self.device)
                 new_args.append(arg)
-            return self.model(*new_args)
+            executable = self.model if compiled else self.not_compiled_model
+            return executable(*new_args)
 
         num_prefills = model_input.attn_metadata.num_prefills
         is_prompt = num_prefills > 0

diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
@@ -120,6 +120,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             seq_len=self.scheduler_config.max_num_batched_tokens,
             kv_caches=kv_caches,
             is_prompt=True,
+            compiled=False,
         )
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()