vllm-project · geeker-smallwhite · Mar 13, 2024
diff --git a/tests/engine/test_stop.py b/tests/engine/test_stop.py
@@ -0,0 +1,20 @@
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+
+PROMPT = '''def print_prime(n):
+   """
+   Print all primes between 1 and n
+   """'''
+
+
+@pytest.mark.parametrize("model", "meta-llama/Llama-2-7b-hf")
+@pytest.mark.parametrize("prompt", PROMPT)
+@pytest.mark.parametrize("stop", [' ', 'for'])
+def test_generate_stop(model, prompt, stop):
+    engine_args = EngineArgs(model=model, enable_prefix_caching=True)
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams(stop=stop)
+    engine.add_request("0", prompt, sampling_params)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -776,11 +776,10 @@ def _finalize_sequence(self, seq: Sequence,
                            stop_string: str) -> None:
         if sampling_params.include_stop_str_in_output:
             return
-
-        if stop_string and seq.output_text.endswith(stop_string):
-            # Truncate the output text so that the stop string is
-            # not included in the output.
-            seq.output_text = seq.output_text[:-len(stop_string)]
+        if stop_string:
+            index = seq.output_text.find(stop_string)
+            if index >= 0:
+                seq.output_text = seq.output_text[:index]
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_executor.add_lora(lora_request)