|
| 1 | +""" |
| 2 | +This file test accuracy of the vLLM server via LMEval. |
| 3 | +It uses local-completions, which interacts with vLLM |
| 4 | +through the OAI API with N concurrent connections. |
| 5 | +This simulates real work usage of the API and makes |
| 6 | +sure that the zmq frontend mp RPC message passing and |
| 7 | +AsyncLLMEngine are working correctly. |
| 8 | +""" |
| 9 | + |
| 10 | +import lm_eval |
| 11 | +import pytest |
| 12 | + |
| 13 | +from ...utils import RemoteOpenAIServer |
| 14 | + |
| 15 | +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" |
| 16 | +NUM_CONCURRENT = 500 |
| 17 | +TASK = "gsm8k" |
| 18 | +FILTER = "exact_match,strict-match" |
| 19 | +RTOL = 0.03 |
| 20 | +EXPECTED_VALUE = 0.58 |
| 21 | + |
| 22 | + |
| 23 | +@pytest.fixture(scope="module") |
| 24 | +def server(): |
| 25 | + args = [ |
| 26 | + "--max-model-len", "4096", "--enable-chunked-prefill", |
| 27 | + "--disable-log-requests", "--enforce-eager" |
| 28 | + ] |
| 29 | + |
| 30 | + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: |
| 31 | + yield remote_server |
| 32 | + |
| 33 | + |
| 34 | +@pytest.fixture(scope="module") |
| 35 | +def server_data(server): |
| 36 | + return { |
| 37 | + "url": f"{server.url_for('v1')}/completions", |
| 38 | + } |
| 39 | + |
| 40 | + |
| 41 | +def test_lm_eval_accuracy(server_data): |
| 42 | + model_args = (f"model={MODEL_NAME}," |
| 43 | + f"base_url={server_data['url']}," |
| 44 | + f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False") |
| 45 | + |
| 46 | + results = lm_eval.simple_evaluate( |
| 47 | + model="local-completions", |
| 48 | + model_args=model_args, |
| 49 | + tasks=TASK, |
| 50 | + ) |
| 51 | + |
| 52 | + measured_value = results["results"][TASK][FILTER] |
| 53 | + assert (measured_value - RTOL < EXPECTED_VALUE |
| 54 | + and measured_value + RTOL > EXPECTED_VALUE |
| 55 | + ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" |
0 commit comments