-
Notifications
You must be signed in to change notification settings - Fork 3.3k
[PD] Add simple unit test for disaggregation feature #5654
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 4 commits
Commits
Show all changes
40 commits
Select commit
Hold shift + click to select a range
ff45cbe
Add simple unit test for disaggregation feature
ShangmingCai 3cc7ed1
Merge branch 'main' into add_pd_test
ShangmingCai f8afe9c
Add disaggregation test into run_suite
ShangmingCai b27efb4
Fix ci dependency for mooncake
ShangmingCai 6b08ed8
fix dependency
ShangmingCai 98a0329
minor
ShangmingCai 1581015
fix rdma dependency
ShangmingCai 4afe3fe
fix lint
ShangmingCai 84a6251
fix tp size
ShangmingCai dc79d56
fix lint
ShangmingCai 777bfca
fix model tp
ShangmingCai 69c66c7
tmp check ci env
ShangmingCai 7f016b5
Merge branch 'main' into add_pd_test
ShangmingCai 76407ef
fix dependency
ShangmingCai a9c38ab
Add a new job in pr-test
ShangmingCai cb79c0b
Merge branch 'main' into add_pd_test
ShangmingCai d01fe2c
check driver
ShangmingCai 5f7da22
Add rdma dependencies
ShangmingCai 236c557
Fix tzdata install
ShangmingCai 18ab9a4
Fix tzdata again
ShangmingCai 8883bf3
fix
ShangmingCai 3098146
more
ShangmingCai c788ac5
more
ShangmingCai 82bcade
more
ShangmingCai 0698a9a
more
ShangmingCai 4c3572c
more
ShangmingCai c8fe5b8
more
ShangmingCai 3a051f7
more
ShangmingCai a4b4d09
more
ShangmingCai a51a0fe
Merge branch 'main' into add_pd_test
ShangmingCai 4d814ba
clean script
ShangmingCai 8250951
fix merge
ShangmingCai ca47625
Merge branch 'main' into add_pd_test
ShangmingCai d38f860
fix pr-test.yaml
ShangmingCai 0a6dcd3
more
ShangmingCai 37d4e83
use 8 gpu runner
ShangmingCai c6c2f69
Merge branch 'main' into add_pd_test
ShangmingCai db7c365
tmp enlarge timeout to verify correctness
ShangmingCai b034a91
seperate pd test
ShangmingCai dc85c95
Done
ShangmingCai File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| import subprocess | ||
| import threading | ||
| import time | ||
| import unittest | ||
| from types import SimpleNamespace | ||
|
|
||
| import requests | ||
| import torch | ||
|
|
||
| from sglang.srt.utils import kill_process_tree | ||
| from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k | ||
| from sglang.test.test_utils import ( | ||
| DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, | ||
| DEFAULT_URL_FOR_TEST, | ||
| CustomTestCase, | ||
| popen_launch_pd_server, | ||
| run_with_timeout, | ||
| ) | ||
|
|
||
|
|
||
| class TestDisaggregationMooncake(CustomTestCase): | ||
| @classmethod | ||
| def setUpClass(cls): | ||
| cls.model = "lmsys/sglang-ci-dsv3-test" | ||
| cls.base_host = "127.0.0.1" | ||
| cls.base_port = int(DEFAULT_URL_FOR_TEST.split(":")[-1]) | ||
| cls.lb_url = DEFAULT_URL_FOR_TEST | ||
| cls.prefill_url = f"http://{cls.base_host}:{cls.base_port + 100}" | ||
| cls.decode_url = f"http://{cls.base_host}:{cls.base_port + 200}" | ||
|
|
||
| run_with_timeout(cls.start_prefill, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH) | ||
| run_with_timeout(cls.start_decode, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH) | ||
|
|
||
| cls.wait_server_ready(cls.prefill_url + "/health") | ||
| cls.wait_server_ready(cls.decode_url + "/health") | ||
|
|
||
| lb_command = [ | ||
| "python3", | ||
| "-m", | ||
| "sglang.srt.disaggregation.mini_lb", | ||
| "--prefill", | ||
| cls.prefill_url, | ||
| "--decode", | ||
| cls.decode_url, | ||
| "--host", | ||
| cls.base_host, | ||
| "--port", | ||
| str(cls.base_port), | ||
| ] | ||
|
|
||
| print("Starting load balancer:", " ".join(lb_command)) | ||
| cls.process_lb = subprocess.Popen( | ||
| lb_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE | ||
| ) | ||
| cls.wait_server_ready(cls.lb_url + "/health") | ||
|
|
||
| @classmethod | ||
| def start_prefill(cls): | ||
| prefill_args = [ | ||
| "--trust-remote-code", | ||
| "--disaggregation-mode", | ||
| "prefill", | ||
| "--host", | ||
| cls.base_host, | ||
| "--port", | ||
| str(cls.base_port + 100), | ||
| ] | ||
| cls.process_prefill = popen_launch_pd_server( | ||
| cls.model, | ||
| cls.prefill_url, | ||
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, | ||
| other_args=prefill_args, | ||
| ) | ||
|
|
||
| @classmethod | ||
| def start_decode(cls): | ||
| decode_args = [ | ||
| "--trust-remote-code", | ||
| "--disaggregation-mode", | ||
| "decode", | ||
| "--host", | ||
| cls.base_host, | ||
| "--port", | ||
| str(cls.base_port + 200), | ||
| "--base-gpu-id", | ||
| "1", | ||
| ] | ||
| cls.process_decode = popen_launch_pd_server( | ||
| cls.model, | ||
| cls.decode_url, | ||
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, | ||
| other_args=decode_args, | ||
| ) | ||
|
|
||
| @classmethod | ||
| def wait_server_ready(cls, url, timeout=60): | ||
| start_time = time.time() | ||
| while True: | ||
| try: | ||
| response = requests.get(url) | ||
| if response.status_code == 200: | ||
| print(f"Server {url} is ready") | ||
| return | ||
| except Exception: | ||
| pass | ||
|
|
||
| if time.time() - start_time > timeout: | ||
| raise RuntimeError(f"Server {url} failed to start in {timeout}s") | ||
| time.sleep(1) | ||
|
|
||
| @classmethod | ||
| def tearDownClass(cls): | ||
| for process in [cls.process_lb, cls.process_decode, cls.process_prefill]: | ||
| if process: | ||
| try: | ||
| kill_process_tree(process.pid) | ||
| except Exception as e: | ||
| print(f"Error killing process {process.pid}: {e}") | ||
|
|
||
| def test_gsm8k(self): | ||
| args = SimpleNamespace( | ||
| num_shots=5, | ||
| data_path=None, | ||
| num_questions=200, | ||
| max_new_tokens=512, | ||
| parallel=128, | ||
| host="http://127.0.0.1", | ||
| port=int(self.lb_url.split(":")[-1]), | ||
| ) | ||
| metrics = run_eval_few_shot_gsm8k(args) | ||
| print(f"Evaluation metrics: {metrics}") | ||
|
|
||
| self.assertGreater(metrics["accuracy"], 0.62) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For faster CI, can we use a small llama model within a node? (maybe tp4+tp4)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this test model is lite