-
Notifications
You must be signed in to change notification settings - Fork 949
[BugFix] Fix Whitelist optimization CI failure #3290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6478b2c
cf65678
da464af
a9a09c1
3675d56
072b5b7
5604953
63a5558
8d39bd7
33736c6
3702299
fe78352
2f3f3fb
52215e8
cb9907e
f846d1e
9d9b720
ad83e5f
0b65ea5
f065b2e
6ebf801
7894e05
beaf1c9
065034f
b169da6
88daf61
3a41163
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -399,19 +399,41 @@ class StageDeployConfig: | |
| the top level of ``DeployConfig`` and propagated to every stage. | ||
| """ | ||
|
|
||
| # === Omni fields === | ||
| # Stage identity and Omni runtime placement. | ||
| stage_id: int | ||
|
xiaohajiayou marked this conversation as resolved.
|
||
| max_num_seqs: int = 64 | ||
| gpu_memory_utilization: float = 0.9 | ||
| tensor_parallel_size: int = 1 | ||
| enforce_eager: bool = False | ||
| max_num_batched_tokens: int = 32768 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to earlier - if this is where the values for things like Are the changes in the yaml needed to make the tests pass since the values aren't handled here? i.e., some things in the schema are required that weren't before
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Previously, some of these fields were not explicitly set in the model YAMLs and instead relied implicitly on default values.
However, now that we’ve aligned on treating vLLM defaults as the single source of truth, relying on implicit defaults is no longer a stable approach
Ideally, user-configurable fields should be defined in YAML, and others handled in pipeline.py. To keep this PR focused, I’ve explicitly materialized those defaults in the YAMLs to avoid unintended behavior changes. Proper value selection can be addressed in a follow-up (#3313).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, these changes are not introducing any newly required fields that weren’t needed before. The fields themselves already exist in vLLM; the difference is that their default values have changed compared to what was previously assumed. |
||
| max_model_len: int | None = None | ||
| async_scheduling: bool | None = None | ||
| devices: str = "0" | ||
| devices: str | None = None | ||
|
|
||
| # Inter-stage connector wiring and request defaults. | ||
| output_connectors: dict[str, str] | None = None | ||
| input_connectors: dict[str, str] | None = None | ||
| default_sampling_params: dict[str, Any] | None = None | ||
| subtalker_sampling_params: dict[str, Any] | None = None | ||
|
|
||
| # === vLLM EngineArgs fields === | ||
| # Parallelism and scheduler/memory capacity. | ||
| tensor_parallel_size: int | None = None | ||
| gpu_memory_utilization: float | None = None | ||
| max_num_seqs: int | None = None | ||
| max_num_batched_tokens: int | None = None | ||
| max_model_len: int | None = None | ||
|
|
||
| # Execution, scheduling, and KV/cache behavior. | ||
| enforce_eager: bool | None = None | ||
| async_scheduling: bool | None = None | ||
| disable_hybrid_kv_cache_manager: bool | None = None | ||
| mm_processor_cache_gb: float | None = None | ||
|
|
||
| # Compilation, profiling, tokenizer/config parsing, and model loading. | ||
| compilation_config: dict[str, Any] | None = None | ||
| profiler_config: dict[str, Any] | None = None | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bjf-frz @david6666666 PTAL |
||
| skip_mm_profiling: bool | None = None | ||
| enable_flashinfer_autotune: bool | None = None | ||
| config_format: str | None = None | ||
| load_format: str | None = None | ||
| tokenizer_mode: str | None = None | ||
|
|
||
| # Pass-through vLLM EngineArgs fields that are not represented above. | ||
| engine_extras: dict[str, Any] = field(default_factory=dict) | ||
|
|
||
|
|
||
|
xiaohajiayou marked this conversation as resolved.
|
||
|
|
@@ -436,14 +458,14 @@ class DeployConfig: | |
| pipeline: str | None = None | ||
|
|
||
| # === Pipeline-wide engine settings (applied uniformly to every stage) === | ||
| trust_remote_code: bool = True | ||
| trust_remote_code: bool | None = None | ||
| distributed_executor_backend: str | None = None | ||
| dtype: str | None = None | ||
| quantization: str | None = None | ||
| enable_prefix_caching: bool = False | ||
| enable_prefix_caching: bool | None = None | ||
| enable_chunked_prefill: bool | None = None | ||
| data_parallel_size: int = 1 | ||
| pipeline_parallel_size: int = 1 | ||
| data_parallel_size: int | None = None | ||
| pipeline_parallel_size: int | None = None | ||
|
|
||
|
|
||
| _STAGE_NON_ENGINE_KEYS = frozenset( | ||
|
|
@@ -465,10 +487,10 @@ def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig: | |
| """Parse a single stage entry from deploy YAML into StageDeployConfig.""" | ||
| if "engine_args" in stage_data: | ||
| engine_args = dict(stage_data["engine_args"]) | ||
| devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices", "0")) | ||
| devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices")) | ||
| else: | ||
| engine_args = {k: v for k, v in stage_data.items() if k not in _STAGE_NON_ENGINE_KEYS and k != "stage_id"} | ||
| devices = stage_data.get("devices", "0") | ||
| devices = stage_data.get("devices") | ||
|
|
||
| kwargs: dict[str, Any] = {"stage_id": stage_data["stage_id"], "devices": devices} | ||
| for name, f in _STAGE_DEPLOY_FIELDS.items(): | ||
|
|
@@ -687,6 +709,15 @@ def _select_processor_funcs( | |
| ) | ||
|
|
||
|
|
||
|
xiaohajiayou marked this conversation as resolved.
|
||
| def deploy_override_field_names() -> frozenset[str]: | ||
| """Return deploy-schema fields whose CLI defaults must not override YAML.""" | ||
| return ( | ||
| frozenset(_STAGE_DEPLOY_FIELDS) | ||
| | frozenset(_PIPELINE_WIDE_ENGINE_FIELDS) | ||
| | frozenset({"async_chunk", "devices"}) | ||
| ) | ||
|
|
||
|
|
||
| def _build_engine_args( | ||
| ps: StagePipelineConfig, | ||
| ds: StageDeployConfig | None, | ||
|
|
@@ -802,7 +833,7 @@ def merge_pipeline_deploy( | |
| engine_args["async_scheduling"] = sched_cls is OmniARAsyncScheduler | ||
| extras = _build_extras(ps, ds) | ||
| runtime: dict[str, Any] = {"process": True} | ||
| if ds is not None: | ||
| if ds is not None and ds.devices is not None: | ||
| runtime["devices"] = ds.devices | ||
|
|
||
| result.append( | ||
|
|
@@ -865,13 +896,13 @@ def to_omegaconf(self) -> Any: | |
|
|
||
| # CLI overrides take precedence over YAML defaults | ||
| for key, value in self.runtime_overrides.items(): | ||
| if key not in ("devices", "max_batch_size"): | ||
| if value is not None and key not in ("devices", "max_batch_size"): | ||
| engine_args[key] = value | ||
|
|
||
| # Build runtime config from YAML defaults + CLI overrides | ||
| runtime: dict[str, Any] = dict(self.yaml_runtime) | ||
| runtime.setdefault("process", True) | ||
| if "devices" in self.runtime_overrides: | ||
| if self.runtime_overrides.get("devices") is not None: | ||
| runtime["devices"] = self.runtime_overrides["devices"] | ||
|
|
||
| # Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs | ||
|
|
@@ -887,8 +918,6 @@ def to_omegaconf(self) -> Any: | |
| effective_mbs = int(cli_mbs or legacy_mbs or 1) | ||
| engine_args.setdefault("max_num_seqs", effective_mbs) | ||
|
|
||
| engine_args.setdefault("max_num_seqs", 1) | ||
|
|
||
| # Build full config dict | ||
| config_dict: dict[str, Any] = { | ||
| "stage_id": self.stage_id, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,8 +10,11 @@ async_chunk: false | |
|
|
||
| stages: | ||
| - stage_id: 0 | ||
| max_num_batched_tokens: 32768 | ||
| max_num_seqs: 3 | ||
| gpu_memory_utilization: 0.45 | ||
| trust_remote_code: true | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are so
|
||
| enable_prefix_caching: false | ||
| devices: "0" | ||
| default_sampling_params: | ||
| temperature: 0.4 | ||
|
|
@@ -23,8 +26,11 @@ stages: | |
| repetition_penalty: 1.05 | ||
|
|
||
| - stage_id: 1 | ||
| max_num_batched_tokens: 32768 | ||
| max_num_seqs: 1 | ||
| enforce_eager: true | ||
|
xiaohajiayou marked this conversation as resolved.
|
||
| trust_remote_code: true | ||
| enable_prefix_caching: false | ||
| devices: "0" | ||
| input_connectors: | ||
| from_stage_0: shared_memory_connector | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.