-
Notifications
You must be signed in to change notification settings - Fork 68
[ROCm][CI] Fix AMD pipeline generator command wrapping, gating, and git diff behavior #347
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5897bb5
5f1ed2c
764d573
8ce3b98
9701b54
f210c89
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| from pydantic import BaseModel | ||
| from typing import Dict, List, Optional, Any, Union | ||
| from typing import Dict, List, Optional, Any, Union, Literal | ||
| import os | ||
|
|
||
| from step import Step | ||
|
|
@@ -10,6 +10,9 @@ | |
| from constants import DeviceType, AgentQueue | ||
|
|
||
|
|
||
| SetupProfile = Literal["nvidia", "amd", "none"] | ||
|
|
||
|
|
||
| class BuildkiteCommandStep(BaseModel): | ||
| label: str | ||
| group: Optional[str] = None | ||
|
|
@@ -152,15 +155,34 @@ def _get_variables_to_inject() -> Dict[str, str]: | |
| } | ||
|
|
||
|
|
||
| def _prepare_commands(step: Step, variables_to_inject: Dict[str, str]) -> List[str]: | ||
| def _get_setup_commands(step: Step, setup_profile: SetupProfile) -> List[str]: | ||
| if step.label.startswith(":docker:") or step.no_plugin or setup_profile == "none": | ||
| return [] | ||
|
|
||
| if setup_profile == "nvidia": | ||
| return [ | ||
| "echo '--- :nvidia: GPU Info'", | ||
| "(command nvidia-smi || true)", | ||
| "echo '--- :gear: CUDA Coredump Setup'", | ||
| "export CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1 && export CUDA_COREDUMP_SHOW_PROGRESS=1 && export CUDA_COREDUMP_GENERATION_FLAGS='skip_nonrelocated_elf_images,skip_global_memory,skip_shared_memory,skip_local_memory,skip_constbank_memory'", | ||
| ] | ||
|
|
||
| if setup_profile == "amd": | ||
| return [ | ||
| "echo '--- :amd: GPU Info'", | ||
| "(command amd-smi || true)", | ||
| ] | ||
|
|
||
| raise ValueError(f"Unsupported setup profile: {setup_profile}") | ||
|
|
||
|
|
||
| def _prepare_commands( | ||
| step: Step, | ||
| variables_to_inject: Dict[str, str], | ||
| setup_profile: SetupProfile = "nvidia", | ||
| ) -> List[str]: | ||
| """Prepare step commands with variables injected and default setup commands.""" | ||
| commands = [] | ||
| # Default setup commands | ||
| if not step.label.startswith(":docker:") and not step.no_plugin: | ||
| commands.append("echo '--- :nvidia: GPU Info'") | ||
| commands.append("(command nvidia-smi || true)") | ||
| commands.append("echo '--- :gear: CUDA Coredump Setup'") | ||
| commands.append("export CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1 && export CUDA_COREDUMP_SHOW_PROGRESS=1 && export CUDA_COREDUMP_GENERATION_FLAGS='skip_nonrelocated_elf_images,skip_global_memory,skip_shared_memory,skip_local_memory,skip_constbank_memory'") | ||
| commands = _get_setup_commands(step, setup_profile) | ||
|
|
||
| continue_on_failure = os.getenv("CONTINUE_ON_FAILURE") == "1" | ||
|
|
||
|
|
@@ -213,6 +235,20 @@ def _create_block_step(step: Step, list_file_diff: List[str]) -> BuildkiteBlockS | |
| return block_step | ||
|
|
||
|
|
||
| def _matches_source_dependency(source_file: str, diff_file: str) -> bool: | ||
| normalized = source_file.rstrip("/") | ||
| if not normalized: | ||
| return False | ||
| return diff_file == normalized or diff_file.startswith(f"{normalized}/") | ||
|
|
||
|
|
||
| def _step_is_blocked(step: Step, list_file_diff: List[str]) -> bool: | ||
| global_config = get_global_config() | ||
| return (not _step_should_run(step, list_file_diff) or ( | ||
| step.optional and global_config["nightly"] != "1" | ||
| )) | ||
|
|
||
|
|
||
|
Comment on lines
+238
to
+251
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The generator used raw substring matching:
exact file match:
I added it because the “should this step be behind a manual block?” logic was being repeated. |
||
| def convert_group_step_to_buildkite_step( | ||
| group_steps: Dict[str, List[Step]], | ||
| ) -> List[BuildkiteGroupStep]: | ||
|
|
@@ -230,7 +266,7 @@ def convert_group_step_to_buildkite_step( | |
| for step in steps: | ||
| # block step | ||
| block_step = None | ||
| if not _step_should_run(step, list_file_diff): | ||
| if _step_is_blocked(step, list_file_diff): | ||
| block_step = _create_block_step(step, list_file_diff) | ||
| if block_step: | ||
| group_steps_list.append(block_step) | ||
|
|
@@ -244,7 +280,7 @@ def convert_group_step_to_buildkite_step( | |
| depends_on=step.depends_on, | ||
| soft_fail=step.soft_fail, | ||
| agents={"queue": get_agent_queue(step)}, | ||
| priority=1000 if os.getenv("PRIORITY", "") == "HIGH" else 0 | ||
| priority=1000 if os.getenv("PRIORITY", "") == "HIGH" else 0, | ||
| ) | ||
|
|
||
| if block_step: | ||
|
|
@@ -276,16 +312,20 @@ def convert_group_step_to_buildkite_step( | |
| # Create AMD mirror step and its block step if specified/applicable | ||
| if step.mirror and step.mirror.get("amd"): | ||
| amd_block_step = None | ||
| if not _step_should_run(step, list_file_diff): | ||
| if not _amd_mirror_should_run( | ||
| step, step.mirror["amd"], list_file_diff | ||
| ): | ||
| amd_block_step = BuildkiteBlockStep( | ||
| block=f"Run AMD: {step.label}", | ||
| depends_on=["image-build-amd"], | ||
| key=f"block-amd-{_generate_step_key(step.label)}", | ||
| ) | ||
| amd_mirror_steps.append(amd_block_step) | ||
| amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"]) | ||
| amd_step = _create_amd_mirror_step( | ||
| step, variables_to_inject, step.mirror["amd"] | ||
| ) | ||
| if amd_block_step: | ||
| amd_step.depends_on.extend([amd_block_step.key]) | ||
| amd_step.depends_on.append(amd_block_step.key) | ||
| amd_mirror_steps.append(amd_step) | ||
|
|
||
| buildkite_group_steps.append( | ||
|
|
@@ -323,7 +363,39 @@ def _step_should_run(step: Step, list_file_diff: List[str]) -> bool: | |
| if step.source_file_dependencies: | ||
| for source_file in step.source_file_dependencies: | ||
| for diff_file in list_file_diff: | ||
| if source_file in diff_file: | ||
| if _matches_source_dependency(source_file, diff_file): | ||
| return True | ||
| return False | ||
|
|
||
|
|
||
| def _amd_mirror_should_run( | ||
| step: Step, amd: Dict[str, Any], list_file_diff: List[str] | ||
| ) -> bool: | ||
| if os.getenv("NOAUTO") == "1": | ||
| return False | ||
| global_config = get_global_config() | ||
| if global_config["nightly"] == "1": | ||
| return True | ||
| if step.optional: | ||
| return False | ||
| if _source_file_dependencies_match( | ||
| amd.get("source_file_dependencies"), list_file_diff | ||
| ): | ||
| return True | ||
| if global_config["run_all"]: | ||
| return True | ||
| return _source_file_dependencies_match( | ||
| step.source_file_dependencies, list_file_diff | ||
| ) | ||
|
|
||
|
|
||
| def _source_file_dependencies_match( | ||
| source_file_dependencies: Optional[List[str]], list_file_diff: List[str] | ||
| ) -> bool: | ||
| if source_file_dependencies: | ||
| for source_file in source_file_dependencies: | ||
| for diff_file in list_file_diff: | ||
| if _matches_source_dependency(source_file, diff_file): | ||
| return True | ||
| return False | ||
|
|
||
|
|
@@ -343,19 +415,26 @@ def _generate_step_key(step_label: str) -> str: | |
| ) | ||
|
|
||
|
|
||
| def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[str, Any]) -> BuildkiteCommandStep: | ||
| def _create_amd_mirror_step( | ||
| step: Step, | ||
| variables_to_inject: Dict[str, str], | ||
| amd: Dict[str, Any], | ||
| ) -> BuildkiteCommandStep: | ||
| """Create an AMD mirrored step from the original step.""" | ||
| amd_device = amd["device"] | ||
| custom_commands = amd.get("commands") | ||
| if custom_commands: | ||
| # Custom AMD commands didn't go through _prepare_commands(), need cd | ||
| amd_commands_str = " && ".join(custom_commands) | ||
| working_dir = amd.get("working_dir", step.working_dir) | ||
| if working_dir: | ||
| amd_commands_str = f"cd {working_dir} && {amd_commands_str}" | ||
| amd_step = step.model_copy(update={ | ||
| "commands": custom_commands, | ||
| "working_dir": amd.get("working_dir", step.working_dir), | ||
| }) | ||
| amd_commands_str = " && ".join( | ||
| _prepare_commands(amd_step, variables_to_inject, setup_profile="amd") | ||
| ) | ||
| else: | ||
| # original_commands already include cd from _prepare_commands() | ||
| amd_commands_str = " && ".join(original_commands) | ||
| amd_commands_str = " && ".join( | ||
| _prepare_commands(step, variables_to_inject, setup_profile="amd") | ||
| ) | ||
|
|
||
| # Pass commands via VLLM_TEST_COMMANDS env var instead of positional | ||
| # argument. Buildkite sets env vars directly in the process environment | ||
|
|
@@ -466,7 +545,7 @@ def _create_torch_nightly_group( | |
| if not step_auto_run and step.source_file_dependencies: | ||
| for source_file in step.source_file_dependencies: | ||
| for diff_file in list_file_diff: | ||
| if source_file in diff_file: | ||
| if _matches_source_dependency(source_file, diff_file): | ||
| step_auto_run = True | ||
| break | ||
| if step_auto_run: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,32 +25,29 @@ def get_merge_base_commit() -> Optional[str]: | |
| def get_list_file_diff(branch: str, merge_base_commit: Optional[str]) -> List[str]: | ||
| """Get list of file paths that get changed between current branch and origin/main.""" | ||
| try: | ||
| subprocess.run(["git", "add", "."], check=True) | ||
| if branch == "main": | ||
| output = subprocess.check_output( | ||
| ["git", "diff", "--name-only", "--diff-filter=ACMDR", "HEAD~1"], | ||
| ["git", "diff", "--name-only", "--diff-filter=ACMDR", "HEAD~1", "HEAD"], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we adding |
||
| universal_newlines=True, | ||
| ) | ||
| else: | ||
| merge_base = merge_base_commit | ||
| if not merge_base: | ||
| pass | ||
| raise RuntimeError("Failed to determine merge base commit for git diff.") | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @khluu Lmk if I should revert this part. I added it cause I thought that we should not silently pass right? |
||
| output = subprocess.check_output( | ||
| [ | ||
| "git", | ||
| "diff", | ||
| "--name-only", | ||
| "--diff-filter=ACMDR", | ||
| merge_base.strip(), | ||
| "HEAD", | ||
| ], | ||
| universal_newlines=True, | ||
| ) | ||
| return [line for line in output.split("\n") if line.strip()] | ||
| except subprocess.CalledProcessError as e: | ||
| raise RuntimeError(f"Failed to get git diff: {e}") | ||
| except AttributeError: | ||
| # Case where merge_base_commit is None | ||
| raise RuntimeError("Failed to determine merge base commit for git diff.") | ||
|
|
||
|
|
||
| def get_pr_labels(pull_request: str, repo_name: str) -> List[str]: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added different profile for AMD and that also gives flexibility for anyone else.