Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions buildkite/bootstrap-amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ if [[ -z "${COV_ENABLED:-}" ]]; then
COV_ENABLED=0
fi

if [[ -z "${AMD_USE_PIPELINE_GENERATOR:-}" ]]; then
AMD_USE_PIPELINE_GENERATOR=0
fi

# ---------------------------------------------------------------------------
# Git setup: ensure origin/main is available and compute merge base once.
# On K8s (blobless clones with --filter=blob:none), origin/main may not be
Expand Down Expand Up @@ -119,12 +123,15 @@ upload_pipeline() {
fi


# (WIP) Use pipeline generator instead of jinja template
if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
python -m pip install click pydantic
python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
buildkite-agent pipeline upload .buildkite/pipeline.yaml
exit 0
# AMD still uses its dedicated test-amd.yaml + Jinja pipeline model.
# The generic Python generator is centered around ci_config.yaml/test_areas
# and is not wired up for bootstrap-amd yet.
if [[ "$AMD_USE_PIPELINE_GENERATOR" == "1" ]]; then
echo "AMD_USE_PIPELINE_GENERATOR=1 requested, but bootstrap-amd still uses the dedicated Jinja pipeline."
else
echo "Skipping Python pipeline generator for AMD; using the dedicated Jinja pipeline."
fi
fi
echo "List file diff: $LIST_FILE_DIFF"
echo "Run all: $RUN_ALL"
Expand Down
127 changes: 103 additions & 24 deletions buildkite/pipeline_generator/buildkite_step.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pydantic import BaseModel
from typing import Dict, List, Optional, Any, Union
from typing import Dict, List, Optional, Any, Union, Literal
import os

from step import Step
Expand All @@ -10,6 +10,9 @@
from constants import DeviceType, AgentQueue


SetupProfile = Literal["nvidia", "amd", "none"]


class BuildkiteCommandStep(BaseModel):
label: str
group: Optional[str] = None
Expand Down Expand Up @@ -152,15 +155,34 @@ def _get_variables_to_inject() -> Dict[str, str]:
}


def _prepare_commands(step: Step, variables_to_inject: Dict[str, str]) -> List[str]:
def _get_setup_commands(step: Step, setup_profile: SetupProfile) -> List[str]:
if step.label.startswith(":docker:") or step.no_plugin or setup_profile == "none":
return []

if setup_profile == "nvidia":
return [
"echo '--- :nvidia: GPU Info'",
"(command nvidia-smi || true)",
"echo '--- :gear: CUDA Coredump Setup'",
"export CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1 && export CUDA_COREDUMP_SHOW_PROGRESS=1 && export CUDA_COREDUMP_GENERATION_FLAGS='skip_nonrelocated_elf_images,skip_global_memory,skip_shared_memory,skip_local_memory,skip_constbank_memory'",
]

if setup_profile == "amd":
return [
"echo '--- :amd: GPU Info'",
"(command amd-smi || true)",
]

raise ValueError(f"Unsupported setup profile: {setup_profile}")
Comment on lines +158 to +176
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added different profile for AMD and that also gives flexibility for anyone else.



def _prepare_commands(
step: Step,
variables_to_inject: Dict[str, str],
setup_profile: SetupProfile = "nvidia",
) -> List[str]:
"""Prepare step commands with variables injected and default setup commands."""
commands = []
# Default setup commands
if not step.label.startswith(":docker:") and not step.no_plugin:
commands.append("echo '--- :nvidia: GPU Info'")
commands.append("(command nvidia-smi || true)")
commands.append("echo '--- :gear: CUDA Coredump Setup'")
commands.append("export CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1 && export CUDA_COREDUMP_SHOW_PROGRESS=1 && export CUDA_COREDUMP_GENERATION_FLAGS='skip_nonrelocated_elf_images,skip_global_memory,skip_shared_memory,skip_local_memory,skip_constbank_memory'")
commands = _get_setup_commands(step, setup_profile)

continue_on_failure = os.getenv("CONTINUE_ON_FAILURE") == "1"

Expand Down Expand Up @@ -213,6 +235,20 @@ def _create_block_step(step: Step, list_file_diff: List[str]) -> BuildkiteBlockS
return block_step


def _matches_source_dependency(source_file: str, diff_file: str) -> bool:
normalized = source_file.rstrip("/")
if not normalized:
return False
return diff_file == normalized or diff_file.startswith(f"{normalized}/")


def _step_is_blocked(step: Step, list_file_diff: List[str]) -> bool:
global_config = get_global_config()
return (not _step_should_run(step, list_file_diff) or (
step.optional and global_config["nightly"] != "1"
))


Comment on lines +238 to +251
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The generator used raw substring matching:

if source_file in diff_file
That is too loose. Examples:

vllm/ would match notvllm/foo.py
src/file would match src/file_extra.py
The helper changes that to proper exact-or-prefix path matching:

exact file match: diff_file == normalized
directory prefix match: diff_file.startswith(f"{normalized}/")
So this one exists to make gating correct.

_step_is_blocked() is mostly a cleanup/helper.

I added it because the “should this step be behind a manual block?” logic was being repeated.

def convert_group_step_to_buildkite_step(
group_steps: Dict[str, List[Step]],
) -> List[BuildkiteGroupStep]:
Expand All @@ -230,7 +266,7 @@ def convert_group_step_to_buildkite_step(
for step in steps:
# block step
block_step = None
if not _step_should_run(step, list_file_diff):
if _step_is_blocked(step, list_file_diff):
block_step = _create_block_step(step, list_file_diff)
if block_step:
group_steps_list.append(block_step)
Expand All @@ -244,7 +280,7 @@ def convert_group_step_to_buildkite_step(
depends_on=step.depends_on,
soft_fail=step.soft_fail,
agents={"queue": get_agent_queue(step)},
priority=1000 if os.getenv("PRIORITY", "") == "HIGH" else 0
priority=1000 if os.getenv("PRIORITY", "") == "HIGH" else 0,
)

if block_step:
Expand Down Expand Up @@ -276,16 +312,20 @@ def convert_group_step_to_buildkite_step(
# Create AMD mirror step and its block step if specified/applicable
if step.mirror and step.mirror.get("amd"):
amd_block_step = None
if not _step_should_run(step, list_file_diff):
if not _amd_mirror_should_run(
step, step.mirror["amd"], list_file_diff
):
amd_block_step = BuildkiteBlockStep(
block=f"Run AMD: {step.label}",
depends_on=["image-build-amd"],
key=f"block-amd-{_generate_step_key(step.label)}",
)
amd_mirror_steps.append(amd_block_step)
amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"])
amd_step = _create_amd_mirror_step(
step, variables_to_inject, step.mirror["amd"]
)
if amd_block_step:
amd_step.depends_on.extend([amd_block_step.key])
amd_step.depends_on.append(amd_block_step.key)
amd_mirror_steps.append(amd_step)

buildkite_group_steps.append(
Expand Down Expand Up @@ -323,7 +363,39 @@ def _step_should_run(step: Step, list_file_diff: List[str]) -> bool:
if step.source_file_dependencies:
for source_file in step.source_file_dependencies:
for diff_file in list_file_diff:
if source_file in diff_file:
if _matches_source_dependency(source_file, diff_file):
return True
return False


def _amd_mirror_should_run(
step: Step, amd: Dict[str, Any], list_file_diff: List[str]
) -> bool:
if os.getenv("NOAUTO") == "1":
return False
global_config = get_global_config()
if global_config["nightly"] == "1":
return True
if step.optional:
return False
if _source_file_dependencies_match(
amd.get("source_file_dependencies"), list_file_diff
):
return True
if global_config["run_all"]:
return True
return _source_file_dependencies_match(
step.source_file_dependencies, list_file_diff
)


def _source_file_dependencies_match(
source_file_dependencies: Optional[List[str]], list_file_diff: List[str]
) -> bool:
if source_file_dependencies:
for source_file in source_file_dependencies:
for diff_file in list_file_diff:
if _matches_source_dependency(source_file, diff_file):
return True
return False

Expand All @@ -343,19 +415,26 @@ def _generate_step_key(step_label: str) -> str:
)


def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[str, Any]) -> BuildkiteCommandStep:
def _create_amd_mirror_step(
step: Step,
variables_to_inject: Dict[str, str],
amd: Dict[str, Any],
) -> BuildkiteCommandStep:
"""Create an AMD mirrored step from the original step."""
amd_device = amd["device"]
custom_commands = amd.get("commands")
if custom_commands:
# Custom AMD commands didn't go through _prepare_commands(), need cd
amd_commands_str = " && ".join(custom_commands)
working_dir = amd.get("working_dir", step.working_dir)
if working_dir:
amd_commands_str = f"cd {working_dir} && {amd_commands_str}"
amd_step = step.model_copy(update={
"commands": custom_commands,
"working_dir": amd.get("working_dir", step.working_dir),
})
amd_commands_str = " && ".join(
_prepare_commands(amd_step, variables_to_inject, setup_profile="amd")
)
else:
# original_commands already include cd from _prepare_commands()
amd_commands_str = " && ".join(original_commands)
amd_commands_str = " && ".join(
_prepare_commands(step, variables_to_inject, setup_profile="amd")
)

# Pass commands via VLLM_TEST_COMMANDS env var instead of positional
# argument. Buildkite sets env vars directly in the process environment
Expand Down Expand Up @@ -466,7 +545,7 @@ def _create_torch_nightly_group(
if not step_auto_run and step.source_file_dependencies:
for source_file in step.source_file_dependencies:
for diff_file in list_file_diff:
if source_file in diff_file:
if _matches_source_dependency(source_file, diff_file):
step_auto_run = True
break
if step_auto_run:
Expand Down
1 change: 0 additions & 1 deletion buildkite/pipeline_generator/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ py-modules = [
"pipeline_generator",
"buildkite_step",
"step",
"utils",
"global_config",
"constants",
]
Expand Down
9 changes: 3 additions & 6 deletions buildkite/pipeline_generator/utils_lib/git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,32 +25,29 @@ def get_merge_base_commit() -> Optional[str]:
def get_list_file_diff(branch: str, merge_base_commit: Optional[str]) -> List[str]:
"""Get list of file paths that get changed between current branch and origin/main."""
try:
subprocess.run(["git", "add", "."], check=True)
if branch == "main":
output = subprocess.check_output(
["git", "diff", "--name-only", "--diff-filter=ACMDR", "HEAD~1"],
["git", "diff", "--name-only", "--diff-filter=ACMDR", "HEAD~1", "HEAD"],
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we adding HEAD at the end?

universal_newlines=True,
)
else:
merge_base = merge_base_commit
if not merge_base:
pass
raise RuntimeError("Failed to determine merge base commit for git diff.")
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@khluu Lmk if I should revert this part. I added it cause I thought that we should not silently pass right?

output = subprocess.check_output(
[
"git",
"diff",
"--name-only",
"--diff-filter=ACMDR",
merge_base.strip(),
"HEAD",
],
universal_newlines=True,
)
return [line for line in output.split("\n") if line.strip()]
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to get git diff: {e}")
except AttributeError:
# Case where merge_base_commit is None
raise RuntimeError("Failed to determine merge base commit for git diff.")


def get_pr_labels(pull_request: str, repo_name: str) -> List[str]:
Expand Down
Loading