Skip to content
Merged
106 changes: 56 additions & 50 deletions .github/workflows/build-test-publish-wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Build, test, and publish a PyPi wheel (to testpypi).

on:
Expand All @@ -35,55 +34,62 @@ concurrency:

jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
with:
default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
secrets:
NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}

# build-test-publish-wheel:
# needs: [pre-flight]
# if: |
# !(needs.pre-flight.outputs.docs_only == 'true'
# || needs.pre-flight.outputs.is_deployment_workflow == 'true')
# uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.65.1
# with:
# dry-run: true
# python-package: megatron.bridge
# python-version: "3.10"
# packaging: uv
# no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
# has-src-dir: true
# skip-test-wheel: true
# custom-container: nvcr.io/nvidia/pytorch:25.05-py3
# runner: self-hosted-nemo
# no-build-isolation: true
# submodules: recursive
# container-options: "--gpus all --runtime=nvidia"
# secrets:
# TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
# TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
# SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
# GH_TOKEN: ${{ secrets.PAT }}
build-test-publish-wheel:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.70.1
with:
dry-run: true
python-package: megatron.bridge
python-version: "3.10"
packaging: uv
no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
has-src-dir: true
skip-test-wheel: true
custom-container: nvcr.io/nvidia/pytorch:25.11-py3
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2-container
no-build-isolation: true
submodules: recursive
container-options: "--gpus all --runtime=nvidia"
secrets:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
GH_TOKEN: ${{ secrets.PAT }}

# build-test-publish-wheel-summary:
# needs: [pre-flight, build-test-publish-wheel]
# if: |
# (
# needs.pre-flight.outputs.docs_only == 'true'
# || needs.pre-flight.outputs.is_deployment_workflow == 'true'
# || always()
# )
# && !cancelled()
# runs-on: ubuntu-latest
# steps:
# - name: Result
# run: |
# FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
build-test-publish-wheel-summary:
needs: [pre-flight, build-test-publish-wheel]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
steps:
- name: Result
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

# if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
# echo "✅ All previous jobs completed successfully"
# exit 0
# else
# echo "❌ Found $FAILED_JOBS failed job(s)"
# # Show which jobs failed
# gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
# exit 1
# fi
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
Comment on lines +72 to +95
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

find . -name "build-test-publish-wheel.yml" -type f

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 119


🏁 Script executed:

cat -n .github/workflows/build-test-publish-wheel.yml

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 4293


🌐 Web query:

GitHub Actions gh cli run view required permissions actions:read

💡 Result:

To run gh run view inside a GitHub Actions workflow, authenticate gh with GH_TOKEN and ensure the workflow token has Actions read permission.

jobs:
  view:
    runs-on: ubuntu-latest
    permissions:
      actions: read          # required for viewing workflow runs
      contents: read         # commonly needed if you also use actions/checkout
      # checks: read         # only needed if you use features that fetch annotations (e.g., gh run watch)
    steps:
      - uses: actions/checkout@v4
      - run: gh run view ${{ github.run_id }} --log
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

Notes:

  • In GitHub Actions, each step that uses gh must set GH_TOKEN to a token with the required permissions/scopes. [1]
  • Grant/limit GITHUB_TOKEN permissions via the permissions: block. [2]
  • If you also need annotations (commonly via gh run watch), you may need checks: read in addition to actions: read. [3]
  • actions/checkout recommends contents: read. [4]

Sources: [1] [2] [3] [4]


🏁 Script executed:

# Verify if there's any job-level permission override in the summary job
sed -n '73,96p' .github/workflows/build-test-publish-wheel.yml | grep -i "permissions"

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 53


gh run view fails silently due to missing actions:read permission.

The workflow's permission block (lines 28–30) grants only id-token: write and contents: read. The build-test-publish-wheel-summary job has no job-level permission override and calls gh run view on line 86 with error masking (|| echo 0), which hides the authorization failure and causes FAILED_JOBS to be set to 0. The job then exits with status 0, incorrectly reporting success even when the check itself failed. Line 94's second gh run view call is also unprotected and will fail without actions:read permission.

Add permissions: {actions: read} to the job and set GH_TOKEN: ${{ github.token }} in the step to grant the necessary access. Replace the error-masking || echo 0 with strict failure handling to catch and report permission errors.

Suggested fix
  build-test-publish-wheel-summary:
    needs: [pre-flight, build-test-publish-wheel]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && !cancelled()
+   permissions:
+     actions: read
+     contents: read
    runs-on: ubuntu-latest
    steps:
      - name: Result
+       env:
+         GH_TOKEN: ${{ github.token }}
        run: |
-         FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+         FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || { echo "gh run view failed"; exit 1; }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
build-test-publish-wheel-summary:
needs: [pre-flight, build-test-publish-wheel]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
steps:
- name: Result
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
# if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
# echo "✅ All previous jobs completed successfully"
# exit 0
# else
# echo "❌ Found $FAILED_JOBS failed job(s)"
# # Show which jobs failed
# gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
# exit 1
# fi
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
build-test-publish-wheel-summary:
needs: [pre-flight, build-test-publish-wheel]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
permissions:
actions: read
contents: read
runs-on: ubuntu-latest
steps:
- name: Result
env:
GH_TOKEN: ${{ github.token }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || { echo "gh run view failed"; exit 1; }
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
🤖 Prompt for AI Agents
In @.github/workflows/build-test-publish-wheel.yml around lines 72 - 95, The
summary job build-test-publish-wheel-summary is calling gh run view but lacks
actions:read permission and masks errors with "|| echo 0", causing false
success; update the job to include permissions: { actions: read } and set
GH_TOKEN: ${{ github.token }} in the Result step's environment, remove the "||
echo 0" error-masking so gh run view errors surface, and ensure both gh run view
invocations use the GH_TOKEN env so authorization failures are reported instead
of being treated as zero failed jobs.

14 changes: 5 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ dependencies = [
"hydra-core>1.3,<=1.3.2",
"megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
"qwen-vl-utils",
"transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
"transformer-engine[pytorch,core_cu13]>=2.10.0a0,<2.13.0",
"mamba-ssm",
"nvidia-resiliency-ext",
"nvidia-resiliency-ext~=0.4.1",
"causal-conv1d",
"flash-linear-attention",
"timm",
Expand All @@ -108,21 +108,17 @@ no-build-isolation-package = [
]
prerelease = "allow"
override-dependencies = [
"nvidia-modelopt[torch]>=0.37.0",
"torch; sys_platform == 'never'",
"torchvision; sys_platform == 'never'",
"triton; sys_platform == 'never'",
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@6a34b6574fa6c29d9d07fdcddf9812cbb1488878",

]

# uv.sources allows us to override dependencies with VCS commits.
# Lets use this only for debugging purposes, but not for production (main).
[tool.uv.sources]
transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "6a34b6574fa6c29d9d07fdcddf9812cbb1488878" }
megatron-core = { path = "3rdparty/Megatron-LM/" }
nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "54f85fe422d296cf04ea524130014bd3a2c3add1" }
nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "0a4f0a8b933121f7af080261a0a5a7717f2c5d49" }
# mamba-ssm = { git = "https://github.com/yfw/mamba", branch = "general_stride_fix" }
nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "v0.4.1" } # Requires a source install to compile cupti for cuda13

[project.optional-dependencies]
recipes = [
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/training/mlm_compat/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def _get_transformer_layer_spec(args: argparse.Namespace, use_te: bool, use_kitc
use_kitchen: Whether to use kitchen extension

Returns:
transformer_layer_spec: The transformer layer specification
ModuleSpec: The transformer layer specification
"""
if use_te:
return get_gpt_layer_with_transformer_engine_spec(
Expand Down
Loading
Loading