Skip to content

Fix invalid argument failing tests on main #3211

Fix invalid argument failing tests on main

Fix invalid argument failing tests on main #3211

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow verifies that the basic install works across all supported platforms.
# For basic install, all imports need to either be successful or appropriately guarded.
name: Installation Test
on:
push:
branches:
- dev
- main
- 'pull-request/[0-9]+'
- 'deploy-release/*'
merge_group:
types: [checks_requested]
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
if: github.repository == 'NVIDIA/Megatron-LM'
pip-test-pytorch:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
&& github.repository == 'NVIDIA/Megatron-LM'
runs-on: linux-amd64-cpu16
name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
container:
image: nvcr.io/nvidia/pytorch:25.05-py3
environment: nemo-ci
strategy:
fail-fast: false
matrix:
python-version: ['3.12']
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set PATH
run: |
echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
- name: Install megatron-core
shell: bash -x -e -u -o pipefail {0}
run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}
- name: Checkout check-imports
uses: actions/checkout@v4
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.63.2
path: FW-CI-templates
- name: Check imports for megatron-core
uses: ./FW-CI-templates/.github/actions/check-imports
with:
package-name: megatron.core
python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python
uv-test-pytorch:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
&& github.repository == 'NVIDIA/Megatron-LM'
runs-on: linux-amd64-cpu16
name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
container:
image: nvcr.io/nvidia/pytorch:25.05-py3
environment: nemo-ci
strategy:
fail-fast: false
matrix:
python-version: ['3.12']
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set PATH
run: |
echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
echo "VIRTUAL_ENV=/opt/venv" | tee -a "$GITHUB_ENV"
echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
echo "CUDACXX=/usr/local/cuda/bin/nvcc" | tee -a "$GITHUB_ENV"
echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
- name: Install project
shell: bash
run: bash docker/common/install.sh --environment dev --base-image pytorch --use-uv
# NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines.
# - name: Checkout check-imports
# uses: actions/checkout@v4
# with:
# repository: NVIDIA-NeMo/FW-CI-templates
# ref: v0.63.2
# path: FW-CI-templates
# - name: Check imports for megatron-core
# uses: ./FW-CI-templates/.github/actions/check-imports
# with:
# package-name: megatron.core
# python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python
install-test-summary:
needs: [pre-flight, pip-test-pytorch, uv-test-pytorch]
runs-on: ubuntu-latest
name: Install test summary
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
&& github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi