Skip to content

fix(ci): improve CI infrastructure reliability #5

fix(ci): improve CI infrastructure reliability

fix(ci): improve CI infrastructure reliability #5

# Copyright NVIDIA CORPORATION

Check failure on line 1 in .github/workflows/forward-compatibility.yaml

View workflow run for this annotation

GitHub Actions / .github/workflows/forward-compatibility.yaml

Invalid workflow file

(Line: 84, Col: 9): Unrecognized named-value: 'secrets'. Located at position 107 within expression: always() && (needs.fetch-latest-images.result == 'failure' || needs.run-e2e-tests.result == 'failure') && secrets.SLACK_BOT_TOKEN != ''
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Forward Compatibility
on:
schedule:
- cron: '0 2 * * 1' # Weekly on Monday at 2 AM UTC
workflow_dispatch: # Manual trigger
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions: {}
jobs:
fetch-latest-images:
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
contents: read
steps:
- uses: actions/checkout@v6
- name: Install regctl
uses: regclient/actions/regctl-installer@148669fe4b19151fcab6e00c6df2db43b9e2b097
with:
release: v0.11.1
- name: Get latest component images and generate values override file
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Fetch latest images from component repositories
echo "::notice::Fetching latest container-toolkit image..."
TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit)
echo "::notice::Fetching latest device-plugin image..."
DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin)
echo "::notice::Fetching latest mig-manager image..."
MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager)
# Generate values override file
.github/scripts/generate-values-overrides.sh \
values-overrides.yaml \
"${TOOLKIT}" \
"${DEVICE_PLUGIN}" \
"${MIG_MANAGER}"
- name: Upload values override file
uses: actions/upload-artifact@v6
with:
name: values-overrides
path: values-overrides.yaml
retention-days: 30
run-e2e-tests:
needs: [fetch-latest-images]
uses: ./.github/workflows/e2e-tests.yaml
with:
operator_image: ghcr.io/nvidia/gpu-operator
operator_version: main-latest
use_values_override: true
secrets: inherit
notify-failure:
runs-on: ubuntu-latest
timeout-minutes: 10
permissions: {}
needs: [fetch-latest-images, run-e2e-tests]
if: ${{ always() && (needs.fetch-latest-images.result == 'failure' || needs.run-e2e-tests.result == 'failure') && secrets.SLACK_BOT_TOKEN != '' }}
steps:
- name: Send Slack alert notification
uses: slackapi/[email protected]
with:
method: chat.postMessage
token: ${{ secrets.SLACK_BOT_TOKEN }}
payload: |
{
"channel": "${{ secrets.SLACK_CHANNEL_ID }}",
"text": ":x: *Forward Compatibility Test Failed for GPU Operator*\n\n*Workflow:* ${{ github.workflow }}\n*Repository:* ${{ github.repository }}\n*Trigger:* ${{ github.event_name }}\n\n*Tested Components:*\nDownload `values-overrides` artifact to see tested component versions\n\n*Details:* <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Failed Run>\n\n${{ secrets.SLACK_MENTION_LIST }}"
}