basetenlabs · William-Gao1 · Mar 12, 2026 · Mar 10, 2026
diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
@@ -25,13 +25,14 @@ jobs:
   smoke-test:
     runs-on: ubuntu-latest
     timeout-minutes: 45
-    outputs:
-      project_name: ${{ steps.summary.outputs.project_name }}
-      project_id: ${{ steps.summary.outputs.project_id }}
-      job_id: ${{ steps.summary.outputs.job_id }}
-      status: ${{ steps.summary.outputs.status }}
-      error: ${{ steps.summary.outputs.error }}
-      passed: ${{ steps.summary.outputs.passed }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - accelerator: H100
+            deploy-and-infer: true
+          - accelerator: H200
+            deploy-and-infer: false
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v4
@@ -46,49 +47,43 @@ jobs:
           api_key = ${{ secrets.SMOKE_TESTS_API_KEY }}
           EOF
 
-      - name: Run smoke test
+      - name: Run smoke test (${{ matrix.accelerator }})
         run: |
           uv run bin/test_example.py \
             examples/qwen3-0.6b-axolotl/training/config.py \
             --timeout 1200 \
             --assert-status TRAINING_JOB_COMPLETED \
             --check-checkpoints \
             --check-cache \
-            --deploy-and-infer \
-            --project smoke-test-${{ github.run_id }} \
+            --accelerator ${{ matrix.accelerator }} \
+            ${{ matrix.deploy-and-infer && '--deploy-and-infer' || '' }} \
+            --project smoke-test-${{ matrix.accelerator }}-${{ github.run_id }} \
             --remote smoke-tests \
             --summary-file /tmp/smoke-test-summary.json
 
       - name: Read summary
         id: summary
         if: always()
         run: |
-          if [ -f /tmp/smoke-test-summary.json ]; then
-            echo "project_name=$(jq -r .project_name /tmp/smoke-test-summary.json)" >> "$GITHUB_OUTPUT"
-            echo "project_id=$(jq -r .project_id /tmp/smoke-test-summary.json)" >> "$GITHUB_OUTPUT"
-            echo "job_id=$(jq -r .job_id /tmp/smoke-test-summary.json)" >> "$GITHUB_OUTPUT"
-            echo "status=$(jq -r .status /tmp/smoke-test-summary.json)" >> "$GITHUB_OUTPUT"
-            {
-              echo "error<<GHEOF"
-              jq -r .error /tmp/smoke-test-summary.json
-              echo "GHEOF"
-            } >> "$GITHUB_OUTPUT"
-            echo "passed=$(jq -r .passed /tmp/smoke-test-summary.json)" >> "$GITHUB_OUTPUT"
-          else
-            echo "project_name=smoke-test-${{ github.run_id }}" >> "$GITHUB_OUTPUT"
-            echo "project_id=" >> "$GITHUB_OUTPUT"
-            echo "job_id=" >> "$GITHUB_OUTPUT"
-            echo "status=UNKNOWN" >> "$GITHUB_OUTPUT"
-            echo "error=Summary file not written (crash before job submission?)" >> "$GITHUB_OUTPUT"
-            echo "passed=false" >> "$GITHUB_OUTPUT"
+          if [ ! -f /tmp/smoke-test-summary.json ]; then
+            echo '{"project_name":"smoke-test-${{ matrix.accelerator }}-${{ github.run_id }}","project_id":"","job_id":"","status":"UNKNOWN","error":"Summary file not written (crash before job submission?)","passed":false}' > /tmp/smoke-test-summary.json
           fi
+          # Tag the summary with which accelerator it came from
+          jq --arg acc "${{ matrix.accelerator }}" '. + {accelerator: $acc}' /tmp/smoke-test-summary.json > /tmp/smoke-test-summary-tagged.json
+
+      - name: Upload summary
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: smoke-test-summary-${{ matrix.accelerator }}
+          path: /tmp/smoke-test-summary-tagged.json
 
       - name: Cleanup
         if: always()
         run: |
           # On success the tracker already deleted the project; on failure just stop jobs
           uv run bin/test_example.py --teardown \
-            --project smoke-test-${{ github.run_id }} \
+            --project smoke-test-${{ matrix.accelerator }}-${{ github.run_id }} \
             --remote smoke-tests
 
   # --------------------------------------------------------------------------
@@ -127,80 +122,69 @@ jobs:
     if: always()
     runs-on: ubuntu-latest
     steps:
-      - name: Notify success
-        if: needs.smoke-test.result == 'success'
-        uses: slackapi/slack-github-action@v2
+      - name: Download all summaries
+        uses: actions/download-artifact@v4
         with:
-          method: chat.postMessage
-          token: ${{ secrets.SLACK_BOT_TOKEN }}
-          payload: |
-            {
-              "channel": "${{ secrets.SLACK_CHANNEL_ID }}",
-              "blocks": [
-                {
-                  "type": "header",
-                  "text": {
-                    "type": "plain_text",
-                    "text": "Training Smoke Test Passed"
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "*Status:* `${{ needs.smoke-test.outputs.status }}`\n*Project:* `${{ needs.smoke-test.outputs.project_name }}`\n*Job ID:* `${{ needs.smoke-test.outputs.job_id }}`"
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View workflow run>"
-                  }
-                }
-              ]
-            }
-
-      - name: Notify failure
-        if: needs.smoke-test.result == 'failure'
+          pattern: smoke-test-summary-*
+          path: /tmp/summaries
+
+      - name: Build Slack message
+        id: slack
+        run: |
+          all_passed=true
+          blocks='[{"type":"header","text":{"type":"plain_text","text":"Training Smoke Test Results"}}]'
+
+          for dir in /tmp/summaries/smoke-test-summary-*/; do
+            summary=$(cat "$dir"/*.json)
+            acc=$(echo "$summary" | jq -r .accelerator)
+            passed=$(echo "$summary" | jq -r .passed)
+            status=$(echo "$summary" | jq -r .status)
+            job_id=$(echo "$summary" | jq -r .job_id)
+            project=$(echo "$summary" | jq -r .project_name)
+            error=$(echo "$summary" | jq -r .error)
+
+            if [ "$passed" = "true" ]; then
+              icon="white_check_mark"
+            else
+              icon="x"
+              all_passed=false
+            fi
+
+            detail="*Status:* \`$status\`  *Job:* \`$job_id\`  *Project:* \`$project\`"
+            if [ "$passed" != "true" ] && [ -n "$error" ] && [ "$error" != "null" ] && [ "$error" != "" ]; then
+              detail="$detail\n*Error:* $error"
+            fi
+
+            blocks=$(echo "$blocks" | jq \
+              --arg acc "$acc" \
+              --arg icon "$icon" \
+              --arg detail "$detail" \
+              '. + [
+                {"type":"section","text":{"type":"mrkdwn","text":(":" + $icon + ": *" + $acc + "*\n" + $detail)}}
+              ]')
+          done
+
+          # Add workflow link
+          blocks=$(echo "$blocks" | jq \
+            --arg url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
+            '. + [{"type":"section","text":{"type":"mrkdwn","text":("<" + $url + "|View workflow run>")}}]')
+
+          if [ "$all_passed" != "true" ]; then
+            blocks=$(echo "$blocks" | jq '. + [{"type":"context","elements":[{"type":"mrkdwn","text":"Failed projects are preserved for investigation."}]}]')
+          fi
+
+          echo "all_passed=$all_passed" >> "$GITHUB_OUTPUT"
+
+          # Write payload to file to avoid shell escaping issues
+          jq -n --argjson blocks "$blocks" \
+            --arg channel "${{ secrets.SLACK_CHANNEL_ID }}" \
+            --arg all_passed "$all_passed" \
+            '{channel: $channel, blocks: $blocks} + (if $all_passed == "false" then {text: "<!here> Training Smoke Test Failed"} else {} end)' \
+            > /tmp/slack-payload.json
+
+      - name: Notify Slack
         uses: slackapi/slack-github-action@v2
         with:
           method: chat.postMessage
           token: ${{ secrets.SLACK_BOT_TOKEN }}
-          payload: |
-            {
-              "channel": "${{ secrets.SLACK_CHANNEL_ID }}",
-              "text": "<!here> Training Smoke Test Failed",
-              "blocks": [
-                {
-                  "type": "header",
-                  "text": {
-                    "type": "plain_text",
-                    "text": "Training Smoke Test Failed"
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "<!here> *Status:* `${{ needs.smoke-test.outputs.status || 'N/A' }}`\n*Project:* `${{ needs.smoke-test.outputs.project_name || 'N/A' }}`\n*Project ID:* `${{ needs.smoke-test.outputs.project_id || 'N/A' }}`\n*Job ID:* `${{ needs.smoke-test.outputs.job_id || 'N/A' }}`\n*Error:* ${{ needs.smoke-test.outputs.error || 'N/A' }}"
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View workflow run>"
-                  }
-                },
-                {
-                  "type": "context",
-                  "elements": [
-                    {
-                      "type": "mrkdwn",
-                      "text": "Failed projects are preserved for investigation."
-                    }
-                  ]
-                }
-              ]
-            }
+          payload-file-path: /tmp/slack-payload.json
diff --git a/bin/test_example.py b/bin/test_example.py
@@ -233,13 +233,24 @@ def load_training_project(config_path: Path):
 # ---------------------------------------------------------------------------
 
 
-def submit_job(config_path: Path, project_name: str, remote: str = "baseten") -> dict:
+def submit_job(
+    config_path: Path,
+    project_name: str,
+    remote: str = "baseten",
+    accelerator: str | None = None,
+) -> dict:
     """Submit a training job with a custom project name."""
+    from truss.base.truss_config import AcceleratorSpec
     from truss_train import definitions
     from truss_train.public_api import push
 
     project = load_training_project(config_path)
 
+    # Override accelerator if requested
+    if accelerator:
+        project.job.compute.accelerator = AcceleratorSpec.model_validate(accelerator)
+        print(f"  Accelerator overridden to: {project.job.compute.accelerator}")
+
     # Override the project name with our smoke-test-prefixed name
     custom_project = definitions.TrainingProject(
         name=project_name, job=project.job
@@ -771,6 +782,11 @@ def main():
         default="2h",
         help="Age threshold for sweep (default: 2h)",
     )
+    parser.add_argument(
+        "--accelerator",
+        type=str,
+        help="Override accelerator type and count (e.g., H200, H200:8)",
+    )
     parser.add_argument(
         "--remote",
         default="smoke-tests",
@@ -823,7 +839,7 @@ def main():
 
     try:
         # 1. Submit job (SDK — need structured IDs)
-        result = submit_job(config_path, project_name, remote=args.remote)
+        result = submit_job(config_path, project_name, remote=args.remote, accelerator=args.accelerator)
         project_id = result["training_project"]["id"]
         job_id = result["training_job"]["id"]
         summary["project_id"] = project_id