Skip to content

Commit edb4198

Browse files
fix: ensure conversion webhook CA bundles are injected during upgrades (#7772)
* fix: ensure conversion webhook CA bundles are injected during upgrades During upgrades with pre-existing LQ/CQ, the API server fails conversion with 'x509: certificate signed by unknown authority' because cert-controller doesn't inject CA bundles into CRD conversion webhook configurations. This fix manually injects CA bundles into kueue.x-k8s.io CRDs, registers webhooks before manager start so conversion endpoints are ready when caches sync. Fixes #7344 Signed-off-by: Sohan Kunkerkar <[email protected]> * test: add e2e coverage for upgrade with webhook conversion Signed-off-by: Sohan Kunkerkar <[email protected]> --------- Signed-off-by: Sohan Kunkerkar <[email protected]>
1 parent c4de9e5 commit edb4198

File tree

7 files changed

+485
-51
lines changed

7 files changed

+485
-51
lines changed

Makefile-test.mk

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ E2E_K8S_FULL_VERSION := $(or $(E2E_K8S_FULL_VERSION),$(E2E_K8S_VERSION).0)
4646
E2E_KIND_VERSION ?= kindest/node:v$(E2E_K8S_FULL_VERSION)
4747
E2E_RUN_ONLY_ENV ?= false
4848
E2E_USE_HELM ?= false
49+
KUEUE_UPGRADE_FROM_VERSION ?= v0.14.4
4950

5051
# For local testing, we should allow user to use different kind cluster name
5152
# Default will delete default kind cluster
@@ -129,6 +130,12 @@ test-e2e-customconfigs-helm: test-e2e-customconfigs
129130
.PHONY: test-e2e-certmanager
130131
test-e2e-certmanager: setup-e2e-env run-test-e2e-certmanager-$(E2E_KIND_VERSION:kindest/node:v%=%)
131132

133+
.PHONY: test-e2e-upgrade
134+
test-e2e-upgrade: setup-e2e-env run-test-e2e-upgrade-$(E2E_KIND_VERSION:kindest/node:v%=%)
135+
136+
.PHONY: test-e2e-certmanager-upgrade
137+
test-e2e-certmanager-upgrade: setup-e2e-env run-test-e2e-certmanager-upgrade-$(E2E_KIND_VERSION:kindest/node:v%=%)
138+
132139
run-test-e2e-singlecluster-%: K8S_VERSION = $(@:run-test-e2e-singlecluster-%=%)
133140
run-test-e2e-singlecluster-%:
134141
@echo Running e2e for k8s ${K8S_VERSION}
@@ -202,6 +209,29 @@ run-test-e2e-certmanager-%:
202209
E2E_USE_HELM=$(E2E_USE_HELM) \
203210
./hack/e2e-test.sh
204211

212+
run-test-e2e-upgrade-%: K8S_VERSION = $(@:run-test-e2e-upgrade-%=%)
213+
run-test-e2e-upgrade-%:
214+
@echo Running upgrade e2e for k8s ${K8S_VERSION}
215+
E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) \
216+
ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" \
217+
KIND_CLUSTER_FILE="kind-cluster.yaml" E2E_TARGET_FOLDER="upgrade" \
218+
KUEUE_UPGRADE_FROM_VERSION=$(KUEUE_UPGRADE_FROM_VERSION) \
219+
TEST_LOG_LEVEL=$(TEST_LOG_LEVEL) \
220+
E2E_RUN_ONLY_ENV=$(E2E_RUN_ONLY_ENV) \
221+
./hack/e2e-test.sh
222+
223+
run-test-e2e-certmanager-upgrade-%: K8S_VERSION = $(@:run-test-e2e-certmanager-upgrade-%=%)
224+
run-test-e2e-certmanager-upgrade-%:
225+
@echo Running upgrade e2e for k8s ${K8S_VERSION}
226+
E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) \
227+
ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" \
228+
KIND_CLUSTER_FILE="kind-cluster.yaml" E2E_TARGET_FOLDER="upgrade" \
229+
KUEUE_UPGRADE_FROM_VERSION=$(KUEUE_UPGRADE_FROM_VERSION) \
230+
CERTMANAGER_VERSION=$(CERTMANAGER_VERSION) \
231+
TEST_LOG_LEVEL=$(TEST_LOG_LEVEL) \
232+
E2E_RUN_ONLY_ENV=$(E2E_RUN_ONLY_ENV) \
233+
./hack/e2e-test.sh
234+
205235
SCALABILITY_RUNNER := $(BIN_DIR)/performance-scheduler-runner
206236
.PHONY: performance-scheduler-runner
207237
performance-scheduler-runner:

cmd/kueue/main.go

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,16 @@ func main() {
205205
kubeConfig.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(*cfg.ClientConnection.QPS, int(*cfg.ClientConnection.Burst))
206206
}
207207
setupLog.V(2).Info("K8S Client", "qps", *cfg.ClientConnection.QPS, "burst", *cfg.ClientConnection.Burst)
208+
209+
// Bootstrap certificates before creating the main manager
210+
// This ensures certs are ready and CA bundles are injected into conversion CRDs
211+
if cfg.InternalCertManagement != nil && *cfg.InternalCertManagement.Enable {
212+
if err := cert.BootstrapCerts(kubeConfig, cfg); err != nil {
213+
setupLog.Error(err, "Unable to bootstrap certificates")
214+
os.Exit(1)
215+
}
216+
}
217+
208218
mgr, err := ctrl.NewManager(kubeConfig, options)
209219
if err != nil {
210220
setupLog.Error(err, "Unable to start manager")
@@ -266,15 +276,15 @@ func main() {
266276
os.Exit(1)
267277
}
268278

269-
// Cert won't be ready until manager starts, so start a goroutine here which
270-
// will block until the cert is ready before setting up the controllers.
271-
// Controllers who register after manager starts will start directly.
272-
go func() {
273-
if err := setupControllers(ctx, mgr, cCache, queues, certsReady, &cfg, serverVersionFetcher); err != nil {
274-
setupLog.Error(err, "Unable to setup controllers")
275-
os.Exit(1)
276-
}
277-
}()
279+
if err := setupControllers(ctx, mgr, cCache, queues, &cfg, serverVersionFetcher); err != nil {
280+
setupLog.Error(err, "Unable to setup controllers")
281+
os.Exit(1)
282+
}
283+
284+
if failedWebhook, err := webhooks.Setup(mgr); err != nil {
285+
setupLog.Error(err, "Unable to create webhook", "webhook", failedWebhook)
286+
os.Exit(1)
287+
}
278288

279289
go queues.CleanUpOnContext(ctx)
280290
go cCache.CleanUpOnContext(ctx)
@@ -331,11 +341,7 @@ func setupIndexes(ctx context.Context, mgr ctrl.Manager, cfg *configapi.Configur
331341
return jobframework.SetupIndexes(ctx, mgr.GetFieldIndexer(), opts...)
332342
}
333343

334-
func setupControllers(ctx context.Context, mgr ctrl.Manager, cCache *schdcache.Cache, queues *qcache.Manager, certsReady chan struct{}, cfg *configapi.Configuration, serverVersionFetcher *kubeversion.ServerVersionFetcher) error {
335-
// The controllers won't work until the webhooks are operating, and the webhook won't work until the
336-
// certs are all in place.
337-
cert.WaitForCertsReady(setupLog, certsReady)
338-
344+
func setupControllers(ctx context.Context, mgr ctrl.Manager, cCache *schdcache.Cache, queues *qcache.Manager, cfg *configapi.Configuration, serverVersionFetcher *kubeversion.ServerVersionFetcher) error {
339345
if failedCtrl, err := core.SetupControllers(mgr, queues, cCache, cfg); err != nil {
340346
return fmt.Errorf("unable to create controller %s: %w", failedCtrl, err)
341347
}
@@ -401,10 +407,6 @@ func setupControllers(ctx context.Context, mgr ctrl.Manager, cCache *schdcache.C
401407
}
402408
}
403409

404-
if failedWebhook, err := webhooks.Setup(mgr); err != nil {
405-
return fmt.Errorf("unable to create webhook %s: %w", failedWebhook, err)
406-
}
407-
408410
opts := []jobframework.Option{
409411
jobframework.WithManageJobsWithoutQueueName(cfg.ManageJobsWithoutQueueName),
410412
jobframework.WithWaitForPodsReady(cfg.WaitForPodsReady),
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# The following patch adds a directive for certmanager to inject CA into the CRD
2+
apiVersion: apiextensions.k8s.io/v1
3+
kind: CustomResourceDefinition
4+
metadata:
5+
annotations:
6+
cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME)
7+
name: localqueues.kueue.x-k8s.io

hack/e2e-common.sh

Lines changed: 96 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ if [[ -n "${CERTMANAGER_VERSION:-}" ]]; then
7272
export CERTMANAGER_MANIFEST="https://github.com/cert-manager/cert-manager/releases/download/${CERTMANAGER_VERSION}/cert-manager.yaml"
7373
fi
7474

75+
if [[ -n "${KUEUE_UPGRADE_FROM_VERSION:-}" ]]; then
76+
export KUEUE_OLD_VERSION_MANIFEST="https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_UPGRADE_FROM_VERSION}/manifests.yaml"
77+
fi
78+
7579
# agnhost image to use for testing.
7680
E2E_TEST_AGNHOST_IMAGE_OLD_WITH_SHA=registry.k8s.io/e2e-test-images/agnhost:2.52@sha256:b173c7d0ffe3d805d49f4dfe48375169b7b8d2e1feb81783efd61eb9d08042e6
7781
export E2E_TEST_AGNHOST_IMAGE_OLD=${E2E_TEST_AGNHOST_IMAGE_OLD_WITH_SHA%%@*}
@@ -141,13 +145,21 @@ function prepare_docker_images {
141145
if [[ -n ${LEADERWORKERSET_VERSION:-} ]]; then
142146
docker pull "${LEADERWORKERSET_IMAGE}"
143147
fi
148+
if [[ -n ${KUEUE_UPGRADE_FROM_VERSION:-} ]]; then
149+
local current_image="${IMAGE_TAG%:*}:${KUEUE_UPGRADE_FROM_VERSION}"
150+
docker pull "${current_image}"
151+
fi
144152
}
145153

146154
# $1 cluster
147155
function cluster_kind_load {
148156
cluster_kind_load_image "$1" "${E2E_TEST_AGNHOST_IMAGE_OLD}"
149157
cluster_kind_load_image "$1" "${E2E_TEST_AGNHOST_IMAGE}"
150158
cluster_kind_load_image "$1" "$IMAGE_TAG"
159+
if [[ -n ${KUEUE_UPGRADE_FROM_VERSION:-} ]]; then
160+
local old_image="${IMAGE_TAG%:*}:${KUEUE_UPGRADE_FROM_VERSION}"
161+
cluster_kind_load_image "$1" "${old_image}"
162+
fi
151163
}
152164

153165
# $1 cluster
@@ -218,6 +230,7 @@ function deploy_with_certmanager() {
218230
cd "${ROOT_DIR}/config/components/crd" || exit
219231
$KUSTOMIZE edit add patch --path "patches/cainjection_in_clusterqueues.yaml"
220232
$KUSTOMIZE edit add patch --path "patches/cainjection_in_cohorts.yaml"
233+
$KUSTOMIZE edit add patch --path "patches/cainjection_in_localqueues.yaml"
221234
$KUSTOMIZE edit add patch --path "patches/cainjection_in_resourceflavors.yaml"
222235
$KUSTOMIZE edit add patch --path "patches/cainjection_in_workloads.yaml"
223236
)
@@ -240,6 +253,12 @@ function deploy_with_certmanager() {
240253

241254
# $1 kubeconfig
242255
function cluster_kueue_deploy {
256+
# Handle upgrade test mode
257+
if [[ -n ${KUEUE_UPGRADE_FROM_VERSION:-} ]]; then
258+
upgrade_test_flow "$1"
259+
return
260+
fi
261+
# Normal deployment flows
243262
if [[ -n ${CERTMANAGER_VERSION:-} ]]; then
244263
kubectl -n cert-manager wait --for condition=ready pod \
245264
-l app.kubernetes.io/instance=cert-manager \
@@ -274,8 +293,7 @@ function helm_install {
274293
function build_and_apply_kueue_manifests {
275294
local build_output
276295
build_output=$($KUSTOMIZE build "$2")
277-
# shellcheck disable=SC2001 # bash parameter substitution does not work on macOS
278-
build_output=$(echo "$build_output" | sed "s/kueue-system/$KUEUE_NAMESPACE/g")
296+
build_output=${build_output//kueue-system/$KUEUE_NAMESPACE}
279297
echo "$build_output" | kubectl apply --kubeconfig="$1" --server-side -f -
280298
}
281299

@@ -405,3 +423,79 @@ EOF
405423
--user="$kind_name"
406424
fi
407425
}
426+
427+
# Upgrade test flow: install old version, create resources, upgrade to current
428+
# $1 kubeconfig
429+
function upgrade_test_flow {
430+
local old_version="${KUEUE_UPGRADE_FROM_VERSION}"
431+
local old_image="${IMAGE_TAG%:*}:${old_version}"
432+
433+
echo "Upgrade Test: $old_version -> current"
434+
echo "Old image: $old_image"
435+
echo "New image: $IMAGE_TAG"
436+
437+
# Step 1: Install old version
438+
echo "Installing $old_version..."
439+
echo " Manifest URL: ${KUEUE_OLD_VERSION_MANIFEST}"
440+
echo " Downloading and modifying manifests..."
441+
442+
# Download and modify manifests inline
443+
curl -sL "${KUEUE_OLD_VERSION_MANIFEST}" | \
444+
sed "s|registry.k8s.io/kueue/kueue:${old_version}|${old_image}|g" | \
445+
sed 's|imagePullPolicy: Always|imagePullPolicy: IfNotPresent|g' | \
446+
kubectl apply --server-side -f -
447+
448+
kubectl wait --for=condition=available --timeout=180s deployment/kueue-controller-manager -n kueue-system
449+
echo "$old_version ready"
450+
451+
# Step 2: Create test resources
452+
echo "Creating test resources..."
453+
kubectl apply --kubeconfig="$1" -f - <<EOF
454+
apiVersion: kueue.x-k8s.io/v1beta1
455+
kind: ResourceFlavor
456+
metadata:
457+
name: upgrade-test-flavor
458+
---
459+
apiVersion: kueue.x-k8s.io/v1beta1
460+
kind: ClusterQueue
461+
metadata:
462+
name: upgrade-test-cq
463+
spec:
464+
namespaceSelector: {}
465+
resourceGroups:
466+
- coveredResources: ["cpu", "memory"]
467+
flavors:
468+
- name: upgrade-test-flavor
469+
resources:
470+
- name: "cpu"
471+
nominalQuota: 10
472+
- name: "memory"
473+
nominalQuota: 10Gi
474+
---
475+
apiVersion: kueue.x-k8s.io/v1beta1
476+
kind: LocalQueue
477+
metadata:
478+
name: upgrade-test-lq
479+
namespace: default
480+
spec:
481+
clusterQueue: upgrade-test-cq
482+
EOF
483+
echo "✓ Resources created"
484+
485+
# Step 3: Upgrade to current (rolling update)
486+
echo "Upgrading to current..."
487+
488+
# Apply upgrade - rolling update will replace pods
489+
(
490+
set_managers_image
491+
trap restore_managers_image EXIT
492+
493+
local build_output
494+
build_output=$($KUSTOMIZE build "${ROOT_DIR}/test/e2e/config/default")
495+
build_output=${build_output//kueue-system/$KUEUE_NAMESPACE}
496+
echo "$build_output" | kubectl apply --kubeconfig="$1" --server-side --force-conflicts -f -
497+
)
498+
499+
echo "✓ Upgrade complete (rolling update in progress)"
500+
echo "========================================="
501+
}

0 commit comments

Comments
 (0)