From faa2cd446cf4d3c3b296e44b0344952df30367e0 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Mon, 27 Apr 2026 17:41:23 -0400 Subject: [PATCH 1/3] Add transient error handling across all three controllers All three reconcilers (CloudConfigReconciler, TrustedCABundleReconciler, CloudOperatorReconciler) now distinguish between transient errors (API blips) and permanent errors (corrupt config, unsupported platform): - Transient errors are silently requeued and only set Degraded=True after a threshold has elapsed (2m for sub-controllers, 2m30s for the main controller). lastTransientFailureAt is tracked so a stale failure window is restarted rather than causing immediate degradation when errors resume after a long gap. - Permanent errors set Degraded=True immediately and return nil so controller-runtime does not requeue; existing watches re-trigger reconciliation when the underlying data changes. - Infrastructure NotFound returns Available (nil) rather than Degraded, matching the main controller's existing behaviour. Reconcile methods are refactored to use a deferred dispatcher pattern with permanent()/isPermanent() helpers. Each error site returns ctrl.Result{}, err (transient) or ctrl.Result{}, permanent(err) (permanent); a single deferred function routes to the correct handler and ensures clearFailureWindow() is called on every nil-error return path. Additional fixes and improvements: - Use controller-runtime's TerminalError instead of a local type and errors.Is instead of type casting; consolidate failure-tracking fields into their own struct. - Fix race in cloud config sync test by using a separate reconciler instance for direct Reconcile() calls rather than sharing the manager's instance across goroutines. - Fix aggregatedTransientDegradedThreshold mismatch in the main controller test that prevented the degraded condition from ever being set. - Document known limitations identified in QC review (status-write failures bypassing the transient threshold, redundant setStatusDegraded call in provisioningAllowed path). - Use specific Gomega matchers or provide failure descriptions per project conventions. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Nolan Brubaker --- .../cloud_config_sync_controller.go | 174 ++++++++++++---- .../cloud_config_sync_controller_test.go | 82 ++++++-- pkg/controllers/clusteroperator_controller.go | 115 ++++++++--- .../clusteroperator_controller_test.go | 152 +++++++++++--- .../trusted_ca_bundle_controller.go | 104 +++++++--- .../trusted_ca_bundle_controller_test.go | 187 ++++++++++++++++-- 6 files changed, 656 insertions(+), 158 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index a3c694c3a..0d193ad51 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -2,11 +2,14 @@ package controllers import ( "context" + "errors" "fmt" "reflect" + "sync" + "time" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -14,6 +17,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/api/features" @@ -30,6 +34,12 @@ const ( // Controller conditions for the Cluster Operator resource cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable" cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" + + // transientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. This prevents brief + // API server blips during upgrades from immediately degrading the operator. + // Applies to both CloudConfigController and TrustedCAController. + transientDegradedThreshold = 2 * time.Minute ) // shouldManageManagedConfigMap returns true if CCCMO should manage the @@ -78,34 +88,81 @@ type CloudConfigReconciler struct { ClusterOperatorStatusClient Scheme *runtime.Scheme FeatureGateAccess featuregates.FeatureGateAccess + failures failureWindow +} + +// failureWindow tracks consecutive transient failures. All methods are safe for concurrent use. +type failureWindow struct { + mu sync.Mutex + consecutiveFailureSince *time.Time + lastTransientFailureAt *time.Time +} + +// clear resets the failure window. Call this on every successful reconcile. +func (fw *failureWindow) clear() { + fw.mu.Lock() + defer fw.mu.Unlock() + fw.consecutiveFailureSince = nil + fw.lastTransientFailureAt = nil +} + +// observe records a transient failure at now and returns the elapsed time since +// the window started plus a boolean indicating whether the window was just opened +// or restarted. staleAfter controls stale-window detection: if the gap since the +// last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable. +func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) { + fw.mu.Lock() + defer fw.mu.Unlock() + stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter + if fw.consecutiveFailureSince == nil || stale { + fw.consecutiveFailureSince = &now + fw.lastTransientFailureAt = &now + return 0, true + } + fw.lastTransientFailureAt = &now + return now.Sub(*fw.consecutiveFailureSince), false } -func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with reconcile.TerminalError()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // All nil-error paths clear the failure window. + defer func() { + if retErr == nil { + r.clearFailureWindow() + return + } + if errors.Is(retErr, reconcile.TerminalError(nil)) { + result, retErr = r.handleTerminalError(ctx, retErr) + } else { + result, retErr = r.handleTransientError(ctx, retErr) + } + }() + infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil { - klog.Errorf("infrastructure resource not found") - if err := r.setDegradedCondition(ctx); err != nil { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) { + // No cloud platform: mirror the main controller's behaviour of returning Available. + klog.Infof("Infrastructure cluster does not exist. Skipping...") + if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } - return ctrl.Result{}, err + return ctrl.Result{}, nil + } else if err != nil { + return ctrl.Result{}, err // transient } network := &configv1.Network{} if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // nil platformStatus is a terminal misconfiguration. + return ctrl.Result{}, reconcile.TerminalError(err) } if !syncNeeded { if err := r.setAvailableCondition(ctx); err != nil { @@ -135,11 +192,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus) if err != nil { + // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, reconcile.TerminalError(err) } platformType := infra.Status.PlatformStatus.Type @@ -161,14 +216,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil { managedConfigFound = true - } else if errors.IsNotFound(err) { + } else if apierrors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") - } else if err != nil { - klog.Errorf("unable to get managed cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + } else { + return ctrl.Result{}, err // transient } } @@ -179,7 +230,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) Name: infra.Spec.CloudConfig.Name, Namespace: OpenshiftConfigNamespace, } - if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { + if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) { klog.Warningf("cloud-config not found in either openshift-config-managed or openshift-config namespace") // For platforms we manage, create a minimal valid config that will be populated by the transformer if shouldManageManagedConfigMap(platformType, features) { @@ -200,23 +251,30 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes. + return ctrl.Result{}, reconcile.TerminalError(err) } // Apply transformer if needed + if r.FeatureGateAccess == nil { + // Operator misconfiguration at startup: ermanent. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured")) + } + + features, err = r.FeatureGateAccess.CurrentFeatureGates() + if err != nil { + // The feature-gate informer may not have synced yet: transient. + klog.Errorf("unable to get feature gates: %v", err) + return ctrl.Result{}, err // transient + } if cloudConfigTransformerFn != nil { // We ignore stuff in sourceCM.BinaryData. This isn't allowed to // contain any key that overlaps with those found in sourceCM.Data and // we're not expecting users to put their data in the former. output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // Platform-specific transform failed on the current config data: terminal. + return ctrl.Result{}, reconcile.TerminalError(err) } sourceCM.Data[defaultConfigKey] = output } @@ -229,17 +287,14 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.setDegradedCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } } // Sync the transformed config to the target configmap for CCM consumption if err := r.syncCloudConfigData(ctx, sourceCM); err != nil { klog.Errorf("unable to sync cloud config") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } if err := r.setAvailableCondition(ctx); err != nil { @@ -249,6 +304,45 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } +// clearFailureWindow resets the transient-error tracking. Called by the deferred +// dispatcher in Reconcile on every successful (nil-error) return path. +func (r *CloudConfigReconciler) clearFailureWindow() { + r.failures.clear() +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + elapsed, started := r.failures.observe(r.Clock.Now(), 0) + if started { + klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleTerminalError sets CloudConfigControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) { if platformStatus == nil { return false, fmt.Errorf("platformStatus is required") @@ -335,7 +429,7 @@ func (r *CloudConfigReconciler) syncConfigMapToTarget(ctx context.Context, sourc // Check if target exists err := r.Get(ctx, targetKey, targetCM) - if err != nil && !errors.IsNotFound(err) { + if err != nil && !apierrors.IsNotFound(err) { return fmt.Errorf("failed to get target configmap %s/%s: %w", targetNamespace, targetName, err) } diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index 85cd2d189..ba217895f 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -141,7 +141,7 @@ var _ = Describe("isCloudConfigEqual reconciler method", func() { } It("should return 'true' if ConfigMaps content are equal", func() { - Expect(reconciler.isCloudConfigEqual(makeManagedCloudConfig(configv1.AzurePlatformType), makeManagedCloudConfig(configv1.AzurePlatformType))).Should(BeTrue()) + Expect(reconciler.isCloudConfigEqual(makeManagedCloudConfig(configv1.AzurePlatformType), makeManagedCloudConfig(configv1.AzurePlatformType))).Should(BeTrue(), "configmaps with identical content should be considered equal") }) It("should return 'false' if ConfigMaps content are not equal", func() { @@ -164,8 +164,7 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() { managedCloudConfig := makeManagedCloudConfig(configv1.AzurePlatformType) It("not prepared config should be different with managed one", func() { - _, ok := infraCloudConfig.Data[infraCloudConfKey] - Expect(ok).Should(BeTrue()) + Expect(infraCloudConfig.Data).To(HaveKey(infraCloudConfKey)) Expect(reconciler.isCloudConfigEqual(infraCloudConfig, managedCloudConfig)).Should(BeFalse()) }) @@ -174,9 +173,8 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() { Expect(err).Should(Succeed()) _, ok := preparedConfig.Data[infraCloudConfKey] Expect(ok).Should(BeFalse()) - _, ok = preparedConfig.Data[defaultConfigKey] - Expect(ok).Should(BeTrue()) - Expect(reconciler.isCloudConfigEqual(preparedConfig, managedCloudConfig)).Should(BeTrue()) + Expect(preparedConfig.Data).To(HaveKey(defaultConfigKey)) + Expect(reconciler.isCloudConfigEqual(preparedConfig, managedCloudConfig)).Should(BeTrue(), "prepared config should have content equal to the managed cloud config") }) It("config preparation should not touch extra fields in infra ConfigMap", func() { @@ -184,8 +182,7 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() { extendedInfraConfig.Data = map[string]string{infraCloudConfKey: "{}", "{}": "{}"} preparedConfig, err := reconciler.prepareSourceConfigMap(extendedInfraConfig, infra) Expect(err).Should(Succeed()) - _, ok := preparedConfig.Data[defaultConfigKey] - Expect(ok).Should(BeTrue()) + Expect(preparedConfig.Data).To(HaveKey(defaultConfigKey)) Expect(len(preparedConfig.Data)).Should(BeEquivalentTo(2)) }) }) @@ -370,8 +367,26 @@ var _ = Describe("Cloud config sync controller", func() { }, timeout).Should(Succeed()) initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion + // Introducing the consecutiveFailureWindow means that there's a field that could be racy + // between the manager calling Reconcile and the test calling Reconcile. + // In production, we only have 1 instance of the reconciler running. + // Create a fresh reconciler that is NOT registered with the manager. + // It shares the same API client (thread-safe) but has its own + // consecutiveFailureSince field, so no data race with the manager's copy. + freshReconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: targetNamespaceName, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting( + nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil, + ), + } + request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}} - _, err := reconciler.Reconcile(ctx, request) + _, err := freshReconciler.Reconcile(ctx, request) Expect(err).Should(Succeed()) Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed()) @@ -531,7 +546,7 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(len(allCMs.Items)).To(BeEquivalentTo(1)) }) - It("should error if a user-specified configmap key isn't present", func() { + It("should degrade immediately if a user-specified configmap key isn't present", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) infraResource.Spec.CloudConfig.Key = "notfound" Expect(cl.Create(ctx, infraResource)).To(Succeed()) @@ -540,8 +555,19 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap")) - + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) It("should continue with reconcile when feature gates are available", func() { @@ -606,16 +632,40 @@ var _ = Describe("Cloud config sync reconciler", func() { }) }) - It("reconcile should fail if no infra resource found", func() { + It("reconcile should succeed and be available if no infra resource found", func() { _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(configv1.ConditionTrue)) }) - It("should fail if no PlatformStatus in infra resource presented ", func() { + It("should degrade immediately if no PlatformStatus in infra resource", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) Expect(cl.Create(ctx, infraResource)).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) }) diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index da94230bf..afa2c3d72 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -19,22 +19,25 @@ package controllers import ( "context" "crypto/tls" + "errors" "fmt" "sort" "strings" + "time" configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" "github.com/openshift/library-go/pkg/cloudprovider" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" "github.com/openshift/cluster-cloud-controller-manager-operator/pkg/cloud" @@ -48,6 +51,13 @@ const ( // Condition type for Cloud Controller ownership cloudControllerOwnershipCondition = "CloudControllerOwner" + + // aggregatedTransientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. + // This prevents brief API server blips during upgrades from immediately degrading the operator. + // Applies to top-level operator, and is longer in order + // to accomodate changes in the lower-level operators. + aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second) ) // CloudOperatorReconciler reconciles a ClusterOperator object @@ -58,6 +68,7 @@ type CloudOperatorReconciler struct { ImagesFile string FeatureGateAccess featuregates.FeatureGateAccess TLSConfig func(*tls.Config) + failures failureWindow } // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete @@ -66,65 +77,68 @@ type CloudOperatorReconciler struct { // +kubebuilder:rbac:groups=config.openshift.io,resources=infrastructures,verbs=get;list;watch // Reconcile will process the cloud-controller-manager clusterOperator -func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { +func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (result ctrl.Result, retErr error) { conditionOverrides := []configv1.ClusterOperatorStatusCondition{} + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with permanent()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // All nil-error paths clear the failure window. + defer func() { + if retErr == nil { + r.clearFailureWindow() + return + } + if errors.Is(retErr, reconcile.TerminalError(nil)) { + result, retErr = r.handleDegradeError(ctx, conditionOverrides, retErr) + } else { + result, retErr = r.handleTransientError(ctx, conditionOverrides, retErr) + } + }() + infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) { klog.Infof("Infrastructure cluster does not exist. Skipping...") - if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) return ctrl.Result{}, err } - - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } else if err != nil { klog.Errorf("Unable to retrive Infrastructure object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } + // Known limitation: when provisioningAllowed internally calls setStatusDegraded + // (e.g. a sub-controller has Degraded=True, or IsCloudProviderExternal errors), + // it returns a non-nil error. Reconcile passes that error to handleTransientError, + // which starts the 2m30s window. After the threshold, handleTransientError calls + // setStatusDegraded again — redundant but harmless, since status is already degraded. + // This is a consequence of keeping status-setting inside provisioningAllowed rather + // than pushing it into Reconcile. allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) - return ctrl.Result{}, err + return ctrl.Result{}, err // transient; status already set inside provisioningAllowed } else if !allowedToProvision { - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } clusterProxy := &configv1.Proxy{} - if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) { + if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !apierrors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig) if err != nil { klog.Errorf("Unable to build operator config %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, reconcile.TerminalError(err) // permanent: defer calls handleDegradeError } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { klog.Errorf("Unable to sync operands: %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { @@ -137,7 +151,44 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, err } - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window +} + +func (r *CloudOperatorReconciler) clearFailureWindow() { + r.failures.clear() +} + +// handleTransientError records the start of a failure window and degrades the +// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns +// a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + elapsed, started := r.failures.observe(r.Clock.Now(), 0) + if started { + klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold) + return ctrl.Result{}, err + } + if elapsed < aggregatedTransientDegradedThreshold { + klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets OperatorDegraded=True immediately and returns nil so +// controller-runtime does NOT requeue. Existing watches on Infrastructure, +// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger } func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error { diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index 041670762..43f5c8c53 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -2,6 +2,7 @@ package controllers import ( "context" + "fmt" "time" . "github.com/onsi/ginkgo/v2" @@ -53,12 +54,15 @@ var _ = Describe("Cluster Operator status controller", func() { co := &configv1.ClusterOperator{} err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() bool { + Eventually(func() error { err := cl.Delete(context.Background(), operator) - return err == nil || apierrors.IsNotFound(err) - }).Should(BeTrue()) + if err == nil || apierrors.IsNotFound(err) { + return nil + } + return err + }).Should(Succeed()) } - Eventually(apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue()) + Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted")) }) type testCase struct { @@ -89,14 +93,17 @@ var _ = Describe("Cluster Operator status controller", func() { Expect(err).To(Succeed()) getOp := &configv1.ClusterOperator{} - Eventually(func() (bool, error) { + Eventually(func() error { err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, getOp) if err != nil { - return false, err + return err } // Successful sync means CO exists and the status is not empty - return getOp != nil && len(getOp.Status.Versions) > 0, nil - }, timeout).Should(BeTrue()) + if getOp == nil || len(getOp.Status.Versions) == 0 { + return fmt.Errorf("ClusterOperator status versions not yet populated") + } + return nil + }, timeout).Should(Succeed()) // check version. Expect(getOp.Status.Versions).To(HaveLen(1)) @@ -104,13 +111,13 @@ var _ = Describe("Cluster Operator status controller", func() { Expect(getOp.Status.Versions[0].Version).To(Equal(expectedVersion)) // check conditions. - Expect(v1helpers.IsStatusConditionTrue(getOp.Status.Conditions, configv1.OperatorAvailable)).To(BeTrue()) + Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorAvailable).Status).To(Equal(configv1.ConditionTrue)) Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorAvailable).Reason).To(Equal(ReasonAsExpected)) - Expect(v1helpers.IsStatusConditionTrue(getOp.Status.Conditions, configv1.OperatorUpgradeable)).To(BeTrue()) + Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorUpgradeable).Status).To(Equal(configv1.ConditionTrue)) Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorUpgradeable).Reason).To(Equal(ReasonAsExpected)) - Expect(v1helpers.IsStatusConditionFalse(getOp.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorDegraded).Status).To(Equal(configv1.ConditionFalse)) Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorDegraded).Reason).To(Equal(ReasonAsExpected)) - Expect(v1helpers.IsStatusConditionFalse(getOp.Status.Conditions, configv1.OperatorProgressing)).To(BeTrue()) + Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorProgressing).Status).To(Equal(configv1.ConditionFalse)) Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorProgressing).Reason).To(Equal(ReasonAsExpected)) Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, cloudControllerOwnershipCondition)).To(BeNil()) @@ -292,7 +299,7 @@ var _ = Describe("Apply resources should", func() { updated, err := reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") // two resources should report successful update, deployment and pdb Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) @@ -315,14 +322,14 @@ var _ = Describe("Apply resources should", func() { updated, err := reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) dep.Spec.Replicas = ptr.To[int32](20) updated, err = reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated"))) // No update as resource didn't change @@ -353,7 +360,7 @@ var _ = Describe("Apply resources should", func() { updated, err := reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") // three resources should report successful update, deployment, pdb and service Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) @@ -382,7 +389,7 @@ var _ = Describe("Apply resources should", func() { updated, err := reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) // Manually changing the port number @@ -415,7 +422,7 @@ var _ = Describe("Apply resources should", func() { updated, err = reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated"))) // Checking that the port has been reverted back and there is only one item in the list @@ -442,7 +449,7 @@ var _ = Describe("Apply resources should", func() { updated, err := reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) // Manually adding another port @@ -474,7 +481,7 @@ var _ = Describe("Apply resources should", func() { updated, err = reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated"))) // Checking that the port list has been reverted back and there is only one item in the list @@ -505,7 +512,7 @@ var _ = Describe("Apply resources should", func() { updated, err := reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) // Manually inserting a new label @@ -531,7 +538,7 @@ var _ = Describe("Apply resources should", func() { updated, err = reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated"))) // Checking that the new label is still there @@ -558,7 +565,7 @@ var _ = Describe("Apply resources should", func() { updated, err := reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created"))) // Now the deployment has just one label "k8s-app: aws-cloud-controller-manager" @@ -587,7 +594,7 @@ var _ = Describe("Apply resources should", func() { updated, err = reconciler.applyResources(context.TODO(), resources) Expect(err).ShouldNot(HaveOccurred()) - Expect(updated).To(BeTrue()) + Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated") Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated"))) // Checking that the label value has been reverted and there is only one item in the map @@ -601,20 +608,103 @@ var _ = Describe("Apply resources should", func() { co := &configv1.ClusterOperator{} err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() bool { + Eventually(func() error { err := cl.Delete(context.Background(), co) - return err == nil || apierrors.IsNotFound(err) - }, timeout).Should(BeTrue()) + if err == nil || apierrors.IsNotFound(err) { + return nil + } + return err + }, timeout).Should(Succeed()) } - Eventually(apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue()) + Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted")) for _, operand := range resources { Expect(cl.Delete(context.Background(), operand)).To(Succeed()) - Eventually(func() bool { - return apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKeyFromObject(operand), operand)) - }, timeout).Should(BeTrue()) + Eventually(func() error { + err := cl.Get(context.Background(), client.ObjectKeyFromObject(operand), operand) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + return fmt.Errorf("expected operand %s to be deleted", operand.GetName()) + }, timeout).Should(Succeed()) + } + }) + +}) + +var _ = Describe("CloudOperatorReconciler error handling", func() { + ctx := context.Background() + + AfterEach(func() { + co := &configv1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Eventually(func() error { + err := cl.Delete(ctx, co) + if err == nil || apierrors.IsNotFound(err) { + return nil + } + return err + }).Should(Succeed()) + } + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted")) + }) + + It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() { + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, } + + _, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error")) + Expect(err).NotTo(HaveOccurred()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorDegraded).Status).To(Equal(configv1.ConditionTrue)) }) + It("handleTransientError should not degrade before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + // Pre-create the ClusterOperator so that setStatusDegraded can update its status + // subresource when the threshold is exceeded (status subresource updates require the + // object to already exist in the cluster). + co := &configv1.ClusterOperator{} + co.SetName(clusterOperatorName) + Expect(cl.Create(ctx, co)).To(Succeed()) + + // First reconcile: transient failure starts; error returned but no degraded condition set. + _, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(), + "should not be degraded before threshold") + + // Advance clock past the degraded threshold. + fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorDegraded).Status).To(Equal(configv1.ConditionTrue)) + }) }) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index ef33709e1..97ca33baa 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "crypto/x509" + "errors" "fmt" "os" @@ -43,6 +44,7 @@ type TrustedCABundleReconciler struct { ClusterOperatorStatusClient Scheme *runtime.Scheme trustBundlePath string + failures failureWindow } // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set. @@ -50,9 +52,32 @@ func isSpecTrustedCASet(proxyConfig *configv1.ProxySpec) bool { return len(proxyConfig.TrustedCA.Name) > 0 } -func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("%s emitted event, syncing %s ConfigMap", req, trustedCAConfigMapName) + // partialRun is set to true on the early-exit path where the event is for + // an unrelated ConfigMap. That path returns available=true but should NOT + // reset an ongoing transient failure window from a previous full reconcile. + partialRun := false + + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with terminal()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // Nil-error paths clear the failure window unless partialRun is set. + defer func() { + if retErr == nil { + if !partialRun { + r.clearFailureWindow() + } + return + } + if errors.Is(retErr, reconcile.TerminalError(nil)) { + result, retErr = r.handleDegradeError(ctx, retErr) + } else { + result, retErr = r.handleTransientError(ctx, retErr) + } + }() + proxyConfig := &configv1.Proxy{} if err := r.Get(ctx, types.NamespacedName{Name: proxyResourceName}, proxyConfig); err != nil { if apierrors.IsNotFound(err) { @@ -62,62 +87,89 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - return reconcile.Result{}, nil + return reconcile.Result{}, nil // defer clears failure window } - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - // Error reading the object - requeue the request. - return reconcile.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) + // Non-NotFound: transient API error. + return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) // transient } // Check if changed config map in 'openshift-config' namespace is proxy trusted ca. - // If not, return early + // If not, return early without resetting the failure window (partialRun=true). if req.Namespace == OpenshiftConfigNamespace && proxyConfig.Spec.TrustedCA.Name != req.Name { + partialRun = true + klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - - klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) return reconcile.Result{}, nil } systemTrustBundle, err := r.getSystemTrustBundle() if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) + // Node cert store may be updating during upgrade: transient. + return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) // transient } proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: terminal. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: terminal. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) + return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) // transient } if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } + return ctrl.Result{}, nil // defer clears failure window +} + +func (r *TrustedCABundleReconciler) clearFailureWindow() { + r.failures.clear() +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. +func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + // Pass transientDegradedThreshold as the stale-window threshold to detect gaps + // where no reconcile ran (e.g. a partialRun returned nil, resetting the rate limiter). + elapsed, started := r.failures.observe(r.Clock.Now(), transientDegradedThreshold) + if started { + klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. +func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } return ctrl.Result{}, nil } diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index 2c112413f..8ad46dc6c 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -16,6 +16,7 @@ import ( "k8s.io/client-go/tools/record" clocktesting "k8s.io/utils/clock/testing" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -164,27 +165,26 @@ var _ = Describe("Trusted CA bundle sync controller", func() { co := &v1.ClusterOperator{} err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() bool { + Eventually(func() error { err := cl.Delete(context.Background(), co) - return err == nil || apierrors.IsNotFound(err) - }).Should(BeTrue()) + if err == nil || apierrors.IsNotFound(err) { + return nil + } + return err + }).Should(Succeed()) } - Eventually(apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue()) + Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted")) if proxyResource != nil { Expect(cl.Delete(ctx, proxyResource, deleteOptions)).To(Succeed()) - Eventually( - apierrors.IsNotFound(cl.Get(ctx, client.ObjectKeyFromObject(proxyResource), &v1.Proxy{})), - ).Should(BeTrue()) + Expect(cl.Get(ctx, client.ObjectKeyFromObject(proxyResource), &v1.Proxy{})).To(MatchError(apierrors.IsNotFound, "Proxy should have been deleted")) } allCMs := &corev1.ConfigMapList{} Expect(cl.List(ctx, allCMs)).To(Succeed()) for _, cm := range allCMs.Items { Expect(cl.Delete(ctx, cm.DeepCopy(), deleteOptions)).To(Succeed()) - Eventually( - apierrors.IsNotFound(cl.Get(ctx, client.ObjectKeyFromObject(cm.DeepCopy()), &corev1.ConfigMap{})), - ).Should(BeTrue()) + Expect(cl.Get(ctx, client.ObjectKeyFromObject(cm.DeepCopy()), &corev1.ConfigMap{})).To(MatchError(apierrors.IsNotFound, "ConfigMap should have been deleted")) } proxyResource = nil @@ -294,9 +294,16 @@ var _ = Describe("Trusted CA bundle sync controller", func() { It("merged bundle should be generated without cloud-config at all", func() { Expect(cl.Delete(ctx, syncedCloudConfigConfigMap)).To(Succeed()) - Eventually(func() bool { - return apierrors.IsNotFound(cl.Get(ctx, client.ObjectKeyFromObject(syncedCloudConfigConfigMap), &corev1.ConfigMap{})) - }).Should(BeTrue()) + Eventually(func() error { + err := cl.Get(ctx, client.ObjectKeyFromObject(syncedCloudConfigConfigMap), &corev1.ConfigMap{}) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + return fmt.Errorf("expected synced cloud config ConfigMap to be deleted") + }).Should(Succeed()) syncedCloudConfigConfigMap = nil mergedTrustedCA := &corev1.ConfigMap{} @@ -309,6 +316,160 @@ var _ = Describe("Trusted CA bundle sync controller", func() { }) }) +var _ = Describe("Trusted CA bundle reconciler unit tests", func() { + ctx := context.Background() + + AfterEach(func() { + co := &v1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Expect(cl.Delete(ctx, co)).To(Succeed()) + Eventually(func() error { + err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + return fmt.Errorf("expected ClusterOperator to be deleted") + }).Should(Succeed()) + } + }) + + It("reconcile should succeed and be available if no proxy resource found", func() { + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: systemCAValid, + } + + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(Succeed()) + + co := &v1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(v1.ConditionTrue)) + }) + + It("stale failure window should be restarted when gap since last error exceeds threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the reconcile progresses to getSystemTrustBundle. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // Step 1: First transient error; failure window opens at T0. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil()) + + // Step 2: Advance clock past the threshold — simulates a gap with no reconciles + // (e.g., system recovered, no events fired for a long time). + fakeClock.Step(transientDegradedThreshold + time.Second) + + // Step 3: New transient error arrives. The stale-window logic should detect that + // lastTransientFailureAt is more than the threshold ago and restart the window + // from 'now', NOT degrade immediately. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), + "should not degrade immediately after a gap — window should restart") + } + } + } + // consecutiveFailureSince should now be 'now', not the original T0. + Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil()) + Expect(fakeClock.Now().Sub(*reconciler.failures.consecutiveFailureSince)).To(BeNumerically("<", time.Second), + "window should have been restarted to ~now, not retained from original T0") + }) + + It("should not degrade on transient error before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the Proxy get succeeds and we reach the system trust bundle read. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // First reconcile at T0: transient failure starts; error is returned but no degraded condition set. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock to mid-window (half the threshold) and reconcile again to simulate + // continuous failures. This updates lastTransientFailureAt, keeping it fresh. + fakeClock.Step(transientDegradedThreshold / 2) + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock so total elapsed from T0 exceeds the threshold, but the gap since the + // most recent failure (lastTransientFailureAt) is less than the threshold. This ensures + // the degradation path is taken rather than the stale-window restart path. + fakeClock.Step(transientDegradedThreshold/2 + time.Second) + + // Final reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(v1.ConditionTrue)) + }) +}) + var _ = Describe("Trusted CA reconciler methods", func() { It("Get system CA should be fine if bundle is valid", func() { reconciler := &TrustedCABundleReconciler{ From e8ad7903e99b89f0777a8a54e16697e4fcf888e8 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Tue, 28 Apr 2026 12:57:55 -0400 Subject: [PATCH 2/3] Maintain error context Signed-off-by: Nolan Brubaker --- .../cloud_config_sync_controller.go | 24 +++++++++---------- pkg/controllers/clusteroperator_controller.go | 20 ++++++++-------- .../trusted_ca_bundle_controller.go | 14 +++++------ 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index 0d193ad51..2425e553a 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -151,18 +151,18 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } return ctrl.Result{}, nil } else if err != nil { - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err) } network := &configv1.Network{} if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to get cluster Network: %w", err) } syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { // nil platformStatus is a terminal misconfiguration. - return ctrl.Result{}, reconcile.TerminalError(err) + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to check cloud config sync requirements: %w", err)) } if !syncNeeded { if err := r.setAvailableCondition(ctx); err != nil { @@ -194,7 +194,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err != nil { // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - return ctrl.Result{}, reconcile.TerminalError(err) + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to get cloud config transformer: %w", err)) } platformType := infra.Status.PlatformStatus.Type @@ -219,7 +219,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } else if apierrors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") } else { - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to get managed cloud config: %w", err) } } @@ -252,12 +252,12 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { // User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes. - return ctrl.Result{}, reconcile.TerminalError(err) + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to prepare source cloud config: %w", err)) } // Apply transformer if needed if r.FeatureGateAccess == nil { - // Operator misconfiguration at startup: ermanent. + // Operator misconfiguration at startup: Permanent. return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured")) } @@ -265,7 +265,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err != nil { // The feature-gate informer may not have synced yet: transient. klog.Errorf("unable to get feature gates: %v", err) - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to get feature gates: %w", err) } if cloudConfigTransformerFn != nil { // We ignore stuff in sourceCM.BinaryData. This isn't allowed to @@ -274,7 +274,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { // Platform-specific transform failed on the current config data: terminal. - return ctrl.Result{}, reconcile.TerminalError(err) + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to transform cloud config: %w", err)) } sourceCM.Data[defaultConfigKey] = output } @@ -294,7 +294,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Sync the transformed config to the target configmap for CCM consumption if err := r.syncCloudConfigData(ctx, sourceCM); err != nil { klog.Errorf("unable to sync cloud config") - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to sync cloud config: %w", err) } if err := r.setAvailableCondition(ctx); err != nil { @@ -326,7 +326,7 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er } klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) } return ctrl.Result{}, err } @@ -338,7 +338,7 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) { klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err) if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) } return ctrl.Result{}, nil } diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index afa2c3d72..94f95d1ff 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -101,12 +101,12 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) klog.Infof("Infrastructure cluster does not exist. Skipping...") if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", err) } return ctrl.Result{}, nil // defer clears failure window } else if err != nil { klog.Errorf("Unable to retrive Infrastructure object: %v", err) - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err) } // Known limitation: when provisioningAllowed internally calls setStatusDegraded @@ -119,7 +119,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) - return ctrl.Result{}, err // transient; status already set inside provisioningAllowed + return ctrl.Result{}, fmt.Errorf("failed to check if provisioning is allowed: %w", err) } else if !allowedToProvision { return ctrl.Result{}, nil // defer clears failure window } @@ -127,28 +127,28 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) clusterProxy := &configv1.Proxy{} if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !apierrors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to get Proxy: %w", err) } operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig) if err != nil { klog.Errorf("Unable to build operator config %s", err) - return ctrl.Result{}, reconcile.TerminalError(err) // permanent: defer calls handleDegradeError + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to build operator config: %w", err)) } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { klog.Errorf("Unable to sync operands: %s", err) - return ctrl.Result{}, err // transient + return ctrl.Result{}, fmt.Errorf("failed to sync operands: %w", err) } if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", err) } if err := r.clearCloudControllerOwnerCondition(ctx); err != nil { klog.Errorf("Unable to clear CloudControllerOwner condition: %s", err) - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("failed to clear CloudControllerOwner condition: %w", err) } return ctrl.Result{}, nil // defer clears failure window @@ -174,7 +174,7 @@ func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, cond } klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", setErr) } return ctrl.Result{}, err } @@ -186,7 +186,7 @@ func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, cond func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err) if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", setErr) } return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger } diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index 97ca33baa..34632bd6b 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -90,7 +90,7 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ return reconcile.Result{}, nil // defer clears failure window } // Non-NotFound: transient API error. - return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) // transient + return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %w", req.Name, err) // transient } // Check if changed config map in 'openshift-config' namespace is proxy trusted ca. @@ -107,24 +107,24 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ systemTrustBundle, err := r.getSystemTrustBundle() if err != nil { // Node cert store may be updating during upgrade: transient. - return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) // transient + return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %w", err) // transient } proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { // Combined cert bundle is corrupt: terminal. - return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %w", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { // Combined cert bundle is corrupt: terminal. - return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %w", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil { - return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) // transient + return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %w", err) // transient } if err := r.setAvailableCondition(ctx); err != nil { @@ -156,7 +156,7 @@ func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, er } klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) } return ctrl.Result{}, err } @@ -168,7 +168,7 @@ func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, er func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) } return ctrl.Result{}, nil } From 05b8726d7c36bf46e8dfdcedd324397e42805cf0 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Mon, 4 May 2026 12:28:21 -0400 Subject: [PATCH 3/3] Consolidate failure handling for controller Signed-off-by: Nolan Brubaker --- .../cloud_config_sync_controller.go | 81 +++++++++---------- .../trusted_ca_bundle_controller.go | 49 ++--------- 2 files changed, 46 insertions(+), 84 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index 2425e553a..bced4fc37 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -123,6 +123,38 @@ func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elaps return now.Sub(*fw.consecutiveFailureSince), false } +// handleTransient records a transient failure and degrades only after threshold has elapsed. +// name labels log messages. staleAfter controls stale-window restart (pass 0 to disable). +// setDegraded is invoked only when the threshold is exceeded. +// Always returns a non-nil error so controller-runtime requeues with exponential backoff. +func (fw *failureWindow) handleTransient(now time.Time, staleAfter, threshold time.Duration, name string, err error, setDegraded func() error) (ctrl.Result, error) { + elapsed, started := fw.observe(now, staleAfter) + if started { + klog.V(4).Infof("%s: transient failure started (%v), will degrade after %s", name, err, threshold) + return ctrl.Result{}, err + } + if elapsed < threshold { + klog.V(4).Infof("%s: transient failure ongoing for %s (threshold %s): %v", name, elapsed, threshold, err) + return ctrl.Result{}, err + } + klog.Warningf("%s: transient failure exceeded threshold (%s), setting degraded: %v", name, elapsed, err) + if setErr := setDegraded(); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) + } + return ctrl.Result{}, err +} + +// handleTerminal degrades immediately and returns nil so controller-runtime does not requeue. +// An existing watch on the relevant resource will re-trigger reconciliation when fixed. +// name labels log messages. +func (fw *failureWindow) handleTerminal(name string, err error, setDegraded func() error) (ctrl.Result, error) { + klog.Errorf("%s: terminal error, setting degraded: %v", name, err) + if setErr := setDegraded(); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) + } + return ctrl.Result{}, nil +} + func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") @@ -132,13 +164,17 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // All nil-error paths clear the failure window. defer func() { if retErr == nil { - r.clearFailureWindow() + r.failures.clear() return } if errors.Is(retErr, reconcile.TerminalError(nil)) { - result, retErr = r.handleTerminalError(ctx, retErr) + result, retErr = r.failures.handleTerminal("CloudConfigReconciler", retErr, func() error { + return r.setDegradedCondition(ctx) + }) } else { - result, retErr = r.handleTransientError(ctx, retErr) + result, retErr = r.failures.handleTransient(r.Clock.Now(), 0, transientDegradedThreshold, "CloudConfigReconciler", retErr, func() error { + return r.setDegradedCondition(ctx) + }) } }() @@ -304,45 +340,6 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } -// clearFailureWindow resets the transient-error tracking. Called by the deferred -// dispatcher in Reconcile on every successful (nil-error) return path. -func (r *CloudConfigReconciler) clearFailureWindow() { - r.failures.clear() -} - -// handleTransientError records the start of a failure window and degrades the -// controller only after transientDegradedThreshold has elapsed. It always -// returns a non-nil error so controller-runtime requeues with exponential backoff. -// Called only from the deferred dispatcher in Reconcile. -func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { - elapsed, started := r.failures.observe(r.Clock.Now(), 0) - if started { - klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) - return ctrl.Result{}, err - } - if elapsed < transientDegradedThreshold { - klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) - return ctrl.Result{}, err - } - klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) - if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) - } - return ctrl.Result{}, err -} - -// handleTerminalError sets CloudConfigControllerDegraded=True immediately and -// returns nil so controller-runtime does NOT requeue. An existing watch on the -// relevant resource will re-trigger reconciliation when the problem is fixed. -// Called only from the deferred dispatcher in Reconcile. -func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) { - klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err) - if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) - } - return ctrl.Result{}, nil -} - func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) { if platformStatus == nil { return false, fmt.Errorf("platformStatus is required") diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index 34632bd6b..1a5e01050 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -67,14 +67,18 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ defer func() { if retErr == nil { if !partialRun { - r.clearFailureWindow() + r.failures.clear() } return } if errors.Is(retErr, reconcile.TerminalError(nil)) { - result, retErr = r.handleDegradeError(ctx, retErr) + result, retErr = r.failures.handleTerminal("TrustedCABundleReconciler", retErr, func() error { + return r.setDegradedCondition(ctx) + }) } else { - result, retErr = r.handleTransientError(ctx, retErr) + result, retErr = r.failures.handleTransient(r.Clock.Now(), transientDegradedThreshold, transientDegradedThreshold, "TrustedCABundleReconciler", retErr, func() error { + return r.setDegradedCondition(ctx) + }) } }() @@ -134,45 +138,6 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, nil // defer clears failure window } -func (r *TrustedCABundleReconciler) clearFailureWindow() { - r.failures.clear() -} - -// handleTransientError records the start of a failure window and degrades the -// controller only after transientDegradedThreshold has elapsed. It always -// returns a non-nil error so controller-runtime requeues with exponential backoff. -// Called only from the deferred dispatcher in Reconcile. -func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { - // Pass transientDegradedThreshold as the stale-window threshold to detect gaps - // where no reconcile ran (e.g. a partialRun returned nil, resetting the rate limiter). - elapsed, started := r.failures.observe(r.Clock.Now(), transientDegradedThreshold) - if started { - klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) - return ctrl.Result{}, err - } - if elapsed < transientDegradedThreshold { - klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) - return ctrl.Result{}, err - } - klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) - if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) - } - return ctrl.Result{}, err -} - -// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and -// returns nil so controller-runtime does NOT requeue. An existing watch on the -// relevant resource will re-trigger reconciliation when the problem is fixed. -// Called only from the deferred dispatcher in Reconcile. -func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { - klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) - if setErr := r.setDegradedCondition(ctx); setErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) - } - return ctrl.Result{}, nil -} - // addProxyCABundle checks ca bundle referred by Proxy resource and adds it to passed bundle // in case if proxy one is valid. // This function returns added bundle as first value, result as second and an error if it was occurred.