From faa2cd446cf4d3c3b296e44b0344952df30367e0 Mon Sep 17 00:00:00 2001
From: Nolan Brubaker <nolan@nbrubaker.com>
Date: Mon, 27 Apr 2026 17:41:23 -0400
Subject: [PATCH 1/3] Add transient error handling across all three controllers

All three reconcilers (CloudConfigReconciler, TrustedCABundleReconciler,
CloudOperatorReconciler) now distinguish between transient errors (API
blips) and permanent errors (corrupt config, unsupported platform):

- Transient errors are silently requeued and only set Degraded=True after
  a threshold has elapsed (2m for sub-controllers, 2m30s for the main
  controller). lastTransientFailureAt is tracked so a stale failure window
  is restarted rather than causing immediate degradation when errors resume
  after a long gap.
- Permanent errors set Degraded=True immediately and return nil so
  controller-runtime does not requeue; existing watches re-trigger
  reconciliation when the underlying data changes.
- Infrastructure NotFound returns Available (nil) rather than Degraded,
  matching the main controller's existing behaviour.

Reconcile methods are refactored to use a deferred dispatcher pattern
with permanent()/isPermanent() helpers. Each error site returns
ctrl.Result{}, err (transient) or ctrl.Result{}, permanent(err)
(permanent); a single deferred function routes to the correct handler and
ensures clearFailureWindow() is called on every nil-error return path.

Additional fixes and improvements:
- Use controller-runtime's TerminalError instead of a local type and
  errors.Is instead of type casting; consolidate failure-tracking fields
  into their own struct.
- Fix race in cloud config sync test by using a separate reconciler
  instance for direct Reconcile() calls rather than sharing the manager's
  instance across goroutines.
- Fix aggregatedTransientDegradedThreshold mismatch in the main controller
  test that prevented the degraded condition from ever being set.
- Document known limitations identified in QC review (status-write
  failures bypassing the transient threshold, redundant setStatusDegraded
  call in provisioningAllowed path).
- Use specific Gomega matchers or provide failure descriptions per project
  conventions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Nolan Brubaker <nolan@nbrubaker.com>
---
 .../cloud_config_sync_controller.go           | 174 ++++++++++++----
 .../cloud_config_sync_controller_test.go      |  82 ++++++--
 pkg/controllers/clusteroperator_controller.go | 115 ++++++++---
 .../clusteroperator_controller_test.go        | 152 +++++++++++---
 .../trusted_ca_bundle_controller.go           | 104 +++++++---
 .../trusted_ca_bundle_controller_test.go      | 187 ++++++++++++++++--
 6 files changed, 656 insertions(+), 158 deletions(-)

diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go
index a3c694c3a..0d193ad51 100644
--- a/pkg/controllers/cloud_config_sync_controller.go
+++ b/pkg/controllers/cloud_config_sync_controller.go
@@ -2,11 +2,14 @@ package controllers
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"reflect"
+	"sync"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/errors"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/klog/v2"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -14,6 +17,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/handler"
 	"sigs.k8s.io/controller-runtime/pkg/predicate"
+	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 
 	configv1 "github.com/openshift/api/config/v1"
 	"github.com/openshift/api/features"
@@ -30,6 +34,12 @@ const (
 	// Controller conditions for the Cluster Operator resource
 	cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable"
 	cloudConfigControllerDegradedCondition  = "CloudConfigControllerDegraded"
+
+	// transientDegradedThreshold is how long transient errors must persist before
+	// the controller sets Degraded=True. This prevents brief
+	// API server blips during upgrades from immediately degrading the operator.
+	// Applies to both CloudConfigController and TrustedCAController.
+	transientDegradedThreshold = 2 * time.Minute
 )
 
 // shouldManageManagedConfigMap returns true if CCCMO should manage the
@@ -78,34 +88,81 @@ type CloudConfigReconciler struct {
 	ClusterOperatorStatusClient
 	Scheme            *runtime.Scheme
 	FeatureGateAccess featuregates.FeatureGateAccess
+	failures          failureWindow
+}
+
+// failureWindow tracks consecutive transient failures. All methods are safe for concurrent use.
+type failureWindow struct {
+	mu                      sync.Mutex
+	consecutiveFailureSince *time.Time
+	lastTransientFailureAt  *time.Time
+}
+
+// clear resets the failure window. Call this on every successful reconcile.
+func (fw *failureWindow) clear() {
+	fw.mu.Lock()
+	defer fw.mu.Unlock()
+	fw.consecutiveFailureSince = nil
+	fw.lastTransientFailureAt = nil
+}
+
+// observe records a transient failure at now and returns the elapsed time since
+// the window started plus a boolean indicating whether the window was just opened
+// or restarted. staleAfter controls stale-window detection: if the gap since the
+// last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable.
+func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) {
+	fw.mu.Lock()
+	defer fw.mu.Unlock()
+	stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter
+	if fw.consecutiveFailureSince == nil || stale {
+		fw.consecutiveFailureSince = &now
+		fw.lastTransientFailureAt = &now
+		return 0, true
+	}
+	fw.lastTransientFailureAt = &now
+	return now.Sub(*fw.consecutiveFailureSince), false
 }
 
-func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
 	klog.V(1).Infof("Syncing cloud-conf ConfigMap")
 
+	// Deferred dispatcher: classifies the returned error and calls the right handler.
+	// Permanent errors (wrapped with reconcile.TerminalError()) degrade immediately without requeue.
+	// Transient errors enter the failure window and only degrade after the threshold.
+	// All nil-error paths clear the failure window.
+	defer func() {
+		if retErr == nil {
+			r.clearFailureWindow()
+			return
+		}
+		if errors.Is(retErr, reconcile.TerminalError(nil)) {
+			result, retErr = r.handleTerminalError(ctx, retErr)
+		} else {
+			result, retErr = r.handleTransientError(ctx, retErr)
+		}
+	}()
+
 	infra := &configv1.Infrastructure{}
-	if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil {
-		klog.Errorf("infrastructure resource not found")
-		if err := r.setDegradedCondition(ctx); err != nil {
+	if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) {
+		// No cloud platform: mirror the main controller's behaviour of returning Available.
+		klog.Infof("Infrastructure cluster does not exist. Skipping...")
+		if err := r.setAvailableCondition(ctx); err != nil {
 			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
 		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, nil
+	} else if err != nil {
+		return ctrl.Result{}, err // transient
 	}
 
 	network := &configv1.Network{}
 	if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
 	if err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		// nil platformStatus is a terminal misconfiguration.
+		return ctrl.Result{}, reconcile.TerminalError(err)
 	}
 	if !syncNeeded {
 		if err := r.setAvailableCondition(ctx); err != nil {
@@ -135,11 +192,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus)
 	if err != nil {
+		// Unsupported platform won't change without a cluster reconfigure.
 		klog.Errorf("unable to get cloud config transformer function; unsupported platform")
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, reconcile.TerminalError(err)
 	}
 
 	platformType := infra.Status.PlatformStatus.Type
@@ -161,14 +216,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		}
 		if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil {
 			managedConfigFound = true
-		} else if errors.IsNotFound(err) {
+		} else if apierrors.IsNotFound(err) {
 			klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
-		} else if err != nil {
-			klog.Errorf("unable to get managed cloud-config for sync")
-			if err := r.setDegradedCondition(ctx); err != nil {
-				return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-			}
-			return ctrl.Result{}, err
+		} else {
+			return ctrl.Result{}, err // transient
 		}
 	}
 
@@ -179,7 +230,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			Name:      infra.Spec.CloudConfig.Name,
 			Namespace: OpenshiftConfigNamespace,
 		}
-		if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) {
+		if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) {
 			klog.Warningf("cloud-config not found in either openshift-config-managed or openshift-config namespace")
 			// For platforms we manage, create a minimal valid config that will be populated by the transformer
 			if shouldManageManagedConfigMap(platformType, features) {
@@ -200,23 +251,30 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
 	if err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		// User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes.
+		return ctrl.Result{}, reconcile.TerminalError(err)
 	}
 
 	// Apply transformer if needed
+	if r.FeatureGateAccess == nil {
+		// Operator misconfiguration at startup: ermanent.
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured"))
+	}
+
+	features, err = r.FeatureGateAccess.CurrentFeatureGates()
+	if err != nil {
+		// The feature-gate informer may not have synced yet: transient.
+		klog.Errorf("unable to get feature gates: %v", err)
+		return ctrl.Result{}, err // transient
+	}
 	if cloudConfigTransformerFn != nil {
 		// We ignore stuff in sourceCM.BinaryData. This isn't allowed to
 		// contain any key that overlaps with those found in sourceCM.Data and
 		// we're not expecting users to put their data in the former.
 		output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
 		if err != nil {
-			if err := r.setDegradedCondition(ctx); err != nil {
-				return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-			}
-			return ctrl.Result{}, err
+			// Platform-specific transform failed on the current config data: terminal.
+			return ctrl.Result{}, reconcile.TerminalError(err)
 		}
 		sourceCM.Data[defaultConfigKey] = output
 	}
@@ -229,17 +287,14 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			if err := r.setDegradedCondition(ctx); err != nil {
 				return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
 			}
-			return ctrl.Result{}, err
+			return ctrl.Result{}, err // transient
 		}
 	}
 
 	// Sync the transformed config to the target configmap for CCM consumption
 	if err := r.syncCloudConfigData(ctx, sourceCM); err != nil {
 		klog.Errorf("unable to sync cloud config")
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	if err := r.setAvailableCondition(ctx); err != nil {
@@ -249,6 +304,45 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	return ctrl.Result{}, nil
 }
 
+// clearFailureWindow resets the transient-error tracking. Called by the deferred
+// dispatcher in Reconcile on every successful (nil-error) return path.
+func (r *CloudConfigReconciler) clearFailureWindow() {
+	r.failures.clear()
+}
+
+// handleTransientError records the start of a failure window and degrades the
+// controller only after transientDegradedThreshold has elapsed. It always
+// returns a non-nil error so controller-runtime requeues with exponential backoff.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
+	elapsed, started := r.failures.observe(r.Clock.Now(), 0)
+	if started {
+		klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
+		return ctrl.Result{}, err
+	}
+	if elapsed < transientDegradedThreshold {
+		klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
+		return ctrl.Result{}, err
+	}
+	klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
+	if setErr := r.setDegradedCondition(ctx); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+	}
+	return ctrl.Result{}, err
+}
+
+// handleTerminalError sets CloudConfigControllerDegraded=True immediately and
+// returns nil so controller-runtime does NOT requeue. An existing watch on the
+// relevant resource will re-trigger reconciliation when the problem is fixed.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) {
+	klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err)
+	if setErr := r.setDegradedCondition(ctx); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+	}
+	return ctrl.Result{}, nil
+}
+
 func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) {
 	if platformStatus == nil {
 		return false, fmt.Errorf("platformStatus is required")
@@ -335,7 +429,7 @@ func (r *CloudConfigReconciler) syncConfigMapToTarget(ctx context.Context, sourc
 
 	// Check if target exists
 	err := r.Get(ctx, targetKey, targetCM)
-	if err != nil && !errors.IsNotFound(err) {
+	if err != nil && !apierrors.IsNotFound(err) {
 		return fmt.Errorf("failed to get target configmap %s/%s: %w", targetNamespace, targetName, err)
 	}
 
diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go
index 85cd2d189..ba217895f 100644
--- a/pkg/controllers/cloud_config_sync_controller_test.go
+++ b/pkg/controllers/cloud_config_sync_controller_test.go
@@ -141,7 +141,7 @@ var _ = Describe("isCloudConfigEqual reconciler method", func() {
 	}
 
 	It("should return 'true' if ConfigMaps content are equal", func() {
-		Expect(reconciler.isCloudConfigEqual(makeManagedCloudConfig(configv1.AzurePlatformType), makeManagedCloudConfig(configv1.AzurePlatformType))).Should(BeTrue())
+		Expect(reconciler.isCloudConfigEqual(makeManagedCloudConfig(configv1.AzurePlatformType), makeManagedCloudConfig(configv1.AzurePlatformType))).Should(BeTrue(), "configmaps with identical content should be considered equal")
 	})
 
 	It("should return 'false' if ConfigMaps content are not equal", func() {
@@ -164,8 +164,7 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() {
 	managedCloudConfig := makeManagedCloudConfig(configv1.AzurePlatformType)
 
 	It("not prepared config should be different with managed one", func() {
-		_, ok := infraCloudConfig.Data[infraCloudConfKey]
-		Expect(ok).Should(BeTrue())
+		Expect(infraCloudConfig.Data).To(HaveKey(infraCloudConfKey))
 		Expect(reconciler.isCloudConfigEqual(infraCloudConfig, managedCloudConfig)).Should(BeFalse())
 	})
 
@@ -174,9 +173,8 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() {
 		Expect(err).Should(Succeed())
 		_, ok := preparedConfig.Data[infraCloudConfKey]
 		Expect(ok).Should(BeFalse())
-		_, ok = preparedConfig.Data[defaultConfigKey]
-		Expect(ok).Should(BeTrue())
-		Expect(reconciler.isCloudConfigEqual(preparedConfig, managedCloudConfig)).Should(BeTrue())
+		Expect(preparedConfig.Data).To(HaveKey(defaultConfigKey))
+		Expect(reconciler.isCloudConfigEqual(preparedConfig, managedCloudConfig)).Should(BeTrue(), "prepared config should have content equal to the managed cloud config")
 	})
 
 	It("config preparation should not touch extra fields in infra ConfigMap", func() {
@@ -184,8 +182,7 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() {
 		extendedInfraConfig.Data = map[string]string{infraCloudConfKey: "{}", "{}": "{}"}
 		preparedConfig, err := reconciler.prepareSourceConfigMap(extendedInfraConfig, infra)
 		Expect(err).Should(Succeed())
-		_, ok := preparedConfig.Data[defaultConfigKey]
-		Expect(ok).Should(BeTrue())
+		Expect(preparedConfig.Data).To(HaveKey(defaultConfigKey))
 		Expect(len(preparedConfig.Data)).Should(BeEquivalentTo(2))
 	})
 })
@@ -370,8 +367,26 @@ var _ = Describe("Cloud config sync controller", func() {
 			}, timeout).Should(Succeed())
 			initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion
 
+			// Introducing the consecutiveFailureWindow means that there's a field that could be racy
+			// between the manager calling Reconcile and the test calling Reconcile.
+			// In production, we only have 1 instance of the reconciler running.
+			// Create a fresh reconciler that is NOT registered with the manager.
+			// It shares the same API client (thread-safe) but has its own
+			// consecutiveFailureSince field, so no data race with the manager's copy.
+			freshReconciler := &CloudConfigReconciler{
+				ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+					Client:           cl,
+					Clock:            clocktesting.NewFakePassiveClock(time.Now()),
+					ManagedNamespace: targetNamespaceName,
+				},
+				Scheme: scheme.Scheme,
+				FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(
+					nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil,
+				),
+			}
+
 			request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}}
-			_, err := reconciler.Reconcile(ctx, request)
+			_, err := freshReconciler.Reconcile(ctx, request)
 			Expect(err).Should(Succeed())
 
 			Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed())
@@ -531,7 +546,7 @@ var _ = Describe("Cloud config sync reconciler", func() {
 			Expect(len(allCMs.Items)).To(BeEquivalentTo(1))
 		})
 
-		It("should error if a user-specified configmap key isn't present", func() {
+		It("should degrade immediately if a user-specified configmap key isn't present", func() {
 			infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
 			infraResource.Spec.CloudConfig.Key = "notfound"
 			Expect(cl.Create(ctx, infraResource)).To(Succeed())
@@ -540,8 +555,19 @@ var _ = Describe("Cloud config sync reconciler", func() {
 			Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed())
 
 			_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
-			Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap"))
-
+			Expect(err).To(Succeed())
+
+			co := &configv1.ClusterOperator{}
+			Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+			var degradedCond *configv1.ClusterOperatorStatusCondition
+			for i := range co.Status.Conditions {
+				if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
+					degradedCond = &co.Status.Conditions[i]
+					break
+				}
+			}
+			Expect(degradedCond).NotTo(BeNil())
+			Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
 		})
 
 		It("should continue with reconcile when feature gates are available", func() {
@@ -606,16 +632,40 @@ var _ = Describe("Cloud config sync reconciler", func() {
 		})
 	})
 
-	It("reconcile should fail if no infra resource found", func() {
+	It("reconcile should succeed and be available if no infra resource found", func() {
 		_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
-		Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found"))
+		Expect(err).To(Succeed())
+
+		co := &configv1.ClusterOperator{}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		var availCond *configv1.ClusterOperatorStatusCondition
+		for i := range co.Status.Conditions {
+			if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition {
+				availCond = &co.Status.Conditions[i]
+				break
+			}
+		}
+		Expect(availCond).NotTo(BeNil())
+		Expect(availCond.Status).To(Equal(configv1.ConditionTrue))
 	})
 
-	It("should fail if no PlatformStatus in infra resource presented ", func() {
+	It("should degrade immediately if no PlatformStatus in infra resource", func() {
 		infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
 		Expect(cl.Create(ctx, infraResource)).To(Succeed())
 		_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
-		Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required"))
+		Expect(err).To(Succeed())
+
+		co := &configv1.ClusterOperator{}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		var degradedCond *configv1.ClusterOperatorStatusCondition
+		for i := range co.Status.Conditions {
+			if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
+				degradedCond = &co.Status.Conditions[i]
+				break
+			}
+		}
+		Expect(degradedCond).NotTo(BeNil())
+		Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
 	})
 })
 
diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go
index da94230bf..afa2c3d72 100644
--- a/pkg/controllers/clusteroperator_controller.go
+++ b/pkg/controllers/clusteroperator_controller.go
@@ -19,22 +19,25 @@ package controllers
 import (
 	"context"
 	"crypto/tls"
+	"errors"
 	"fmt"
 	"sort"
 	"strings"
+	"time"
 
 	configv1 "github.com/openshift/api/config/v1"
 	operatorv1 "github.com/openshift/api/operator/v1"
 	"github.com/openshift/library-go/pkg/cloudprovider"
 	"github.com/openshift/library-go/pkg/operator/configobserver/featuregates"
 	corev1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/errors"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/klog/v2"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/handler"
+	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 	"sigs.k8s.io/controller-runtime/pkg/source"
 
 	"github.com/openshift/cluster-cloud-controller-manager-operator/pkg/cloud"
@@ -48,6 +51,13 @@ const (
 
 	// Condition type for Cloud Controller ownership
 	cloudControllerOwnershipCondition = "CloudControllerOwner"
+
+	// aggregatedTransientDegradedThreshold is how long transient errors must persist before
+	// the controller sets Degraded=True.
+	// This prevents brief API server blips during upgrades from immediately degrading the operator.
+	// Applies to top-level operator, and is longer in order
+	// to accomodate changes in the lower-level operators.
+	aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second)
 )
 
 // CloudOperatorReconciler reconciles a ClusterOperator object
@@ -58,6 +68,7 @@ type CloudOperatorReconciler struct {
 	ImagesFile        string
 	FeatureGateAccess featuregates.FeatureGateAccess
 	TLSConfig         func(*tls.Config)
+	failures          failureWindow
 }
 
 // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete
@@ -66,65 +77,68 @@ type CloudOperatorReconciler struct {
 // +kubebuilder:rbac:groups=config.openshift.io,resources=infrastructures,verbs=get;list;watch
 
 // Reconcile will process the cloud-controller-manager clusterOperator
-func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) {
+func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (result ctrl.Result, retErr error) {
 	conditionOverrides := []configv1.ClusterOperatorStatusCondition{}
 
+	// Deferred dispatcher: classifies the returned error and calls the right handler.
+	// Permanent errors (wrapped with permanent()) degrade immediately without requeue.
+	// Transient errors enter the failure window and only degrade after the threshold.
+	// All nil-error paths clear the failure window.
+	defer func() {
+		if retErr == nil {
+			r.clearFailureWindow()
+			return
+		}
+		if errors.Is(retErr, reconcile.TerminalError(nil)) {
+			result, retErr = r.handleDegradeError(ctx, conditionOverrides, retErr)
+		} else {
+			result, retErr = r.handleTransientError(ctx, conditionOverrides, retErr)
+		}
+	}()
+
 	infra := &configv1.Infrastructure{}
-	if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
+	if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) {
 		klog.Infof("Infrastructure cluster does not exist. Skipping...")
-
 		if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
 			klog.Errorf("Unable to sync cluster operator status: %s", err)
 			return ctrl.Result{}, err
 		}
-
-		return ctrl.Result{}, nil
+		return ctrl.Result{}, nil // defer clears failure window
 	} else if err != nil {
 		klog.Errorf("Unable to retrive Infrastructure object: %v", err)
-
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
+	// Known limitation: when provisioningAllowed internally calls setStatusDegraded
+	// (e.g. a sub-controller has Degraded=True, or IsCloudProviderExternal errors),
+	// it returns a non-nil error. Reconcile passes that error to handleTransientError,
+	// which starts the 2m30s window. After the threshold, handleTransientError calls
+	// setStatusDegraded again — redundant but harmless, since status is already degraded.
+	// This is a consequence of keeping status-setting inside provisioningAllowed rather
+	// than pushing it into Reconcile.
 	allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides)
 	if err != nil {
 		klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err)
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient; status already set inside provisioningAllowed
 	} else if !allowedToProvision {
-		return ctrl.Result{}, nil
+		return ctrl.Result{}, nil // defer clears failure window
 	}
 
 	clusterProxy := &configv1.Proxy{}
-	if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) {
+	if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !apierrors.IsNotFound(err) {
 		klog.Errorf("Unable to retrive Proxy object: %v", err)
-
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig)
 	if err != nil {
 		klog.Errorf("Unable to build operator config %s", err)
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, reconcile.TerminalError(err) // permanent: defer calls handleDegradeError
 	}
 
 	if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil {
 		klog.Errorf("Unable to sync operands: %s", err)
-		if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil {
-			klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
-			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
@@ -137,7 +151,44 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
 		return ctrl.Result{}, err
 	}
 
-	return ctrl.Result{}, nil
+	return ctrl.Result{}, nil // defer clears failure window
+}
+
+func (r *CloudOperatorReconciler) clearFailureWindow() {
+	r.failures.clear()
+}
+
+// handleTransientError records the start of a failure window and degrades the
+// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns
+// a non-nil error so controller-runtime requeues with exponential backoff.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
+	elapsed, started := r.failures.observe(r.Clock.Now(), 0)
+	if started {
+		klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold)
+		return ctrl.Result{}, err
+	}
+	if elapsed < aggregatedTransientDegradedThreshold {
+		klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err)
+		return ctrl.Result{}, err
+	}
+	klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
+	if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
+	}
+	return ctrl.Result{}, err
+}
+
+// handleDegradeError sets OperatorDegraded=True immediately and returns nil so
+// controller-runtime does NOT requeue. Existing watches on Infrastructure,
+// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
+	klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err)
+	if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
+	}
+	return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger
 }
 
 func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error {
diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go
index 041670762..43f5c8c53 100644
--- a/pkg/controllers/clusteroperator_controller_test.go
+++ b/pkg/controllers/clusteroperator_controller_test.go
@@ -2,6 +2,7 @@ package controllers
 
 import (
 	"context"
+	"fmt"
 	"time"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -53,12 +54,15 @@ var _ = Describe("Cluster Operator status controller", func() {
 		co := &configv1.ClusterOperator{}
 		err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)
 		if err == nil || !apierrors.IsNotFound(err) {
-			Eventually(func() bool {
+			Eventually(func() error {
 				err := cl.Delete(context.Background(), operator)
-				return err == nil || apierrors.IsNotFound(err)
-			}).Should(BeTrue())
+				if err == nil || apierrors.IsNotFound(err) {
+					return nil
+				}
+				return err
+			}).Should(Succeed())
 		}
-		Eventually(apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue())
+		Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted"))
 	})
 
 	type testCase struct {
@@ -89,14 +93,17 @@ var _ = Describe("Cluster Operator status controller", func() {
 			Expect(err).To(Succeed())
 
 			getOp := &configv1.ClusterOperator{}
-			Eventually(func() (bool, error) {
+			Eventually(func() error {
 				err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, getOp)
 				if err != nil {
-					return false, err
+					return err
 				}
 				// Successful sync means CO exists and the status is not empty
-				return getOp != nil && len(getOp.Status.Versions) > 0, nil
-			}, timeout).Should(BeTrue())
+				if getOp == nil || len(getOp.Status.Versions) == 0 {
+					return fmt.Errorf("ClusterOperator status versions not yet populated")
+				}
+				return nil
+			}, timeout).Should(Succeed())
 
 			// check version.
 			Expect(getOp.Status.Versions).To(HaveLen(1))
@@ -104,13 +111,13 @@ var _ = Describe("Cluster Operator status controller", func() {
 			Expect(getOp.Status.Versions[0].Version).To(Equal(expectedVersion))
 
 			// check conditions.
-			Expect(v1helpers.IsStatusConditionTrue(getOp.Status.Conditions, configv1.OperatorAvailable)).To(BeTrue())
+			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorAvailable).Status).To(Equal(configv1.ConditionTrue))
 			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorAvailable).Reason).To(Equal(ReasonAsExpected))
-			Expect(v1helpers.IsStatusConditionTrue(getOp.Status.Conditions, configv1.OperatorUpgradeable)).To(BeTrue())
+			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorUpgradeable).Status).To(Equal(configv1.ConditionTrue))
 			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorUpgradeable).Reason).To(Equal(ReasonAsExpected))
-			Expect(v1helpers.IsStatusConditionFalse(getOp.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue())
+			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorDegraded).Status).To(Equal(configv1.ConditionFalse))
 			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorDegraded).Reason).To(Equal(ReasonAsExpected))
-			Expect(v1helpers.IsStatusConditionFalse(getOp.Status.Conditions, configv1.OperatorProgressing)).To(BeTrue())
+			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorProgressing).Status).To(Equal(configv1.ConditionFalse))
 			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, configv1.OperatorProgressing).Reason).To(Equal(ReasonAsExpected))
 			Expect(v1helpers.FindStatusCondition(getOp.Status.Conditions, cloudControllerOwnershipCondition)).To(BeNil())
 
@@ -292,7 +299,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err := reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		// two resources should report successful update, deployment and pdb
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
@@ -315,14 +322,14 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err := reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
 
 		dep.Spec.Replicas = ptr.To[int32](20)
 
 		updated, err = reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated")))
 
 		// No update as resource didn't change
@@ -353,7 +360,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err := reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		// three resources should report successful update, deployment, pdb and service
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
@@ -382,7 +389,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err := reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
 
 		// Manually changing the port number
@@ -415,7 +422,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err = reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated")))
 
 		// Checking that the port has been reverted back and there is only one item in the list
@@ -442,7 +449,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err := reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
 
 		// Manually adding another port
@@ -474,7 +481,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err = reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated")))
 
 		// Checking that the port list has been reverted back and there is only one item in the list
@@ -505,7 +512,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err := reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
 
 		// Manually inserting a new label
@@ -531,7 +538,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err = reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated")))
 
 		// Checking that the new label is still there
@@ -558,7 +565,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err := reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully created")))
 
 		// Now the deployment has just one label "k8s-app: aws-cloud-controller-manager"
@@ -587,7 +594,7 @@ var _ = Describe("Apply resources should", func() {
 
 		updated, err = reconciler.applyResources(context.TODO(), resources)
 		Expect(err).ShouldNot(HaveOccurred())
-		Expect(updated).To(BeTrue())
+		Expect(updated).To(BeTrue(), "expected applyResources to report that at least one resource was created or updated")
 		Eventually(recorder.Events).Should(Receive(ContainSubstring("Resource was successfully updated")))
 
 		// Checking that the label value has been reverted and there is only one item in the map
@@ -601,20 +608,103 @@ var _ = Describe("Apply resources should", func() {
 		co := &configv1.ClusterOperator{}
 		err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)
 		if err == nil || !apierrors.IsNotFound(err) {
-			Eventually(func() bool {
+			Eventually(func() error {
 				err := cl.Delete(context.Background(), co)
-				return err == nil || apierrors.IsNotFound(err)
-			}, timeout).Should(BeTrue())
+				if err == nil || apierrors.IsNotFound(err) {
+					return nil
+				}
+				return err
+			}, timeout).Should(Succeed())
 		}
-		Eventually(apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue())
+		Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted"))
 
 		for _, operand := range resources {
 			Expect(cl.Delete(context.Background(), operand)).To(Succeed())
 
-			Eventually(func() bool {
-				return apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKeyFromObject(operand), operand))
-			}, timeout).Should(BeTrue())
+			Eventually(func() error {
+				err := cl.Get(context.Background(), client.ObjectKeyFromObject(operand), operand)
+				if apierrors.IsNotFound(err) {
+					return nil
+				}
+				if err != nil {
+					return err
+				}
+				return fmt.Errorf("expected operand %s to be deleted", operand.GetName())
+			}, timeout).Should(Succeed())
+		}
+	})
+
+})
+
+var _ = Describe("CloudOperatorReconciler error handling", func() {
+	ctx := context.Background()
+
+	AfterEach(func() {
+		co := &configv1.ClusterOperator{}
+		if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil {
+			Eventually(func() error {
+				err := cl.Delete(ctx, co)
+				if err == nil || apierrors.IsNotFound(err) {
+					return nil
+				}
+				return err
+			}).Should(Succeed())
+		}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted"))
+	})
+
+	It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() {
+		reconciler := &CloudOperatorReconciler{
+			ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+				Client:           cl,
+				Clock:            clocktesting.NewFakePassiveClock(time.Now()),
+				ManagedNamespace: defaultManagementNamespace,
+				Recorder:         record.NewFakeRecorder(32),
+			},
+			Scheme: scheme.Scheme,
 		}
+
+		_, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error"))
+		Expect(err).NotTo(HaveOccurred())
+
+		co := &configv1.ClusterOperator{}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorDegraded).Status).To(Equal(configv1.ConditionTrue))
 	})
 
+	It("handleTransientError should not degrade before threshold, but degrade after threshold", func() {
+		fakeClock := clocktesting.NewFakeClock(time.Now())
+		reconciler := &CloudOperatorReconciler{
+			ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+				Client:           cl,
+				Clock:            fakeClock,
+				ManagedNamespace: defaultManagementNamespace,
+				Recorder:         record.NewFakeRecorder(32),
+			},
+			Scheme: scheme.Scheme,
+		}
+
+		// Pre-create the ClusterOperator so that setStatusDegraded can update its status
+		// subresource when the threshold is exceeded (status subresource updates require the
+		// object to already exist in the cluster).
+		co := &configv1.ClusterOperator{}
+		co.SetName(clusterOperatorName)
+		Expect(cl.Create(ctx, co)).To(Succeed())
+
+		// First reconcile: transient failure starts; error returned but no degraded condition set.
+		_, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error"))
+		Expect(err).To(HaveOccurred())
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(),
+			"should not be degraded before threshold")
+
+		// Advance clock past the degraded threshold.
+		fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second)
+
+		// Second reconcile: threshold exceeded, controller sets degraded.
+		_, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error"))
+		Expect(err).To(HaveOccurred())
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorDegraded).Status).To(Equal(configv1.ConditionTrue))
+	})
 })
diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go
index ef33709e1..97ca33baa 100644
--- a/pkg/controllers/trusted_ca_bundle_controller.go
+++ b/pkg/controllers/trusted_ca_bundle_controller.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"crypto/x509"
+	"errors"
 	"fmt"
 	"os"
 
@@ -43,6 +44,7 @@ type TrustedCABundleReconciler struct {
 	ClusterOperatorStatusClient
 	Scheme          *runtime.Scheme
 	trustBundlePath string
+	failures        failureWindow
 }
 
 // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set.
@@ -50,9 +52,32 @@ func isSpecTrustedCASet(proxyConfig *configv1.ProxySpec) bool {
 	return len(proxyConfig.TrustedCA.Name) > 0
 }
 
-func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
 	klog.V(1).Infof("%s emitted event, syncing %s ConfigMap", req, trustedCAConfigMapName)
 
+	// partialRun is set to true on the early-exit path where the event is for
+	// an unrelated ConfigMap. That path returns available=true but should NOT
+	// reset an ongoing transient failure window from a previous full reconcile.
+	partialRun := false
+
+	// Deferred dispatcher: classifies the returned error and calls the right handler.
+	// Permanent errors (wrapped with terminal()) degrade immediately without requeue.
+	// Transient errors enter the failure window and only degrade after the threshold.
+	// Nil-error paths clear the failure window unless partialRun is set.
+	defer func() {
+		if retErr == nil {
+			if !partialRun {
+				r.clearFailureWindow()
+			}
+			return
+		}
+		if errors.Is(retErr, reconcile.TerminalError(nil)) {
+			result, retErr = r.handleDegradeError(ctx, retErr)
+		} else {
+			result, retErr = r.handleTransientError(ctx, retErr)
+		}
+	}()
+
 	proxyConfig := &configv1.Proxy{}
 	if err := r.Get(ctx, types.NamespacedName{Name: proxyResourceName}, proxyConfig); err != nil {
 		if apierrors.IsNotFound(err) {
@@ -62,62 +87,89 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 			if err := r.setAvailableCondition(ctx); err != nil {
 				return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
 			}
-			return reconcile.Result{}, nil
+			return reconcile.Result{}, nil // defer clears failure window
 		}
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
-		}
-		// Error reading the object - requeue the request.
-		return reconcile.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err)
+		// Non-NotFound: transient API error.
+		return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) // transient
 	}
 
 	// Check if changed config map in 'openshift-config' namespace is proxy trusted ca.
-	// If not, return early
+	// If not, return early without resetting the failure window (partialRun=true).
 	if req.Namespace == OpenshiftConfigNamespace && proxyConfig.Spec.TrustedCA.Name != req.Name {
+		partialRun = true
+		klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req)
 		if err := r.setAvailableCondition(ctx); err != nil {
 			return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
 		}
-
-		klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req)
 		return reconcile.Result{}, nil
 	}
 
 	systemTrustBundle, err := r.getSystemTrustBundle()
 	if err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
-		}
-		return reconcile.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err)
+		// Node cert store may be updating during upgrade: transient.
+		return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) // transient
 	}
 
 	proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle)
 	if err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
-		}
-		return reconcile.Result{}, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)
+		// Combined cert bundle is corrupt: terminal.
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err))
 	}
 
 	_, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle)
 	if err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
-		}
-		return reconcile.Result{}, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)
+		// Combined cert bundle is corrupt: terminal.
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err))
 	}
 
 	ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle)
 	if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
-		}
-		return reconcile.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err)
+		return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) // transient
 	}
 
 	if err := r.setAvailableCondition(ctx); err != nil {
 		return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err)
 	}
 
+	return ctrl.Result{}, nil // defer clears failure window
+}
+
+func (r *TrustedCABundleReconciler) clearFailureWindow() {
+	r.failures.clear()
+}
+
+// handleTransientError records the start of a failure window and degrades the
+// controller only after transientDegradedThreshold has elapsed. It always
+// returns a non-nil error so controller-runtime requeues with exponential backoff.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
+	// Pass transientDegradedThreshold as the stale-window threshold to detect gaps
+	// where no reconcile ran (e.g. a partialRun returned nil, resetting the rate limiter).
+	elapsed, started := r.failures.observe(r.Clock.Now(), transientDegradedThreshold)
+	if started {
+		klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
+		return ctrl.Result{}, err
+	}
+	if elapsed < transientDegradedThreshold {
+		klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
+		return ctrl.Result{}, err
+	}
+	klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
+	if setErr := r.setDegradedCondition(ctx); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+	}
+	return ctrl.Result{}, err
+}
+
+// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and
+// returns nil so controller-runtime does NOT requeue. An existing watch on the
+// relevant resource will re-trigger reconciliation when the problem is fixed.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) {
+	klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err)
+	if setErr := r.setDegradedCondition(ctx); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+	}
 	return ctrl.Result{}, nil
 }
 
diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go
index 2c112413f..8ad46dc6c 100644
--- a/pkg/controllers/trusted_ca_bundle_controller_test.go
+++ b/pkg/controllers/trusted_ca_bundle_controller_test.go
@@ -16,6 +16,7 @@ import (
 	"k8s.io/client-go/tools/record"
 	clocktesting "k8s.io/utils/clock/testing"
 	"k8s.io/utils/ptr"
+	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/config"
 	"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -164,27 +165,26 @@ var _ = Describe("Trusted CA bundle sync controller", func() {
 		co := &v1.ClusterOperator{}
 		err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)
 		if err == nil || !apierrors.IsNotFound(err) {
-			Eventually(func() bool {
+			Eventually(func() error {
 				err := cl.Delete(context.Background(), co)
-				return err == nil || apierrors.IsNotFound(err)
-			}).Should(BeTrue())
+				if err == nil || apierrors.IsNotFound(err) {
+					return nil
+				}
+				return err
+			}).Should(Succeed())
 		}
-		Eventually(apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue())
+		Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted"))
 
 		if proxyResource != nil {
 			Expect(cl.Delete(ctx, proxyResource, deleteOptions)).To(Succeed())
-			Eventually(
-				apierrors.IsNotFound(cl.Get(ctx, client.ObjectKeyFromObject(proxyResource), &v1.Proxy{})),
-			).Should(BeTrue())
+			Expect(cl.Get(ctx, client.ObjectKeyFromObject(proxyResource), &v1.Proxy{})).To(MatchError(apierrors.IsNotFound, "Proxy should have been deleted"))
 		}
 
 		allCMs := &corev1.ConfigMapList{}
 		Expect(cl.List(ctx, allCMs)).To(Succeed())
 		for _, cm := range allCMs.Items {
 			Expect(cl.Delete(ctx, cm.DeepCopy(), deleteOptions)).To(Succeed())
-			Eventually(
-				apierrors.IsNotFound(cl.Get(ctx, client.ObjectKeyFromObject(cm.DeepCopy()), &corev1.ConfigMap{})),
-			).Should(BeTrue())
+			Expect(cl.Get(ctx, client.ObjectKeyFromObject(cm.DeepCopy()), &corev1.ConfigMap{})).To(MatchError(apierrors.IsNotFound, "ConfigMap should have been deleted"))
 		}
 
 		proxyResource = nil
@@ -294,9 +294,16 @@ var _ = Describe("Trusted CA bundle sync controller", func() {
 
 	It("merged bundle should be generated without cloud-config at all", func() {
 		Expect(cl.Delete(ctx, syncedCloudConfigConfigMap)).To(Succeed())
-		Eventually(func() bool {
-			return apierrors.IsNotFound(cl.Get(ctx, client.ObjectKeyFromObject(syncedCloudConfigConfigMap), &corev1.ConfigMap{}))
-		}).Should(BeTrue())
+		Eventually(func() error {
+			err := cl.Get(ctx, client.ObjectKeyFromObject(syncedCloudConfigConfigMap), &corev1.ConfigMap{})
+			if apierrors.IsNotFound(err) {
+				return nil
+			}
+			if err != nil {
+				return err
+			}
+			return fmt.Errorf("expected synced cloud config ConfigMap to be deleted")
+		}).Should(Succeed())
 		syncedCloudConfigConfigMap = nil
 
 		mergedTrustedCA := &corev1.ConfigMap{}
@@ -309,6 +316,160 @@ var _ = Describe("Trusted CA bundle sync controller", func() {
 	})
 })
 
+var _ = Describe("Trusted CA bundle reconciler unit tests", func() {
+	ctx := context.Background()
+
+	AfterEach(func() {
+		co := &v1.ClusterOperator{}
+		if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil {
+			Expect(cl.Delete(ctx, co)).To(Succeed())
+			Eventually(func() error {
+				err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)
+				if apierrors.IsNotFound(err) {
+					return nil
+				}
+				if err != nil {
+					return err
+				}
+				return fmt.Errorf("expected ClusterOperator to be deleted")
+			}).Should(Succeed())
+		}
+	})
+
+	It("reconcile should succeed and be available if no proxy resource found", func() {
+		reconciler := &TrustedCABundleReconciler{
+			ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+				Client:           cl,
+				Clock:            clocktesting.NewFakePassiveClock(time.Now()),
+				ManagedNamespace: testManagedNamespace,
+			},
+			trustBundlePath: systemCAValid,
+		}
+
+		_, err := reconciler.Reconcile(ctx, ctrl.Request{})
+		Expect(err).To(Succeed())
+
+		co := &v1.ClusterOperator{}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		var availCond *v1.ClusterOperatorStatusCondition
+		for i := range co.Status.Conditions {
+			if co.Status.Conditions[i].Type == trustedCABundleControllerAvailableCondition {
+				availCond = &co.Status.Conditions[i]
+				break
+			}
+		}
+		Expect(availCond).NotTo(BeNil())
+		Expect(availCond.Status).To(Equal(v1.ConditionTrue))
+	})
+
+	It("stale failure window should be restarted when gap since last error exceeds threshold", func() {
+		fakeClock := clocktesting.NewFakeClock(time.Now())
+		reconciler := &TrustedCABundleReconciler{
+			ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+				Client:           cl,
+				Clock:            fakeClock,
+				ManagedNamespace: testManagedNamespace,
+			},
+			trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error
+		}
+
+		// Create a Proxy so the reconcile progresses to getSystemTrustBundle.
+		proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}}
+		Expect(cl.Create(ctx, proxy)).To(Succeed())
+		DeferCleanup(func() { _ = cl.Delete(ctx, proxy) })
+
+		// Step 1: First transient error; failure window opens at T0.
+		_, err := reconciler.Reconcile(ctx, ctrl.Request{})
+		Expect(err).To(HaveOccurred())
+		Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil())
+
+		// Step 2: Advance clock past the threshold — simulates a gap with no reconciles
+		// (e.g., system recovered, no events fired for a long time).
+		fakeClock.Step(transientDegradedThreshold + time.Second)
+
+		// Step 3: New transient error arrives. The stale-window logic should detect that
+		// lastTransientFailureAt is more than the threshold ago and restart the window
+		// from 'now', NOT degrade immediately.
+		_, err = reconciler.Reconcile(ctx, ctrl.Request{})
+		Expect(err).To(HaveOccurred())
+
+		co := &v1.ClusterOperator{}
+		if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil {
+			for _, cond := range co.Status.Conditions {
+				if cond.Type == trustedCABundleControllerDegradedCondition {
+					Expect(cond.Status).NotTo(Equal(v1.ConditionTrue),
+						"should not degrade immediately after a gap — window should restart")
+				}
+			}
+		}
+		// consecutiveFailureSince should now be 'now', not the original T0.
+		Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil())
+		Expect(fakeClock.Now().Sub(*reconciler.failures.consecutiveFailureSince)).To(BeNumerically("<", time.Second),
+			"window should have been restarted to ~now, not retained from original T0")
+	})
+
+	It("should not degrade on transient error before threshold, but degrade after threshold", func() {
+		fakeClock := clocktesting.NewFakeClock(time.Now())
+		reconciler := &TrustedCABundleReconciler{
+			ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+				Client:           cl,
+				Clock:            fakeClock,
+				ManagedNamespace: testManagedNamespace,
+			},
+			trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error
+		}
+
+		// Create a Proxy so the Proxy get succeeds and we reach the system trust bundle read.
+		proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}}
+		Expect(cl.Create(ctx, proxy)).To(Succeed())
+		DeferCleanup(func() { _ = cl.Delete(ctx, proxy) })
+
+		// First reconcile at T0: transient failure starts; error is returned but no degraded condition set.
+		_, err := reconciler.Reconcile(ctx, ctrl.Request{})
+		Expect(err).To(HaveOccurred())
+		co := &v1.ClusterOperator{}
+		if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil {
+			for _, cond := range co.Status.Conditions {
+				if cond.Type == trustedCABundleControllerDegradedCondition {
+					Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold")
+				}
+			}
+		}
+
+		// Advance clock to mid-window (half the threshold) and reconcile again to simulate
+		// continuous failures. This updates lastTransientFailureAt, keeping it fresh.
+		fakeClock.Step(transientDegradedThreshold / 2)
+		_, err = reconciler.Reconcile(ctx, ctrl.Request{})
+		Expect(err).To(HaveOccurred())
+		if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil {
+			for _, cond := range co.Status.Conditions {
+				if cond.Type == trustedCABundleControllerDegradedCondition {
+					Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold")
+				}
+			}
+		}
+
+		// Advance clock so total elapsed from T0 exceeds the threshold, but the gap since the
+		// most recent failure (lastTransientFailureAt) is less than the threshold. This ensures
+		// the degradation path is taken rather than the stale-window restart path.
+		fakeClock.Step(transientDegradedThreshold/2 + time.Second)
+
+		// Final reconcile: threshold exceeded, controller sets degraded.
+		_, err = reconciler.Reconcile(ctx, ctrl.Request{})
+		Expect(err).To(HaveOccurred())
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		var degradedCond *v1.ClusterOperatorStatusCondition
+		for i := range co.Status.Conditions {
+			if co.Status.Conditions[i].Type == trustedCABundleControllerDegradedCondition {
+				degradedCond = &co.Status.Conditions[i]
+				break
+			}
+		}
+		Expect(degradedCond).NotTo(BeNil())
+		Expect(degradedCond.Status).To(Equal(v1.ConditionTrue))
+	})
+})
+
 var _ = Describe("Trusted CA reconciler methods", func() {
 	It("Get system CA should be fine if bundle is valid", func() {
 		reconciler := &TrustedCABundleReconciler{

From e8ad7903e99b89f0777a8a54e16697e4fcf888e8 Mon Sep 17 00:00:00 2001
From: Nolan Brubaker <nolan@nbrubaker.com>
Date: Tue, 28 Apr 2026 12:57:55 -0400
Subject: [PATCH 2/3] Maintain error context

Signed-off-by: Nolan Brubaker <nolan@nbrubaker.com>
---
 .../cloud_config_sync_controller.go           | 24 +++++++++----------
 pkg/controllers/clusteroperator_controller.go | 20 ++++++++--------
 .../trusted_ca_bundle_controller.go           | 14 +++++------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go
index 0d193ad51..2425e553a 100644
--- a/pkg/controllers/cloud_config_sync_controller.go
+++ b/pkg/controllers/cloud_config_sync_controller.go
@@ -151,18 +151,18 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		}
 		return ctrl.Result{}, nil
 	} else if err != nil {
-		return ctrl.Result{}, err // transient
+		return ctrl.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err)
 	}
 
 	network := &configv1.Network{}
 	if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil {
-		return ctrl.Result{}, err // transient
+		return ctrl.Result{}, fmt.Errorf("failed to get cluster Network: %w", err)
 	}
 
 	syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
 	if err != nil {
 		// nil platformStatus is a terminal misconfiguration.
-		return ctrl.Result{}, reconcile.TerminalError(err)
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to check cloud config sync requirements: %w", err))
 	}
 	if !syncNeeded {
 		if err := r.setAvailableCondition(ctx); err != nil {
@@ -194,7 +194,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	if err != nil {
 		// Unsupported platform won't change without a cluster reconfigure.
 		klog.Errorf("unable to get cloud config transformer function; unsupported platform")
-		return ctrl.Result{}, reconcile.TerminalError(err)
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to get cloud config transformer: %w", err))
 	}
 
 	platformType := infra.Status.PlatformStatus.Type
@@ -219,7 +219,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		} else if apierrors.IsNotFound(err) {
 			klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
 		} else {
-			return ctrl.Result{}, err // transient
+			return ctrl.Result{}, fmt.Errorf("failed to get managed cloud config: %w", err)
 		}
 	}
 
@@ -252,12 +252,12 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
 	if err != nil {
 		// User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes.
-		return ctrl.Result{}, reconcile.TerminalError(err)
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to prepare source cloud config: %w", err))
 	}
 
 	// Apply transformer if needed
 	if r.FeatureGateAccess == nil {
-		// Operator misconfiguration at startup: ermanent.
+		// Operator misconfiguration at startup: Permanent.
 		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured"))
 	}
 
@@ -265,7 +265,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	if err != nil {
 		// The feature-gate informer may not have synced yet: transient.
 		klog.Errorf("unable to get feature gates: %v", err)
-		return ctrl.Result{}, err // transient
+		return ctrl.Result{}, fmt.Errorf("failed to get feature gates: %w", err)
 	}
 	if cloudConfigTransformerFn != nil {
 		// We ignore stuff in sourceCM.BinaryData. This isn't allowed to
@@ -274,7 +274,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
 		if err != nil {
 			// Platform-specific transform failed on the current config data: terminal.
-			return ctrl.Result{}, reconcile.TerminalError(err)
+			return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to transform cloud config: %w", err))
 		}
 		sourceCM.Data[defaultConfigKey] = output
 	}
@@ -294,7 +294,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	// Sync the transformed config to the target configmap for CCM consumption
 	if err := r.syncCloudConfigData(ctx, sourceCM); err != nil {
 		klog.Errorf("unable to sync cloud config")
-		return ctrl.Result{}, err // transient
+		return ctrl.Result{}, fmt.Errorf("failed to sync cloud config: %w", err)
 	}
 
 	if err := r.setAvailableCondition(ctx); err != nil {
@@ -326,7 +326,7 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er
 	}
 	klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
 	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
 	}
 	return ctrl.Result{}, err
 }
@@ -338,7 +338,7 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er
 func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) {
 	klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err)
 	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
 	}
 	return ctrl.Result{}, nil
 }
diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go
index afa2c3d72..94f95d1ff 100644
--- a/pkg/controllers/clusteroperator_controller.go
+++ b/pkg/controllers/clusteroperator_controller.go
@@ -101,12 +101,12 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
 		klog.Infof("Infrastructure cluster does not exist. Skipping...")
 		if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
 			klog.Errorf("Unable to sync cluster operator status: %s", err)
-			return ctrl.Result{}, err
+			return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", err)
 		}
 		return ctrl.Result{}, nil // defer clears failure window
 	} else if err != nil {
 		klog.Errorf("Unable to retrive Infrastructure object: %v", err)
-		return ctrl.Result{}, err // transient
+		return ctrl.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err)
 	}
 
 	// Known limitation: when provisioningAllowed internally calls setStatusDegraded
@@ -119,7 +119,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
 	allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides)
 	if err != nil {
 		klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err)
-		return ctrl.Result{}, err // transient; status already set inside provisioningAllowed
+		return ctrl.Result{}, fmt.Errorf("failed to check if provisioning is allowed: %w", err)
 	} else if !allowedToProvision {
 		return ctrl.Result{}, nil // defer clears failure window
 	}
@@ -127,28 +127,28 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
 	clusterProxy := &configv1.Proxy{}
 	if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !apierrors.IsNotFound(err) {
 		klog.Errorf("Unable to retrive Proxy object: %v", err)
-		return ctrl.Result{}, err // transient
+		return ctrl.Result{}, fmt.Errorf("failed to get Proxy: %w", err)
 	}
 
 	operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig)
 	if err != nil {
 		klog.Errorf("Unable to build operator config %s", err)
-		return ctrl.Result{}, reconcile.TerminalError(err) // permanent: defer calls handleDegradeError
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to build operator config: %w", err))
 	}
 
 	if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil {
 		klog.Errorf("Unable to sync operands: %s", err)
-		return ctrl.Result{}, err // transient
+		return ctrl.Result{}, fmt.Errorf("failed to sync operands: %w", err)
 	}
 
 	if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
 		klog.Errorf("Unable to sync cluster operator status: %s", err)
-		return ctrl.Result{}, err
+		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", err)
 	}
 
 	if err := r.clearCloudControllerOwnerCondition(ctx); err != nil {
 		klog.Errorf("Unable to clear CloudControllerOwner condition: %s", err)
-		return ctrl.Result{}, err
+		return ctrl.Result{}, fmt.Errorf("failed to clear CloudControllerOwner condition: %w", err)
 	}
 
 	return ctrl.Result{}, nil // defer clears failure window
@@ -174,7 +174,7 @@ func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, cond
 	}
 	klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
 	if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
+		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", setErr)
 	}
 	return ctrl.Result{}, err
 }
@@ -186,7 +186,7 @@ func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, cond
 func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
 	klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err)
 	if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr)
+		return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", setErr)
 	}
 	return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger
 }
diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go
index 97ca33baa..34632bd6b 100644
--- a/pkg/controllers/trusted_ca_bundle_controller.go
+++ b/pkg/controllers/trusted_ca_bundle_controller.go
@@ -90,7 +90,7 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 			return reconcile.Result{}, nil // defer clears failure window
 		}
 		// Non-NotFound: transient API error.
-		return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) // transient
+		return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %w", req.Name, err) // transient
 	}
 
 	// Check if changed config map in 'openshift-config' namespace is proxy trusted ca.
@@ -107,24 +107,24 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 	systemTrustBundle, err := r.getSystemTrustBundle()
 	if err != nil {
 		// Node cert store may be updating during upgrade: transient.
-		return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) // transient
+		return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %w", err) // transient
 	}
 
 	proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle)
 	if err != nil {
 		// Combined cert bundle is corrupt: terminal.
-		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err))
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %w", err))
 	}
 
 	_, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle)
 	if err != nil {
 		// Combined cert bundle is corrupt: terminal.
-		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err))
+		return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %w", err))
 	}
 
 	ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle)
 	if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil {
-		return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) // transient
+		return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %w", err) // transient
 	}
 
 	if err := r.setAvailableCondition(ctx); err != nil {
@@ -156,7 +156,7 @@ func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, er
 	}
 	klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
 	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
 	}
 	return ctrl.Result{}, err
 }
@@ -168,7 +168,7 @@ func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, er
 func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) {
 	klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err)
 	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
 	}
 	return ctrl.Result{}, nil
 }

From 05b8726d7c36bf46e8dfdcedd324397e42805cf0 Mon Sep 17 00:00:00 2001
From: Nolan Brubaker <nolan@nbrubaker.com>
Date: Mon, 4 May 2026 12:28:21 -0400
Subject: [PATCH 3/3] Consolidate failure handling for controller

Signed-off-by: Nolan Brubaker <nolan@nbrubaker.com>
---
 .../cloud_config_sync_controller.go           | 81 +++++++++----------
 .../trusted_ca_bundle_controller.go           | 49 ++---------
 2 files changed, 46 insertions(+), 84 deletions(-)

diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go
index 2425e553a..bced4fc37 100644
--- a/pkg/controllers/cloud_config_sync_controller.go
+++ b/pkg/controllers/cloud_config_sync_controller.go
@@ -123,6 +123,38 @@ func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elaps
 	return now.Sub(*fw.consecutiveFailureSince), false
 }
 
+// handleTransient records a transient failure and degrades only after threshold has elapsed.
+// name labels log messages. staleAfter controls stale-window restart (pass 0 to disable).
+// setDegraded is invoked only when the threshold is exceeded.
+// Always returns a non-nil error so controller-runtime requeues with exponential backoff.
+func (fw *failureWindow) handleTransient(now time.Time, staleAfter, threshold time.Duration, name string, err error, setDegraded func() error) (ctrl.Result, error) {
+	elapsed, started := fw.observe(now, staleAfter)
+	if started {
+		klog.V(4).Infof("%s: transient failure started (%v), will degrade after %s", name, err, threshold)
+		return ctrl.Result{}, err
+	}
+	if elapsed < threshold {
+		klog.V(4).Infof("%s: transient failure ongoing for %s (threshold %s): %v", name, elapsed, threshold, err)
+		return ctrl.Result{}, err
+	}
+	klog.Warningf("%s: transient failure exceeded threshold (%s), setting degraded: %v", name, elapsed, err)
+	if setErr := setDegraded(); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
+	}
+	return ctrl.Result{}, err
+}
+
+// handleTerminal degrades immediately and returns nil so controller-runtime does not requeue.
+// An existing watch on the relevant resource will re-trigger reconciliation when fixed.
+// name labels log messages.
+func (fw *failureWindow) handleTerminal(name string, err error, setDegraded func() error) (ctrl.Result, error) {
+	klog.Errorf("%s: terminal error, setting degraded: %v", name, err)
+	if setErr := setDegraded(); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
+	}
+	return ctrl.Result{}, nil
+}
+
 func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
 	klog.V(1).Infof("Syncing cloud-conf ConfigMap")
 
@@ -132,13 +164,17 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	// All nil-error paths clear the failure window.
 	defer func() {
 		if retErr == nil {
-			r.clearFailureWindow()
+			r.failures.clear()
 			return
 		}
 		if errors.Is(retErr, reconcile.TerminalError(nil)) {
-			result, retErr = r.handleTerminalError(ctx, retErr)
+			result, retErr = r.failures.handleTerminal("CloudConfigReconciler", retErr, func() error {
+				return r.setDegradedCondition(ctx)
+			})
 		} else {
-			result, retErr = r.handleTransientError(ctx, retErr)
+			result, retErr = r.failures.handleTransient(r.Clock.Now(), 0, transientDegradedThreshold, "CloudConfigReconciler", retErr, func() error {
+				return r.setDegradedCondition(ctx)
+			})
 		}
 	}()
 
@@ -304,45 +340,6 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	return ctrl.Result{}, nil
 }
 
-// clearFailureWindow resets the transient-error tracking. Called by the deferred
-// dispatcher in Reconcile on every successful (nil-error) return path.
-func (r *CloudConfigReconciler) clearFailureWindow() {
-	r.failures.clear()
-}
-
-// handleTransientError records the start of a failure window and degrades the
-// controller only after transientDegradedThreshold has elapsed. It always
-// returns a non-nil error so controller-runtime requeues with exponential backoff.
-// Called only from the deferred dispatcher in Reconcile.
-func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
-	elapsed, started := r.failures.observe(r.Clock.Now(), 0)
-	if started {
-		klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
-		return ctrl.Result{}, err
-	}
-	if elapsed < transientDegradedThreshold {
-		klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
-		return ctrl.Result{}, err
-	}
-	klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
-	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
-	}
-	return ctrl.Result{}, err
-}
-
-// handleTerminalError sets CloudConfigControllerDegraded=True immediately and
-// returns nil so controller-runtime does NOT requeue. An existing watch on the
-// relevant resource will re-trigger reconciliation when the problem is fixed.
-// Called only from the deferred dispatcher in Reconcile.
-func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) {
-	klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err)
-	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
-	}
-	return ctrl.Result{}, nil
-}
-
 func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) {
 	if platformStatus == nil {
 		return false, fmt.Errorf("platformStatus is required")
diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go
index 34632bd6b..1a5e01050 100644
--- a/pkg/controllers/trusted_ca_bundle_controller.go
+++ b/pkg/controllers/trusted_ca_bundle_controller.go
@@ -67,14 +67,18 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 	defer func() {
 		if retErr == nil {
 			if !partialRun {
-				r.clearFailureWindow()
+				r.failures.clear()
 			}
 			return
 		}
 		if errors.Is(retErr, reconcile.TerminalError(nil)) {
-			result, retErr = r.handleDegradeError(ctx, retErr)
+			result, retErr = r.failures.handleTerminal("TrustedCABundleReconciler", retErr, func() error {
+				return r.setDegradedCondition(ctx)
+			})
 		} else {
-			result, retErr = r.handleTransientError(ctx, retErr)
+			result, retErr = r.failures.handleTransient(r.Clock.Now(), transientDegradedThreshold, transientDegradedThreshold, "TrustedCABundleReconciler", retErr, func() error {
+				return r.setDegradedCondition(ctx)
+			})
 		}
 	}()
 
@@ -134,45 +138,6 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 	return ctrl.Result{}, nil // defer clears failure window
 }
 
-func (r *TrustedCABundleReconciler) clearFailureWindow() {
-	r.failures.clear()
-}
-
-// handleTransientError records the start of a failure window and degrades the
-// controller only after transientDegradedThreshold has elapsed. It always
-// returns a non-nil error so controller-runtime requeues with exponential backoff.
-// Called only from the deferred dispatcher in Reconcile.
-func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
-	// Pass transientDegradedThreshold as the stale-window threshold to detect gaps
-	// where no reconcile ran (e.g. a partialRun returned nil, resetting the rate limiter).
-	elapsed, started := r.failures.observe(r.Clock.Now(), transientDegradedThreshold)
-	if started {
-		klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
-		return ctrl.Result{}, err
-	}
-	if elapsed < transientDegradedThreshold {
-		klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
-		return ctrl.Result{}, err
-	}
-	klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
-	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
-	}
-	return ctrl.Result{}, err
-}
-
-// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and
-// returns nil so controller-runtime does NOT requeue. An existing watch on the
-// relevant resource will re-trigger reconciliation when the problem is fixed.
-// Called only from the deferred dispatcher in Reconcile.
-func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) {
-	klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err)
-	if setErr := r.setDegradedCondition(ctx); setErr != nil {
-		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
-	}
-	return ctrl.Result{}, nil
-}
-
 // addProxyCABundle checks ca bundle referred by Proxy resource and adds it to passed bundle
 // in case if proxy one is valid.
 // This function returns added bundle as first value, result as second and an error if it was occurred.