-
Notifications
You must be signed in to change notification settings - Fork 79
OCPBUGS-42837: Do not set Degraded=True on transient errors #436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
nrb
wants to merge
3
commits into
openshift:main
Choose a base branch
from
nrb:OCPBUGS-42837
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,18 +2,22 @@ package controllers | |
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "reflect" | ||
| "sync" | ||
| "time" | ||
|
|
||
| corev1 "k8s.io/api/core/v1" | ||
| "k8s.io/apimachinery/pkg/api/errors" | ||
| apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
| "k8s.io/klog/v2" | ||
| ctrl "sigs.k8s.io/controller-runtime" | ||
| "sigs.k8s.io/controller-runtime/pkg/builder" | ||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
| "sigs.k8s.io/controller-runtime/pkg/handler" | ||
| "sigs.k8s.io/controller-runtime/pkg/predicate" | ||
| "sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
|
|
||
| configv1 "github.com/openshift/api/config/v1" | ||
| "github.com/openshift/api/features" | ||
|
|
@@ -30,6 +34,12 @@ const ( | |
| // Controller conditions for the Cluster Operator resource | ||
| cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable" | ||
| cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" | ||
|
|
||
| // transientDegradedThreshold is how long transient errors must persist before | ||
| // the controller sets Degraded=True. This prevents brief | ||
| // API server blips during upgrades from immediately degrading the operator. | ||
| // Applies to both CloudConfigController and TrustedCAController. | ||
| transientDegradedThreshold = 2 * time.Minute | ||
| ) | ||
|
|
||
| // shouldManageManagedConfigMap returns true if CCCMO should manage the | ||
|
|
@@ -78,34 +88,117 @@ type CloudConfigReconciler struct { | |
| ClusterOperatorStatusClient | ||
| Scheme *runtime.Scheme | ||
| FeatureGateAccess featuregates.FeatureGateAccess | ||
| failures failureWindow | ||
| } | ||
|
|
||
| // failureWindow tracks consecutive transient failures. All methods are safe for concurrent use. | ||
| type failureWindow struct { | ||
| mu sync.Mutex | ||
| consecutiveFailureSince *time.Time | ||
| lastTransientFailureAt *time.Time | ||
| } | ||
|
|
||
| // clear resets the failure window. Call this on every successful reconcile. | ||
| func (fw *failureWindow) clear() { | ||
| fw.mu.Lock() | ||
| defer fw.mu.Unlock() | ||
| fw.consecutiveFailureSince = nil | ||
| fw.lastTransientFailureAt = nil | ||
| } | ||
|
|
||
| // observe records a transient failure at now and returns the elapsed time since | ||
| // the window started plus a boolean indicating whether the window was just opened | ||
| // or restarted. staleAfter controls stale-window detection: if the gap since the | ||
| // last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable. | ||
| func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) { | ||
| fw.mu.Lock() | ||
| defer fw.mu.Unlock() | ||
| stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter | ||
| if fw.consecutiveFailureSince == nil || stale { | ||
| fw.consecutiveFailureSince = &now | ||
| fw.lastTransientFailureAt = &now | ||
| return 0, true | ||
| } | ||
| fw.lastTransientFailureAt = &now | ||
| return now.Sub(*fw.consecutiveFailureSince), false | ||
| } | ||
|
|
||
| // handleTransient records a transient failure and degrades only after threshold has elapsed. | ||
| // name labels log messages. staleAfter controls stale-window restart (pass 0 to disable). | ||
| // setDegraded is invoked only when the threshold is exceeded. | ||
| // Always returns a non-nil error so controller-runtime requeues with exponential backoff. | ||
| func (fw *failureWindow) handleTransient(now time.Time, staleAfter, threshold time.Duration, name string, err error, setDegraded func() error) (ctrl.Result, error) { | ||
| elapsed, started := fw.observe(now, staleAfter) | ||
| if started { | ||
| klog.V(4).Infof("%s: transient failure started (%v), will degrade after %s", name, err, threshold) | ||
| return ctrl.Result{}, err | ||
| } | ||
| if elapsed < threshold { | ||
| klog.V(4).Infof("%s: transient failure ongoing for %s (threshold %s): %v", name, elapsed, threshold, err) | ||
| return ctrl.Result{}, err | ||
| } | ||
| klog.Warningf("%s: transient failure exceeded threshold (%s), setting degraded: %v", name, elapsed, err) | ||
| if setErr := setDegraded(); setErr != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) | ||
| } | ||
| return ctrl.Result{}, err | ||
| } | ||
|
|
||
| func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { | ||
| // handleTerminal degrades immediately and returns nil so controller-runtime does not requeue. | ||
| // An existing watch on the relevant resource will re-trigger reconciliation when fixed. | ||
| // name labels log messages. | ||
| func (fw *failureWindow) handleTerminal(name string, err error, setDegraded func() error) (ctrl.Result, error) { | ||
| klog.Errorf("%s: terminal error, setting degraded: %v", name, err) | ||
| if setErr := setDegraded(); setErr != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr) | ||
| } | ||
| return ctrl.Result{}, nil | ||
| } | ||
|
|
||
| func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { | ||
| klog.V(1).Infof("Syncing cloud-conf ConfigMap") | ||
|
|
||
| // Deferred dispatcher: classifies the returned error and calls the right handler. | ||
| // Permanent errors (wrapped with reconcile.TerminalError()) degrade immediately without requeue. | ||
| // Transient errors enter the failure window and only degrade after the threshold. | ||
| // All nil-error paths clear the failure window. | ||
| defer func() { | ||
| if retErr == nil { | ||
| r.failures.clear() | ||
| return | ||
| } | ||
| if errors.Is(retErr, reconcile.TerminalError(nil)) { | ||
| result, retErr = r.failures.handleTerminal("CloudConfigReconciler", retErr, func() error { | ||
| return r.setDegradedCondition(ctx) | ||
| }) | ||
| } else { | ||
| result, retErr = r.failures.handleTransient(r.Clock.Now(), 0, transientDegradedThreshold, "CloudConfigReconciler", retErr, func() error { | ||
| return r.setDegradedCondition(ctx) | ||
| }) | ||
| } | ||
| }() | ||
|
Comment on lines
+94
to
+179
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this generic enough common logic that it can be defined in a package? |
||
|
|
||
| infra := &configv1.Infrastructure{} | ||
| if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil { | ||
| klog.Errorf("infrastructure resource not found") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) { | ||
| // No cloud platform: mirror the main controller's behaviour of returning Available. | ||
| klog.Infof("Infrastructure cluster does not exist. Skipping...") | ||
| if err := r.setAvailableCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, nil | ||
| } else if err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err) | ||
| } | ||
|
|
||
| network := &configv1.Network{} | ||
| if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, fmt.Errorf("failed to get cluster Network: %w", err) | ||
| } | ||
|
|
||
| syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) | ||
| if err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| // nil platformStatus is a terminal misconfiguration. | ||
| return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to check cloud config sync requirements: %w", err)) | ||
| } | ||
| if !syncNeeded { | ||
| if err := r.setAvailableCondition(ctx); err != nil { | ||
|
|
@@ -135,11 +228,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
|
|
||
| cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus) | ||
| if err != nil { | ||
| // Unsupported platform won't change without a cluster reconfigure. | ||
| klog.Errorf("unable to get cloud config transformer function; unsupported platform") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to get cloud config transformer: %w", err)) | ||
| } | ||
|
|
||
| platformType := infra.Status.PlatformStatus.Type | ||
|
|
@@ -161,14 +252,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
| } | ||
| if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil { | ||
| managedConfigFound = true | ||
| } else if errors.IsNotFound(err) { | ||
| } else if apierrors.IsNotFound(err) { | ||
| klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") | ||
| } else if err != nil { | ||
| klog.Errorf("unable to get managed cloud-config for sync") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| } else { | ||
| return ctrl.Result{}, fmt.Errorf("failed to get managed cloud config: %w", err) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -179,7 +266,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
| Name: infra.Spec.CloudConfig.Name, | ||
| Namespace: OpenshiftConfigNamespace, | ||
| } | ||
| if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { | ||
| if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) { | ||
| klog.Warningf("cloud-config not found in either openshift-config-managed or openshift-config namespace") | ||
| // For platforms we manage, create a minimal valid config that will be populated by the transformer | ||
| if shouldManageManagedConfigMap(platformType, features) { | ||
|
|
@@ -200,23 +287,30 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
|
|
||
| sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) | ||
| if err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| // User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes. | ||
| return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to prepare source cloud config: %w", err)) | ||
| } | ||
|
|
||
| // Apply transformer if needed | ||
| if r.FeatureGateAccess == nil { | ||
| // Operator misconfiguration at startup: Permanent. | ||
| return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured")) | ||
| } | ||
|
|
||
| features, err = r.FeatureGateAccess.CurrentFeatureGates() | ||
| if err != nil { | ||
| // The feature-gate informer may not have synced yet: transient. | ||
| klog.Errorf("unable to get feature gates: %v", err) | ||
| return ctrl.Result{}, fmt.Errorf("failed to get feature gates: %w", err) | ||
| } | ||
| if cloudConfigTransformerFn != nil { | ||
| // We ignore stuff in sourceCM.BinaryData. This isn't allowed to | ||
| // contain any key that overlaps with those found in sourceCM.Data and | ||
| // we're not expecting users to put their data in the former. | ||
| output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) | ||
| if err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| // Platform-specific transform failed on the current config data: terminal. | ||
| return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to transform cloud config: %w", err)) | ||
| } | ||
| sourceCM.Data[defaultConfigKey] = output | ||
| } | ||
|
|
@@ -229,17 +323,14 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
| } | ||
|
|
||
| // Sync the transformed config to the target configmap for CCM consumption | ||
| if err := r.syncCloudConfigData(ctx, sourceCM); err != nil { | ||
| klog.Errorf("unable to sync cloud config") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, fmt.Errorf("failed to sync cloud config: %w", err) | ||
| } | ||
|
|
||
| if err := r.setAvailableCondition(ctx); err != nil { | ||
|
|
@@ -335,7 +426,7 @@ func (r *CloudConfigReconciler) syncConfigMapToTarget(ctx context.Context, sourc | |
|
|
||
| // Check if target exists | ||
| err := r.Get(ctx, targetKey, targetCM) | ||
| if err != nil && !errors.IsNotFound(err) { | ||
| if err != nil && !apierrors.IsNotFound(err) { | ||
| return fmt.Errorf("failed to get target configmap %s/%s: %w", targetNamespace, targetName, err) | ||
| } | ||
|
|
||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How did we come up with this value? Have we tested it in upgrades and checked if it is sufficient?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't tested this across upgrades, but I will.
This was taken from a couple other repos doing similar work on their readiness work. Will run a few upgrades and see if the window causes issues and needs to be adjusted
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TY!