Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 131 additions & 40 deletions pkg/controllers/cloud_config_sync_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,22 @@ package controllers

import (
"context"
"errors"
"fmt"
"reflect"
"sync"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/api/features"
Expand All @@ -30,6 +34,12 @@ const (
// Controller conditions for the Cluster Operator resource
cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable"
cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded"

// transientDegradedThreshold is how long transient errors must persist before
// the controller sets Degraded=True. This prevents brief
// API server blips during upgrades from immediately degrading the operator.
// Applies to both CloudConfigController and TrustedCAController.
transientDegradedThreshold = 2 * time.Minute
Comment on lines +38 to +42
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did we come up with this value? Have we tested it in upgrades and checked if it is sufficient?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't tested this across upgrades, but I will.

This was taken from a couple other repos doing similar work on their readiness work. Will run a few upgrades and see if the window causes issues and needs to be adjusted

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TY!

)

// shouldManageManagedConfigMap returns true if CCCMO should manage the
Expand Down Expand Up @@ -78,34 +88,117 @@ type CloudConfigReconciler struct {
ClusterOperatorStatusClient
Scheme *runtime.Scheme
FeatureGateAccess featuregates.FeatureGateAccess
failures failureWindow
}

// failureWindow tracks consecutive transient failures. All methods are safe for concurrent use.
type failureWindow struct {
mu sync.Mutex
consecutiveFailureSince *time.Time
lastTransientFailureAt *time.Time
}

// clear resets the failure window. Call this on every successful reconcile.
func (fw *failureWindow) clear() {
fw.mu.Lock()
defer fw.mu.Unlock()
fw.consecutiveFailureSince = nil
fw.lastTransientFailureAt = nil
}

// observe records a transient failure at now and returns the elapsed time since
// the window started plus a boolean indicating whether the window was just opened
// or restarted. staleAfter controls stale-window detection: if the gap since the
// last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable.
func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) {
fw.mu.Lock()
defer fw.mu.Unlock()
stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter
if fw.consecutiveFailureSince == nil || stale {
fw.consecutiveFailureSince = &now
fw.lastTransientFailureAt = &now
return 0, true
}
fw.lastTransientFailureAt = &now
return now.Sub(*fw.consecutiveFailureSince), false
}

// handleTransient records a transient failure and degrades only after threshold has elapsed.
// name labels log messages. staleAfter controls stale-window restart (pass 0 to disable).
// setDegraded is invoked only when the threshold is exceeded.
// Always returns a non-nil error so controller-runtime requeues with exponential backoff.
func (fw *failureWindow) handleTransient(now time.Time, staleAfter, threshold time.Duration, name string, err error, setDegraded func() error) (ctrl.Result, error) {
elapsed, started := fw.observe(now, staleAfter)
if started {
klog.V(4).Infof("%s: transient failure started (%v), will degrade after %s", name, err, threshold)
return ctrl.Result{}, err
}
if elapsed < threshold {
klog.V(4).Infof("%s: transient failure ongoing for %s (threshold %s): %v", name, elapsed, threshold, err)
return ctrl.Result{}, err
}
klog.Warningf("%s: transient failure exceeded threshold (%s), setting degraded: %v", name, elapsed, err)
if setErr := setDegraded(); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
}
return ctrl.Result{}, err
}

func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
// handleTerminal degrades immediately and returns nil so controller-runtime does not requeue.
// An existing watch on the relevant resource will re-trigger reconciliation when fixed.
// name labels log messages.
func (fw *failureWindow) handleTerminal(name string, err error, setDegraded func() error) (ctrl.Result, error) {
klog.Errorf("%s: terminal error, setting degraded: %v", name, err)
if setErr := setDegraded(); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
}
return ctrl.Result{}, nil
}

func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
klog.V(1).Infof("Syncing cloud-conf ConfigMap")

// Deferred dispatcher: classifies the returned error and calls the right handler.
// Permanent errors (wrapped with reconcile.TerminalError()) degrade immediately without requeue.
// Transient errors enter the failure window and only degrade after the threshold.
// All nil-error paths clear the failure window.
defer func() {
if retErr == nil {
r.failures.clear()
return
}
if errors.Is(retErr, reconcile.TerminalError(nil)) {
result, retErr = r.failures.handleTerminal("CloudConfigReconciler", retErr, func() error {
return r.setDegradedCondition(ctx)
})
} else {
result, retErr = r.failures.handleTransient(r.Clock.Now(), 0, transientDegradedThreshold, "CloudConfigReconciler", retErr, func() error {
return r.setDegradedCondition(ctx)
})
}
}()
Comment on lines +94 to +179
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this generic enough common logic that it can be defined in a package?


infra := &configv1.Infrastructure{}
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil {
klog.Errorf("infrastructure resource not found")
if err := r.setDegradedCondition(ctx); err != nil {
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) {
// No cloud platform: mirror the main controller's behaviour of returning Available.
klog.Infof("Infrastructure cluster does not exist. Skipping...")
if err := r.setAvailableCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, nil
} else if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err)
}

network := &configv1.Network{}
if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, fmt.Errorf("failed to get cluster Network: %w", err)
}

syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// nil platformStatus is a terminal misconfiguration.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to check cloud config sync requirements: %w", err))
}
if !syncNeeded {
if err := r.setAvailableCondition(ctx); err != nil {
Expand Down Expand Up @@ -135,11 +228,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus)
if err != nil {
// Unsupported platform won't change without a cluster reconfigure.
klog.Errorf("unable to get cloud config transformer function; unsupported platform")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to get cloud config transformer: %w", err))
}

platformType := infra.Status.PlatformStatus.Type
Expand All @@ -161,14 +252,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}
if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil {
managedConfigFound = true
} else if errors.IsNotFound(err) {
} else if apierrors.IsNotFound(err) {
klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
} else if err != nil {
klog.Errorf("unable to get managed cloud-config for sync")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
} else {
return ctrl.Result{}, fmt.Errorf("failed to get managed cloud config: %w", err)
}
}

Expand All @@ -179,7 +266,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
Name: infra.Spec.CloudConfig.Name,
Namespace: OpenshiftConfigNamespace,
}
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) {
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) {
klog.Warningf("cloud-config not found in either openshift-config-managed or openshift-config namespace")
// For platforms we manage, create a minimal valid config that will be populated by the transformer
if shouldManageManagedConfigMap(platformType, features) {
Expand All @@ -200,23 +287,30 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to prepare source cloud config: %w", err))
}

// Apply transformer if needed
if r.FeatureGateAccess == nil {
// Operator misconfiguration at startup: Permanent.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured"))
}

features, err = r.FeatureGateAccess.CurrentFeatureGates()
if err != nil {
// The feature-gate informer may not have synced yet: transient.
klog.Errorf("unable to get feature gates: %v", err)
return ctrl.Result{}, fmt.Errorf("failed to get feature gates: %w", err)
}
if cloudConfigTransformerFn != nil {
// We ignore stuff in sourceCM.BinaryData. This isn't allowed to
// contain any key that overlaps with those found in sourceCM.Data and
// we're not expecting users to put their data in the former.
output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// Platform-specific transform failed on the current config data: terminal.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to transform cloud config: %w", err))
}
sourceCM.Data[defaultConfigKey] = output
}
Expand All @@ -229,17 +323,14 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, err // transient
}
}

// Sync the transformed config to the target configmap for CCM consumption
if err := r.syncCloudConfigData(ctx, sourceCM); err != nil {
klog.Errorf("unable to sync cloud config")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, fmt.Errorf("failed to sync cloud config: %w", err)
}

if err := r.setAvailableCondition(ctx); err != nil {
Expand Down Expand Up @@ -335,7 +426,7 @@ func (r *CloudConfigReconciler) syncConfigMapToTarget(ctx context.Context, sourc

// Check if target exists
err := r.Get(ctx, targetKey, targetCM)
if err != nil && !errors.IsNotFound(err) {
if err != nil && !apierrors.IsNotFound(err) {
return fmt.Errorf("failed to get target configmap %s/%s: %w", targetNamespace, targetName, err)
}

Expand Down
Loading