Skip to content

Commit 096694f

Browse files
matheuscscpgithub-actions[bot]
authored andcommitted
Introduce DefaultToRetryOnFailure feature gate
Signed-off-by: Matheus Pimenta <[email protected]> (cherry picked from commit b40c798)
1 parent 2447764 commit 096694f

File tree

13 files changed

+99
-54
lines changed

13 files changed

+99
-54
lines changed

api/v2/helmrelease_types.go

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ type Remediation interface {
489489
// UpgradeStrategy.
490490
// +kubebuilder:object:generate=false
491491
type Strategy interface {
492-
GetRetry() Retry
492+
GetRetry(defaultToRetryOnFailure bool) Retry
493493
}
494494

495495
// Retry defines a consistent interface for retry strategies from
@@ -511,7 +511,8 @@ type Install struct {
511511
Timeout *metav1.Duration `json:"timeout,omitempty"`
512512

513513
// Strategy defines the install strategy to use for this HelmRelease.
514-
// Defaults to 'RemediateOnFailure'.
514+
// Defaults to 'RemediateOnFailure', or 'RetryOnFailure' when the
515+
// DefaultToRetryOnFailure feature gate is enabled.
515516
// +optional
516517
Strategy *InstallStrategy `json:"strategy,omitempty"`
517518

@@ -615,8 +616,14 @@ func (in Install) GetRemediation() Remediation {
615616

616617
// GetRetry returns the configured retry strategy for the Helm install
617618
// action.
618-
func (in Install) GetRetry() Retry {
619-
if in.Strategy == nil || in.Strategy.Name != string(ActionStrategyRetryOnFailure) {
619+
func (in Install) GetRetry(defaultToRetryOnFailure bool) Retry {
620+
if in.Strategy == nil {
621+
if defaultToRetryOnFailure {
622+
return &InstallStrategy{Name: string(ActionStrategyRetryOnFailure)}
623+
}
624+
return nil
625+
}
626+
if in.Strategy.Name != string(ActionStrategyRetryOnFailure) {
620627
return nil
621628
}
622629
return in.Strategy
@@ -764,7 +771,8 @@ type Upgrade struct {
764771
Timeout *metav1.Duration `json:"timeout,omitempty"`
765772

766773
// Strategy defines the upgrade strategy to use for this HelmRelease.
767-
// Defaults to 'RemediateOnFailure'.
774+
// Defaults to 'RemediateOnFailure', or 'RetryOnFailure' when the
775+
// DefaultToRetryOnFailure feature gate is enabled.
768776
// +optional
769777
Strategy *UpgradeStrategy `json:"strategy,omitempty"`
770778

@@ -866,8 +874,14 @@ func (in Upgrade) GetRemediation() Remediation {
866874

867875
// GetRetry returns the configured retry strategy for the Helm upgrade
868876
// action.
869-
func (in Upgrade) GetRetry() Retry {
870-
if in.Strategy == nil || in.Strategy.Name != string(ActionStrategyRetryOnFailure) {
877+
func (in Upgrade) GetRetry(defaultToRetryOnFailure bool) Retry {
878+
if in.Strategy == nil {
879+
if defaultToRetryOnFailure {
880+
return &UpgradeStrategy{Name: string(ActionStrategyRetryOnFailure)}
881+
}
882+
return nil
883+
}
884+
if in.Strategy.Name != string(ActionStrategyRetryOnFailure) {
871885
return nil
872886
}
873887
return in.Strategy
@@ -1437,13 +1451,14 @@ func (in HelmRelease) GetActiveRemediation() Remediation {
14371451
}
14381452

14391453
// GetActiveRetry returns the active retry configuration for the
1440-
// HelmRelease.
1441-
func (in HelmRelease) GetActiveRetry() Retry {
1454+
// HelmRelease. When defaultToRetryOnFailure is true and no strategy
1455+
// is explicitly configured, it defaults to RetryOnFailure.
1456+
func (in HelmRelease) GetActiveRetry(defaultToRetryOnFailure bool) Retry {
14421457
switch in.Status.LastAttemptedReleaseAction {
14431458
case ReleaseActionInstall:
1444-
return in.GetInstall().GetRetry()
1459+
return in.GetInstall().GetRetry(defaultToRetryOnFailure)
14451460
case ReleaseActionUpgrade:
1446-
return in.GetUpgrade().GetRetry()
1461+
return in.GetUpgrade().GetRetry(defaultToRetryOnFailure)
14471462
default:
14481463
return nil
14491464
}

config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,8 @@ spec:
492492
strategy:
493493
description: |-
494494
Strategy defines the install strategy to use for this HelmRelease.
495-
Defaults to 'RemediateOnFailure'.
495+
Defaults to 'RemediateOnFailure', or 'RetryOnFailure' when the
496+
DefaultToRetryOnFailure feature gate is enabled.
496497
properties:
497498
name:
498499
description: Name of the install strategy.
@@ -1013,7 +1014,8 @@ spec:
10131014
strategy:
10141015
description: |-
10151016
Strategy defines the upgrade strategy to use for this HelmRelease.
1016-
Defaults to 'RemediateOnFailure'.
1017+
Defaults to 'RemediateOnFailure', or 'RetryOnFailure' when the
1018+
DefaultToRetryOnFailure feature gate is enabled.
10171019
properties:
10181020
name:
10191021
description: Name of the upgrade strategy.

docs/api/v2/helm.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2029,7 +2029,8 @@ InstallStrategy
20292029
<td>
20302030
<em>(Optional)</em>
20312031
<p>Strategy defines the install strategy to use for this HelmRelease.
2032-
Defaults to &lsquo;RemediateOnFailure&rsquo;.</p>
2032+
Defaults to &lsquo;RemediateOnFailure&rsquo;, or &lsquo;RetryOnFailure&rsquo; when the
2033+
DefaultToRetryOnFailure feature gate is enabled.</p>
20332034
</td>
20342035
</tr>
20352036
<tr>
@@ -3154,7 +3155,8 @@ UpgradeStrategy
31543155
<td>
31553156
<em>(Optional)</em>
31563157
<p>Strategy defines the upgrade strategy to use for this HelmRelease.
3157-
Defaults to &lsquo;RemediateOnFailure&rsquo;.</p>
3158+
Defaults to &lsquo;RemediateOnFailure&rsquo;, or &lsquo;RetryOnFailure&rsquo; when the
3159+
DefaultToRetryOnFailure feature gate is enabled.</p>
31583160
</td>
31593161
</tr>
31603162
<tr>

docs/spec/v2/helmreleases.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,8 @@ The field offers the following subfields:
576576
`RemediateOnFailure` or `RetryOnFailure`.
577577
If the `.spec.install.strategy` field is not specified, the HelmRelease
578578
reconciliation behaves as if `.spec.install.strategy.name` was set to
579-
`RemediateOnFailure`.
579+
`RemediateOnFailure`, or `RetryOnFailure` when the
580+
`DefaultToRetryOnFailure` feature gate is enabled.
580581
- `.retryInterval` (Optional): The time to wait between retries of failed
581582
releases when the install strategy is set to `RetryOnFailure`. Defaults
582583
to `5m`. Cannot be used with `RemediateOnFailure`.
@@ -654,7 +655,9 @@ The field offers the following subfields:
654655
- `.name` (Required): The name of the upgrade strategy to use. One of
655656
`RemediateOnFailure` or `RetryOnFailure`. If the `.spec.upgrade.strategy`
656657
field is not specified, the HelmRelease reconciliation behaves as if
657-
`.spec.upgrade.strategy.name` was set to `RemediateOnFailure`.
658+
`.spec.upgrade.strategy.name` was set to `RemediateOnFailure`, or
659+
`RetryOnFailure` when the `DefaultToRetryOnFailure` feature gate is
660+
enabled.
658661
- `.retryInterval` (Optional): The time to wait between retries of failed
659662
releases when the upgrade strategy is set to `RetryOnFailure`. Defaults
660663
to `5m`. Cannot be used with `RemediateOnFailure`.

internal/controller/helmrelease_controller.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ type HelmReleaseReconciler struct {
109109

110110
AdditiveCELDependencyCheck bool
111111
AllowExternalArtifact bool
112+
DefaultToRetryOnFailure bool
112113
DirectSourceFetch bool
113114
DisableChartDigestTracking bool
114115
}
@@ -415,7 +416,7 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context,
415416
}
416417

417418
// Off we go!
418-
if err = intreconcile.NewAtomicRelease(patchHelper, cfg, r.EventRecorder, r.FieldManager, r.DisallowedFieldManagers).Reconcile(ctx, &intreconcile.Request{
419+
if err = intreconcile.NewAtomicRelease(patchHelper, cfg, r.EventRecorder, r.FieldManager, r.DisallowedFieldManagers, r.DefaultToRetryOnFailure).Reconcile(ctx, &intreconcile.Request{
419420
Object: obj,
420421
Chart: loadedChart,
421422
Values: values,
@@ -433,7 +434,7 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context,
433434
}
434435
switch {
435436
case errors.Is(err, intreconcile.ErrRetryAfterInterval):
436-
return jitter.JitteredRequeueInterval(ctrl.Result{RequeueAfter: obj.GetActiveRetry().GetRetryInterval()}), nil
437+
return jitter.JitteredRequeueInterval(ctrl.Result{RequeueAfter: obj.GetActiveRetry(r.DefaultToRetryOnFailure).GetRetryInterval()}), nil
437438
case errors.Is(err, intreconcile.ErrMustRequeue):
438439
return ctrl.Result{Requeue: true}, nil
439440
case interrors.IsOneOf(err, intreconcile.ErrExceededMaxRetries, intreconcile.ErrMissingRollbackTarget):

internal/features/features.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ const (
7979
// to allow immediate processing of the new reconciliation request. This can
8080
// help avoid getting stuck on failing deployments when fixes are available.
8181
CancelHealthCheckOnNewRevision = "CancelHealthCheckOnNewRevision"
82+
83+
// DefaultToRetryOnFailure changes the default install/upgrade strategy
84+
// from RemediateOnFailure to RetryOnFailure when the user has not
85+
// explicitly configured a strategy. Unlike RemediateOnFailure, which
86+
// has a retry budget, RetryOnFailure retries indefinitely and
87+
// auto-clears failures on success, providing better UX especially
88+
// when CancelHealthCheckOnNewRevision is enabled.
89+
DefaultToRetryOnFailure = "DefaultToRetryOnFailure"
8290
)
8391

8492
var features = map[string]bool{
@@ -121,6 +129,9 @@ var features = map[string]bool{
121129
// CancelHealthCheckOnNewRevision
122130
// opt-in from v1.5.0
123131
CancelHealthCheckOnNewRevision: false,
132+
// DefaultToRetryOnFailure
133+
// opt-in from v1.5.2
134+
DefaultToRetryOnFailure: false,
124135
}
125136

126137
func init() {

internal/reconcile/atomic_release.go

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,20 @@ type AtomicRelease struct {
117117
strategy releaseStrategy
118118
fieldManager string
119119
disallowedFieldManagers []string
120+
defaultToRetryOnFailure bool
120121
}
121122

122123
// NewAtomicRelease returns a new AtomicRelease reconciler configured with the
123124
// provided values.
124-
func NewAtomicRelease(patchHelper *patch.SerialPatcher, cfg *action.ConfigFactory, recorder record.EventRecorder, fieldManager string, disallowedFieldManagers []string) *AtomicRelease {
125+
func NewAtomicRelease(patchHelper *patch.SerialPatcher, cfg *action.ConfigFactory, recorder record.EventRecorder, fieldManager string, disallowedFieldManagers []string, defaultToRetryOnFailure bool) *AtomicRelease {
125126
return &AtomicRelease{
126127
patchHelper: patchHelper,
127128
eventRecorder: recorder,
128129
configFactory: cfg,
129130
strategy: &cleanReleaseStrategy{},
130131
fieldManager: fieldManager,
131132
disallowedFieldManagers: disallowedFieldManagers,
133+
defaultToRetryOnFailure: defaultToRetryOnFailure,
132134
}
133135
}
134136

@@ -230,7 +232,7 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error {
230232
fmt.Sprintf("instructed to stop before running %s action reconciler %s", next.Type(), next.Name()),
231233
)
232234

233-
if retry := req.Object.GetActiveRetry(); retry != nil {
235+
if retry := req.Object.GetActiveRetry(r.defaultToRetryOnFailure); retry != nil {
234236
conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, "retrying after %s", retry.GetRetryInterval().String())
235237
return ErrRetryAfterInterval
236238
}
@@ -270,7 +272,7 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error {
270272
fmt.Sprintf("action reconciler %s of type %s returned error: %s", next.Name(), next.Type(), err),
271273
)
272274

273-
if retry := req.Object.GetActiveRetry(); retry != nil {
275+
if retry := req.Object.GetActiveRetry(r.defaultToRetryOnFailure); retry != nil {
274276
log.Error(err, fmt.Sprintf("failed to run '%s' action", next.Name()))
275277
conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, "retrying after %s", retry.GetRetryInterval().String())
276278
return ErrRetryAfterInterval
@@ -288,7 +290,7 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error {
288290
"instructed to stop after running %s action reconciler %s", next.Type(), next.Name()),
289291
)
290292

291-
if retry := req.Object.GetActiveRetry(); retry != nil {
293+
if retry := req.Object.GetActiveRetry(r.defaultToRetryOnFailure); retry != nil {
292294
conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, "retrying after %s", retry.GetRetryInterval().String())
293295
return ErrRetryAfterInterval
294296
}
@@ -331,7 +333,7 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state
331333
case ReleaseStatusInSync:
332334
log.Info("release in-sync with desired state")
333335

334-
if retry := req.Object.GetActiveRetry(); retry != nil {
336+
if retry := req.Object.GetActiveRetry(r.defaultToRetryOnFailure); retry != nil {
335337
req.Object.Status.History.TruncateIgnoringPreviousSnapshots()
336338
} else {
337339
// Remove all history up to the previous release action.
@@ -347,7 +349,7 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state
347349

348350
if forceRequested {
349351
log.Info(msgWithReason("forcing upgrade for in-sync release", "force requested through annotation"))
350-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
352+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
351353
}
352354

353355
// Since the release is in-sync, remove any remediated condition if
@@ -389,33 +391,33 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state
389391
if req.Object.GetInstall().GetRemediation().RetriesExhausted(req.Object) {
390392
if forceRequested {
391393
log.Info(msgWithReason("forcing install while out of retries", "force requested through annotation"))
392-
return NewInstall(r.configFactory, r.eventRecorder), nil
394+
return NewInstall(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
393395
}
394396

395397
return nil, fmt.Errorf("%w: cannot install release", ErrExceededMaxRetries)
396398
}
397399

398-
return NewInstall(r.configFactory, r.eventRecorder), nil
400+
return NewInstall(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
399401
case ReleaseStatusUnmanaged:
400402
log.Info(msgWithReason("release not managed by controller", state.Reason))
401403

402404
// Clear the history as we can no longer rely on it.
403405
req.Object.Status.ClearHistory()
404406

405-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
407+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
406408
case ReleaseStatusOutOfSync:
407409
log.Info(msgWithReason("release out-of-sync with desired state", state.Reason))
408410

409411
if req.Object.GetUpgrade().GetRemediation().RetriesExhausted(req.Object) {
410412
if forceRequested {
411413
log.Info(msgWithReason("forcing upgrade while out of retries", "force requested through annotation"))
412-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
414+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
413415
}
414416

415417
return nil, fmt.Errorf("%w: cannot upgrade release", ErrExceededMaxRetries)
416418
}
417419

418-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
420+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
419421
case ReleaseStatusDrifted:
420422
log.Info(msgWithReason("detected changes in cluster state", diff.SummarizeDiffSetBrief(state.Diff)))
421423
for _, change := range state.Diff {
@@ -467,11 +469,11 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state
467469

468470
// If the action strategy is to retry (and not remediate), we behave just like
469471
// "flux reconcile hr --force" and .spec.<action>.remediation.retries set to 0.
470-
if req.Object.GetActiveRetry() != nil {
472+
if req.Object.GetActiveRetry(r.defaultToRetryOnFailure) != nil {
471473
req.Object.Status.History.TruncateIgnoringPreviousSnapshots()
472474

473475
log.V(logger.DebugLevel).Info("retrying upgrade for failed release")
474-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
476+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
475477
}
476478

477479
remediation := req.Object.GetActiveRemediation()
@@ -480,7 +482,7 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state
480482
// upgrade the release to see if that fixes the problem.
481483
if remediation == nil {
482484
log.V(logger.DebugLevel).Info("no active remediation strategy")
483-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
485+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
484486
}
485487

486488
// If there is no failure count, the conditions under which the failure
@@ -490,14 +492,14 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state
490492
// attempted again.
491493
if remediation.GetFailureCount(req.Object) <= 0 {
492494
log.Info("release conditions have changed since last failure")
493-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
495+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
494496
}
495497

496498
// If the force annotation is set, we can attempt to upgrade the release
497499
// without any further checks.
498500
if forceRequested {
499501
log.Info(msgWithReason("forcing upgrade for failed release", "force requested through annotation"))
500-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
502+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
501503
}
502504

503505
// We have exhausted the number of retries for the remediation
@@ -526,7 +528,7 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state
526528
// If the rollback target is in any way corrupt,
527529
// the most correct remediation is to reattempt the upgrade.
528530
log.Info(msgWithReason("unable to verify previous release in storage to roll back to", err.Error()))
529-
return NewUpgrade(r.configFactory, r.eventRecorder), nil
531+
return NewUpgrade(r.configFactory, r.eventRecorder, r.defaultToRetryOnFailure), nil
530532
}
531533

532534
// This may be a temporary error, return it to retry.

internal/reconcile/atomic_release_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ func TestAtomicRelease_Reconcile(t *testing.T) {
178178
Chart: testutil.BuildChart(testutil.ChartWithTestHook()),
179179
Values: nil,
180180
}
181-
g.Expect(NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil).Reconcile(context.TODO(), req)).ToNot(HaveOccurred())
181+
g.Expect(NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil, false).Reconcile(context.TODO(), req)).ToNot(HaveOccurred())
182182

183183
g.Expect(obj.Status.Conditions).To(conditions.MatchConditions([]metav1.Condition{
184184
{
@@ -1230,7 +1230,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
12301230
Values: tt.values,
12311231
}
12321232

1233-
err = NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil).Reconcile(context.TODO(), req)
1233+
err = NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil, false).Reconcile(context.TODO(), req)
12341234
wantErr := BeNil()
12351235
if tt.wantErr != nil {
12361236
wantErr = MatchError(tt.wantErr)
@@ -1462,7 +1462,7 @@ func TestAtomicRelease_Reconcile_PostRenderers_Scenarios(t *testing.T) {
14621462
Values: tt.values,
14631463
}
14641464

1465-
err = NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil).Reconcile(context.TODO(), req)
1465+
err = NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil, false).Reconcile(context.TODO(), req)
14661466
g.Expect(err).ToNot(HaveOccurred())
14671467

14681468
g.Expect(obj.Status.ObservedPostRenderersDigest).To(Equal(tt.wantDigest))
@@ -2444,7 +2444,7 @@ func TestAtomicRelease_Reconcile_CommonMetadata_Scenarios(t *testing.T) {
24442444
Values: tt.values,
24452445
}
24462446

2447-
err = NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil).Reconcile(context.TODO(), req)
2447+
err = NewAtomicRelease(patchHelper, cfg, recorder, testFieldManager, nil, false).Reconcile(context.TODO(), req)
24482448
g.Expect(err).ToNot(HaveOccurred())
24492449

24502450
g.Expect(obj.Status.ObservedCommonMetadataDigest).To(Equal(tt.wantDigest))

0 commit comments

Comments
 (0)