Skip to content

Commit 4cfb628

Browse files
gcs278claude
andcommitted
OCPBUGS-83281: Wait for Gateway deletion before removing GatewayClass in test cleanup
The Gateway API test cleanup was deleting Gateways and immediately proceeding to delete the GatewayClass and istiod without waiting for the Gateway resources to be fully removed. Since the gateway deployment and pods have an owner reference to the Gateway, they are cascade-deleted by Kubernetes GC only after the Gateway is gone. If the GatewayClass and istiod are removed first, the gateway pods lose their control plane and crash-loop, generating pathological "Back-off restarting failed container" events that fail CI invariant monitors. Add a shared waitForGatewayDeletion helper that both the upgrade test Teardown and the controller test AfterEach use to ensure Gateways are fully deleted before proceeding with GatewayClass cleanup. https://redhat.atlassian.net/browse/OCPBUGS-83281 Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent 9df27cd commit 4cfb628

2 files changed

Lines changed: 19 additions & 15 deletions

File tree

test/extended/router/gatewayapi_upgrade.go

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313

1414
apierrors "k8s.io/apimachinery/pkg/api/errors"
1515
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16-
"k8s.io/apimachinery/pkg/util/wait"
1716
e2e "k8s.io/kubernetes/test/e2e/framework"
1817
"k8s.io/kubernetes/test/e2e/upgrades"
1918
gatewayv1 "sigs.k8s.io/gateway-api/apis/v1"
@@ -240,21 +239,8 @@ func (t *GatewayAPIUpgradeTest) Teardown(ctx context.Context, f *e2e.Framework)
240239
e2e.Logf("Failed to delete Gateway %q: %v", t.gatewayName, err)
241240
}
242241

243-
// Wait for Gateway to be fully deleted before removing GatewayClass
244-
// This prevents orphaned resources if the controller (defined by GatewayClass) is removed
245-
// before Istiod completes cleanup
246242
g.By("Waiting for Gateway to be fully deleted")
247-
err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, false, func(ctx context.Context) (bool, error) {
248-
_, err := t.oc.AdminGatewayApiClient().GatewayV1().Gateways(ingressNamespace).Get(ctx, t.gatewayName, metav1.GetOptions{})
249-
if apierrors.IsNotFound(err) {
250-
e2e.Logf("Gateway %q successfully deleted", t.gatewayName)
251-
return true, nil
252-
}
253-
return false, nil
254-
})
255-
if err != nil {
256-
e2e.Logf("Gateway %q still exists after 2 minutes, continuing cleanup anyway", t.gatewayName)
257-
}
243+
waitForGatewayDeletion(t.oc, t.gatewayName)
258244

259245
g.By("Deleting the GatewayClass")
260246
err = t.oc.AdminGatewayApiClient().GatewayV1().GatewayClasses().Delete(ctx, gatewayClassName, metav1.DeleteOptions{})

test/extended/router/gatewayapicontroller.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ var _ = g.Describe("[sig-network-edge][OCPFeatureGate:GatewayAPIController][Feat
152152
o.Expect(err).NotTo(o.HaveOccurred(), "Gateway %s could not be deleted", name)
153153
}
154154

155+
g.By("Waiting for gateways to be fully deleted")
156+
for _, name := range gateways {
157+
waitForGatewayDeletion(oc, name)
158+
}
159+
155160
g.By("Deleting the GatewayClass")
156161

157162
if err := oc.AdminGatewayApiClient().GatewayV1().GatewayClasses().Delete(context.Background(), gatewayClassName, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) {
@@ -1324,6 +1329,19 @@ func waitForIstiodPodDeletion(oc *exutil.CLI) {
13241329
}).WithTimeout(10 * time.Minute).WithPolling(10 * time.Second).Should(o.Succeed())
13251330
}
13261331

1332+
// waitForGatewayDeletion waits for a Gateway resource to be fully deleted.
1333+
// The gateway deployment and pods have an owner reference to the Gateway, so
1334+
// Kubernetes garbage collection will cascade-delete them once the Gateway is gone.
1335+
// This must complete before removing the GatewayClass or istiod, otherwise
1336+
// orphaned gateway pods will crash-loop without their control plane.
1337+
func waitForGatewayDeletion(oc *exutil.CLI, gatewayName string) {
1338+
e2e.Logf("Waiting for Gateway %q to be fully deleted", gatewayName)
1339+
o.Eventually(func(g o.Gomega) {
1340+
_, err := oc.AdminGatewayApiClient().GatewayV1().Gateways(ingressNamespace).Get(context.Background(), gatewayName, metav1.GetOptions{})
1341+
g.Expect(apierrors.IsNotFound(err)).To(o.BeTrue(), "Gateway %q still exists", gatewayName)
1342+
}).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).Should(o.Succeed())
1343+
}
1344+
13271345
// validateOLMBasedOSSM validates that Gateway API is using OLM-based provisioning.
13281346
func validateOLMBasedOSSM(oc *exutil.CLI, timeout time.Duration) {
13291347
pollInterval := 5 * time.Second

0 commit comments

Comments
 (0)