diff --git a/pkg/monitor/backenddisruption/disruption_backend_sampler.go b/pkg/monitor/backenddisruption/disruption_backend_sampler.go index a8200b040ec1..f66279a40af9 100644 --- a/pkg/monitor/backenddisruption/disruption_backend_sampler.go +++ b/pkg/monitor/backenddisruption/disruption_backend_sampler.go @@ -231,6 +231,12 @@ func (b *BackendSampler) WithSamplerHooks(samplerHooks []SamplerHook) *BackendSa return b } +// WithTimeout sets a custom timeout for HTTP requests including DNS resolution and connection establishment +func (b *BackendSampler) WithTimeout(timeout time.Duration) *BackendSampler { + b.timeout = &timeout + return b +} + // bodyMatches checks the body content and returns an error if it doesn't match the expected. func (b *BackendSampler) bodyMatches(body []byte) error { switch { diff --git a/pkg/monitortests/network/disruptionserviceloadbalancer/monitortest.go b/pkg/monitortests/network/disruptionserviceloadbalancer/monitortest.go index c6cad8e760c5..9931ed963e6c 100644 --- a/pkg/monitortests/network/disruptionserviceloadbalancer/monitortest.go +++ b/pkg/monitortests/network/disruptionserviceloadbalancer/monitortest.go @@ -256,7 +256,7 @@ func (w *availability) PrepareCollection(ctx context.Context, adminRESTConfig *r // Hit it once before considering ourselves ready fmt.Fprintf(os.Stderr, "hitting pods through the service's LoadBalancer\n") - timeout := 10 * time.Minute + timeout := 20 * time.Minute // require thirty seconds of passing requests to continue (in case the SLB becomes available and then degrades) // TODO this seems weird to @deads2k, why is status not trustworthy baseURL := fmt.Sprintf("http://%s", net.JoinHostPort(tcpIngressIP, strconv.Itoa(svcPort))) @@ -266,18 +266,24 @@ func (w *availability) PrepareCollection(ctx context.Context, adminRESTConfig *r return fmt.Errorf("could not reach %v reliably: %w", url, err) } + // Use longer timeout to accommodate slow DNS resolution. AWS ELB DNS zones may have SOA TTL of 900s, + // which means DNS propagation can take up to 15 minutes. Set timeout high enough to allow + // DNS retries during this propagation period. + connectionTimeout := 120 * time.Second newConnectionDisruptionSampler := backenddisruption.NewSimpleBackendFromOpenshiftTests( baseURL, "service-load-balancer-with-pdb-new-connections", path, monitorapi.NewConnectionType). - WithExpectedBody("hello") + WithExpectedBody("hello"). + WithTimeout(connectionTimeout) reusedConnectionDisruptionSampler := backenddisruption.NewSimpleBackendFromOpenshiftTests( baseURL, "service-load-balancer-with-pdb-reused-connections", path, monitorapi.ReusedConnectionType). - WithExpectedBody("hello") + WithExpectedBody("hello"). + WithTimeout(connectionTimeout) w.disruptionChecker = disruptionlibrary.NewAvailabilityInvariant( newConnectionTestName, reusedConnectionTestName,