diff --git a/clouddriver-google/src/main/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandler.groovy b/clouddriver-google/src/main/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandler.groovy index ee1e12b1355..b3c53ccdf60 100644 --- a/clouddriver-google/src/main/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandler.groovy +++ b/clouddriver-google/src/main/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandler.groovy @@ -29,6 +29,7 @@ import com.google.api.services.compute.model.InstanceGroupManagerAutoHealingPoli import com.google.api.services.compute.model.InstanceProperties import com.google.api.services.compute.model.InstanceTemplate import com.google.api.services.compute.model.NamedPort +import com.google.api.services.compute.model.Operation import com.netflix.frigga.Names import com.netflix.spectator.api.Registry import com.netflix.spinnaker.cats.cache.Cache @@ -64,6 +65,7 @@ import com.netflix.spinnaker.moniker.Namer import groovy.util.logging.Slf4j import org.springframework.beans.factory.annotation.Autowired import org.springframework.stereotype.Component +import javax.annotation.PostConstruct import static com.google.common.base.Preconditions.checkArgument import static com.netflix.spinnaker.clouddriver.google.deploy.GCEUtil.BACKEND_SERVICE_NAMES @@ -587,14 +589,33 @@ class BasicGoogleDeployHandler implements DeployHandler - safeRetry.doRetry( + // Execute backend service update with retry logic for transient errors + def operation = safeRetry.doRetry( updateBackendServices(compute, project, backendService.name, backendService), "Load balancer backend service", task, - [400, 412], + [400, 412], // Retry on Bad Request (400) and Precondition Failed (412) [], [action: "update", phase: BASE_PHASE, operation: "updateBackendServices", (TAG_SCOPE): SCOPE_GLOBAL], registry ) + + if (operation) { + // Wait for the backend service update operation to complete + // + // Per Google Cloud documentation: + // "Backend service operations are asynchronous and return an Operation resource. + // You can use an operation resource to manage asynchronous API requests. + // For global operations, use the globalOperations resource." + // + // Without waiting for backend service updates to complete, + // subsequent health checks and traffic routing may execute before the backend service + // knows about the new instance group, causing deployment failures. + // + // This ensures that health checks and WaitForUpInstancesTask execute only after + // backend services have been fully updated with the new instance group. + task.updateStatus BASE_PHASE, "Waiting for backend service ${backendService.name} update to complete..." + googleOperationPoller.waitForGlobalOperation(compute, project, operation.getName(), + null, task, "backend service ${backendService.name}", BASE_PHASE) + } + task.updateStatus BASE_PHASE, "Done associating server group $serverGroupName with backend service ${backendService.name}." } } + // Update regional backend services and wait for operation completion if (willUpdateRegionalBackendServices) { regionBackendServicesToUpdate.each { BackendService backendService -> - safeRetry.doRetry( + // Execute regional backend service update with retry logic for transient errors + def operation = safeRetry.doRetry( updateRegionBackendServices(compute, project, region, backendService.name, backendService), "Internal load balancer backend service", task, - [400, 412], + [400, 412], // Retry on Bad Request (400) and Precondition Failed (412) [], [action: "update", phase: BASE_PHASE, operation: "updateRegionBackendServices", (TAG_SCOPE): SCOPE_REGIONAL, (TAG_REGION): region], registry ) + + if (operation) { + // Wait for the regional backend service update operation to complete + // + // Per Google Cloud documentation: + // "Regional backend service operations are asynchronous and return an Operation resource. + // You can use an operation resource to manage asynchronous API requests. + // For regional operations, use the regionOperations resource." + // + // Similar to global backend services, regional backend services require explicit waiting to ensure the new instance group + // is properly registered before health checks and traffic routing decisions are made. + task.updateStatus BASE_PHASE, "Waiting for regional backend service ${backendService.name} update to complete..." + googleOperationPoller.waitForRegionalOperation(compute, project, region, operation.getName(), + null, task, "regional backend service ${backendService.name}", BASE_PHASE) + } + task.updateStatus BASE_PHASE, "Done associating server group $serverGroupName with backend service ${backendService.name}." } } @@ -667,6 +746,24 @@ class BasicGoogleDeployHandler implements DeployHandler backend.group } - timeExecute( + return timeExecute( compute.regionBackendServices().update(project, region, backendServiceName, serviceToUpdate), "compute.regionBackendServices.update", TAG_SCOPE, SCOPE_REGIONAL, TAG_REGION, region) - null } } + /** + * Creates a closure to update global backend services with new instance groups. + * + * Per Google Cloud documentation: + * "Backend service operations are asynchronous and return an Operation resource. + * You can use an operation resource to manage asynchronous API requests. + * Operations can be global, regional or zonal. For global operations, use the globalOperations resource." + * + * The returned Operation object must be polled until completion to ensure the backend service + * update has been fully applied before proceeding with subsequent deployment steps. This prevents + * race conditions where health checks or traffic routing occurs before the backend service knows + * about the new instance group. + * + * @param compute GCP Compute API client + * @param project GCP project ID + * @param backendServiceName Name of the backend service to update + * @param backendService Backend service configuration with new backends to add + * @return Closure that returns an Operation object for the update request + */ private Closure updateBackendServices(Compute compute, String project, String backendServiceName, BackendService backendService) { return { BackendService serviceToUpdate = timeExecute( @@ -697,11 +812,10 @@ class BasicGoogleDeployHandler implements DeployHandler backend.group } - timeExecute( + return timeExecute( compute.backendServices().update(project, backendServiceName, serviceToUpdate), "compute.backendServices.update", TAG_SCOPE, SCOPE_GLOBAL) - null } } @@ -731,6 +845,14 @@ class BasicGoogleDeployHandler implements DeployHandler labels } diff --git a/clouddriver-google/src/main/groovy/com/netflix/spinnaker/config/GoogleConfiguration.groovy b/clouddriver-google/src/main/groovy/com/netflix/spinnaker/config/GoogleConfiguration.groovy index 2e506f7c82a..ffc3c6d8a6a 100644 --- a/clouddriver-google/src/main/groovy/com/netflix/spinnaker/config/GoogleConfiguration.groovy +++ b/clouddriver-google/src/main/groovy/com/netflix/spinnaker/config/GoogleConfiguration.groovy @@ -68,6 +68,17 @@ class GoogleConfiguration { List fallbackInstanceTypeDisks = [] List instanceTypeDisks = [] + /** + * Feature flag: when true (default) Clouddriver will actively poll GCP asynchronous operations + * that mutate backend services and autoscalers. + * Setting this to false restores the legacy behaviour where Clouddriver does not wait for + * completion of those operations, which can re-introduce race conditions during red/black + * deployments but matches the historical behaviour. + * + * YAML path: google.defaults.enableAsyncOperationWait + */ + boolean enableAsyncOperationWait = true + GoogleInstanceTypeDisk determineInstanceTypeDisk(String instanceType) { GoogleInstanceTypeDisk instanceTypeDisk = instanceTypeDisks.find { it.instanceType == instanceType diff --git a/clouddriver-google/src/test/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandlerSpec.groovy b/clouddriver-google/src/test/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandlerSpec.groovy index 3098a3a4e3c..ae3c3029ae1 100644 --- a/clouddriver-google/src/test/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandlerSpec.groovy +++ b/clouddriver-google/src/test/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandlerSpec.groovy @@ -18,6 +18,9 @@ package com.netflix.spinnaker.clouddriver.google.deploy.handlers import com.google.api.services.compute.Compute import com.google.api.services.compute.ComputeRequest +import com.google.api.services.compute.model.Autoscaler +import com.google.api.services.compute.model.Backend +import com.google.api.services.compute.model.BackendService import com.google.api.services.compute.model.Image import com.google.api.services.compute.model.ImageList import com.google.api.services.compute.model.Instance @@ -26,10 +29,21 @@ import com.google.api.services.compute.model.MachineType import com.google.api.services.compute.model.MachineTypeList import com.google.api.services.compute.model.Network import com.google.api.services.compute.model.NetworkList +import com.google.api.services.compute.model.Operation +import com.netflix.spectator.api.Clock +import com.netflix.spectator.api.Id +import com.netflix.spectator.api.Registry +import com.netflix.spectator.api.Timer import com.netflix.spinnaker.clouddriver.data.task.Task import com.netflix.spinnaker.clouddriver.data.task.TaskRepository +import com.netflix.spinnaker.clouddriver.google.deploy.GoogleOperationPoller +import com.netflix.spinnaker.clouddriver.google.deploy.SafeRetry import com.netflix.spinnaker.clouddriver.google.deploy.description.BasicGoogleDeployDescription +import com.netflix.spinnaker.clouddriver.google.model.GoogleAutoscalingPolicy import com.netflix.spinnaker.clouddriver.google.security.GoogleCredentials +import com.netflix.spinnaker.clouddriver.google.security.GoogleNamedAccountCredentials +import com.netflix.spinnaker.clouddriver.names.NamerRegistry +import com.netflix.spinnaker.moniker.Namer import spock.lang.Ignore import spock.lang.Shared import spock.lang.Specification @@ -87,4 +101,90 @@ class BasicGoogleDeployHandlerSpec extends Specification { mock.list(_, _) >> list mock } + + def "backend service update creates closure that returns operation"() { + given: + def mockCompute = Mock(Compute) + def mockBackendServices = Mock(Compute.BackendServices) + def mockGet = Mock(Compute.BackendServices.Get) + def mockUpdate = Mock(Compute.BackendServices.Update) + def mockOperation = new Operation(name: "operation-123", status: "DONE") + + // Setup registry mocks for timeExecute + def mockClock = Mock(Clock) + def mockTimer = Mock(Timer) + def mockId = Mock(Id) + def mockRegistry = Mock(Registry) + mockRegistry.clock() >> mockClock + mockRegistry.createId(_, _) >> mockId + mockId.withTags(_) >> mockId + mockRegistry.timer(_) >> mockTimer + mockClock.monotonicTime() >> 1000L + + def handler = new BasicGoogleDeployHandler() + handler.registry = mockRegistry + + // Mock backend service data + def existingBackendService = new BackendService(name: "test-backend-service", backends: []) + def newBackendService = new BackendService( + name: "test-backend-service", + backends: [new Backend(group: "projects/test/zones/us-central1-a/instanceGroups/new-group")] + ) + + when: + def updateClosure = handler.updateBackendServices(mockCompute, "test-project", "test-backend-service", newBackendService) + def result = updateClosure.call() + + then: + // Verify GCP API calls + 2 * mockCompute.backendServices() >> mockBackendServices + 1 * mockBackendServices.get("test-project", "test-backend-service") >> mockGet + 1 * mockGet.execute() >> existingBackendService + 1 * mockBackendServices.update("test-project", "test-backend-service", _) >> mockUpdate + 1 * mockUpdate.execute() >> mockOperation + + result == mockOperation + } + + def "autoscaler operations return operations for waiting"() { + setup: + def mockRegistry = Mock(Registry) + def mockClock = Mock(Clock) + def mockTimer = Mock(Timer) + def mockId = Mock(Id) + def mockCompute = Mock(Compute) + def mockAutoscalers = Mock(Compute.Autoscalers) + def mockAutoscalerInsert = Mock(Compute.Autoscalers.Insert) + def mockRegionAutoscalers = Mock(Compute.RegionAutoscalers) + def mockRegAutoscalerInsert = Mock(Compute.RegionAutoscalers.Insert) + + mockRegistry.clock() >> mockClock + mockRegistry.createId(_, _) >> mockId + mockId.withTags(_) >> mockId + mockRegistry.timer(_) >> mockTimer + mockClock.monotonicTime() >> 1000L + + def zonalOperation = new Operation(name: "zonal-op", status: "DONE") + def regionalOperation = new Operation(name: "regional-op", status: "DONE") + + mockCompute.autoscalers() >> mockAutoscalers + mockCompute.regionAutoscalers() >> mockRegionAutoscalers + mockAutoscalers.insert(_, _, _) >> mockAutoscalerInsert + mockAutoscalerInsert.execute() >> zonalOperation + mockRegionAutoscalers.insert(_, _, _) >> mockRegAutoscalerInsert + mockRegAutoscalerInsert.execute() >> regionalOperation + + def handler = new BasicGoogleDeployHandler(registry: mockRegistry) + + when: + def zonalResult = handler.timeExecute(mockAutoscalerInsert, "compute.autoscalers.insert", "TAG_SCOPE", "SCOPE_ZONAL") + def regionalResult = handler.timeExecute(mockRegAutoscalerInsert, "compute.regionAutoscalers.insert", "TAG_SCOPE", "SCOPE_REGIONAL") + + then: + 1 * mockAutoscalerInsert.execute() >> zonalOperation + 1 * mockRegAutoscalerInsert.execute() >> regionalOperation + + zonalResult.getName() == "zonal-op" + regionalResult.getName() == "regional-op" + } }