Skip to content

Commit 704a7df

Browse files
[release-21.0] Fix deadlock between health check and topology watcher (#16995) (#17010)
Signed-off-by: Manan Gupta <[email protected]> Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com>
1 parent 89f3707 commit 704a7df

File tree

2 files changed

+73
-2
lines changed

2 files changed

+73
-2
lines changed

go/vt/discovery/healthcheck.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ func NewHealthCheck(ctx context.Context, retryDelay, healthCheckTimeout time.Dur
360360
healthy: make(map[KeyspaceShardTabletType][]*TabletHealth),
361361
subscribers: make(map[chan *TabletHealth]struct{}),
362362
cellAliases: make(map[string]string),
363-
loadTabletsTrigger: make(chan struct{}),
363+
loadTabletsTrigger: make(chan struct{}, 1),
364364
}
365365
var topoWatchers []*TopologyWatcher
366366
cells := strings.Split(cellsToWatch, ",")
@@ -539,7 +539,13 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
539539
if prevTarget.TabletType == topodata.TabletType_PRIMARY {
540540
if primaries := hc.healthData[oldTargetKey]; len(primaries) == 0 {
541541
log.Infof("We will have no health data for the next new primary tablet after demoting the tablet: %v, so start loading tablets now", topotools.TabletIdent(th.Tablet))
542-
hc.loadTabletsTrigger <- struct{}{}
542+
// We want to trigger a loadTablets call, but if the channel is not empty
543+
// then a trigger is already scheduled, we don't need to trigger another one.
544+
// This also prevents the code from deadlocking as described in https://github.com/vitessio/vitess/issues/16994.
545+
select {
546+
case hc.loadTabletsTrigger <- struct{}{}:
547+
default:
548+
}
543549
}
544550
}
545551
}

go/vt/discovery/topology_watcher_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"google.golang.org/protobuf/proto"
2929

3030
"vitess.io/vitess/go/test/utils"
31+
querypb "vitess.io/vitess/go/vt/proto/query"
3132

3233
"vitess.io/vitess/go/vt/logutil"
3334
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
@@ -709,3 +710,67 @@ func TestGetTabletErrorDoesNotRemoveFromHealthcheck(t *testing.T) {
709710
assert.True(t, proto.Equal(tablet1, allTablets[key1]))
710711
assert.True(t, proto.Equal(tablet2, allTablets[key2]))
711712
}
713+
714+
// TestDeadlockBetweenTopologyWatcherAndHealthCheck tests the possibility of a deadlock
715+
// between the topology watcher and the health check.
716+
// The issue https://github.com/vitessio/vitess/issues/16994 has more details on the deadlock.
717+
func TestDeadlockBetweenTopologyWatcherAndHealthCheck(t *testing.T) {
718+
ctx := utils.LeakCheckContext(t)
719+
720+
// create a new memory topo server and an health check instance.
721+
ts, _ := memorytopo.NewServerAndFactory(ctx, "zone-1")
722+
hc := NewHealthCheck(ctx, time.Hour, time.Hour, ts, "zone-1", "", nil)
723+
defer hc.Close()
724+
defer hc.topoWatchers[0].Stop()
725+
726+
// Add a tablet to the topology.
727+
tablet1 := &topodatapb.Tablet{
728+
Alias: &topodatapb.TabletAlias{
729+
Cell: "zone-1",
730+
Uid: 100,
731+
},
732+
Type: topodatapb.TabletType_REPLICA,
733+
Hostname: "host1",
734+
PortMap: map[string]int32{
735+
"grpc": 123,
736+
},
737+
Keyspace: "keyspace",
738+
Shard: "shard",
739+
}
740+
err := ts.CreateTablet(ctx, tablet1)
741+
// Run the first loadTablets call to ensure the tablet is present in the topology watcher.
742+
hc.topoWatchers[0].loadTablets()
743+
require.NoError(t, err)
744+
745+
// We want to run updateHealth with arguments that always
746+
// make it trigger load Tablets.
747+
th := &TabletHealth{
748+
Tablet: tablet1,
749+
Target: &querypb.Target{
750+
Keyspace: "keyspace",
751+
Shard: "shard",
752+
TabletType: topodatapb.TabletType_REPLICA,
753+
},
754+
}
755+
prevTarget := &querypb.Target{
756+
Keyspace: "keyspace",
757+
Shard: "shard",
758+
TabletType: topodatapb.TabletType_PRIMARY,
759+
}
760+
761+
// If we run the updateHealth function often enough, then we
762+
// will see the deadlock where the topology watcher is trying to replace
763+
// the tablet in the health check, but health check has the mutex acquired
764+
// already because it is calling updateHealth.
765+
// updateHealth itself will be stuck trying to send on the shared channel.
766+
for i := 0; i < 10; i++ {
767+
// Update the port of the tablet so that when update Health asks topo watcher to
768+
// refresh the tablets, it finds an update and tries to replace it.
769+
_, err = ts.UpdateTabletFields(ctx, tablet1.Alias, func(t *topodatapb.Tablet) error {
770+
t.PortMap["testing_port"] = int32(i + 1)
771+
return nil
772+
})
773+
require.NoError(t, err)
774+
hc.updateHealth(th, prevTarget, false, false)
775+
}
776+
}

0 commit comments

Comments
 (0)