Skip to content

Commit bccb019

Browse files
GuptaManan100frouioui
authored andcommitted
Fix deadlock between health check and topology watcher (#16995)
Signed-off-by: Manan Gupta <[email protected]> Signed-off-by: Florent Poinsard <[email protected]>
1 parent 1ae91e3 commit bccb019

File tree

2 files changed

+73
-2
lines changed

2 files changed

+73
-2
lines changed

go/vt/discovery/healthcheck.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ func NewHealthCheck(ctx context.Context, retryDelay, healthCheckTimeout time.Dur
323323
healthy: make(map[KeyspaceShardTabletType][]*TabletHealth),
324324
subscribers: make(map[chan *TabletHealth]struct{}),
325325
cellAliases: make(map[string]string),
326-
loadTabletsTrigger: make(chan struct{}),
326+
loadTabletsTrigger: make(chan struct{}, 1),
327327
}
328328
var topoWatchers []*TopologyWatcher
329329
var filter TabletFilter
@@ -516,7 +516,13 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
516516
if prevTarget.TabletType == topodata.TabletType_PRIMARY {
517517
if primaries := hc.healthData[oldTargetKey]; len(primaries) == 0 {
518518
log.Infof("We will have no health data for the next new primary tablet after demoting the tablet: %v, so start loading tablets now", topotools.TabletIdent(th.Tablet))
519-
hc.loadTabletsTrigger <- struct{}{}
519+
// We want to trigger a loadTablets call, but if the channel is not empty
520+
// then a trigger is already scheduled, we don't need to trigger another one.
521+
// This also prevents the code from deadlocking as described in https://github.com/vitessio/vitess/issues/16994.
522+
select {
523+
case hc.loadTabletsTrigger <- struct{}{}:
524+
default:
525+
}
520526
}
521527
}
522528
}

go/vt/discovery/topology_watcher_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"google.golang.org/protobuf/proto"
2828

2929
"vitess.io/vitess/go/test/utils"
30+
querypb "vitess.io/vitess/go/vt/proto/query"
3031

3132
"vitess.io/vitess/go/vt/logutil"
3233
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
@@ -630,3 +631,67 @@ func TestFilterByKeypsaceSkipsIgnoredTablets(t *testing.T) {
630631

631632
tw.Stop()
632633
}
634+
635+
// TestDeadlockBetweenTopologyWatcherAndHealthCheck tests the possibility of a deadlock
636+
// between the topology watcher and the health check.
637+
// The issue https://github.com/vitessio/vitess/issues/16994 has more details on the deadlock.
638+
func TestDeadlockBetweenTopologyWatcherAndHealthCheck(t *testing.T) {
639+
ctx := utils.LeakCheckContext(t)
640+
641+
// create a new memory topo server and an health check instance.
642+
ts, _ := memorytopo.NewServerAndFactory(ctx, "zone-1")
643+
hc := NewHealthCheck(ctx, time.Hour, time.Hour, ts, "zone-1", "")
644+
defer hc.Close()
645+
defer hc.topoWatchers[0].Stop()
646+
647+
// Add a tablet to the topology.
648+
tablet1 := &topodatapb.Tablet{
649+
Alias: &topodatapb.TabletAlias{
650+
Cell: "zone-1",
651+
Uid: 100,
652+
},
653+
Type: topodatapb.TabletType_REPLICA,
654+
Hostname: "host1",
655+
PortMap: map[string]int32{
656+
"grpc": 123,
657+
},
658+
Keyspace: "keyspace",
659+
Shard: "shard",
660+
}
661+
err := ts.CreateTablet(ctx, tablet1)
662+
// Run the first loadTablets call to ensure the tablet is present in the topology watcher.
663+
hc.topoWatchers[0].loadTablets()
664+
require.NoError(t, err)
665+
666+
// We want to run updateHealth with arguments that always
667+
// make it trigger load Tablets.
668+
th := &TabletHealth{
669+
Tablet: tablet1,
670+
Target: &querypb.Target{
671+
Keyspace: "keyspace",
672+
Shard: "shard",
673+
TabletType: topodatapb.TabletType_REPLICA,
674+
},
675+
}
676+
prevTarget := &querypb.Target{
677+
Keyspace: "keyspace",
678+
Shard: "shard",
679+
TabletType: topodatapb.TabletType_PRIMARY,
680+
}
681+
682+
// If we run the updateHealth function often enough, then we
683+
// will see the deadlock where the topology watcher is trying to replace
684+
// the tablet in the health check, but health check has the mutex acquired
685+
// already because it is calling updateHealth.
686+
// updateHealth itself will be stuck trying to send on the shared channel.
687+
for i := 0; i < 10; i++ {
688+
// Update the port of the tablet so that when update Health asks topo watcher to
689+
// refresh the tablets, it finds an update and tries to replace it.
690+
_, err = ts.UpdateTabletFields(ctx, tablet1.Alias, func(t *topodatapb.Tablet) error {
691+
t.PortMap["testing_port"] = int32(i + 1)
692+
return nil
693+
})
694+
require.NoError(t, err)
695+
hc.updateHealth(th, prevTarget, false, false)
696+
}
697+
}

0 commit comments

Comments
 (0)