Skip to content

Commit a0010f2

Browse files
committed
storeliveness: rememeber when support was withdrawn from a store
This patch adds some book keeping to ensure we keep track of when a store withdraws support from a remote store. We don't need to/want to persist this across node restarts. Moreover, this piece of information doesn't generalize to requesting support. For these reasons, we decide not to add a field on slpb.SupportState. Instead, we add indirection to the supportFor map instead. In an upcoming commit, we'll use this withdrawl timestamp to decide whether a store is suspect or not. Informs #158513 Epic: none Release note: None
1 parent 4c9b844 commit a0010f2

File tree

7 files changed

+174
-42
lines changed

7 files changed

+174
-42
lines changed

pkg/kv/kvserver/storeliveness/store_liveness_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,11 @@ func TestStoreLiveness(t *testing.T) {
120120
case "debug-supporter-state":
121121
var sortedSupportMap []string
122122
for _, support := range sm.supporterStateHandler.supporterState.supportFor {
123-
sortedSupportMap = append(sortedSupportMap, fmt.Sprintf("%+v", support))
123+
sortedSupportMap = append(sortedSupportMap, fmt.Sprintf(
124+
"%+v lastSupportWithdrawnTime:%s",
125+
support.SupportState,
126+
support.lastSupportWithdrawnTime,
127+
))
124128
}
125129
slices.Sort(sortedSupportMap)
126130
return fmt.Sprintf(

pkg/kv/kvserver/storeliveness/supporter_state.go

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,25 @@ import (
1717
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
1818
)
1919

20+
// supportState contains state pertaining to support provided to a single remote
21+
// store.
22+
type supportState struct {
23+
// SupportState contains all fields that must be persisted to disk.
24+
slpb.SupportState
25+
26+
// lastSupportWithdrawnTime is the timestamp at which support was last
27+
// withdrawn for the remote store.
28+
lastSupportWithdrawnTime hlc.Timestamp
29+
}
30+
2031
// supporterState stores the core data structures for providing support.
2132
type supporterState struct {
2233
// meta stores the SupporterMeta, including the max timestamp at which this
2334
// store has withdrawn support.
2435
meta slpb.SupporterMeta
25-
// supportFor stores the SupportState for each remote store for which this
36+
// supportFor stores the supportState for each remote store for which this
2637
// store has provided support.
27-
supportFor map[slpb.StoreIdent]slpb.SupportState
38+
supportFor map[slpb.StoreIdent]supportState
2839
}
2940

3041
// supporterStateHandler is the main interface for handling support for other
@@ -64,15 +75,15 @@ func newSupporterStateHandler() *supporterStateHandler {
6475
ssh := &supporterStateHandler{
6576
supporterState: supporterState{
6677
meta: slpb.SupporterMeta{},
67-
supportFor: make(map[slpb.StoreIdent]slpb.SupportState),
78+
supportFor: make(map[slpb.StoreIdent]supportState),
6879
},
6980
}
7081
ssh.update.Store(
7182
&supporterStateForUpdate{
7283
checkedIn: &ssh.supporterState,
7384
inProgress: supporterState{
7485
meta: slpb.SupporterMeta{},
75-
supportFor: make(map[slpb.StoreIdent]slpb.SupportState),
86+
supportFor: make(map[slpb.StoreIdent]supportState),
7687
},
7788
},
7889
)
@@ -101,7 +112,7 @@ type supporterStateForUpdate struct {
101112
func (ssh *supporterStateHandler) getSupportFor(id slpb.StoreIdent) slpb.SupportState {
102113
ssh.mu.RLock()
103114
defer ssh.mu.RUnlock()
104-
return ssh.supporterState.supportFor[id]
115+
return ssh.supporterState.supportFor[id].SupportState
105116
}
106117

107118
// getNumSupportFor returns the size of the supporterState.supportFor map.
@@ -116,9 +127,9 @@ func (ssh *supporterStateHandler) getNumSupportFor() int {
116127
func (ssh *supporterStateHandler) exportAllSupportFor() []slpb.SupportState {
117128
ssh.mu.RLock()
118129
defer ssh.mu.RUnlock()
119-
supportStates := make([]slpb.SupportState, len(ssh.supporterState.supportFor))
130+
supportStates := make([]slpb.SupportState, 0, len(ssh.supporterState.supportFor))
120131
for _, ss := range ssh.supporterState.supportFor {
121-
supportStates = append(supportStates, ss)
132+
supportStates = append(supportStates, ss.SupportState)
122133
}
123134
return supportStates
124135
}
@@ -146,13 +157,11 @@ func (ssfu *supporterStateForUpdate) getMeta() slpb.SupporterMeta {
146157
return ssfu.checkedIn.meta
147158
}
148159

149-
// getSupportFor returns the SupportState from the inProgress view; if not
150-
// present, it falls back to the SupportState from the checkedIn view.
160+
// getSupportFor returns the supportState from the inProgress view; if not
161+
// present, it falls back to the supportState from the checkedIn view.
151162
// The returned boolean indicates whether the store is present in the supportFor
152163
// map; it does NOT indicate whether support is provided.
153-
func (ssfu *supporterStateForUpdate) getSupportFor(
154-
storeID slpb.StoreIdent,
155-
) (slpb.SupportState, bool) {
164+
func (ssfu *supporterStateForUpdate) getSupportFor(storeID slpb.StoreIdent) (supportState, bool) {
156165
ss, ok := ssfu.inProgress.supportFor[storeID]
157166
if !ok {
158167
ss, ok = ssfu.checkedIn.supportFor[storeID]
@@ -180,7 +189,8 @@ func (ssfu *supporterStateForUpdate) write(ctx context.Context, b storage.Batch)
180189
}
181190
}
182191
for _, ss := range ssfu.inProgress.supportFor {
183-
if err := writeSupportForState(ctx, b, ss); err != nil {
192+
// Only write the persistent SupportState to disk.
193+
if err := writeSupportForState(ctx, b, ss.SupportState); err != nil {
184194
return err
185195
}
186196
}
@@ -201,9 +211,9 @@ func (ssh *supporterStateHandler) read(ctx context.Context, r storage.Reader) er
201211
ssh.mu.Lock()
202212
defer ssh.mu.Unlock()
203213
ssh.supporterState.meta = meta
204-
ssh.supporterState.supportFor = make(map[slpb.StoreIdent]slpb.SupportState, len(supportFor))
214+
ssh.supporterState.supportFor = make(map[slpb.StoreIdent]supportState, len(supportFor))
205215
for _, s := range supportFor {
206-
ssh.supporterState.supportFor[s.Target] = s
216+
ssh.supporterState.supportFor[s.Target] = supportState{SupportState: s}
207217
}
208218
return nil
209219
}
@@ -256,12 +266,15 @@ func (ssfu *supporterStateForUpdate) handleHeartbeat(
256266
from := msg.From
257267
ss, ok := ssfu.getSupportFor(from)
258268
if !ok {
259-
ss = slpb.SupportState{Target: from}
269+
ss = supportState{SupportState: slpb.SupportState{Target: from}}
260270
}
261271
ssNew := handleHeartbeat(ss, msg)
272+
assert(ss.lastSupportWithdrawnTime == ssNew.lastSupportWithdrawnTime,
273+
"lastSupportWithdrawnTime changed on successful heartbeat",
274+
)
262275
if ss != ssNew {
263276
ssfu.inProgress.supportFor[from] = ssNew
264-
logSupportForChange(ctx, ss, ssNew)
277+
logSupportForChange(ctx, ss.SupportState, ssNew.SupportState)
265278
}
266279
return slpb.Message{
267280
Type: slpb.MsgHeartbeatResp,
@@ -274,7 +287,7 @@ func (ssfu *supporterStateForUpdate) handleHeartbeat(
274287

275288
// handleHeartbeat contains the core logic for updating the epoch and expiration
276289
// of a support requester upon receiving a heartbeat.
277-
func handleHeartbeat(ss slpb.SupportState, msg *slpb.Message) slpb.SupportState {
290+
func handleHeartbeat(ss supportState, msg *slpb.Message) supportState {
278291
assert(!msg.Expiration.IsEmpty(), "requested support with zero expiration")
279292
if ss.Epoch == msg.Epoch {
280293
ss.Expiration.Forward(msg.Expiration)
@@ -315,10 +328,10 @@ func (ssfu *supporterStateForUpdate) withdrawSupport(
315328
)
316329
supportWithdrawnForStoreIDs = make(map[roachpb.StoreID]struct{})
317330
for id, ss := range ssfu.checkedIn.supportFor {
318-
ssNew := maybeWithdrawSupport(ss, now)
319-
if ss != ssNew {
331+
ssNew, withdrawn := maybeWithdrawSupport(ss, now)
332+
if withdrawn {
320333
ssfu.inProgress.supportFor[id] = ssNew
321-
log.KvExec.Infof(ctx, "withdrew support for %s", supportChangeStr(ss, ssNew))
334+
log.KvExec.Infof(ctx, "withdrew support for %s", supportChangeStr(ss.SupportState, ssNew.SupportState))
322335
meta := ssfu.getMeta()
323336
if meta.MaxWithdrawn.Forward(now) {
324337
ssfu.inProgress.meta = meta
@@ -330,11 +343,15 @@ func (ssfu *supporterStateForUpdate) withdrawSupport(
330343
}
331344

332345
// maybeWithdrawSupport contains the core logic for updating the epoch and
333-
// expiration of a support requester when withdrawing support.
334-
func maybeWithdrawSupport(ss slpb.SupportState, now hlc.ClockTimestamp) slpb.SupportState {
346+
// expiration of a support requester when withdrawing support. If support is
347+
// withdrawn, lastSupportWithdrawnTime is updated to the current timestamp and
348+
// a boolean indicating as such is returned.
349+
func maybeWithdrawSupport(ss supportState, now hlc.ClockTimestamp) (supportState, bool) {
335350
if !ss.Expiration.IsEmpty() && ss.Expiration.LessEq(now.ToTimestamp()) {
336351
ss.Epoch++
337352
ss.Expiration = hlc.Timestamp{}
353+
ss.lastSupportWithdrawnTime = now.ToTimestamp()
354+
return ss, true
338355
}
339-
return ss
356+
return ss, false
340357
}

pkg/kv/kvserver/storeliveness/testdata/basic

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ debug-supporter-state
4949
meta:
5050
{MaxWithdrawn:201.000000000,0}
5151
support for:
52-
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:0,0}
52+
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:0,0} lastSupportWithdrawnTime:201.000000000,0
5353

5454
debug-metrics
5555
----
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# -------------------------------------------------------------
2+
# This test verifies the behavior of lastSupportWithdrawnTime.
3+
# Specifically:
4+
# 1. It is set when support is first withdrawn.
5+
# 2. It is NOT updated on subsequent withdrawal attempts.
6+
# 3. It is preserved when handling successful heartbeats.
7+
# -------------------------------------------------------------
8+
9+
# -------------------------------------------------------------
10+
# Store (n1, s1) provides support to (n2, s2).
11+
# -------------------------------------------------------------
12+
13+
handle-messages
14+
msg type=MsgHeartbeat from-node-id=2 from-store-id=2 epoch=1 expiration=100
15+
----
16+
responses:
17+
{Type:MsgHeartbeatResp From:{NodeID:1 StoreID:1} To:{NodeID:2 StoreID:2} Epoch:1 Expiration:100.000000000,0}
18+
19+
debug-supporter-state
20+
----
21+
meta:
22+
{MaxWithdrawn:0,0}
23+
support for:
24+
{Target:{NodeID:2 StoreID:2} Epoch:1 Expiration:100.000000000,0} lastSupportWithdrawnTime:0,0
25+
26+
27+
# -------------------------------------------------------------
28+
# Store (n1, s1) withdraws support at time 200.
29+
# lastSupportWithdrawnTime should be set to 200.
30+
# -------------------------------------------------------------
31+
32+
withdraw-support now=200
33+
----
34+
35+
debug-supporter-state
36+
----
37+
meta:
38+
{MaxWithdrawn:200.000000000,0}
39+
support for:
40+
{Target:{NodeID:2 StoreID:2} Epoch:2 Expiration:0,0} lastSupportWithdrawnTime:200.000000000,0
41+
42+
43+
# -------------------------------------------------------------
44+
# Store (n1, s1) attempts to withdraw support again at time 300.
45+
# lastSupportWithdrawnTime should NOT be updated.
46+
# -------------------------------------------------------------
47+
48+
withdraw-support now=300
49+
----
50+
51+
debug-supporter-state
52+
----
53+
meta:
54+
{MaxWithdrawn:200.000000000,0}
55+
support for:
56+
{Target:{NodeID:2 StoreID:2} Epoch:2 Expiration:0,0} lastSupportWithdrawnTime:200.000000000,0
57+
58+
59+
# -------------------------------------------------------------
60+
# Store (n2, s2) re-establishes support via heartbeat.
61+
# lastSupportWithdrawnTime should be preserved.
62+
# -------------------------------------------------------------
63+
64+
handle-messages
65+
msg type=MsgHeartbeat from-node-id=2 from-store-id=2 epoch=2 expiration=400
66+
----
67+
responses:
68+
{Type:MsgHeartbeatResp From:{NodeID:1 StoreID:1} To:{NodeID:2 StoreID:2} Epoch:2 Expiration:400.000000000,0}
69+
70+
debug-supporter-state
71+
----
72+
meta:
73+
{MaxWithdrawn:200.000000000,0}
74+
support for:
75+
{Target:{NodeID:2 StoreID:2} Epoch:2 Expiration:400.000000000,0} lastSupportWithdrawnTime:200.000000000,0
76+
77+
78+
# -------------------------------------------------------------
79+
# Store (n2, s2) extends support with a new epoch.
80+
# lastSupportWithdrawnTime should still be preserved.
81+
# -------------------------------------------------------------
82+
83+
handle-messages
84+
msg type=MsgHeartbeat from-node-id=2 from-store-id=2 epoch=3 expiration=500
85+
----
86+
responses:
87+
{Type:MsgHeartbeatResp From:{NodeID:1 StoreID:1} To:{NodeID:2 StoreID:2} Epoch:3 Expiration:500.000000000,0}
88+
89+
debug-supporter-state
90+
----
91+
meta:
92+
{MaxWithdrawn:200.000000000,0}
93+
support for:
94+
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:500.000000000,0} lastSupportWithdrawnTime:200.000000000,0
95+
96+
97+
# -------------------------------------------------------------
98+
# Store (n1, s1) withdraws support again at time 600.
99+
# lastSupportWithdrawnTime should now be updated to 600.
100+
# -------------------------------------------------------------
101+
102+
withdraw-support now=600
103+
----
104+
105+
debug-supporter-state
106+
----
107+
meta:
108+
{MaxWithdrawn:600.000000000,0}
109+
support for:
110+
{Target:{NodeID:2 StoreID:2} Epoch:4 Expiration:0,0} lastSupportWithdrawnTime:600.000000000,0
111+

pkg/kv/kvserver/storeliveness/testdata/multi-store

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ debug-supporter-state
5151
meta:
5252
{MaxWithdrawn:0,0}
5353
support for:
54-
{Target:{NodeID:1 StoreID:2} Epoch:2 Expiration:102.000000000,0}
55-
{Target:{NodeID:2 StoreID:3} Epoch:3 Expiration:103.000000000,0}
56-
{Target:{NodeID:2 StoreID:4} Epoch:4 Expiration:104.000000000,0}
54+
{Target:{NodeID:1 StoreID:2} Epoch:2 Expiration:102.000000000,0} lastSupportWithdrawnTime:0,0
55+
{Target:{NodeID:2 StoreID:3} Epoch:3 Expiration:103.000000000,0} lastSupportWithdrawnTime:0,0
56+
{Target:{NodeID:2 StoreID:4} Epoch:4 Expiration:104.000000000,0} lastSupportWithdrawnTime:0,0
5757

5858
withdraw-support now=103
5959
----
@@ -72,9 +72,9 @@ debug-supporter-state
7272
meta:
7373
{MaxWithdrawn:103.000000000,0}
7474
support for:
75-
{Target:{NodeID:1 StoreID:2} Epoch:3 Expiration:0,0}
76-
{Target:{NodeID:2 StoreID:3} Epoch:4 Expiration:0,0}
77-
{Target:{NodeID:2 StoreID:4} Epoch:4 Expiration:104.000000000,0}
75+
{Target:{NodeID:1 StoreID:2} Epoch:3 Expiration:0,0} lastSupportWithdrawnTime:103.000000000,0
76+
{Target:{NodeID:2 StoreID:3} Epoch:4 Expiration:0,0} lastSupportWithdrawnTime:103.000000000,0
77+
{Target:{NodeID:2 StoreID:4} Epoch:4 Expiration:104.000000000,0} lastSupportWithdrawnTime:0,0
7878

7979

8080
# -------------------------------------------------------------
@@ -109,9 +109,9 @@ debug-supporter-state
109109
meta:
110110
{MaxWithdrawn:103.000000000,0}
111111
support for:
112-
{Target:{NodeID:1 StoreID:2} Epoch:3 Expiration:0,0}
113-
{Target:{NodeID:2 StoreID:3} Epoch:4 Expiration:0,0}
114-
{Target:{NodeID:2 StoreID:4} Epoch:4 Expiration:104.000000000,0}
112+
{Target:{NodeID:1 StoreID:2} Epoch:3 Expiration:0,0} lastSupportWithdrawnTime:103.000000000,0
113+
{Target:{NodeID:2 StoreID:3} Epoch:4 Expiration:0,0} lastSupportWithdrawnTime:103.000000000,0
114+
{Target:{NodeID:2 StoreID:4} Epoch:4 Expiration:104.000000000,0} lastSupportWithdrawnTime:0,0
115115

116116
send-heartbeats now=200 support-duration=10s
117117
----

pkg/kv/kvserver/storeliveness/testdata/restart

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ debug-supporter-state
4646
meta:
4747
{MaxWithdrawn:201.000000000,0}
4848
support for:
49-
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:300.000000000,0}
49+
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:300.000000000,0} lastSupportWithdrawnTime:201.000000000,0
5050

5151
# -------------------------------------------------------------
5252
# Store (n1, s1) restarts.
@@ -71,7 +71,7 @@ debug-supporter-state
7171
meta:
7272
{MaxWithdrawn:201.000000000,0}
7373
support for:
74-
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:300.000000000,0}
74+
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:300.000000000,0} lastSupportWithdrawnTime:0,0
7575

7676
debug-metrics
7777
----
@@ -109,7 +109,7 @@ debug-supporter-state
109109
meta:
110110
{MaxWithdrawn:201.000000000,0}
111111
support for:
112-
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:300.000000000,0}
112+
{Target:{NodeID:2 StoreID:2} Epoch:3 Expiration:300.000000000,0} lastSupportWithdrawnTime:0,0
113113

114114
# -------------------------------------------------------------
115115
# Store (n1, s1) withdraws support after the grace period.
@@ -123,7 +123,7 @@ debug-supporter-state
123123
meta:
124124
{MaxWithdrawn:311.000000000,0}
125125
support for:
126-
{Target:{NodeID:2 StoreID:2} Epoch:4 Expiration:0,0}
126+
{Target:{NodeID:2 StoreID:2} Epoch:4 Expiration:0,0} lastSupportWithdrawnTime:311.000000000,0
127127

128128
# -------------------------------------------------------------
129129
# Store (n1, s1) sends heartbeats but it forgot about support

0 commit comments

Comments
 (0)