Skip to content

Commit d85842e

Browse files
craig[bot]tbg
andcommitted
Merge #158373
158373: mmaprototype: filter ineligible lease targets before computing means r=wenyihu6 a=tbg Add health and store-level disposition checks to lease target filtering, and move all eligibility filtering (including the existing per-replica disposition check) to happen before computing load means. Computing means only over eligible targets is semantically correct: when asking "is store X above or below average?", we mean among stores that could actually receive the lease. Including ineligible stores (dead, unhealthy, refusing) distorts the average. Example: with eligible targets at 30/40/50 QPS and a dead store at 0 QPS: - Including dead store: mean=30, so 50 QPS looks "significantly above average" - Excluding dead store: mean=40, so 50 QPS is "slightly above average" The latter correctly reflects reality among viable candidates. Add a datadriven directive and testdata file to exercise the new retainReadyLeaseTargetStoresOnly function, covering health, store-level disposition, and per-replica disposition filtering. Also extend the test DSL to support lease-disposition on replica lines and allow set-store-status to set individual status fields without triggering validation assertions. - [ ] TODO: dig up the pre/post-means-ness of the SMA and highlight differences to reviewers. Part of #156776. Epic: CRDB-55052 Co-authored-by: Tobias Grieger <[email protected]>
2 parents ac9c647 + 29ebce2 commit d85842e

File tree

5 files changed

+318
-16
lines changed

5 files changed

+318
-16
lines changed

pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ func newRebalanceEnv(
112112

113113
type sheddingStore struct {
114114
roachpb.StoreID
115+
// storeLoadSummary is relative to the entire cluster (not a set of valid
116+
// replacement stores for some particular replica), see the comment where
117+
// sheddingStores are constructed.
115118
storeLoadSummary
116119
}
117120

@@ -242,6 +245,49 @@ func (re *rebalanceEnv) rebalanceStores(
242245
return re.changes
243246
}
244247

248+
// rebalanceStore attempts to shed load from a single overloaded store via lease
249+
// transfers and/or replica rebalances.
250+
//
251+
// # Candidate Filtering Strategy
252+
//
253+
// When selecting rebalance targets, we filter candidates in two phases. The key
254+
// consideration is which stores should be included when computing load means,
255+
// since the means determine whether a store looks "underloaded" (good target)
256+
// or "overloaded" (bad target).
257+
//
258+
// **Pre-means filtering** excludes stores with a non-OK disposition from the
259+
// mean. The disposition serves as the source of truth for "can this store
260+
// accept work?" — whether due to drain, maintenance, disk capacity, or any
261+
// other reason. Note that unhealthy stores always have non-OK disposition, so
262+
// disposition indirectly reflects health.
263+
//
264+
// Ill-disposed stores cannot participate as targets and it is correct to
265+
// exclude them from the mean, as the mean helps us answer the question "among
266+
// viable targets, who is underloaded?". If ill-disposed storeswere included in
267+
// the mean, they could skew it down, making viable targets look too overloaded
268+
// to be considered, and preventing resolution of a load imbalance. They could
269+
// also skew the mean up and make actually-overloaded targets look underloaded,
270+
// but this is less of an issue: since both source and target are evaluated
271+
// relative to the same mean before accepting a transfer, their relative
272+
// positions are preserved, and we wouldn't accept a transfer that leaves the
273+
// target worse off than the source.
274+
//
275+
// **Post-means filtering** prevents some of the remaining candidates from being
276+
// chosen as targets for tactical reasons:
277+
// - Stores that would be worse off than the source (relative to the mean
278+
// across the candidate set), to reduce thrashing. For example, if a store
279+
// were overloaded due to a single very hot range, moving that range to
280+
// another store would just shift the problem elsewhere and cause thrashing.
281+
// - Stores with too much pending inflight work (reduced confidence in load arithmetic).
282+
//
283+
// NB: TestClusterState tests various scenarios and edge cases that demonstrate
284+
// filtering behavior and outcomes.
285+
//
286+
// TODO(tbg): The above describes the intended design. Lease transfers follow this
287+
// pattern (see retainReadyLeaseTargetStoresOnly). Replica transfers are still
288+
// being cleaned up: they currently compute means over all constraint-matching
289+
// stores and filter only afterward, which is not intentional.
290+
// Tracking issue: https://github.com/cockroachdb/cockroach/pull/158373
245291
func (re *rebalanceEnv) rebalanceStore(
246292
ctx context.Context, store sheddingStore, localStoreID roachpb.StoreID,
247293
) {
@@ -595,6 +641,9 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID(
595641
if len(candsPL) <= 1 {
596642
continue // leaseholder is the only candidate
597643
}
644+
645+
candsPL = retainReadyLeaseTargetStoresOnly(ctx, candsPL, re.stores, rangeID)
646+
598647
clear(re.scratch.nodes)
599648
means := computeMeansForStoreSet(re, candsPL, re.scratch.nodes, re.scratch.stores)
600649
sls := re.computeLoadSummary(ctx, store.StoreID, &means.storeLoad, &means.nodeLoad)
@@ -607,12 +656,6 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID(
607656
}
608657
var candsSet candidateSet
609658
for _, cand := range cands {
610-
if disp := re.stores[cand.storeID].adjusted.replicas[rangeID].LeaseDisposition; disp != LeaseDispositionOK {
611-
// Don't transfer lease to a store that is lagging.
612-
log.KvDistribution.Infof(ctx, "skipping store s%d for lease transfer: lease disposition %v",
613-
cand.storeID, disp)
614-
continue
615-
}
616659
candSls := re.computeLoadSummary(ctx, cand.storeID, &means.storeLoad, &means.nodeLoad)
617660
candsSet.candidates = append(candsSet.candidates, candidateInfo{
618661
StoreID: cand.storeID,
@@ -685,3 +728,26 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID(
685728
// We iterated through all top-K ranges without running into any limits.
686729
return leaseTransferCount
687730
}
731+
732+
// retainReadyLeaseTargetStoresOnly filters the input set to only those stores that
733+
// are ready to accept a lease for the given range. A store is not ready if it
734+
// is not healthy, or does not accept leases at either the store or replica level.
735+
//
736+
// The input storeSet is mutated (and used to for the returned result).
737+
func retainReadyLeaseTargetStoresOnly(
738+
ctx context.Context, in storeSet, stores map[roachpb.StoreID]*storeState, rangeID roachpb.RangeID,
739+
) storeSet {
740+
out := in[:0]
741+
for _, storeID := range in {
742+
s := stores[storeID].status
743+
switch {
744+
case s.Disposition.Lease != LeaseDispositionOK:
745+
log.KvDistribution.VEventf(ctx, 2, "skipping s%d for lease transfer: lease disposition %v (health %v)", storeID, s.Disposition.Lease, s.Health)
746+
case stores[storeID].adjusted.replicas[rangeID].LeaseDisposition != LeaseDispositionOK:
747+
log.KvDistribution.VEventf(ctx, 2, "skipping s%d for lease transfer: replica lease disposition %v (health %v)", storeID, stores[storeID].adjusted.replicas[rangeID].LeaseDisposition, s.Health)
748+
default:
749+
out = append(out, storeID)
750+
}
751+
}
752+
return out
753+
}

pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,7 @@ func parseSecondaryLoadVector(t *testing.T, in string) SecondaryLoadVector {
7474
return vec
7575
}
7676

77-
func parseStatusFromArgs(t *testing.T, d *datadriven.TestData) Status {
78-
var status Status
77+
func parseStatusFromArgs(t *testing.T, d *datadriven.TestData, status *Status) {
7978
if d.HasArg("health") {
8079
healthStr := dd.ScanArg[string](t, d, "health")
8180
found := false
@@ -118,7 +117,6 @@ func parseStatusFromArgs(t *testing.T, d *datadriven.TestData) Status {
118117
t.Fatalf("unknown replica disposition: %s", replicaStr)
119118
}
120119
}
121-
return status
122120
}
123121

124122
func parseStoreLoadMsg(t *testing.T, in string) StoreLoadMsg {
@@ -215,6 +213,18 @@ func parseStoreLeaseholderMsg(t *testing.T, in string) StoreLeaseholderMsg {
215213
replType, err := parseReplicaType(parts[1])
216214
require.NoError(t, err)
217215
repl.ReplicaType.ReplicaType = replType
216+
case "lease-disposition":
217+
found := false
218+
for i := LeaseDisposition(0); i < leaseDispositionCount; i++ {
219+
if i.String() == parts[1] {
220+
repl.LeaseDisposition = i
221+
found = true
222+
break
223+
}
224+
}
225+
if !found {
226+
t.Fatalf("unknown lease disposition: %s", parts[1])
227+
}
218228
default:
219229
t.Fatalf("unknown argument: %s", parts[0])
220230
}
@@ -345,6 +355,9 @@ func TestClusterState(t *testing.T) {
345355
func(t *testing.T, path string) {
346356
ts := timeutil.NewManualTime(time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC))
347357
cs := newClusterState(ts, newStringInterner())
358+
tr := tracing.NewTracer()
359+
tr.SetRedactable(true)
360+
defer tr.Close()
348361

349362
printNodeListMeta := func(t *testing.T) string {
350363
nodeList := []int{}
@@ -401,6 +414,11 @@ func TestClusterState(t *testing.T) {
401414
// Recursively invoked in `include` directive.
402415
var invokeFn func(t *testing.T, d *datadriven.TestData) string
403416
invokeFn = func(t *testing.T, d *datadriven.TestData) string {
417+
// Start a recording span for each command. Commands that want to
418+
// include the trace in their output can call finishAndGet().
419+
ctx, finishAndGet := tracing.ContextWithRecordingSpan(
420+
context.Background(), tr, d.Cmd,
421+
)
404422
switch d.Cmd {
405423
case "include":
406424
loc := dd.ScanArg[string](t, d, "path")
@@ -477,8 +495,11 @@ func TestClusterState(t *testing.T) {
477495
if !ok {
478496
t.Fatalf("store %d not found", storeID)
479497
}
480-
status := parseStatusFromArgs(t, d)
481-
ss.status = MakeStatus(status.Health, status.Disposition.Lease, status.Disposition.Replica)
498+
// NB: we intentionall bypass the assertion in MakeStatus
499+
// here so that we can test all combinations of health,
500+
// lease, and replica dispositions, even those that we never
501+
// want to see in production.
502+
parseStatusFromArgs(t, d, &ss.status)
482503
return ss.status.String()
483504

484505
case "store-load-msg":
@@ -576,10 +597,6 @@ func TestClusterState(t *testing.T) {
576597
storeID := dd.ScanArg[roachpb.StoreID](t, d, "store-id")
577598
rng := rand.New(rand.NewSource(0))
578599
dsm := newDiversityScoringMemo()
579-
tr := tracing.NewTracer()
580-
tr.SetRedactable(true)
581-
defer tr.Close()
582-
ctx, finishAndGet := tracing.ContextWithRecordingSpan(context.Background(), tr, "rebalance-stores")
583600
re := newRebalanceEnv(cs, rng, dsm, cs.ts.Now())
584601

585602
if n, ok := dd.ScanArgOpt[int](t, d, "max-lease-transfer-count"); ok {
@@ -603,6 +620,15 @@ func TestClusterState(t *testing.T) {
603620
ts.Advance(time.Second * time.Duration(seconds))
604621
return fmt.Sprintf("t=%v", ts.Now().Sub(testingBaseTime))
605622

623+
case "retain-ready-lease-target-stores-only":
624+
in := dd.ScanArg[[]roachpb.StoreID](t, d, "in")
625+
rangeID := dd.ScanArg[roachpb.RangeID](t, d, "range-id")
626+
out := retainReadyLeaseTargetStoresOnly(ctx, storeSet(in), cs.stores, rangeID)
627+
rec := finishAndGet()
628+
var sb redact.StringBuilder
629+
rec.SafeFormatMinimal(&sb)
630+
return fmt.Sprintf("%s%v\n", sb.String(), out)
631+
606632
default:
607633
panic(fmt.Sprintf("unknown command: %v", d.Cmd))
608634
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Test retainReadyLeaseTargetStoresOnly which filters stores based on:
2+
# 1. Store-level lease disposition (must be LeaseDispositionOK)
3+
# 2. Per-replica lease disposition (must be LeaseDispositionOK)
4+
#
5+
# Note: Health is handled indirectly via disposition. Unhealthy stores always
6+
# have non-OK disposition, so we don't need a separate health check.
7+
8+
set-store
9+
store-id=1 node-id=1 locality-tiers=region=us
10+
store-id=2 node-id=2 locality-tiers=region=us
11+
store-id=3 node-id=3 locality-tiers=region=us
12+
----
13+
node-id=1 locality-tiers=region=us,node=1
14+
store-id=1 attrs=
15+
node-id=2 locality-tiers=region=us,node=2
16+
store-id=2 attrs=
17+
node-id=3 locality-tiers=region=us,node=3
18+
store-id=3 attrs=
19+
20+
# Set up some stores. The specifics don't matter since we're only testing
21+
# the lease health checks.
22+
store-load-msg
23+
store-id=1 node-id=1 load=[100,0,0] capacity=[200,100,100] load-time=0s
24+
store-id=2 node-id=2 load=[50,0,0] capacity=[200,100,100] load-time=0s
25+
store-id=3 node-id=3 load=[50,0,0] capacity=[200,100,100] load-time=0s
26+
----
27+
28+
store-leaseholder-msg
29+
store-id=1
30+
range-id=1 load=[10,0,0]
31+
config=(num_replicas=3)
32+
store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
33+
store-id=2 replica-id=2 type=VOTER_FULL leaseholder=false
34+
store-id=3 replica-id=3 type=VOTER_FULL leaseholder=false
35+
----
36+
37+
# All stores healthy and accepting leases - all retained.
38+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
39+
----
40+
[1 2 3]
41+
42+
# Only input stores matter.
43+
retain-ready-lease-target-stores-only in=(1,3) range-id=1
44+
----
45+
[1 3]
46+
47+
# Mark s2 as shedding replicas, which should have no effect
48+
# since we're looking at leases.
49+
set-store-status store-id=2 replicas=shedding
50+
----
51+
ok shedding=replicas
52+
53+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
54+
----
55+
[1 2 3]
56+
57+
# Make store 2 unhealthy - it should be filtered out via disposition.
58+
# Unhealthy stores must have non-OK disposition (invariant).
59+
set-store-status store-id=2 health=unhealthy leases=shedding replicas=shedding
60+
----
61+
unhealthy shedding=leases,replicas
62+
63+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
64+
----
65+
skipping s2 for lease transfer: lease disposition shedding (health unhealthy)
66+
[1 3]
67+
68+
# Different kind of unhealthy. Same result.
69+
set-store-status store-id=2 health=unknown leases=shedding replicas=shedding
70+
----
71+
unknown shedding=leases,replicas
72+
73+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
74+
----
75+
skipping s2 for lease transfer: lease disposition shedding (health unknown)
76+
[1 3]
77+
78+
# Restore store 2, make store 3 refuse leases at store level.
79+
set-store-status store-id=2 health=ok leases=ok replicas=ok
80+
----
81+
ok accepting all
82+
83+
set-store-status store-id=3 leases=refusing
84+
----
85+
ok refusing=leases
86+
87+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
88+
----
89+
skipping s3 for lease transfer: lease disposition refusing (health ok)
90+
[1 2]
91+
92+
# Shedding and refusing are treated the same.
93+
set-store-status store-id=3 health=ok leases=shedding
94+
----
95+
ok shedding=leases
96+
97+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
98+
----
99+
skipping s3 for lease transfer: lease disposition shedding (health ok)
100+
[1 2]
101+
102+
# Restore store 3.
103+
set-store-status store-id=3 leases=ok
104+
----
105+
ok accepting all
106+
107+
# All stores ready again.
108+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
109+
----
110+
[1 2 3]
111+
112+
# Mark r1 as not accepting leases targeting s2 via per-replica disposition.
113+
store-leaseholder-msg
114+
store-id=1
115+
range-id=1 load=[10,0,0]
116+
config=(num_replicas=3)
117+
store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
118+
store-id=2 replica-id=2 type=VOTER_FULL leaseholder=false lease-disposition=refusing
119+
store-id=3 replica-id=3 type=VOTER_FULL leaseholder=false
120+
----
121+
122+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
123+
----
124+
skipping s2 for lease transfer: replica lease disposition refusing (health ok)
125+
[1 3]
126+
127+
# Restore s2's replica disposition.
128+
store-leaseholder-msg
129+
store-id=1
130+
range-id=1 load=[10,0,0]
131+
config=(num_replicas=3)
132+
store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
133+
store-id=2 replica-id=2 type=VOTER_FULL leaseholder=false lease-disposition=ok
134+
store-id=3 replica-id=3 type=VOTER_FULL leaseholder=false
135+
----
136+
137+
retain-ready-lease-target-stores-only in=(1,2,3) range-id=1
138+
----
139+
[1 2 3]

0 commit comments

Comments
 (0)