Skip to content

Commit a7621f7

Browse files
committed
roachtest: wait for disk space reclamation in clearrange
In the clearrange roachtests, adapt the completion condition to wait for reclamation of disk space by waiting until the approximate disk size of the key range for the dropped table falls to <1% of its original. Additionally, slightly relax the range count condition to reduce the test time. The long tail of reducing from a dozen ranges to just 1 can take significant wall time and is not important to exericse. Epic: none Release note: none
1 parent 19b1cfd commit a7621f7

File tree

1 file changed

+99
-44
lines changed

1 file changed

+99
-44
lines changed

pkg/cmd/roachtest/tests/clearrange.go

Lines changed: 99 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@ import (
1515
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
1616
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1717
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
18+
"github.com/cockroachdb/cockroach/pkg/roachpb"
1819
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
20+
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
1921
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
22+
"github.com/stretchr/testify/require"
2023
)
2124

2225
func registerClearRange(r registry.Registry) {
@@ -82,32 +85,18 @@ func runClearRange(ctx context.Context, t test.Test, c cluster.Cluster, aggressi
8285

8386
t.Status()
8487

85-
// Set up a convenience function that we can call to learn the number of
86-
// ranges for the bigbank.bank table (even after it's been dropped).
87-
numBankRanges := func() func() int {
88-
conn := c.Conn(ctx, t.L(), 1)
89-
defer conn.Close()
90-
91-
var startHex string
92-
if err := conn.QueryRow(
93-
`SELECT to_hex(raw_start_key)
94-
FROM [SHOW RANGES FROM TABLE bigbank.bank WITH KEYS]
95-
ORDER BY raw_start_key ASC LIMIT 1`,
96-
).Scan(&startHex); err != nil {
97-
t.Fatal(err)
98-
}
99-
return func() int {
100-
conn := c.Conn(ctx, t.L(), 1)
101-
defer conn.Close()
102-
var n int
103-
if err := conn.QueryRow(
104-
`SELECT count(*) FROM crdb_internal.ranges_no_leases WHERE substr(to_hex(start_key), 1, length($1::string)) = $1`, startHex,
105-
).Scan(&n); err != nil {
106-
t.Fatal(err)
107-
}
108-
return n
109-
}
110-
}()
88+
bigBankSpan, err := getKeyspanForTable(ctx, t, c, 1, "bigbank.bank")
89+
require.NoError(t, err)
90+
t.L().Printf("bigbank DB ID: %s (%x - %x)", bigBankSpan, bigBankSpan.Key, bigBankSpan.EndKey)
91+
getBigBankStats := func() spanStats {
92+
stats := getSpanStats(ctx, t, c, 1, bigBankSpan)
93+
t.L().Printf("bigbank: %d ranges, %s disk, %s live, %s total",
94+
stats.rangeCount,
95+
humanizeutil.IBytes(stats.approximateDiskBytes),
96+
humanizeutil.IBytes(stats.liveBytes),
97+
humanizeutil.IBytes(stats.totalBytes))
98+
return stats
99+
}
111100

112101
m.Go(func(ctx context.Context) error {
113102
c.Run(ctx, option.WithNodes(c.Node(1)), `./cockroach workload init kv {pgurl:1}`)
@@ -127,6 +116,10 @@ ORDER BY raw_start_key ASC LIMIT 1`,
127116
return err
128117
}
129118

119+
// Collect the stats before dropping the table. getBigBanksStats will
120+
// print them out.
121+
_ = getBigBankStats()
122+
130123
t.WorkerStatus("dropping table")
131124
defer t.WorkerStatus()
132125

@@ -136,26 +129,39 @@ ORDER BY raw_start_key ASC LIMIT 1`,
136129
return err
137130
}
138131

139-
t.WorkerStatus("computing number of ranges")
140-
initialBankRanges := numBankRanges()
132+
t.WorkerStatus("computing span stats")
133+
preDropBankStats := getBigBankStats()
141134

142135
t.WorkerStatus("dropping bank table")
143136
if _, err := conn.ExecContext(ctx, `DROP TABLE bigbank.bank`); err != nil {
144137
return err
145138
}
146139

147-
// Spend some time reading data with a timeout to make sure the
148-
// DROP above didn't brick the cluster. At the time of writing,
149-
// clearing all of the table data takes ~6min, so we want to run
150-
// for at least a multiple of that duration.
151-
const minDuration = 45 * time.Minute
152-
deadline := timeutil.Now().Add(minDuration)
153-
curBankRanges := numBankRanges()
154-
t.WorkerStatus("waiting for ~", curBankRanges, " merges to complete (and for at least ", minDuration, " to pass)")
155-
for timeutil.Now().Before(deadline) || curBankRanges > 1 {
156-
after := time.After(5 * time.Minute)
157-
curBankRanges = numBankRanges() // this call takes minutes, unfortunately
158-
t.WorkerProgress(1 - float64(curBankRanges)/float64(initialBankRanges))
140+
curBankStats := getBigBankStats()
141+
progressFn := func() float64 {
142+
// Compute progress as a float [0, 1.0].
143+
//
144+
// We compute the progress in terms of the number of ranges and the
145+
// amount of disk space, relative to the stats we computed
146+
// immediately after dropping the table. That is:
147+
//
148+
// 1 - (current / initial)
149+
//
150+
// The range count progress subtracts 1 from each count to account
151+
// for the expectation that one range will always remain.
152+
//
153+
// We compute the overall progress as the minimum of the two metrics'
154+
// progress.
155+
mergeProgress := 1 - (float64(curBankStats.rangeCount-1) /
156+
float64(preDropBankStats.rangeCount-1))
157+
diskProgress := 1 - (float64(curBankStats.approximateDiskBytes) /
158+
float64(preDropBankStats.approximateDiskBytes))
159+
return max(0, min(mergeProgress, diskProgress))
160+
}
161+
// Terminate when progress is 0.975 or greater. That is, we've reclaimed
162+
// 97.5% of the disk space and merged 97.5% of the ranges.
163+
for progress := progressFn(); progress < 0.975; progress = progressFn() {
164+
t.WorkerProgress(progress)
159165

160166
var count int
161167
// NB: context cancellation in QueryRowContext does not work as expected.
@@ -168,16 +174,65 @@ ORDER BY raw_start_key ASC LIMIT 1`,
168174
return err
169175
}
170176

171-
t.WorkerStatus("waiting for ~", curBankRanges, " merges to complete (and for at least ", timeutil.Until(deadline), " to pass)")
177+
t.WorkerStatus("progress ", progress, " (", curBankStats.rangeCount, " ranges, ",
178+
humanizeutil.IBytes(curBankStats.approximateDiskBytes), " disk usage)")
172179
select {
173-
case <-after:
180+
case <-time.After(time.Minute):
174181
case <-ctx.Done():
175182
return ctx.Err()
176183
}
184+
curBankStats = getBigBankStats()
177185
}
178-
// TODO(tschottdorf): verify that disk space usage drops below to <some small amount>, but that
179-
// may not actually happen (see https://github.com/cockroachdb/cockroach/issues/29290).
186+
t.WorkerStatus("reclamation condition met")
180187
return nil
181188
})
182189
m.Wait()
183190
}
191+
192+
func getKeyspanForTable(
193+
ctx context.Context, t test.Test, c cluster.Cluster, n int, tbl string,
194+
) (roachpb.Span, error) {
195+
conn := c.Conn(ctx, t.L(), n)
196+
defer conn.Close()
197+
var startKey, endKey roachpb.Key
198+
err := conn.QueryRow(`SELECT `+
199+
`crdb_internal.table_span($1::regclass::oid::int)[1] AS start_key, `+
200+
`crdb_internal.table_span($1::regclass::oid::int)[2] AS end_key`,
201+
tbl).Scan(&startKey, &endKey)
202+
return roachpb.Span{
203+
Key: startKey,
204+
EndKey: endKey,
205+
}, err
206+
}
207+
208+
type spanStats struct {
209+
rangeCount int
210+
approximateDiskBytes int64
211+
liveBytes int64
212+
totalBytes int64
213+
}
214+
215+
func getSpanStats(
216+
ctx context.Context, t test.Test, c cluster.Cluster, n int, span roachpb.Span,
217+
) spanStats {
218+
conn := c.Conn(ctx, t.L(), n)
219+
defer conn.Close()
220+
221+
var stats spanStats
222+
err := conn.QueryRow(
223+
`SELECT `+
224+
`(stats->'range_count')::int AS range_count, `+
225+
`(stats->'approximate_disk_bytes')::int AS approximate_disk_bytes, `+
226+
`(stats->'approximate_total_stats'->'live_bytes')::int AS live_bytes, `+
227+
`(stats->'approximate_total_stats'->'key_bytes')::int + (stats->'approximate_total_stats'->'val_bytes')::int AS total_bytes `+
228+
`FROM crdb_internal.tenant_span_stats(ARRAY(SELECT($1::bytes, $2::bytes)))`,
229+
span.Key, span.EndKey).
230+
Scan(
231+
&stats.rangeCount,
232+
&stats.approximateDiskBytes,
233+
&stats.liveBytes,
234+
&stats.totalBytes,
235+
)
236+
require.NoError(t, err)
237+
return stats
238+
}

0 commit comments

Comments
 (0)