Skip to content

Commit 867d77d

Browse files
committed
Migrate tests/cluster tests to tests/unit/cluster (#2297)
Migrate tests to use the same framework for all cluster tests. Convert old framework APIs (K, RI, etc.) to new APIs (R, srv, etc.). Add process_is_alive check in cluster_util.tcl to fix an exception caused by running ps on dead processes during failover tests. Signed-off-by: Jun Yeong Kim <[email protected]>
1 parent 6c329df commit 867d77d

7 files changed

Lines changed: 505 additions & 83 deletions

File tree

tests/support/cluster_util.tcl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ proc wait_for_cluster_size {cluster_size} {
148148
# Check that cluster nodes agree about "state", or raise an error.
149149
proc wait_for_cluster_state {state} {
150150
for {set j 0} {$j < [llength $::servers]} {incr j} {
151+
if {![process_is_alive [srv -$j pid]]} continue
151152
if {[process_is_paused [srv -$j pid]]} continue
152153
wait_for_condition 1000 50 {
153154
[CI $j cluster_state] eq $state

tests/unit/cluster/cluster-shards.tcl

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,289 @@ start_cluster 3 3 {tags {external:skip cluster}} {
5353
assert_equal $shard_0_slot_coverage [dict get [get_node_info_from_shard $node_0_id $validation_node "shard"] "slots"]
5454
}
5555
}
56+
# Initial slot distribution for split-slot cluster tests.
57+
set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926]
58+
set ::slot1 [list 5460 5460 5462 10922 10925 10925]
59+
set ::slot2 [list 10923 10924 10927 16383]
60+
set ::slot3 [list 1001 1001]
61+
62+
# Slot allocator: assigns split slots to each master.
63+
proc split_slot_allocation {masters replicas} {
64+
for {set j 0} {$j < $masters} {incr j} {
65+
R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}]
66+
}
67+
}
68+
69+
# Replica allocator: allocates only masters replicas, leaving the last server
70+
# (R 8) as a standalone no-slot node for testing purposes.
71+
proc split_slot_replica_allocation {masters replicas} {
72+
cluster_allocate_replicas $masters [expr {$replicas - 1}]
73+
}
74+
75+
proc cluster_ensure_master {id} {
76+
if { [regexp "master" [R $id role]] == 0 } {
77+
assert_equal {OK} [R $id CLUSTER FAILOVER]
78+
wait_for_condition 50 100 {
79+
[regexp "master" [R $id role]] == 1
80+
} else {
81+
fail "instance $id is not master"
82+
}
83+
}
84+
}
85+
86+
# start_cluster 4 masters + 5 nodes (4 replicas + 1 standalone R8)
87+
start_cluster 4 5 {tags {external:skip cluster}} {
88+
89+
# cluster_master_nodes and cluster_replica_nodes refer to the active cluster members.
90+
set ::cluster_master_nodes 4
91+
set ::cluster_replica_nodes 4
92+
93+
test "Cluster should start ok" {
94+
wait_for_cluster_state ok
95+
}
96+
97+
test "Set cluster hostnames and verify they are propagated" {
98+
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
99+
R $j config set cluster-announce-hostname "host-$j.com"
100+
}
101+
102+
# Wait for everyone to agree about the state
103+
wait_for_cluster_propagation
104+
}
105+
106+
test "Verify information about the shards" {
107+
set ids {}
108+
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
109+
lappend ids [R $j CLUSTER MYID]
110+
}
111+
set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3]
112+
113+
# Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent.
114+
for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} {
115+
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
116+
assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots]
117+
assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname]
118+
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip]
119+
# Default value of 'cluster-preferred-endpoint-type' is ip.
120+
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint]
121+
122+
if {$::tls} {
123+
assert_equal [srv [expr -1*$i] plaintext-port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
124+
assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port]
125+
} else {
126+
assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
127+
}
128+
129+
if {$i < 4} {
130+
assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
131+
assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health]
132+
} else {
133+
assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
134+
# Replica could be in online or loading
135+
}
136+
}
137+
}
138+
}
139+
140+
test "Verify no slot shard" {
141+
# R 8 is a standalone node with no slots assigned (left standalone by split_slot_replica_allocation)
142+
set node_8_id [R 8 CLUSTER MYID]
143+
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots]
144+
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots]
145+
}
146+
147+
set node_0_id [R 0 CLUSTER MYID]
148+
149+
test "Kill a node and tell the replica to immediately takeover" {
150+
pause_process [srv 0 pid]
151+
R 4 cluster failover force
152+
}
153+
154+
# Primary 0 node should report as fail, wait until the new primary acknowledges it.
155+
test "Verify health as fail for killed node" {
156+
wait_for_condition 1000 50 {
157+
"fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"]
158+
} else {
159+
fail "New primary never detected the node failed"
160+
}
161+
}
162+
163+
set primary_id 4
164+
set replica_id 0
165+
166+
test "Restarting primary node" {
167+
restart_server [expr -1*$replica_id] true false
168+
}
169+
170+
test "Instance #0 gets converted into a replica" {
171+
wait_for_condition 1000 50 {
172+
[s [expr -1*$replica_id] role] eq {slave}
173+
} else {
174+
fail "Old primary was not converted into replica"
175+
}
176+
}
177+
178+
test "Test the replica reports a loading state while it's loading" {
179+
# Test the command is good for verifying everything moves to a happy state
180+
set replica_cluster_id [R $replica_id CLUSTER MYID]
181+
wait_for_condition 50 1000 {
182+
[dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online"
183+
} else {
184+
fail "Replica never transitioned to online"
185+
}
186+
187+
# Set 1 MB of data, so there is something to load on full sync
188+
R $primary_id debug populate 1000 key 1000
189+
190+
# Kill replica client for primary and load new data to the primary
191+
R $primary_id config set repl-backlog-size 100
192+
193+
# Set the key load delay so that it will take at least
194+
# 2 seconds to fully load the data.
195+
R $replica_id config set key-load-delay 4000
196+
197+
# Trigger event loop processing every 1024 bytes, this trigger
198+
# allows us to send and receive cluster messages, so we are setting
199+
# it low so that the cluster messages are sent more frequently.
200+
R $replica_id config set loading-process-events-interval-bytes 1024
201+
202+
R $primary_id multi
203+
R $primary_id client kill type replica
204+
# populate the correct data
205+
set num 100
206+
set value [string repeat A 1024]
207+
for {set j 0} {$j < $num} {incr j} {
208+
# Use hashtag valid for shard #0
209+
set key "{ch3}$j"
210+
R $primary_id set $key $value
211+
}
212+
R $primary_id exec
213+
214+
# The replica should reconnect and start a full sync, it will gossip about it's health to the primary.
215+
wait_for_condition 50 1000 {
216+
"loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
217+
} else {
218+
fail "Replica never transitioned to loading"
219+
}
220+
221+
# Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data.
222+
R $replica_id CLUSTER SHARDS
223+
R $replica_id CLUSTER SLOTS
224+
225+
# Speed up the key loading and verify everything resumes
226+
R $replica_id config set key-load-delay 0
227+
228+
wait_for_condition 50 1000 {
229+
"online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
230+
} else {
231+
fail "Replica never transitioned to online"
232+
}
233+
234+
# Final sanity, the replica agrees it is online.
235+
assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health]
236+
}
237+
238+
test "Regression test for a crash when calling SHARDS during handshake" {
239+
# Use R 8 (standalone node) to establish handshaking connections
240+
set id [R 8 CLUSTER MYID]
241+
R 8 CLUSTER RESET HARD
242+
for {set i 0} {$i < 8} {incr i} {
243+
R $i CLUSTER FORGET $id
244+
}
245+
R 8 cluster meet 127.0.0.1 [srv 0 port]
246+
# This line would previously crash, since all the outbound
247+
# connections were in handshake state.
248+
R 8 CLUSTER SHARDS
249+
}
250+
251+
test "Cluster is up" {
252+
wait_for_cluster_state ok
253+
}
254+
255+
test "Shard ids are unique" {
256+
set shard_ids {}
257+
for {set i 0} {$i < 4} {incr i} {
258+
set shard_id [R $i cluster myshardid]
259+
assert_equal [dict exists $shard_ids $shard_id] 0
260+
dict set shard_ids $shard_id 1
261+
}
262+
}
263+
264+
test "CLUSTER MYSHARDID reports same id for both primary and replica" {
265+
for {set i 0} {$i < 4} {incr i} {
266+
assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid]
267+
assert_equal [string length [R $i cluster myshardid]] 40
268+
}
269+
}
270+
271+
test "New replica receives primary's shard id" {
272+
# find a primary
273+
set id 0
274+
for {} {$id < 8} {incr id} {
275+
if {[regexp "master" [R $id role]]} {
276+
break
277+
}
278+
}
279+
assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid]
280+
assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]]
281+
assert_equal [R 8 cluster myshardid] [R $id cluster myshardid]
282+
}
283+
284+
test "CLUSTER MYSHARDID reports same shard id after shard restart" {
285+
set node_ids {}
286+
for {set i 0} {$i < 8} {incr i 4} {
287+
dict set node_ids $i [R $i cluster myshardid]
288+
pause_process [srv [expr -1*$i] pid]
289+
}
290+
for {set i 0} {$i < 8} {incr i 4} {
291+
restart_server [expr -1*$i] true false
292+
}
293+
wait_for_cluster_state ok
294+
for {set i 0} {$i < 8} {incr i 4} {
295+
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
296+
}
297+
}
298+
299+
test "CLUSTER MYSHARDID reports same shard id after cluster restart" {
300+
set node_ids {}
301+
for {set i 0} {$i < 8} {incr i} {
302+
dict set node_ids $i [R $i cluster myshardid]
303+
}
304+
for {set i 0} {$i < 8} {incr i} {
305+
pause_process [srv [expr -1*$i] pid]
306+
}
307+
for {set i 0} {$i < 8} {incr i} {
308+
restart_server [expr -1*$i] true false
309+
}
310+
wait_for_cluster_state ok
311+
for {set i 0} {$i < 8} {incr i} {
312+
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
313+
}
314+
}
315+
316+
test "CLUSTER SHARDS id response validation" {
317+
# For each node in the cluster
318+
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
319+
# Get the CLUSTER SHARDS output from this node
320+
set shards [R $i CLUSTER SHARDS]
321+
set seen_shard_ids {}
322+
323+
# For each shard in the output
324+
foreach shard $shards {
325+
set shard_dict [dict create {*}$shard]
326+
327+
# 1. Verify 'id' key exists
328+
assert {[dict exists $shard_dict id]}
329+
set shard_id [dict get $shard_dict id]
330+
331+
# 2. Verify shard_id is a 40-char string
332+
assert {[string length $shard_id] == 40}
333+
334+
# 3. Verify that for a given node's output, all shard IDs are unique
335+
assert {[dict exists $seen_shard_ids $shard_id] == 0}
336+
dict set seen_shard_ids $shard_id 1
337+
}
338+
}
339+
}
340+
341+
} split_slot_allocation split_slot_replica_allocation
Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,27 @@
33
# iterations. The test checks that certain properties
44
# are preserved across iterations.
55

6-
source "../tests/includes/init-tests.tcl"
7-
8-
test "Create a 5 nodes cluster" {
9-
create_cluster 5 5
10-
}
6+
start_cluster 5 5 {tags {external:skip cluster}} {
117

128
test "Cluster is up" {
13-
assert_cluster_state ok
9+
wait_for_cluster_state ok
1410
}
1511

1612
set iterations 20
17-
set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]]
13+
set cluster [valkey_cluster 127.0.0.1:[srv 0 port]]
1814

1915
while {[incr iterations -1]} {
2016
set tokill [randomInt 10]
2117
set other [expr {($tokill+1)%10}] ; # Some other instance.
2218
set key [randstring 20 20 alpha]
2319
set val [randstring 20 20 alpha]
24-
set role [RI $tokill role]
20+
set role [s [expr -1*$tokill] role]
2521
if {$role eq {master}} {
2622
set slave {}
27-
set myid [dict get [get_myself $tokill] id]
28-
foreach_valkey_id id {
23+
set myid [dict get [cluster_get_myself $tokill] id]
24+
for {set id 0} {$id < [llength $::servers]} {incr id} {
2925
if {$id == $tokill} continue
30-
if {[dict get [get_myself $id] slaveof] eq $myid} {
26+
if {[dict get [cluster_get_myself $id] slaveof] eq $myid} {
3127
set slave $id
3228
}
3329
}
@@ -41,7 +37,7 @@ while {[incr iterations -1]} {
4137
if {$role eq {master}} {
4238
test "Wait for slave of #$tokill to sync" {
4339
wait_for_condition 1000 50 {
44-
[string match {*state=online*} [RI $tokill slave0]]
40+
[string match {*state=online*} [s [expr -1*$tokill] slave0]]
4541
} else {
4642
fail "Slave of node #$tokill is not ok"
4743
}
@@ -62,9 +58,7 @@ while {[incr iterations -1]} {
6258
}
6359

6460
test "Terminating node #$tokill" {
65-
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
66-
R $tokill config set appendonly no
67-
kill_instance valkey $tokill
61+
catch {R $tokill shutdown nosave}
6862
}
6963

7064
if {$role eq {master}} {
@@ -78,7 +72,7 @@ while {[incr iterations -1]} {
7872
}
7973

8074
test "Cluster should eventually be up again" {
81-
assert_cluster_state ok
75+
wait_for_cluster_state ok
8276
}
8377

8478
test "Cluster is writable again" {
@@ -89,12 +83,12 @@ while {[incr iterations -1]} {
8983
}
9084

9185
test "Restarting node #$tokill" {
92-
restart_instance valkey $tokill
86+
restart_server [expr -1*$tokill] true false
9387
}
9488

9589
test "Instance #$tokill is now a slave" {
9690
wait_for_condition 1000 50 {
97-
[RI $tokill role] eq {slave}
91+
[s [expr -1*$tokill] role] eq {slave}
9892
} else {
9993
fail "Restarted instance is not a slave"
10094
}
@@ -111,7 +105,9 @@ while {[incr iterations -1]} {
111105
}
112106

113107
test "Post condition: current_epoch >= my_epoch everywhere" {
114-
foreach_valkey_id id {
108+
for {set id 0} {$id < [llength $::servers]} {incr id} {
115109
assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
116110
}
117111
}
112+
113+
} ;# start_cluster

0 commit comments

Comments
 (0)