Skip to content

Commit 0580abd

Browse files
committed
sql/schemachanger: deflake backup and restore tests
Previously, the backup and restore tests could flake locally or on CI because we relied on the second granularity of finished_time. This was not sufficient because tests could execute fast enough so that the status of the latest schema change could be confused. To address this, this patch uses the max job ID to detect newer jobs. Fixes: #158772 Release note: None
1 parent ebfcb56 commit 0580abd

File tree

2 files changed

+20
-15
lines changed

2 files changed

+20
-15
lines changed

pkg/sql/schemachanger/sctest/backup.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
"strings"
1616
"sync/atomic"
1717
"testing"
18-
"time"
1918

2019
"github.com/cockroachdb/cockroach/pkg/clusterversion"
2120
"github.com/cockroachdb/cockroach/pkg/sql/parser"
@@ -422,10 +421,10 @@ func backupRollbacks(t *testing.T, factory TestServerFactory, cs CumulativeTestC
422421
// Fetch the state of the cluster before the schema change kicks off.
423422
tdb.Exec(t, fmt.Sprintf("USE %q", cs.DatabaseName))
424423
expected := parserRoundTrip(t, tdb.QueryStr(t, fetchDescriptorStateQuery))
425-
// Store the start time, so we know which jobs are relevant.
426-
ts := tdb.QueryRow(t, "SELECT current_timestamp(0)::timestamp")
427-
var startTime time.Time
428-
ts.Scan(&startTime)
424+
// Store the highest job ID, so we know which jobs are relevant.
425+
jobID := tdb.QueryRow(t, "SELECT COALESCE(max(job_id), 0) FROM [SHOW JOBS]")
426+
var maxJobID int64
427+
jobID.Scan(&maxJobID)
429428
// Kick off the schema change, fail it at the prescribed stage,
430429
// and perform the backups during the rollback via the BeforeStage hook.
431430
require.Regexp(t, fmt.Sprintf("boom %d", cs.StageOrdinal),
@@ -437,7 +436,7 @@ func backupRollbacks(t *testing.T, factory TestServerFactory, cs CumulativeTestC
437436
// Note: Depending on the state of the schema objects, no job may be created
438437
// to finalize the rollback, since dependent objects were never backed
439438
// up.
440-
succeeded, jobExists := hasLatestSchemaChangeSucceededWithTimestamp(t, tdb, startTime)
439+
succeeded, jobExists := hasLatestSchemaChangeSucceededWithMaxJobID(t, tdb, maxJobID)
441440
require.False(t, succeeded && jobExists)
442441
tdb.Exec(t, fmt.Sprintf("USE %q", cs.DatabaseName))
443442
postRollback := parserRoundTrip(t, tdb.QueryStr(t, fetchDescriptorStateQuery))
@@ -579,10 +578,11 @@ func exerciseBackupRestore(
579578
tdb.Exec(t, cs.CreateDatabaseStmt)
580579
}
581580

582-
// Store the start time, so we know which jobs are relevant.
583-
ts := tdb.QueryRow(t, "SELECT current_timestamp(0)::timestamp")
584-
var startTime time.Time
585-
ts.Scan(&startTime)
581+
// Store the highest job ID, so we know which jobs are relevant.
582+
tdb.Exec(t, "USE system")
583+
jobID := tdb.QueryRow(t, "SELECT COALESCE(max(job_id), 0) FROM [SHOW JOBS]")
584+
var maxJobID int64
585+
jobID.Scan(&maxJobID)
586586

587587
// RESTORE.
588588
pe.RequestPlan()
@@ -598,7 +598,7 @@ func exerciseBackupRestore(
598598
skipComparison := false
599599
// If the only thing left is to clean up then this restore may have no
600600
// job so the startTime is important.
601-
if succeeded, jobExists := hasLatestSchemaChangeSucceededWithTimestamp(t, tdb, startTime); (succeeded && jobExists) || (b.maySucceed && !jobExists) {
601+
if succeeded, jobExists := hasLatestSchemaChangeSucceededWithMaxJobID(t, tdb, maxJobID); (succeeded && jobExists) || (b.maySucceed && !jobExists) {
602602
expected = b.expectedOnSuccess
603603
skipComparison = b.skipComparisonOnSuccess
604604
if !b.maySucceed {

pkg/sql/schemachanger/sctest/framework.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,14 +1015,19 @@ func waitForSchemaChangesToFinish(t *testing.T, tdb *sqlutils.SQLRunner) {
10151015
)
10161016
}
10171017

1018-
func hasLatestSchemaChangeSucceededWithTimestamp(
1019-
t *testing.T, tdb *sqlutils.SQLRunner, startTime time.Time,
1018+
// hasLatestSchemaChangeSucceededWithMaxJobID detects if the latest schema change
1019+
// has changed. The function requires the last observed maximum job ID and takes
1020+
// advantage of the increasing nature of the ID (since the upper bits encode
1021+
// the current time). Note: We could have used `finished`, but it does not encode
1022+
// enough precision.
1023+
func hasLatestSchemaChangeSucceededWithMaxJobID(
1024+
t *testing.T, tdb *sqlutils.SQLRunner, maxJobID int64,
10201025
) (succeeded bool, jobExists bool) {
10211026
result := tdb.QueryStr(t, fmt.Sprintf(
1022-
`SELECT status FROM [SHOW JOBS] WHERE job_type IN ('%s') AND finished >= $1::timestamp ORDER BY finished DESC, job_id DESC LIMIT 1`,
1027+
`SELECT status FROM [SHOW JOBS] WHERE job_type IN ('%s') AND job_id > $1 ORDER BY finished DESC, job_id DESC LIMIT 1`,
10231028
jobspb.TypeNewSchemaChange,
10241029
),
1025-
startTime)
1030+
maxJobID)
10261031
return len(result) == 0 || result[0][0] == "succeeded", len(result) > 0
10271032
}
10281033

0 commit comments

Comments
 (0)