Skip to content

Commit 6896ced

Browse files
authored
feat: per-job max_runtime to cap rsync/rclone wall-clock duration (#16)
* feat(config): add max_runtime field with validation * refactor(config): explicit error on MaxRuntimeDuration and distinct zero/negative messages * feat(engine): add StatusTimedOut with render support New status constant for jobs killed by per-invocation deadline. IsFailure returns true so Summary.HasErrors and the CLI exit-code path treat it as a failure. statusSymbol maps it to ✗ (red); itemStatsText renders "timed out" (red when color enabled). aggregateStatus already classifies any IsFailure status as StatusFailed at the aggregate level — no changes needed there. * refactor(engine): group failure labels and clarify timeout test names * feat(engine): classify DeadlineExceeded as StatusTimedOut Adds classifyExitStatus(ctx, runErr) in runner.go. Both RsyncExecutor and RcloneExecutor route their cmd.Start and cmd.Wait exit paths through it: context.DeadlineExceeded maps to StatusTimedOut; context.Canceled and other errors remain StatusFailed. Covers both the Start path (already-expired context rejects the spawn) and the Wait path (deadline fires mid-run). Five-subtest classifier unit test pins the "deadline first then parent cancel" ordering guarantee; two executor integration tests use WithDeadline(past) to exercise the full path. * docs(engine): generalize classifyExitStatus doc comment * test(engine): document --config /dev/null in rclone timeout test * feat(engine): enforce MaxRuntime per rsync/rclone invocation Adds jobContext helper that wraps the parent context with WithTimeout when max_runtime is set, or returns the parent unchanged (with a no-op cancel) when it is empty. Wires the derived context into runRsyncJob (synchronous cancel per source loop iteration) and runRcloneJob (defer cancel). CleanupArchives keeps the parent context unchanged. * docs(engine): annotate MaxRuntime error discard and CleanupArchives ctx choice * docs(example): document max_runtime in example config * docs(example): clarify max_runtime format and per-invocation semantics * test: cover rclone runner-level timeout and validator wiring * fix(engine): preserve runErr in timeout log lines
1 parent cdb28a5 commit 6896ced

13 files changed

Lines changed: 623 additions & 17 deletions

config.example.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,10 @@ mode = "copy"
7979
bwlimit = "2M"
8080
# optional = true # skip without failure if the local source is absent
8181
# (useful for detachable devices like external drives)
82+
# max_runtime = "2h" # cap per-invocation wall-clock time; Shuttle kills
83+
# the tool and reports "timed out" (counts as a
84+
# failure, exit 1) if exceeded. Omit to disable.
85+
# Format: Go duration string (e.g. "30s", "5m",
86+
# "2h", "1h30m"). Applies per invocation: rsync
87+
# jobs with N sources and rclone jobs with N
88+
# remotes each get N independent timeouts.

internal/config/config.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"os"
1212
"path/filepath"
1313
"strings"
14+
"time"
1415

1516
toml "github.com/pelletier/go-toml/v2"
1617
)
@@ -85,6 +86,12 @@ type Job struct {
8586
// sources (paths containing ":") since those are not stat'd before
8687
// invocation.
8788
Optional bool `toml:"optional"`
89+
// MaxRuntime caps the wall-clock duration of each rsync/rclone
90+
// invocation. Parsed via time.ParseDuration (e.g. "30m", "2h"). Empty
91+
// means no timeout. Zero and negative durations are rejected at
92+
// validation time. Applies per invocation: a job with N rsync sources
93+
// or N rclone remotes gets N independent timeouts.
94+
MaxRuntime string `toml:"max_runtime"`
8895

8996
// Rsync fields
9097
Sources []string `toml:"sources"`
@@ -167,6 +174,19 @@ func (c *Config) ResolvedLogRetentionDays() int {
167174
return *c.Defaults.LogRetentionDays
168175
}
169176

177+
// MaxRuntimeDuration returns the parsed max_runtime value, or 0 when
178+
// the field is empty (meaning "no timeout"). The LoadBytes/LoadFile
179+
// path validates max_runtime at parse time, so callers on that path
180+
// never see the error. Callers that construct Job literals directly
181+
// (e.g. tests) must handle the error explicitly so typos surface
182+
// immediately instead of silently becoming "no timeout".
183+
func (j Job) MaxRuntimeDuration() (time.Duration, error) {
184+
if j.MaxRuntime == "" {
185+
return 0, nil
186+
}
187+
return time.ParseDuration(j.MaxRuntime)
188+
}
189+
170190
// JobNames returns the names of all configured jobs in config order.
171191
func (c *Config) JobNames() []string {
172192
names := make([]string, len(c.Jobs))
@@ -302,7 +322,7 @@ func validateRsyncJob(job Job) error {
302322
if job.Destination == "" {
303323
return fmt.Errorf("job %q: empty destination", job.Name)
304324
}
305-
return nil
325+
return validateMaxRuntime(job)
306326
}
307327

308328
func validateRcloneJob(job Job) error {
@@ -326,5 +346,26 @@ func validateRcloneJob(job Job) error {
326346
}
327347
remoteSeen[r] = true
328348
}
349+
return validateMaxRuntime(job)
350+
}
351+
352+
// validateMaxRuntime enforces that max_runtime, when set, parses to a
353+
// strictly positive time.Duration. An empty string means "no timeout"
354+
// and passes. Zero and negative values are rejected so users don't
355+
// accidentally configure "0s" expecting it to mean "no timeout".
356+
func validateMaxRuntime(job Job) error {
357+
if job.MaxRuntime == "" {
358+
return nil
359+
}
360+
d, err := time.ParseDuration(job.MaxRuntime)
361+
if err != nil {
362+
return fmt.Errorf("job %q: invalid max_runtime %q: %w", job.Name, job.MaxRuntime, err)
363+
}
364+
if d == 0 {
365+
return fmt.Errorf("job %q: max_runtime is %q; omit the field to disable the timeout", job.Name, job.MaxRuntime)
366+
}
367+
if d < 0 {
368+
return fmt.Errorf("job %q: max_runtime must be positive (got %q)", job.Name, job.MaxRuntime)
369+
}
329370
return nil
330371
}

internal/config/config_test.go

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"path/filepath"
66
"strings"
77
"testing"
8+
"time"
89

910
"github.com/jkleinne/shuttle/internal/config"
1011
)
@@ -546,3 +547,155 @@ destination = "/tmp/backup"
546547
t.Error("Jobs[0].Optional = true, want false (zero value)")
547548
}
548549
}
550+
551+
func TestLoad_MaxRuntime_ParsedAndExposed(t *testing.T) {
552+
tomlData := `
553+
[[job]]
554+
name = "photos"
555+
engine = "rsync"
556+
sources = ["/tmp/photos"]
557+
destination = "/tmp/backup"
558+
max_runtime = "2h"
559+
`
560+
cfg, err := config.LoadBytes([]byte(tomlData))
561+
if err != nil {
562+
t.Fatalf("unexpected error: %v", err)
563+
}
564+
if cfg.Jobs[0].MaxRuntime != "2h" {
565+
t.Errorf("MaxRuntime = %q, want \"2h\"", cfg.Jobs[0].MaxRuntime)
566+
}
567+
got, err := cfg.Jobs[0].MaxRuntimeDuration()
568+
if err != nil {
569+
t.Fatalf("MaxRuntimeDuration() error = %v", err)
570+
}
571+
if got != 2*time.Hour {
572+
t.Errorf("MaxRuntimeDuration() = %v, want 2h", got)
573+
}
574+
}
575+
576+
func TestLoad_MaxRuntime_Absent_DurationIsZero(t *testing.T) {
577+
tomlData := `
578+
[[job]]
579+
name = "photos"
580+
engine = "rsync"
581+
sources = ["/tmp/photos"]
582+
destination = "/tmp/backup"
583+
`
584+
cfg, err := config.LoadBytes([]byte(tomlData))
585+
if err != nil {
586+
t.Fatalf("unexpected error: %v", err)
587+
}
588+
if cfg.Jobs[0].MaxRuntime != "" {
589+
t.Errorf("MaxRuntime = %q, want empty", cfg.Jobs[0].MaxRuntime)
590+
}
591+
got, err := cfg.Jobs[0].MaxRuntimeDuration()
592+
if err != nil {
593+
t.Fatalf("MaxRuntimeDuration() error = %v", err)
594+
}
595+
if got != 0 {
596+
t.Errorf("MaxRuntimeDuration() = %v, want 0", got)
597+
}
598+
}
599+
600+
func TestLoad_MaxRuntime_Zero_Rejected(t *testing.T) {
601+
tomlData := `
602+
[[job]]
603+
name = "photos"
604+
engine = "rsync"
605+
sources = ["/tmp/photos"]
606+
destination = "/tmp/backup"
607+
max_runtime = "0s"
608+
`
609+
_, err := config.LoadBytes([]byte(tomlData))
610+
if err == nil {
611+
t.Fatal("expected error for max_runtime = \"0s\", got nil")
612+
}
613+
msg := err.Error()
614+
if !strings.Contains(msg, "photos") || !strings.Contains(msg, "max_runtime") {
615+
t.Errorf("error %q should mention job name and field", msg)
616+
}
617+
}
618+
619+
func TestLoad_MaxRuntime_Negative_Rejected(t *testing.T) {
620+
tomlData := `
621+
[[job]]
622+
name = "photos"
623+
engine = "rsync"
624+
sources = ["/tmp/photos"]
625+
destination = "/tmp/backup"
626+
max_runtime = "-5m"
627+
`
628+
_, err := config.LoadBytes([]byte(tomlData))
629+
if err == nil {
630+
t.Fatal("expected error for negative max_runtime, got nil")
631+
}
632+
if !strings.Contains(err.Error(), "photos") {
633+
t.Errorf("error %q should mention job name", err.Error())
634+
}
635+
if !strings.Contains(err.Error(), "positive") {
636+
t.Errorf("error %q should mention \"positive\"", err.Error())
637+
}
638+
}
639+
640+
func TestLoad_MaxRuntime_Malformed_Rejected(t *testing.T) {
641+
tomlData := `
642+
[[job]]
643+
name = "photos"
644+
engine = "rsync"
645+
sources = ["/tmp/photos"]
646+
destination = "/tmp/backup"
647+
max_runtime = "banana"
648+
`
649+
_, err := config.LoadBytes([]byte(tomlData))
650+
if err == nil {
651+
t.Fatal("expected error for malformed max_runtime, got nil")
652+
}
653+
if !strings.Contains(err.Error(), "photos") {
654+
t.Errorf("error %q should mention job name", err.Error())
655+
}
656+
}
657+
658+
func TestLoad_MaxRuntime_RcloneJob_ValidatedAndExposed(t *testing.T) {
659+
// validateMaxRuntime is called from both validateRsyncJob and
660+
// validateRcloneJob. The other tests cover the rsync path; this
661+
// pins that the validator runs for rclone jobs too.
662+
tomlData := `
663+
[[job]]
664+
name = "docs-to-cloud"
665+
engine = "rclone"
666+
source = "/tmp/docs"
667+
remotes = ["my_gdrive"]
668+
mode = "copy"
669+
max_runtime = "30m"
670+
`
671+
cfg, err := config.LoadBytes([]byte(tomlData))
672+
if err != nil {
673+
t.Fatalf("unexpected error: %v", err)
674+
}
675+
got, err := cfg.Jobs[0].MaxRuntimeDuration()
676+
if err != nil {
677+
t.Fatalf("MaxRuntimeDuration() error = %v", err)
678+
}
679+
if got != 30*time.Minute {
680+
t.Errorf("MaxRuntimeDuration() = %v, want 30m", got)
681+
}
682+
}
683+
684+
func TestLoad_MaxRuntime_RcloneJob_NegativeRejected(t *testing.T) {
685+
tomlData := `
686+
[[job]]
687+
name = "docs-to-cloud"
688+
engine = "rclone"
689+
source = "/tmp/docs"
690+
remotes = ["my_gdrive"]
691+
mode = "copy"
692+
max_runtime = "-1h"
693+
`
694+
_, err := config.LoadBytes([]byte(tomlData))
695+
if err == nil {
696+
t.Fatal("expected error for negative max_runtime on rclone job, got nil")
697+
}
698+
if !strings.Contains(err.Error(), "docs-to-cloud") {
699+
t.Errorf("error %q should mention job name", err.Error())
700+
}
701+
}

internal/engine/rclone.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,13 @@ func (e *RcloneExecutor) Exec(ctx context.Context, args []string, onProgress fun
136136
}
137137

138138
if err := cmd.Start(); err != nil {
139-
e.logger.FileError(fmt.Sprintf("rclone start failed for %s: %v", displayName, err))
140-
return ItemResult{Name: displayName, Status: StatusFailed}
139+
status := classifyExitStatus(ctx, err)
140+
if status == StatusTimedOut {
141+
e.logger.FileError(fmt.Sprintf("rclone timed out for %s after per-job max_runtime: %v", displayName, err))
142+
} else {
143+
e.logger.FileError(fmt.Sprintf("rclone start failed for %s: %v", displayName, err))
144+
}
145+
return ItemResult{Name: displayName, Status: status}
141146
}
142147

143148
var pipeWg sync.WaitGroup
@@ -168,14 +173,17 @@ func (e *RcloneExecutor) Exec(ctx context.Context, args []string, onProgress fun
168173
}
169174
stats.Elapsed = elapsed
170175

171-
status := StatusOK
176+
status := classifyExitStatus(ctx, runErr)
172177
if runErr != nil {
173-
status = StatusFailed
174178
subcommand := "rclone"
175179
if len(args) > 0 {
176180
subcommand = "rclone " + args[0]
177181
}
178-
e.logger.FileError(fmt.Sprintf("%s failed for %s: %v", subcommand, displayName, runErr))
182+
if status == StatusTimedOut {
183+
e.logger.FileError(fmt.Sprintf("%s timed out for %s after per-job max_runtime: %v", subcommand, displayName, runErr))
184+
} else {
185+
e.logger.FileError(fmt.Sprintf("%s failed for %s: %v", subcommand, displayName, runErr))
186+
}
179187
}
180188

181189
return ItemResult{Name: displayName, Status: status, Stats: stats}

internal/engine/rclone_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,3 +483,29 @@ func TestCleanupArchives_KeepsRecent(t *testing.T) {
483483
t.Error("recent archive dir should still exist")
484484
}
485485
}
486+
487+
func TestRcloneExec_ExpiredContext_ReturnsTimedOut(t *testing.T) {
488+
skipIfNoRclone(t)
489+
490+
src := t.TempDir()
491+
dst := t.TempDir()
492+
if err := os.WriteFile(filepath.Join(src, "hello.txt"), []byte("world"), 0o644); err != nil {
493+
t.Fatalf("writing test file: %v", err)
494+
}
495+
496+
// A context whose deadline is already in the past will cause exec.CommandContext
497+
// to kill the process immediately, producing a DeadlineExceeded context error.
498+
ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(-1*time.Second))
499+
defer cancel()
500+
501+
executor, logPath := newRcloneTestExecutor(t)
502+
// --config /dev/null prevents rclone from loading the developer's
503+
// real rclone config during the test.
504+
job := config.Job{ExtraFlags: []string{"--config", "/dev/null"}}
505+
args := BuildRcloneArgs("copy", nil, job, src+"/", ":local:"+dst, false, logPath, "")
506+
result := executor.Exec(ctx, args, nil)
507+
508+
if result.Status != StatusTimedOut {
509+
t.Errorf("Status = %q, want %q", result.Status, StatusTimedOut)
510+
}
511+
}

internal/engine/render.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ const (
3636
symbolOptionalMissing = "○"
3737
labelFailed = "failed"
3838
labelNotFound = "not found"
39+
labelTimedOut = "timed out"
3940
labelSkipped = "skipped"
4041
labelOptionalMissing = "source missing (optional)"
4142
tallyLabelPassed = "passed"
@@ -56,7 +57,7 @@ func colorize(useColor bool, code, text string) string {
5657
// statusSymbol returns a colored status indicator character.
5758
func statusSymbol(status Status, useColor bool) string {
5859
switch status {
59-
case StatusFailed, StatusNotFound:
60+
case StatusFailed, StatusNotFound, StatusTimedOut:
6061
return colorize(useColor, ansiRed, symbolFailed)
6162
case StatusSkipped:
6263
return colorize(useColor, ansiYellow, symbolSkipped)
@@ -75,6 +76,8 @@ func itemStatsText(item ItemResult, useColor bool) string {
7576
return colorize(useColor, ansiRed, labelFailed)
7677
case StatusNotFound:
7778
return colorize(useColor, ansiRed, labelNotFound)
79+
case StatusTimedOut:
80+
return colorize(useColor, ansiRed, labelTimedOut)
7881
case StatusSkipped:
7982
return colorize(useColor, ansiYellow, labelSkipped)
8083
case StatusOptionalMissing:

0 commit comments

Comments
 (0)