Skip to content

Commit 57d3a6b

Browse files
[ONPREM-2785] Add tests + decrease request timeout on task retries (#235)
* Add better acceptance test coverage around task-agent failure modes * Add correlation to user-agent string Make it easier to differentiate retried task instances * Decrease overall runner API request timeout to 1m 5m was a bit excessive * ./do license-attributions * Attempt to fix tests on Windows I think the tasks fail too quickly before the readiness probes have a chance * Add changelog entry
1 parent 4b968cf commit 57d3a6b

File tree

7 files changed

+151
-15
lines changed

7 files changed

+151
-15
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ By following these guidelines, we can easily determine which changes should be i
1010

1111
## Edge
1212

13+
- [#235](https://github.com/circleci/runner-init/pull/235) Decrease runner API request timeout and add correlation string to requests.
1314
- [#174](https://github.com/circleci/runner-init/pull/174) Add support for a custom entrypoint override.
1415
- [#224](https://github.com/circleci/runner-init/pull/224) Record and log timings in orchestrator init function.
1516
- [#212](https://github.com/circleci/runner-init/pull/212) Fix child process cleanup on Windows using job objects. This ensures that child processes are destroyed when the parent process (task-agent) terminates.

acceptance/task_test.go

Lines changed: 122 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package acceptance
22

33
import (
44
"fmt"
5+
"net/http/httptest"
56
"os"
67
"path/filepath"
78
"runtime"
@@ -10,23 +11,26 @@ import (
1011
"time"
1112

1213
"github.com/circleci/ex/testing/runner"
14+
"github.com/circleci/ex/testing/testcontext"
1315
"gotest.tools/v3/assert"
1416
"gotest.tools/v3/assert/cmp"
17+
18+
"github.com/circleci/runner-init/internal/testing/fakerunnerapi"
1519
)
1620

1721
func TestRunTask(t *testing.T) {
1822
readinessFilePath := filepath.Join(t.TempDir(), "ready")
1923
goodConfig := fmt.Sprintf(`
20-
{
21-
"cmd": [],
22-
"enable_unsafe_retries": false,
23-
"token": "testtoken",
24-
"readiness_file_path": "%v",
25-
"task_agent_path": "%v",
26-
"runner_api_base_url": "https://runner.circleci.com",
27-
"allocation": "testallocation",
28-
"max_run_time": 60000000000
29-
}`, strings.ReplaceAll(readinessFilePath, `\`, `\\`), strings.ReplaceAll(taskAgentBinary, `\`, `\\`))
24+
{
25+
"cmd": [],
26+
"enable_unsafe_retries": false,
27+
"token": "testtoken",
28+
"readiness_file_path": "%v",
29+
"task_agent_path": "%v",
30+
"runner_api_base_url": "https://runner.circleci.com",
31+
"allocation": "testallocation",
32+
"max_run_time": 60000000000
33+
}`, strings.ReplaceAll(readinessFilePath, `\`, `\\`), strings.ReplaceAll(taskAgentBinary, `\`, `\\`))
3034

3135
t.Run("Good run-task", func(t *testing.T) {
3236
r := runner.New(
@@ -36,6 +40,7 @@ func TestRunTask(t *testing.T) {
3640
)
3741
res, err := r.Start(orchestratorTestBinaryRunTask)
3842
assert.NilError(t, err)
43+
defer func() { t.Log(res.Logs()) }()
3944

4045
t.Run("Probe for readiness", func(t *testing.T) {
4146
assert.NilError(t, res.Ready("admin", time.Second*20))
@@ -67,8 +72,8 @@ func TestRunTask(t *testing.T) {
6772

6873
//nolint:gosec
6974
err := os.WriteFile(entrypointPath, []byte(`#!/bin/bash
70-
echo "Executing custom entrypoint"
71-
exec "$@"`), 0750)
75+
echo "Executing custom entrypoint"
76+
exec "$@"`), 0750)
7277
assert.NilError(t, err)
7378

7479
r := runner.New(
@@ -79,6 +84,7 @@ exec "$@"`), 0750)
7984
)
8085
res, err := r.Start(orchestratorTestBinaryOverride)
8186
assert.NilError(t, err)
87+
defer func() { t.Log(res.Logs()) }()
8288

8389
t.Run("Probe for readiness", func(t *testing.T) {
8490
assert.NilError(t, res.Ready("admin", time.Second*20))
@@ -105,5 +111,108 @@ exec "$@"`), 0750)
105111
})
106112
})
107113

108-
// TODO: Add more test cases...
114+
t.Run("task-agent exits with an error", func(t *testing.T) {
115+
ctx := testcontext.Background()
116+
runnerAPI := fakerunnerapi.New(ctx, []fakerunnerapi.Task{
117+
{
118+
Token: "testtoken",
119+
Allocation: "testallocation",
120+
},
121+
})
122+
s := httptest.NewServer(runnerAPI)
123+
t.Cleanup(s.Close)
124+
125+
badConfig := fmt.Sprintf(`
126+
{
127+
"cmd": [],
128+
"enable_unsafe_retries": false,
129+
"token": "testtoken",
130+
"task_agent_path": "%v --bad-flag",
131+
"runner_api_base_url": "%v",
132+
"allocation": "testallocation",
133+
"max_run_time": 60000000000
134+
}`, strings.ReplaceAll(taskAgentBinary, `\`, `\\`), s.URL)
135+
136+
r := runner.New(
137+
"CIRCLECI_GOAT_SHUTDOWN_DELAY=10s",
138+
"CIRCLECI_GOAT_CONFIG="+badConfig,
139+
"CIRCLECI_GOAT_HEALTH_CHECK_ADDR=:7624",
140+
)
141+
res, err := r.Start(orchestratorTestBinaryRunTask)
142+
assert.NilError(t, err)
143+
defer func() { t.Log(res.Logs()) }()
144+
145+
t.Run("Run task", func(t *testing.T) {
146+
select {
147+
case err = <-res.Wait():
148+
assert.NilError(t, err, "handled errors should have a clean exit")
149+
case <-time.After(time.Second * 40):
150+
assert.NilError(t, res.Stop())
151+
t.Fatal(t, "timeout before process stopped")
152+
}
153+
})
154+
155+
t.Run("Task failed", func(t *testing.T) {
156+
assert.Check(t, cmp.Len(runnerAPI.TaskUnclaims(), 0))
157+
assert.Check(t, cmp.DeepEqual(runnerAPI.TaskEvents(), []fakerunnerapi.TaskEvent{
158+
{
159+
Allocation: "testallocation",
160+
TimestampMilli: time.Now().UnixMilli(),
161+
Message: []byte("error while executing task agent: " +
162+
"task agent command exited with an unexpected error: " +
163+
"exit status 80: circleci-agent: error: unknown flag --bad-flag\n: " +
164+
"Check container logs for more details"),
165+
},
166+
}, fakerunnerapi.CmpTaskEvent))
167+
})
168+
})
169+
170+
t.Run("task-agent fails to start", func(t *testing.T) {
171+
ctx := testcontext.Background()
172+
runnerAPI := fakerunnerapi.New(ctx, []fakerunnerapi.Task{
173+
{
174+
ID: "testid",
175+
Token: "testtoken",
176+
Allocation: "testallocation",
177+
},
178+
})
179+
s := httptest.NewServer(runnerAPI)
180+
t.Cleanup(s.Close)
181+
182+
badConfig := fmt.Sprintf(`
183+
{
184+
"cmd": [],
185+
"enable_unsafe_retries": false,
186+
"task_id": "testid",
187+
"token": "testtoken",
188+
"task_agent_path": "thiswillfailtostart",
189+
"runner_api_base_url": "%v",
190+
"allocation": "testallocation",
191+
"max_run_time": 60000000000
192+
}`, s.URL)
193+
194+
r := runner.New(
195+
"CIRCLECI_GOAT_SHUTDOWN_DELAY=10s",
196+
"CIRCLECI_GOAT_CONFIG="+badConfig,
197+
"CIRCLECI_GOAT_HEALTH_CHECK_ADDR=:7624",
198+
)
199+
res, err := r.Start(orchestratorTestBinaryRunTask)
200+
assert.NilError(t, err)
201+
defer func() { t.Log(res.Logs()) }()
202+
203+
t.Run("Run task", func(t *testing.T) {
204+
select {
205+
case err = <-res.Wait():
206+
assert.NilError(t, err, "handled errors should have a clean exit")
207+
case <-time.After(time.Second * 40):
208+
assert.NilError(t, res.Stop())
209+
t.Fatal(t, "timeout before process stopped")
210+
}
211+
})
212+
213+
t.Run("Task unclaimed - this is a safe error to retry", func(t *testing.T) {
214+
assert.Check(t, cmp.Len(runnerAPI.TaskUnclaims(), 1))
215+
assert.Check(t, cmp.Len(runnerAPI.TaskEvents(), 0))
216+
})
217+
})
109218
}

clients/runner/client.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010

1111
"github.com/circleci/ex/config/secret"
1212
"github.com/circleci/ex/httpclient"
13+
"github.com/circleci/ex/httpclient/dnscache"
1314
"github.com/circleci/ex/httpclient/metrics"
1415
"github.com/circleci/ex/o11y"
1516
)
@@ -29,10 +30,13 @@ type ClientConfig struct {
2930

3031
type Info struct {
3132
AgentVersion string
33+
// Correlation should be a unique-ish string to correlate API requests and logs
34+
Correlation string
3235
}
3336

3437
func (i Info) userAgent() string {
35-
return fmt.Sprintf("CircleCI-GOAT/%s (%s; %s)", i.AgentVersion, runtime.GOOS, runtime.GOARCH)
38+
return fmt.Sprintf("CircleCI-GOAT/%s (%s; %s; %s)",
39+
i.AgentVersion, runtime.GOOS, runtime.GOARCH, i.Correlation)
3640
}
3741

3842
func NewClient(c ClientConfig) *Client {
@@ -41,8 +45,11 @@ func NewClient(c ClientConfig) *Client {
4145
BaseURL: c.BaseURL,
4246
AuthToken: string(c.AuthToken),
4347
AcceptType: httpclient.JSON,
44-
Timeout: time.Minute * 5,
48+
Timeout: time.Minute * 1,
4549
UserAgent: c.Info.userAgent(),
50+
DialContext: dnscache.DialContext(dnscache.New(dnscache.Config{
51+
TTL: 30 * time.Second,
52+
}), nil),
4653
}
4754

4855
if c.Tracer != nil {

cmd/orchestrator/main.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@ import (
66
"fmt"
77
"log" //nolint:depguard // a non-O11y log is allowed for a top-level fatal exit
88
"os"
9+
"strings"
910
"time"
1011

1112
"github.com/alecthomas/kong"
1213
"github.com/circleci/ex/httpserver/healthcheck"
1314
"github.com/circleci/ex/o11y"
1415
"github.com/circleci/ex/system"
1516
"github.com/circleci/ex/termination"
17+
"github.com/circleci/ex/testing/testrand"
1618

1719
"github.com/circleci/runner-init/clients/runner"
1820
"github.com/circleci/runner-init/cmd"
@@ -131,6 +133,7 @@ func runSetup(ctx context.Context, cli cli, version string, sys *system.System)
131133
AuthToken: c.Config.Token,
132134
Info: runner.Info{
133135
AgentVersion: version,
136+
Correlation: correlation(),
134137
},
135138
})
136139

@@ -145,6 +148,18 @@ func runSetup(ctx context.Context, cli cli, version string, sys *system.System)
145148
return o, nil
146149
}
147150

151+
// correlation returns a unique-ish string to correlate API requests and logs.
152+
// We prefer the Pod name, which we currently get via the hostname since we don't inject it into the task
153+
// environment yet. However, hostname is not fully reliable (it can be overridden in the Pod spec, or some
154+
// providers may not set it to the Pod name), so we fall back to a pseudo-random hex string if needed.
155+
func correlation() string {
156+
hostname, _ := os.Hostname()
157+
if strings.HasPrefix(hostname, "ccita-") {
158+
return hostname
159+
}
160+
return testrand.Hex(8)
161+
}
162+
148163
type Runner interface {
149164
Run(ctx context.Context) error
150165
}

go-project-licenses.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ Name,License
22
github.com/DataDog/datadog-go/statsd,MIT
33
github.com/alecthomas/kong,MIT
44
github.com/cenkalti/backoff/v5,MIT
5+
github.com/cespare/xxhash/v2,MIT
56
github.com/circleci/ex,MIT
67
github.com/circleci/runner-init,Apache-2.0
78
github.com/facebookgo/clock,MIT
@@ -38,6 +39,7 @@ github.com/quic-go/qpack,MIT
3839
github.com/quic-go/quic-go,MIT
3940
github.com/rollbar/rollbar-go,MIT
4041
github.com/ugorji/go/codec,MIT
42+
github.com/vmihailenco/go-tinylfu,MIT
4143
github.com/vmihailenco/msgpack/v5,BSD-2-Clause
4244
github.com/vmihailenco/tagparser/v2,BSD-2-Clause
4345
go.opentelemetry.io/auto/sdk,Apache-2.0

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,7 @@ require (
522522
github.com/uudashr/iface v1.4.1 // indirect
523523
github.com/vbatts/tar-split v0.12.1 // indirect
524524
github.com/vektah/gqlparser/v2 v2.5.27 // indirect
525+
github.com/vmihailenco/go-tinylfu v0.2.2 // indirect
525526
github.com/wagoodman/go-partybus v0.0.0-20230516145632-8ccac152c651 // indirect
526527
github.com/wagoodman/go-progress v0.0.0-20220614130704-4b1c25a33c7c // indirect
527528
github.com/whyrusleeping/cbor-gen v0.1.3-0.20240731173018-74d74643234c // indirect

go.sum

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x
379379
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
380380
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
381381
github.com/certifi/gocertifi v0.0.0-20180118203423-deb3ae2ef261/go.mod h1:GJKEexRPVJrBSOjoqN5VNOIKJ5Q3RViH6eu3puDRwx4=
382+
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
382383
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
383384
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
384385
github.com/charithe/durationcheck v0.0.10 h1:wgw73BiocdBDQPik+zcEoBG/ob8uyBHf2iyoHGPf5w4=

0 commit comments

Comments
 (0)