Skip to content

Commit 862d78a

Browse files
mcp: fix race condition in ServerSession.startKeepalive (#856)
This PR fixes a race condition where `ServerSession.keepaliveCancel` was accessed in `Close` concurrently with its initialization in `ServerSession.startKeepalive`. It also removes a now redundant test. The fix moves keepalive initialization into `ServerSession.Connect` similar to the implementation in `ClientSession.Connect`. This is safe because the MCP spec allows keepalive pings before the initialization messages. From the MCP [spec](https://modelcontextprotocol.io/specification/2025-11-25/basic/lifecycle) (emphasis added): > The client SHOULD NOT send requests **other than pings** before the server has responded to the initialize request. The PR includes a reproducer test case, which fails with the following error without this fix: ``` > go test -count=10 -race -run ^TestStreamableStatelessKeepaliveRace$ github.com/modelcontextprotocol/go-sdk/mcp 8s ================== WARNING: DATA RACE Write at 0x00c00007fbf0 by goroutine 1095: github.com/modelcontextprotocol/go-sdk/mcp.startKeepalive() /Users/benjamin/Documents/go-sdk/mcp/shared.go:590 +0x58 github.com/modelcontextprotocol/go-sdk/mcp.(*ServerSession).startKeepalive() /Users/benjamin/Documents/go-sdk/mcp/server.go:1526 +0x130 github.com/modelcontextprotocol/go-sdk/mcp.(*ServerSession).initialized() /Users/benjamin/Documents/go-sdk/mcp/server.go:1059 +0xf8 github.com/modelcontextprotocol/go-sdk/mcp.init.serverSessionMethod[go.shape.*uint8,go.shape.interface { GetMeta() map[string]interface {}; SetMeta(map[string]interface {}); github.com/modelcontextprotocol/go-sdk/mcp.isResult() }].func27() /Users/benjamin/Documents/go-sdk/mcp/shared.go:332 +0x98 github.com/modelcontextprotocol/go-sdk/mcp.newServerMethodInfo[go.shape.*github.com/modelcontextprotocol/go-sdk/mcp.InitializedParams,go.shape.interface { GetMeta() map[string]interface {}; SetMeta(map[string]interface {}); github.com/modelcontextprotocol/go-sdk/mcp.isResult() },go.shape.struct { github.com/modelcontextprotocol/go-sdk/mcp.Meta "json:\"_meta,omitempty\"" }].func2() /Users/benjamin/Documents/go-sdk/mcp/shared.go:272 +0x84 github.com/modelcontextprotocol/go-sdk/mcp.defaultReceivingMethodHandler[go.shape.*uint8]() /Users/benjamin/Documents/go-sdk/mcp/shared.go:152 +0xe8 github.com/modelcontextprotocol/go-sdk/mcp.defaultReceivingMethodHandler[*github.com/modelcontextprotocol/go-sdk/mcp.ServerSession]() /Users/benjamin/Documents/go-sdk/mcp/shared.go:146 +0x60 github.com/modelcontextprotocol/go-sdk/mcp.handleReceive[go.shape.*uint8]() /Users/benjamin/Documents/go-sdk/mcp/shared.go:169 +0x228 github.com/modelcontextprotocol/go-sdk/mcp.(*ServerSession).handle() /Users/benjamin/Documents/go-sdk/mcp/server.go:1445 +0x370 github.com/modelcontextprotocol/go-sdk/mcp.connect[go.shape.*uint8,go.shape.*uint8].func1.1() /Users/benjamin/Documents/go-sdk/mcp/transport.go:170 +0x5c github.com/modelcontextprotocol/go-sdk/internal/jsonrpc2.HandlerFunc.Handle() /Users/benjamin/Documents/go-sdk/internal/jsonrpc2/jsonrpc2.go:84 +0x48 github.com/modelcontextprotocol/go-sdk/internal/jsonrpc2.(*Connection).handleAsync.func3() /Users/benjamin/Documents/go-sdk/internal/jsonrpc2/conn.go:717 +0xd8 Previous read at 0x00c00007fbf0 by goroutine 1089: github.com/modelcontextprotocol/go-sdk/mcp.(*ServerSession).Close() /Users/benjamin/Documents/go-sdk/mcp/server.go:1502 +0x34 github.com/modelcontextprotocol/go-sdk/mcp.(*StreamableHTTPHandler).ServeHTTP.deferwrap1() /Users/benjamin/Documents/go-sdk/mcp/streamable.go:519 +0x34 runtime.deferreturn() /opt/homebrew/Cellar/go/1.25.4/libexec/src/runtime/panic.go:589 +0x5c github.com/modelcontextprotocol/go-sdk/mcp.TestStreamableStatelessKeepaliveRace.mustNotPanic.func2() /Users/benjamin/Documents/go-sdk/mcp/streamable_test.go:2156 +0x94 net/http.HandlerFunc.ServeHTTP() /opt/homebrew/Cellar/go/1.25.4/libexec/src/net/http/server.go:2322 +0x48 net/http.serverHandler.ServeHTTP() /opt/homebrew/Cellar/go/1.25.4/libexec/src/net/http/server.go:3340 +0x270 net/http.(*conn).serve() /opt/homebrew/Cellar/go/1.25.4/libexec/src/net/http/server.go:2109 +0x9b0 net/http.(*Server).Serve.gowrap3() /opt/homebrew/Cellar/go/1.25.4/libexec/src/net/http/server.go:3493 +0x48 Goroutine 1095 (running) created at: github.com/modelcontextprotocol/go-sdk/internal/jsonrpc2.(*Connection).handleAsync() /Users/benjamin/Documents/go-sdk/internal/jsonrpc2/conn.go:715 +0x334 github.com/modelcontextprotocol/go-sdk/internal/jsonrpc2.(*Connection).acceptRequest.func2.gowrap1() /Users/benjamin/Documents/go-sdk/internal/jsonrpc2/conn.go:676 +0x34 Goroutine 1089 (running) created at: net/http.(*Server).Serve() /opt/homebrew/Cellar/go/1.25.4/libexec/src/net/http/server.go:3493 +0x5dc net/http/httptest.(*Server).goServe.func1() /opt/homebrew/Cellar/go/1.25.4/libexec/src/net/http/httptest/server.go:311 +0x98 ================== --- FAIL: TestStreamableStatelessKeepaliveRace (0.14s) testing.go:1617: race detected during execution of test FAIL FAIL github.com/modelcontextprotocol/go-sdk/mcp 1.727s FAIL ``` --------- Co-authored-by: Maciej Kisiel <[email protected]>
1 parent 34a19df commit 862d78a

File tree

4 files changed

+40
-54
lines changed

4 files changed

+40
-54
lines changed

mcp/client.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,8 @@ func (cs *ClientSession) ID() string {
340340
// Close is idempotent and concurrency safe.
341341
func (cs *ClientSession) Close() error {
342342
// Note: keepaliveCancel access is safe without a mutex because:
343-
// 1. keepaliveCancel is only written once during startKeepalive (happens-before all Close calls)
343+
// 1. keepaliveCancel is only written once during Client.Connect (through startKeepalive),
344+
// which happens before any code that may call Close from another goroutine
344345
// 2. context.CancelFunc is safe to call multiple times and from multiple goroutines
345346
// 3. The keepalive goroutine calls Close on ping failure, but this is safe since
346347
// Close is idempotent and conn.Close() handles concurrent calls correctly

mcp/server.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,13 @@ func (s *Server) Connect(ctx context.Context, t Transport, opts *ServerSessionOp
10311031
s.opts.Logger.Error("server connect error", "error", err)
10321032
return nil, err
10331033
}
1034+
1035+
// Start keepalive before returning the session to avoid race conditions with Close.
1036+
// This is safe because the spec allows sending pings before initialization (see ServerSession.handle for details).
1037+
if s.opts.KeepAlive > 0 {
1038+
ss.startKeepalive(ss.server.opts.KeepAlive)
1039+
}
1040+
10341041
return ss, nil
10351042
}
10361043

@@ -1058,9 +1065,6 @@ func (ss *ServerSession) initialized(ctx context.Context, params *InitializedPar
10581065
ss.server.opts.Logger.Error("duplicate initialized notification")
10591066
return nil, fmt.Errorf("duplicate %q received", notificationInitialized)
10601067
}
1061-
if ss.server.opts.KeepAlive > 0 {
1062-
ss.startKeepalive(ss.server.opts.KeepAlive)
1063-
}
10641068
if h := ss.server.opts.InitializedHandler; h != nil {
10651069
h(ctx, serverRequestFor(ss, params))
10661070
}
@@ -1110,7 +1114,7 @@ type ServerSession struct {
11101114
server *Server
11111115
conn *jsonrpc2.Connection
11121116
mcpConn Connection
1113-
keepaliveCancel context.CancelFunc // TODO: theory around why keepaliveCancel need not be guarded
1117+
keepaliveCancel context.CancelFunc
11141118

11151119
mu sync.Mutex
11161120
state ServerSessionState
@@ -1504,7 +1508,8 @@ func (ss *ServerSession) setLevel(_ context.Context, params *SetLoggingLevelPara
15041508
func (ss *ServerSession) Close() error {
15051509
if ss.keepaliveCancel != nil {
15061510
// Note: keepaliveCancel access is safe without a mutex because:
1507-
// 1. keepaliveCancel is only written once during startKeepalive (happens-before all Close calls)
1511+
// 1. keepaliveCancel is only written once during Server.Connect (through startKeepalive),
1512+
// which happens before any code that may call Close from another goroutine
15081513
// 2. context.CancelFunc is safe to call multiple times and from multiple goroutines
15091514
// 3. The keepalive goroutine calls Close on ping failure, but this is safe since
15101515
// Close is idempotent and conn.Close() handles concurrent calls correctly

mcp/server_test.go

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -508,54 +508,6 @@ func TestServerAddResourceTemplate(t *testing.T) {
508508
}
509509
}
510510

511-
// TestServerSessionkeepaliveCancelOverwritten is to verify that `ServerSession.keepaliveCancel` is assigned exactly once,
512-
// ensuring that only a single goroutine is responsible for the session's keepalive ping mechanism.
513-
func TestServerSessionkeepaliveCancelOverwritten(t *testing.T) {
514-
// Set KeepAlive to a long duration to ensure the keepalive
515-
// goroutine stays alive for the duration of the test without actually sending
516-
// ping requests, since we don't have a real client connection established.
517-
server := NewServer(testImpl, &ServerOptions{KeepAlive: 5 * time.Second})
518-
ss := &ServerSession{server: server}
519-
520-
// 1. Initialize the session.
521-
_, err := ss.initialize(context.Background(), &InitializeParams{})
522-
if err != nil {
523-
t.Fatalf("ServerSession initialize failed: %v", err)
524-
}
525-
526-
// 2. Call 'initialized' for the first time. This should start the keepalive mechanism.
527-
_, err = ss.initialized(context.Background(), &InitializedParams{})
528-
if err != nil {
529-
t.Fatalf("First initialized call failed: %v", err)
530-
}
531-
if ss.keepaliveCancel == nil {
532-
t.Fatalf("expected ServerSession.keepaliveCancel to be set after the first call of initialized")
533-
}
534-
535-
// Save the cancel function and use defer to ensure resources are cleaned up.
536-
firstCancel := ss.keepaliveCancel
537-
defer firstCancel()
538-
539-
// 3. Manually set the field to nil.
540-
// Do this to facilitate the test's core assertion. The goal is to verify that
541-
// 'ss.keepaliveCancel' is not assigned a second time. By setting it to nil,
542-
// we can easily check after the next call if a new keepalive goroutine was started.
543-
ss.keepaliveCancel = nil
544-
545-
// 4. Call 'initialized' for the second time. This should return an error.
546-
_, err = ss.initialized(context.Background(), &InitializedParams{})
547-
if err == nil {
548-
t.Fatalf("Expected 'duplicate initialized received' error on second call, got nil")
549-
}
550-
551-
// 5. Re-check the field to ensure it remains nil.
552-
// Since 'initialized' correctly returned an error and did not call
553-
// 'startKeepalive', the field should remain unchanged.
554-
if ss.keepaliveCancel != nil {
555-
t.Fatal("expected ServerSession.keepaliveCancel to be nil after we manually niled it and re-initialized")
556-
}
557-
}
558-
559511
// panicks reports whether f() panics.
560512
func panics(f func()) (b bool) {
561513
defer func() {

mcp/streamable_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,34 @@ func TestStreamableServerShutdown(t *testing.T) {
363363
}
364364
}
365365

366+
// TestStreamableStatelessKeepaliveRace verifies that there is no data race between
367+
// ServerSession.startKeepalive and ServerSession.Close in stateless servers.
368+
func TestStreamableStatelessKeepaliveRace(t *testing.T) {
369+
ctx := context.Background()
370+
server := NewServer(testImpl, &ServerOptions{KeepAlive: time.Hour})
371+
AddTool(server, &Tool{Name: "greet"}, sayHi)
372+
handler := NewStreamableHTTPHandler(
373+
func(*http.Request) *Server { return server },
374+
&StreamableHTTPOptions{Stateless: true},
375+
)
376+
httpServer := httptest.NewServer(mustNotPanic(t, handler))
377+
defer httpServer.Close()
378+
379+
for range 50 {
380+
cs, err := NewClient(testImpl, nil).Connect(ctx, &StreamableClientTransport{
381+
Endpoint: httpServer.URL,
382+
}, nil)
383+
if err != nil {
384+
t.Fatalf("NewClient() failed: %v", err)
385+
}
386+
_, _ = cs.CallTool(ctx, &CallToolParams{
387+
Name: "greet",
388+
Arguments: map[string]any{"Name": "world"},
389+
})
390+
_ = cs.Close()
391+
}
392+
}
393+
366394
// TestClientReplay verifies that the client can recover from a mid-stream
367395
// network failure and receive replayed messages (if replay is configured). It
368396
// uses a proxy that is killed and restarted to simulate a recoverable network

0 commit comments

Comments
 (0)