Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ require (
sigs.k8s.io/yaml v1.6.0
)

require github.com/stretchr/testify v1.11.1

require (
cel.dev/expr v0.25.1 // indirect
github.com/Masterminds/semver/v3 v3.4.0 // indirect
Expand Down
133 changes: 132 additions & 1 deletion pkg/metrics/client_go_adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@

import (
"context"
"net/url"
"sync"
"sync/atomic"
"time"

"github.com/prometheus/client_golang/prometheus"
clientmetrics "k8s.io/client-go/tools/metrics"
Expand All @@ -35,10 +39,70 @@
Name: "rest_client_requests_total",
Help: "Number of HTTP requests, partitioned by status code, method, and host.",
},
[]string{"code", "method", "host"},

Check failure on line 42 in pkg/metrics/client_go_adapter.go

View workflow job for this annotation

GitHub Actions / lint

string `host` has 7 occurrences, make it a constant (goconst)
)

requestLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_request_duration_seconds",
Help: "Request latency in seconds. Broken down by verb and host.",
Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0},
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we support native histograms from the get-go?

Copy link
Copy Markdown
Member

@sbueringer sbueringer May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(see the native histogram config for controller_runtime_reconcile_time_seconds)

},
[]string{"verb", "host"},

Check failure on line 51 in pkg/metrics/client_go_adapter.go

View workflow job for this annotation

GitHub Actions / lint

string `verb` has 5 occurrences, make it a constant (goconst)
)

resolverLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_dns_resolution_duration_seconds",
Help: "DNS resolver latency in seconds. Broken down by host.",
Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0},
},
[]string{"host"},
)

requestSize = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_request_size_bytes",
Help: "Request size in bytes. Broken down by verb and host.",
// 64 bytes to 16MB
Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216},
},
[]string{"verb", "host"},
)

responseSize = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_response_size_bytes",
Help: "Response size in bytes. Broken down by verb and host.",
// 64 bytes to 16MB
Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216},
},
[]string{"verb", "host"},
)

rateLimiterLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_rate_limiter_duration_seconds",
Help: "Client side rate limiter latency in seconds. Broken down by verb, and host.",
Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0},
},
[]string{"verb", "host"},
)

requestRetry = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "rest_client_request_retries_total",
Help: "Number of request retries, partitioned by status code, verb, and host.",
},
[]string{"code", "verb", "host"},
)
)

// restMetricsEnabled gates the opt-in adapters below, they discard observations
// until RegisterDefaultRESTClientMetrics flips it. requestResult stays on
// unconditionally to preserve existing behavior.
var restMetricsEnabled atomic.Bool
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it help readability if we used an atomic pointer that switches the entire adapterset rather than having a check in each method? Is there a no-op from client-go we can reuse?


func init() {
registerClientMetrics()
}
Expand All @@ -50,7 +114,30 @@

// register the metrics with client-go
clientmetrics.Register(clientmetrics.RegisterOpts{
RequestResult: &resultAdapter{metric: requestResult},
RequestResult: &resultAdapter{metric: requestResult},
RequestLatency: &latencyAdapter{metric: requestLatency},
ResolverLatency: &resolverLatencyAdapter{metric: resolverLatency},
RequestSize: &sizeAdapter{metric: requestSize},
ResponseSize: &sizeAdapter{metric: responseSize},
RateLimiterLatency: &latencyAdapter{metric: rateLimiterLatency},
RequestRetry: &retryAdapter{metric: requestRetry},
})
}

var registerDefaultsOnce sync.Once

// RegisterDefaultRESTClientMetrics enables the client metrics.
func RegisterDefaultRESTClientMetrics() {
Copy link
Copy Markdown
Member

@sbueringer sbueringer May 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make this more flexible so it's possible to specify exactly which metrics to enable?

In Cluster API for example we probably cannot enable all of these metrics because at scale we're communicating with 1k+ different apiservers. So it would be nice to be able to enable only specific metrics.

(not sure if there's prior art somewhere for on API that allows to enable a set of metrics)

(cc @alvaroaleman)

registerDefaultsOnce.Do(func() {
Registry.MustRegister(
requestLatency,
resolverLatency,
requestSize,
responseSize,
rateLimiterLatency,
requestRetry,
)
restMetricsEnabled.Store(true)
})
}

Expand All @@ -69,3 +156,47 @@
func (r *resultAdapter) Increment(_ context.Context, code, method, host string) {
r.metric.WithLabelValues(code, method, host).Inc()
}

type latencyAdapter struct {
metric *prometheus.HistogramVec
}

func (l *latencyAdapter) Observe(_ context.Context, verb string, u url.URL, duration time.Duration) {
if !restMetricsEnabled.Load() {
return
}
l.metric.WithLabelValues(verb, u.Host).Observe(duration.Seconds())
}

type resolverLatencyAdapter struct {
metric *prometheus.HistogramVec
}

func (r *resolverLatencyAdapter) Observe(_ context.Context, host string, duration time.Duration) {
if !restMetricsEnabled.Load() {
return
}
r.metric.WithLabelValues(host).Observe(duration.Seconds())
}

type sizeAdapter struct {
metric *prometheus.HistogramVec
}

func (r *sizeAdapter) Observe(_ context.Context, verb string, host string, size float64) {
if !restMetricsEnabled.Load() {
return
}
r.metric.WithLabelValues(verb, host).Observe(size)
}

type retryAdapter struct {
metric *prometheus.CounterVec
}

func (r *retryAdapter) IncrementRetry(_ context.Context, code, verb, host string) {
if !restMetricsEnabled.Load() {
return
}
r.metric.WithLabelValues(code, verb, host).Inc()
}
82 changes: 82 additions & 0 deletions pkg/metrics/client_go_adapter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
Copyright 2026 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"context"
"net/url"
"testing"
"time"

"github.com/stretchr/testify/require"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer if we can avoid introducing testify just for two checks. Let's align to our existing tests please

clientmetrics "k8s.io/client-go/tools/metrics"
)

var optInMetricNames = []string{
"rest_client_request_duration_seconds",
"rest_client_dns_resolution_duration_seconds",
"rest_client_request_size_bytes",
"rest_client_response_size_bytes",
"rest_client_rate_limiter_duration_seconds",
"rest_client_request_retries_total",
}

func observeAllRESTClientMetrics() {
ctx := context.Background()

Check failure on line 39 in pkg/metrics/client_go_adapter_test.go

View workflow job for this annotation

GitHub Actions / lint

use of `context.Background` forbidden because "Use ginkgos SpecContext or go testings t.Context instead" (forbidigo)

Check failure on line 39 in pkg/metrics/client_go_adapter_test.go

View workflow job for this annotation

GitHub Actions / lint

use of `context.Background` forbidden because "Use ginkgos SpecContext or go testings t.Context instead" (forbidigo)
clientmetrics.RequestResult.Increment(ctx, "200", "GET", "example.com")
clientmetrics.RequestLatency.Observe(ctx, "GET", url.URL{Host: "example.com"}, 1*time.Second)
clientmetrics.ResolverLatency.Observe(ctx, "example.com", 1*time.Second)
clientmetrics.RequestSize.Observe(ctx, "GET", "example.com", 1024)
clientmetrics.ResponseSize.Observe(ctx, "GET", "example.com", 1024)
clientmetrics.RateLimiterLatency.Observe(ctx, "GET", url.URL{Host: "example.com"}, 1*time.Second)
clientmetrics.RequestRetry.IncrementRetry(ctx, "200", "GET", "example.com")
}

func gatheredNames(t *testing.T) map[string]struct{} {
t.Helper()
mfs, err := Registry.Gather()
require.NoError(t, err)

names := make(map[string]struct{})
for _, mf := range mfs {
names[mf.GetName()] = struct{}{}
}
return names
}

func TestRESTClientMetrics(t *testing.T) {
observeAllRESTClientMetrics()

names := gatheredNames(t)
if _, ok := names["rest_client_requests_total"]; !ok {
t.Error("metric rest_client_requests_total should be exposed by default")
}
for _, name := range optInMetricNames {
_, found := names[name]
require.False(t, found, "metric %s should not be found before calling RegisterDefaultRESTClientMetrics", name)
}

RegisterDefaultRESTClientMetrics()
observeAllRESTClientMetrics()

names = gatheredNames(t)
for _, name := range optInMetricNames {
_, found := names[name]
require.True(t, found, "metric %s not found", name)
}

}

Check failure on line 82 in pkg/metrics/client_go_adapter_test.go

View workflow job for this annotation

GitHub Actions / lint

unnecessary trailing newline (whitespace)

Check failure on line 82 in pkg/metrics/client_go_adapter_test.go

View workflow job for this annotation

GitHub Actions / lint

unnecessary trailing newline (whitespace)
Loading