-
Notifications
You must be signed in to change notification settings - Fork 1.3k
✨ Allow opt-in for client-go REST client metrics #3510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,10 @@ | |
|
|
||
| import ( | ||
| "context" | ||
| "net/url" | ||
| "sync" | ||
| "sync/atomic" | ||
| "time" | ||
|
|
||
| "github.com/prometheus/client_golang/prometheus" | ||
| clientmetrics "k8s.io/client-go/tools/metrics" | ||
|
|
@@ -35,10 +39,70 @@ | |
| Name: "rest_client_requests_total", | ||
| Help: "Number of HTTP requests, partitioned by status code, method, and host.", | ||
| }, | ||
| []string{"code", "method", "host"}, | ||
| ) | ||
|
|
||
| requestLatency = prometheus.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "rest_client_request_duration_seconds", | ||
| Help: "Request latency in seconds. Broken down by verb and host.", | ||
| Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, | ||
| }, | ||
| []string{"verb", "host"}, | ||
| ) | ||
|
|
||
| resolverLatency = prometheus.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "rest_client_dns_resolution_duration_seconds", | ||
| Help: "DNS resolver latency in seconds. Broken down by host.", | ||
| Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, | ||
| }, | ||
| []string{"host"}, | ||
| ) | ||
|
|
||
| requestSize = prometheus.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "rest_client_request_size_bytes", | ||
| Help: "Request size in bytes. Broken down by verb and host.", | ||
| // 64 bytes to 16MB | ||
| Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, | ||
| }, | ||
| []string{"verb", "host"}, | ||
| ) | ||
|
|
||
| responseSize = prometheus.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "rest_client_response_size_bytes", | ||
| Help: "Response size in bytes. Broken down by verb and host.", | ||
| // 64 bytes to 16MB | ||
| Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, | ||
| }, | ||
| []string{"verb", "host"}, | ||
| ) | ||
|
|
||
| rateLimiterLatency = prometheus.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "rest_client_rate_limiter_duration_seconds", | ||
| Help: "Client side rate limiter latency in seconds. Broken down by verb, and host.", | ||
| Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, | ||
| }, | ||
| []string{"verb", "host"}, | ||
| ) | ||
|
|
||
| requestRetry = prometheus.NewCounterVec( | ||
| prometheus.CounterOpts{ | ||
| Name: "rest_client_request_retries_total", | ||
| Help: "Number of request retries, partitioned by status code, verb, and host.", | ||
| }, | ||
| []string{"code", "verb", "host"}, | ||
| ) | ||
| ) | ||
|
|
||
| // restMetricsEnabled gates the opt-in adapters below, they discard observations | ||
| // until RegisterDefaultRESTClientMetrics flips it. requestResult stays on | ||
| // unconditionally to preserve existing behavior. | ||
| var restMetricsEnabled atomic.Bool | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it help readability if we used an atomic pointer that switches the entire adapterset rather than having a check in each method? Is there a no-op from client-go we can reuse? |
||
|
|
||
| func init() { | ||
| registerClientMetrics() | ||
| } | ||
|
|
@@ -50,7 +114,30 @@ | |
|
|
||
| // register the metrics with client-go | ||
| clientmetrics.Register(clientmetrics.RegisterOpts{ | ||
| RequestResult: &resultAdapter{metric: requestResult}, | ||
| RequestResult: &resultAdapter{metric: requestResult}, | ||
| RequestLatency: &latencyAdapter{metric: requestLatency}, | ||
| ResolverLatency: &resolverLatencyAdapter{metric: resolverLatency}, | ||
| RequestSize: &sizeAdapter{metric: requestSize}, | ||
| ResponseSize: &sizeAdapter{metric: responseSize}, | ||
| RateLimiterLatency: &latencyAdapter{metric: rateLimiterLatency}, | ||
| RequestRetry: &retryAdapter{metric: requestRetry}, | ||
| }) | ||
| } | ||
|
|
||
| var registerDefaultsOnce sync.Once | ||
|
|
||
| // RegisterDefaultRESTClientMetrics enables the client metrics. | ||
| func RegisterDefaultRESTClientMetrics() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we make this more flexible so it's possible to specify exactly which metrics to enable? In Cluster API for example we probably cannot enable all of these metrics because at scale we're communicating with 1k+ different apiservers. So it would be nice to be able to enable only specific metrics. (not sure if there's prior art somewhere for on API that allows to enable a set of metrics) (cc @alvaroaleman) |
||
| registerDefaultsOnce.Do(func() { | ||
| Registry.MustRegister( | ||
| requestLatency, | ||
| resolverLatency, | ||
| requestSize, | ||
| responseSize, | ||
| rateLimiterLatency, | ||
| requestRetry, | ||
| ) | ||
| restMetricsEnabled.Store(true) | ||
| }) | ||
| } | ||
|
|
||
|
|
@@ -69,3 +156,47 @@ | |
| func (r *resultAdapter) Increment(_ context.Context, code, method, host string) { | ||
| r.metric.WithLabelValues(code, method, host).Inc() | ||
| } | ||
|
|
||
| type latencyAdapter struct { | ||
| metric *prometheus.HistogramVec | ||
| } | ||
|
|
||
| func (l *latencyAdapter) Observe(_ context.Context, verb string, u url.URL, duration time.Duration) { | ||
| if !restMetricsEnabled.Load() { | ||
| return | ||
| } | ||
| l.metric.WithLabelValues(verb, u.Host).Observe(duration.Seconds()) | ||
| } | ||
|
|
||
| type resolverLatencyAdapter struct { | ||
| metric *prometheus.HistogramVec | ||
| } | ||
|
|
||
| func (r *resolverLatencyAdapter) Observe(_ context.Context, host string, duration time.Duration) { | ||
| if !restMetricsEnabled.Load() { | ||
| return | ||
| } | ||
| r.metric.WithLabelValues(host).Observe(duration.Seconds()) | ||
| } | ||
|
|
||
| type sizeAdapter struct { | ||
| metric *prometheus.HistogramVec | ||
| } | ||
|
|
||
| func (r *sizeAdapter) Observe(_ context.Context, verb string, host string, size float64) { | ||
| if !restMetricsEnabled.Load() { | ||
| return | ||
| } | ||
| r.metric.WithLabelValues(verb, host).Observe(size) | ||
| } | ||
|
|
||
| type retryAdapter struct { | ||
| metric *prometheus.CounterVec | ||
| } | ||
|
|
||
| func (r *retryAdapter) IncrementRetry(_ context.Context, code, verb, host string) { | ||
| if !restMetricsEnabled.Load() { | ||
| return | ||
| } | ||
| r.metric.WithLabelValues(code, verb, host).Inc() | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| /* | ||
| Copyright 2026 The Kubernetes Authors. | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| */ | ||
|
|
||
| package metrics | ||
|
|
||
| import ( | ||
| "context" | ||
| "net/url" | ||
| "testing" | ||
| "time" | ||
|
|
||
| "github.com/stretchr/testify/require" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer if we can avoid introducing testify just for two checks. Let's align to our existing tests please |
||
| clientmetrics "k8s.io/client-go/tools/metrics" | ||
| ) | ||
|
|
||
| var optInMetricNames = []string{ | ||
| "rest_client_request_duration_seconds", | ||
| "rest_client_dns_resolution_duration_seconds", | ||
| "rest_client_request_size_bytes", | ||
| "rest_client_response_size_bytes", | ||
| "rest_client_rate_limiter_duration_seconds", | ||
| "rest_client_request_retries_total", | ||
| } | ||
|
|
||
| func observeAllRESTClientMetrics() { | ||
| ctx := context.Background() | ||
|
Check failure on line 39 in pkg/metrics/client_go_adapter_test.go
|
||
| clientmetrics.RequestResult.Increment(ctx, "200", "GET", "example.com") | ||
| clientmetrics.RequestLatency.Observe(ctx, "GET", url.URL{Host: "example.com"}, 1*time.Second) | ||
| clientmetrics.ResolverLatency.Observe(ctx, "example.com", 1*time.Second) | ||
| clientmetrics.RequestSize.Observe(ctx, "GET", "example.com", 1024) | ||
| clientmetrics.ResponseSize.Observe(ctx, "GET", "example.com", 1024) | ||
| clientmetrics.RateLimiterLatency.Observe(ctx, "GET", url.URL{Host: "example.com"}, 1*time.Second) | ||
| clientmetrics.RequestRetry.IncrementRetry(ctx, "200", "GET", "example.com") | ||
| } | ||
|
|
||
| func gatheredNames(t *testing.T) map[string]struct{} { | ||
| t.Helper() | ||
| mfs, err := Registry.Gather() | ||
| require.NoError(t, err) | ||
|
|
||
| names := make(map[string]struct{}) | ||
| for _, mf := range mfs { | ||
| names[mf.GetName()] = struct{}{} | ||
| } | ||
| return names | ||
| } | ||
|
|
||
| func TestRESTClientMetrics(t *testing.T) { | ||
| observeAllRESTClientMetrics() | ||
|
|
||
| names := gatheredNames(t) | ||
| if _, ok := names["rest_client_requests_total"]; !ok { | ||
| t.Error("metric rest_client_requests_total should be exposed by default") | ||
| } | ||
| for _, name := range optInMetricNames { | ||
| _, found := names[name] | ||
| require.False(t, found, "metric %s should not be found before calling RegisterDefaultRESTClientMetrics", name) | ||
| } | ||
|
|
||
| RegisterDefaultRESTClientMetrics() | ||
| observeAllRESTClientMetrics() | ||
|
|
||
| names = gatheredNames(t) | ||
| for _, name := range optInMetricNames { | ||
| _, found := names[name] | ||
| require.True(t, found, "metric %s not found", name) | ||
| } | ||
|
|
||
| } | ||
|
Check failure on line 82 in pkg/metrics/client_go_adapter_test.go
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we support native histograms from the get-go?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(see the native histogram config for controller_runtime_reconcile_time_seconds)