From 3d24c39259cc5f6e726cee2dc0b93f7a2b415836 Mon Sep 17 00:00:00 2001 From: "aldi.j" Date: Sun, 20 Jul 2025 21:03:37 +0700 Subject: [PATCH 1/4] [dcgm-exporter] Add option hostNetwork for dcgmExporter Signed-off-by: aldi.j --- bundle/manifests/nvidia.com_clusterpolicies.yaml | 3 +++ config/crd/bases/nvidia.com_clusterpolicies.yaml | 3 +++ deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml | 3 +++ deployments/gpu-operator/values.yaml | 1 + 4 files changed, 10 insertions(+) diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 2c1b9e55f..c4b5494f1 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -337,6 +337,9 @@ spec: - name type: object type: array + hostNetwork: + description: Run the DCGM Exporter pod on the host’s network, sharing the node’s network interfaces and IP address + type: boolean image: description: NVIDIA DCGM Exporter image name pattern: '[a-zA-Z0-9\-]+' diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 2c1b9e55f..c4b5494f1 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -337,6 +337,9 @@ spec: - name type: object type: array + hostNetwork: + description: Run the DCGM Exporter pod on the host’s network, sharing the node’s network interfaces and IP address + type: boolean image: description: NVIDIA DCGM Exporter image name pattern: '[a-zA-Z0-9\-]+' diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 2c1b9e55f..c4b5494f1 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -337,6 +337,9 @@ spec: - name type: object type: array + hostNetwork: + description: Run the DCGM Exporter pod on the host’s network, sharing the node’s network interfaces and IP address + type: boolean image: description: NVIDIA DCGM Exporter image name pattern: '[a-zA-Z0-9\-]+' diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 749208de2..ad7cf43b3 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -322,6 +322,7 @@ dcgmExporter: value: "true" - name: DCGM_EXPORTER_COLLECTORS value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + hostNetwork: false resources: {} service: internalTrafficPolicy: Cluster From 9e867a3de8856ea46160d55394f73ef0ca211285 Mon Sep 17 00:00:00 2001 From: "aldi.j" Date: Sun, 20 Jul 2025 21:27:31 +0700 Subject: [PATCH 2/4] [dcgm-exporter] Add option hostNetwork for dcgmExporter Signed-off-by: aldi.j --- deployments/gpu-operator/templates/clusterpolicy.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 236ef3e5c..31f576662 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -499,6 +499,9 @@ spec: {{- if .Values.dcgmExporter.imagePullSecrets }} imagePullSecrets: {{ toYaml .Values.dcgmExporter.imagePullSecrets | nindent 6 }} {{- end }} + {{- if .Values.dcgmExporter.hostNetwork }} + hostNetwork: {{ .Values.dcgmExporter.hostNetwork }} + {{- end }} {{- if .Values.dcgmExporter.resources }} resources: {{ toYaml .Values.dcgmExporter.resources | nindent 6 }} {{- end }} From e2d159d4e41d02004eb72dd075dd66fb51ecf90a Mon Sep 17 00:00:00 2001 From: "aldi.j" Date: Sun, 20 Jul 2025 22:04:25 +0700 Subject: [PATCH 3/4] [dcgm-exporter] Add option hostNetwork for dcgmExporter Signed-off-by: aldi.j --- controllers/object_controls.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index a84c8971e..d00e4d7ca 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1608,6 +1608,10 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if len(config.DCGMExporter.ImagePullSecrets) > 0 { addPullSecrets(&obj.Spec.Template.Spec, config.DCGMExporter.ImagePullSecrets) } + // set hostNetwork + if config.DCGMExporter.HostNetwork != nil { + obj.Spec.Template.Spec.HostNetwork = *config.DCGMExporter.HostNetwork + } // set resource limits if config.DCGMExporter.Resources != nil { // apply resource limits to all containers From 9d5c64637adb5e4cc83a90b3864245d545545303 Mon Sep 17 00:00:00 2001 From: "aldi.j" Date: Sun, 20 Jul 2025 22:05:20 +0700 Subject: [PATCH 4/4] [dcgm-exporter] Add option hostNetwork for dcgmExporter Signed-off-by: aldi.j --- api/nvidia/v1/clusterpolicy_types.go | 7 +++++++ api/nvidia/v1/zz_generated.deepcopy.go | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 6ccca002c..6929a2920 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -884,6 +884,13 @@ type DCGMExporterSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret" ImagePullSecrets []string `json:"imagePullSecrets,omitempty"` + // HostNetwork + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HostNetwork" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + HostNetwork *bool `json:"hostNetwork,omitempty"` + // Optional: Define resources requests and limits for each pod // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements" diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 1735b0699..2cd7c591f 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -349,6 +349,11 @@ func (in *DCGMExporterSpec) DeepCopyInto(out *DCGMExporterSpec) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.HostNetwork != nil { + in, out := &in.HostNetwork, &out.HostNetwork + *out = new(bool) + **out = **in + } if in.Resources != nil { in, out := &in.Resources, &out.Resources *out = new(ResourceRequirements)