diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 6ccca002c..6929a2920 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -884,6 +884,13 @@ type DCGMExporterSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret" ImagePullSecrets []string `json:"imagePullSecrets,omitempty"` + // HostNetwork + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HostNetwork" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + HostNetwork *bool `json:"hostNetwork,omitempty"` + // Optional: Define resources requests and limits for each pod // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements" diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 1735b0699..2cd7c591f 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -349,6 +349,11 @@ func (in *DCGMExporterSpec) DeepCopyInto(out *DCGMExporterSpec) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.HostNetwork != nil { + in, out := &in.HostNetwork, &out.HostNetwork + *out = new(bool) + **out = **in + } if in.Resources != nil { in, out := &in.Resources, &out.Resources *out = new(ResourceRequirements) diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 2c1b9e55f..c4b5494f1 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -337,6 +337,9 @@ spec: - name type: object type: array + hostNetwork: + description: Run the DCGM Exporter pod on the host’s network, sharing the node’s network interfaces and IP address + type: boolean image: description: NVIDIA DCGM Exporter image name pattern: '[a-zA-Z0-9\-]+' diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 2c1b9e55f..c4b5494f1 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -337,6 +337,9 @@ spec: - name type: object type: array + hostNetwork: + description: Run the DCGM Exporter pod on the host’s network, sharing the node’s network interfaces and IP address + type: boolean image: description: NVIDIA DCGM Exporter image name pattern: '[a-zA-Z0-9\-]+' diff --git a/controllers/object_controls.go b/controllers/object_controls.go index a84c8971e..d00e4d7ca 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1608,6 +1608,10 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if len(config.DCGMExporter.ImagePullSecrets) > 0 { addPullSecrets(&obj.Spec.Template.Spec, config.DCGMExporter.ImagePullSecrets) } + // set hostNetwork + if config.DCGMExporter.HostNetwork != nil { + obj.Spec.Template.Spec.HostNetwork = *config.DCGMExporter.HostNetwork + } // set resource limits if config.DCGMExporter.Resources != nil { // apply resource limits to all containers diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 2c1b9e55f..c4b5494f1 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -337,6 +337,9 @@ spec: - name type: object type: array + hostNetwork: + description: Run the DCGM Exporter pod on the host’s network, sharing the node’s network interfaces and IP address + type: boolean image: description: NVIDIA DCGM Exporter image name pattern: '[a-zA-Z0-9\-]+' diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 236ef3e5c..31f576662 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -499,6 +499,9 @@ spec: {{- if .Values.dcgmExporter.imagePullSecrets }} imagePullSecrets: {{ toYaml .Values.dcgmExporter.imagePullSecrets | nindent 6 }} {{- end }} + {{- if .Values.dcgmExporter.hostNetwork }} + hostNetwork: {{ .Values.dcgmExporter.hostNetwork }} + {{- end }} {{- if .Values.dcgmExporter.resources }} resources: {{ toYaml .Values.dcgmExporter.resources | nindent 6 }} {{- end }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 749208de2..ad7cf43b3 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -322,6 +322,7 @@ dcgmExporter: value: "true" - name: DCGM_EXPORTER_COLLECTORS value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + hostNetwork: false resources: {} service: internalTrafficPolicy: Cluster