From a9c8b4f9e3a4ce766dd5bdaf0d95e96f5821b0c9 Mon Sep 17 00:00:00 2001 From: Rajath Agasthya Date: Thu, 2 Apr 2026 13:27:09 -0500 Subject: [PATCH] Run toolkit validation in operand init containers The toolkit-validation init containers in operand DaemonSets previously polled for a toolkit-ready sentinel file on the host. It is possible for operands to (in unknown situations) to find a stale toolkit-ready file from a previous cycle, passing the init gate while nvidia-smi would actually fail. Replace the shell-based sentinel file check with a nvidia-validator check. This runs nvidia-smi through the toolkit runtime wrapper and retries until it succeeds, validating both toolkit injection and driver module readiness without depending on host sentinel files. Signed-off-by: Rajath Agasthya --- assets/gpu-feature-discovery/0500_daemonset.yaml | 11 +++++++++-- assets/state-container-toolkit/0500_daemonset.yaml | 4 ++++ assets/state-dcgm-exporter/0800_daemonset.yaml | 13 ++++++++++--- assets/state-dcgm/0400_dcgm.yml | 11 +++++++++-- assets/state-device-plugin/0500_daemonset.yaml | 11 +++++++++-- assets/state-driver/0500_daemonset.yaml | 2 +- assets/state-mig-manager/0600_daemonset.yaml | 11 +++++++++-- assets/state-mps-control-daemon/0400_daemonset.yaml | 11 +++++++++-- 8 files changed, 60 insertions(+), 14 deletions(-) diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml index 765a84e7a..441cc4e03 100644 --- a/assets/gpu-feature-discovery/0500_daemonset.yaml +++ b/assets/gpu-feature-discovery/0500_daemonset.yaml @@ -29,13 +29,20 @@ spec: - name: toolkit-validation image: "FILLED BY THE OPERATOR" command: ['sh', '-c'] - args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"] + args: ["nvidia-validator"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: toolkit securityContext: privileged: true volumeMounts: - name: run-nvidia mountPath: /run/nvidia - mountPropagation: HostToContainer + mountPropagation: Bidirectional - name: config-manager-init image: "FILLED BY THE OPERATOR" command: ["config-manager"] diff --git a/assets/state-container-toolkit/0500_daemonset.yaml b/assets/state-container-toolkit/0500_daemonset.yaml index 75f802e10..790cfa933 100644 --- a/assets/state-container-toolkit/0500_daemonset.yaml +++ b/assets/state-container-toolkit/0500_daemonset.yaml @@ -77,6 +77,10 @@ spec: fieldPath: metadata.namespace imagePullPolicy: IfNotPresent name: nvidia-container-toolkit-ctr + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/toolkit-ready"] securityContext: privileged: true seLinuxOptions: diff --git a/assets/state-dcgm-exporter/0800_daemonset.yaml b/assets/state-dcgm-exporter/0800_daemonset.yaml index c826aafbb..47d6451d5 100644 --- a/assets/state-dcgm-exporter/0800_daemonset.yaml +++ b/assets/state-dcgm-exporter/0800_daemonset.yaml @@ -28,13 +28,20 @@ spec: - name: toolkit-validation image: "FILLED BY THE OPERATOR" command: ['sh', '-c'] - args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"] + args: ["nvidia-validator"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: toolkit securityContext: privileged: true volumeMounts: - name: run-nvidia - mountPath: "/run/nvidia" - mountPropagation: HostToContainer + mountPath: /run/nvidia + mountPropagation: Bidirectional containers: - image: "FILLED BY THE OPERATOR" name: nvidia-dcgm-exporter diff --git a/assets/state-dcgm/0400_dcgm.yml b/assets/state-dcgm/0400_dcgm.yml index 0c27fd4e6..58b9170bb 100644 --- a/assets/state-dcgm/0400_dcgm.yml +++ b/assets/state-dcgm/0400_dcgm.yml @@ -28,13 +28,20 @@ spec: - name: toolkit-validation image: "FILLED BY THE OPERATOR" command: ['sh', '-c'] - args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"] + args: ["nvidia-validator"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: toolkit securityContext: privileged: true volumeMounts: - name: run-nvidia mountPath: /run/nvidia - mountPropagation: HostToContainer + mountPropagation: Bidirectional containers: - image: "FILLED BY THE OPERATOR" name: nvidia-dcgm-ctr diff --git a/assets/state-device-plugin/0500_daemonset.yaml b/assets/state-device-plugin/0500_daemonset.yaml index c4d85adfb..be8076ee7 100644 --- a/assets/state-device-plugin/0500_daemonset.yaml +++ b/assets/state-device-plugin/0500_daemonset.yaml @@ -28,13 +28,20 @@ spec: - image: "FILLED BY THE OPERATOR" name: toolkit-validation command: ['sh', '-c'] - args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"] + args: ["nvidia-validator"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: toolkit securityContext: privileged: true volumeMounts: - name: run-nvidia-validations mountPath: /run/nvidia/validations - mountPropagation: HostToContainer + mountPropagation: Bidirectional - image: "FILLED BY THE OPERATOR" name: config-manager-init command: ["config-manager"] diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml index 853cf6fc9..59043811a 100644 --- a/assets/state-driver/0500_daemonset.yaml +++ b/assets/state-driver/0500_daemonset.yaml @@ -150,7 +150,7 @@ spec: lifecycle: preStop: exec: - command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/.driver-ctr-ready"] + command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/.driver-ctr-ready /run/nvidia/validations/driver-ready"] - image: "FILLED BY THE OPERATOR" imagePullPolicy: IfNotPresent name: nvidia-peermem-ctr diff --git a/assets/state-mig-manager/0600_daemonset.yaml b/assets/state-mig-manager/0600_daemonset.yaml index 48315b422..d5884be87 100644 --- a/assets/state-mig-manager/0600_daemonset.yaml +++ b/assets/state-mig-manager/0600_daemonset.yaml @@ -28,13 +28,20 @@ spec: - name: toolkit-validation image: "FILLED BY THE OPERATOR" command: ['sh', '-c'] - args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container toolkit to be setup; sleep 5; done"] + args: ["nvidia-validator"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: toolkit securityContext: privileged: true volumeMounts: - name: run-nvidia-validations mountPath: /run/nvidia/validations - mountPropagation: HostToContainer + mountPropagation: Bidirectional containers: - name: nvidia-mig-manager image: "FILLED BY THE OPERATOR" diff --git a/assets/state-mps-control-daemon/0400_daemonset.yaml b/assets/state-mps-control-daemon/0400_daemonset.yaml index 3be58af20..a47d1fe93 100644 --- a/assets/state-mps-control-daemon/0400_daemonset.yaml +++ b/assets/state-mps-control-daemon/0400_daemonset.yaml @@ -30,13 +30,20 @@ spec: - image: "FILLED BY THE OPERATOR" name: toolkit-validation command: ['sh', '-c'] - args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"] + args: ["nvidia-validator"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: toolkit securityContext: privileged: true volumeMounts: - name: run-nvidia mountPath: /run/nvidia - mountPropagation: HostToContainer + mountPropagation: Bidirectional - image: "FILLED BY THE OPERATOR" name: mps-control-daemon-mounts command: [mps-control-daemon, mount-shm]