From 6c9969ef7d896a7b5ed522541caae2ddf3a416ca Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Thu, 13 Mar 2025 23:05:32 -0400 Subject: [PATCH 01/12] update repo for new version Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/versions1.json | 16 ++++++++-------- openshift/versions1.json | 16 ++++++++-------- repo.toml | 6 +++--- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/gpu-operator/versions1.json b/gpu-operator/versions1.json index 50cafe979..5e2447674 100644 --- a/gpu-operator/versions1.json +++ b/gpu-operator/versions1.json @@ -1,6 +1,14 @@ [ { "preferred": "true", + "url": "../25.3.0", + "version": "25.3.0" + }, + { + "url": "../24.9.2", + "version": "24.9.2" + }, + { "url": "../24.9.1", "version": "24.9.1" }, @@ -15,13 +23,5 @@ { "url": "../24.6.1", "version": "24.6.1" - }, - { - "url": "../24.6.0", - "version": "24.6.0" - }, - { - "url": "../24.3.0", - "version": "24.3.0" } ] \ No newline at end of file diff --git a/openshift/versions1.json b/openshift/versions1.json index 89826a87b..2d6c87c32 100644 --- a/openshift/versions1.json +++ b/openshift/versions1.json @@ -1,6 +1,14 @@ [ { "preferred": "true", + "url": "../25.3.0", + "version": "25.3.0" + }, + { + "url": "../24.9.2", + "version": "24.9.2" + }, + { "url": "../24.9.1", "version": "24.9.1" }, @@ -11,13 +19,5 @@ { "url": "../24.6.2", "version": "24.6.2" - }, - { - "url": "../24.6.1", - "version": "24.6.1" - }, - { - "url": "../24.3.0", - "version": "24.3.0" } ] \ No newline at end of file diff --git a/repo.toml b/repo.toml index d574c70a5..d1ec38bb7 100644 --- a/repo.toml +++ b/repo.toml @@ -153,8 +153,8 @@ output_format = "linkcheck" docs_root = "${root}/gpu-operator" project = "gpu-operator" name = "NVIDIA GPU Operator" -version = "24.9.2" -source_substitutions = { version = "v24.9.2", recommended = "570.86.15" } +version = "25.3.0" +source_substitutions = { version = "v25.3.0", recommended = "570.86.15" } copyright_start = 2020 sphinx_exclude_patterns = [ "life-cycle-policy.rst", @@ -212,7 +212,7 @@ output_format = "linkcheck" docs_root = "${root}/openshift" project = "gpu-operator-openshift" name = "NVIDIA GPU Operator on Red Hat OpenShift Container Platform" -version = "24.9.2" +version = "25.3.0" copyright_start = 2020 sphinx_exclude_patterns = [ "get-entitlement.rst", From 8bb472ea1f2c34a1d35bb09652dab0bc4041c131 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Thu, 13 Mar 2025 23:08:12 -0400 Subject: [PATCH 02/12] update platform support, release notes Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/life-cycle-policy.rst | 35 ++++------ gpu-operator/platform-support.rst | 35 ++++++---- gpu-operator/release-notes.rst | 107 ++++++++++++++++++++++++++++- 3 files changed, 144 insertions(+), 33 deletions(-) diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index d5b62915f..b14a63305 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -55,13 +55,13 @@ The product life cycle and versioning are subject to change in the future. * - GPU Operator Version - Status - * - 24.9.x + * - 25.3.x - Generally Available - * - 24.6.x + * - 24.9.x - Maintenance - * - 24.3.x and lower + * - 24.6.x and lower - EOL @@ -89,44 +89,39 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. - ${version} * - NVIDIA GPU Driver - - | `570.86.15 `_ (recommended), - | `565.57.01 `_ - | `560.35.03 `_ - | `550.144.03 `_ (default), - | `550.127.08 `_ - | `535.230.02 `_ - | `535.216.03 `_ + - | `570.124.06 `_ (default, recommended), + | `570.86.15 `_ * - NVIDIA Driver Manager for Kubernetes - - `v0.7.0 `__ + - `v0.8.0 `__ * - NVIDIA Container Toolkit - `1.17.4 `__ * - NVIDIA Kubernetes Device Plugin - - `0.17.0 `__ + - `0.17.1 `__ * - DCGM Exporter - - `3.3.9-3.6.1 `__ + - `4.1.1-4.0.4 `__ * - Node Feature Discovery - - v0.16.6 + - `v0.17.2 `__ * - | NVIDIA GPU Feature Discovery | for Kubernetes - `0.17.0 `__ * - NVIDIA MIG Manager for Kubernetes - - `0.10.0 `__ + - `0.12.0 `__ * - DCGM - - `3.3.9-1 `__ + - `4.1.1-2 `__ * - Validator for NVIDIA GPU Operator - ${version} * - NVIDIA KubeVirt GPU Device Plugin - - `v1.2.10 `__ + - `v1.3.1 `__ * - NVIDIA vGPU Device Manager - `v0.2.8 `__ @@ -135,14 +130,14 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. - `2.20.5 `__ * - NVIDIA Kata Manager for Kubernetes - - `v0.2.2 `__ + - `v0.2.3 `__ * - | NVIDIA Confidential Computing | Manager for Kubernetes - v0.1.1 * - NVIDIA GDRCopy Driver - - `v2.4.1-1 `__ + - `v2.4.4 `__ .. _gds-open-kernel: @@ -156,4 +151,4 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. version downloaded from the `NVIDIA vGPU Software Portal `_. - The GPU Operator is supported on all active NVIDIA data center production drivers. Refer to `Supported Drivers and CUDA Toolkit Versions `_ - for more information. + for more information. \ No newline at end of file diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index f7b941576..bda0dbbd2 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -165,6 +165,16 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: | NVIDIA T400 | Turing | +-------------------------+------------------------+ + .. tab-item:: G-series Products + + +-------------------------+------------------------+ + | Product | Architecture | + +=========================+========================+ + | NVIDIA HGX B200 | NVIDIA Blackwell | + +-------------------------+------------------------+ + | NVIDIA HGX GB200 NVL | NVIDIA Blackwell | + +-------------------------+------------------------+ + .. _gpu-operator-arm-platforms: @@ -258,19 +268,19 @@ The GPU Operator has been validated in the following scenarios: | NKP * - Ubuntu 20.04 LTS |fn2|_ - - 1.24---1.31 + - 1.29---1.32 - - 7.0 U3c, 8.0 U2, 8.0 U3 - - 1.24---1.31 + - 1.29---1.32 - - - 2.12, 2.13 * - Ubuntu 22.04 LTS |fn2|_ - - 1.24---1.31 + - 1.29---1.32 - - 8.0 U2, 8.0 U3 - - 1.24---1.31 + - 1.29---1.32 - - 1.26 - 2.12, 2.13 @@ -288,10 +298,10 @@ The GPU Operator has been validated in the following scenarios: | Enterprise | Linux 8.8, | 8.10 - - 1.24---1.31 + - 1.29---1.32 - - - - 1.24---1.31 + - 1.29---1.32 - - - @@ -373,14 +383,14 @@ The GPU Operator has been validated in the following scenarios: | NKP * - Ubuntu 20.04 LTS - - 1.24--1.31 + - 1.29--1.32 - - 7.0 U3c, 8.0 U2, 8.0 U3 - 1.23---1.25 - 2.12, 2.13 * - Ubuntu 22.04 LTS - - 1.24--1.31 + - 1.29--1.32 - - 8.0 U2, 8.0 U3 - @@ -397,10 +407,10 @@ The GPU Operator has been validated in the following scenarios: | Enterprise | Linux 8.4, | 8.6---8.9 - - 1.24---1.31 + - 1.29---1.32 - - - - 1.24---1.31 + - 1.29---1.32 - @@ -454,6 +464,7 @@ Support for KubeVirt and OpenShift Virtualization Red Hat OpenShift Virtualization is based on KubeVirt. + ================ =========== ============= ========= ============= =========== Operating System Kubernetes KubeVirt OpenShift Virtualization ---------------- ----------- ------------------------- ---------------------------- @@ -476,7 +487,7 @@ Refer to :ref:`GPU Operator with KubeVirt` or :ref:`NVIDIA GPU Operator with Ope KubeVirt and OpenShift Virtualization with NVIDIA vGPU is supported on the following devices: -- H100 +- H100 - GA10x: A100, A40, RTX A6000, RTX A5500, RTX A5000, A30, A16, A10, A2. @@ -521,4 +532,4 @@ Additional Supported Container Management Tools ----------------------------------------------- * Helm v3 -* Red Hat Operator Lifecycle Manager (OLM) +* Red Hat Operator Lifecycle Manager (OLM) \ No newline at end of file diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 8ee75a294..0ece83de5 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -31,9 +31,114 @@ See the :ref:`GPU Operator Component Matrix` for a list of software components a GPU Operator beta releases are documented on `GitHub `_. NVIDIA AI Enterprise builds are not posted on GitHub. - ---- +.. _v25.3.0: + +25.3.0 +====== + +.. _v25.3.0-new-features: + +New Features +------------ + +* Added support for the following software component versions: + + - NVIDIA Container Toolkit version v1.15.0 + - NVIDIA Driver Manager for Kubernetes v0.8.0 + - NVIDIA Kubernetes Device Plugin v0.17.1 + - NVIDIA DCGM Exporter v4.1.1-4.0.4 + - NVIDIA DCGM v4.1.1-2 + - Node Feature Discovery v0.17.2 + - NVIDIA MIG Manager for Kubernetes v0.12.0 + - NVIDIA KubeVirt GPU Device Plugin v1.3.1 + - NVIDIA Kata Manager for Kubernetes v0.2.3 + - NVIDIA GDRCopy Driver v2.4.4 + +* Added support for the NVIDIA Data Center GPU Driver version 570.124.06. + +* Added new parameter, `kernelModuleType`, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. + The `kernelModuleType` field supports three values to determine how the the kernal model is selected. Valid values for the new field are auto (default), open, and proprietary. + + In previous versions, the `useOpenKernelModules` field specified the driver containers to install the NVIDIA Open GPU Kernel module driver. + This field is now deprecated and will be removed in a future release. + If you were using the `useOpenKernelModules` field, it's recommended that you update your configuration to use the `kernelModuleType` field instead. + + Note, `auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. + 550 and 535 branch drivers do not yet support this mode. + +* Added support for Ubuntu 24.04. + +* Added support for NVIDIA HGX GB200 NVL and NVIDIA HGX B200. + +* Added support for NVIDIA Network Operator v25.1.0. + Refer to :ref:`Support for GPUDirect RDMA` and :ref:`Support for GPUDirect Storage`. + +* Added support for OpenShift 4.18. + +* Add support for Containerd 2.0. + +* Added support for new MIG profiles with HGX B200. + + * Added support for the following profiles: + + * ``1g.23gb`` + * ``1g.23gb+me`` + * ``1g.45gb`` + * ``2g.45gb`` + * ``3g.90gb`` + * ``7g.180gb`` + + * Added an ``all-balanced`` profile creates the following GPU instances: + + * ``1g.23gb`` :math:`\times` 2 + * ``2g.45gb`` :math:`\times` 1 + * ``3g.90gb`` :math:`\times` 1 + +* Added support for new MIG profiles with HGX GB200. + + * Added support for the following profiles: + + * ``1g.24gb`` + * ``1g.24gb+me`` + * ``1g.47gb`` + * ``2g.47gb`` + * ``3g.95gb`` + * ``4g.95gb`` + * ``7g.189gb`` + + * Added an ``all-balanced`` profile creates the following GPU instances: + + * ``1g.24gb`` :math:`\times` 2 + * ``2g.47gb`` :math:`\times` 1 + * ``3g.95gb`` :math:`\times` 1 + +Improvements +------------ + +* Improved security by removing unneeded permissions in the GPU Operator ClusterRole. + +* Improved GPU Operator metrics to include a `operatorMetricsNamespace` field that sets the metrcis namespace to `gpu_operator`. + +.. _v25.3.0-fixed-issues: + +Fixed Issues +------------ + +* Removed default liveiness prode from the GDS and GDRCopy containers of the driver-daemonset. + Long response times of the `lsmod` commands were causing timeout errors in the probe and unnecessary restarts of the container, resulting in the DaemonSet being in a bad state. + +* Fixed an issue where the GPU Operator failed to create a valid DaemonSet name on OpenShift Container Platform when using 64 kernel page size. + Refer to Github `issue #1207 `__ for more details. + +* Removed deprecated `operator.defaultRuntime`` parameter. + +.. _v25.3.0-known-limitations: + +Known Limitations +----------------- + .. _v24.9.2: 24.9.2 From 1ec84e61ee4eedbb206a0b97381df8564ad7a129 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:55:57 -0400 Subject: [PATCH 03/12] update for kernelmoduletypes change Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/getting-started.rst | 16 ++++++++++++++-- gpu-operator/gpu-driver-configuration.rst | 12 ++++++++++-- gpu-operator/gpu-operator-rdma.rst | 12 +++++------- gpu-operator/manifests/input/nvd-demo-gold.yaml | 2 +- gpu-operator/platform-support.rst | 3 ++- gpu-operator/release-notes.rst | 17 ++++++++++++----- 6 files changed, 44 insertions(+), 18 deletions(-) diff --git a/gpu-operator/getting-started.rst b/gpu-operator/getting-started.rst index 647a84cca..1b3834a9d 100644 --- a/gpu-operator/getting-started.rst +++ b/gpu-operator/getting-started.rst @@ -173,6 +173,17 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. Set this value to ``false`` when using the Operator on systems with pre-installed drivers. - ``true`` + * - ``kernelModuleType`` + - Specifies the type of the NVIDIA GPU Kernel modules to use. + Valid values are ``auto`` (default), ``proprietary``, and ``open``. + + ``Auto`` means that the recommended kernel module type (open or proprietary) is chosen based on the GPU devices on the host and the driver branch used. + Note, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. + 550 and 535 branch drivers do not yet support this mode. + ``Open`` means the open kernel module is used. + ``Proprietary`` means the proprietary module is used. + - ``auto`` + * - ``driver.repository`` - The images are downloaded from NGC. Specify another image repository when using custom driver images. @@ -197,8 +208,9 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. runs slowly in your cluster. - ``60s`` - * - ``driver.useOpenKernelModules`` - - When set to ``true``, the driver containers install the NVIDIA Open GPU Kernel module driver. + * - ``driver.useOpenKernelModules`` Deprecated. + - This field is deprecated as of v25.3.0, use ``kernelModelueType`` instead. + When set to ``true``, the driver containers install the NVIDIA Open GPU Kernel module driver. - ``false`` * - ``driver.usePrecompiled`` diff --git a/gpu-operator/gpu-driver-configuration.rst b/gpu-operator/gpu-driver-configuration.rst index 8c7378db2..25cee99cd 100644 --- a/gpu-operator/gpu-driver-configuration.rst +++ b/gpu-operator/gpu-driver-configuration.rst @@ -195,6 +195,13 @@ The following table describes some of the fields in the custom resource. - Specifies the credentials to provide to the registry if the registry is secured. - None + * - ``kernelModuleType`` + - Specifies the type of the NVIDIA GPU Kernel modules to use. + Valid values are ``auto`` (default), ``proprietary``, and ``open``. + + ``Auto`` means that the recommended kernel module type is chosen based on the GPU devices on the host and the driver branch used. + - ``auto`` + * - ``labels`` - Specifies a map of key and value pairs to add as custom labels to the driver pod. - None @@ -217,8 +224,9 @@ The following table describes some of the fields in the custom resource. - Specifies the container registry that contains the driver container. - ``nvcr.io/nvidia`` - * - ``useOpenKernelModules`` - - Specifies to use the NVIDIA Open GPU Kernel modules. + * - ``useOpenKernelModules`` Deprecated. + - This field is depreacted as of v25.3.0. Use ``kernelModuleType`` instead. + Specifies to use the NVIDIA Open GPU Kernel modules. - ``false`` * - ``tolerations`` diff --git a/gpu-operator/gpu-operator-rdma.rst b/gpu-operator/gpu-operator-rdma.rst index 406eaec2d..cccecf02c 100644 --- a/gpu-operator/gpu-operator-rdma.rst +++ b/gpu-operator/gpu-operator-rdma.rst @@ -30,10 +30,9 @@ To support GPUDirect RDMA, userspace CUDA APIs are required. The kernel mode support is provided by one of two approaches: DMA-BUF from the Linux kernel or the legacy ``nvidia-peermem`` kernel module. NVIDIA recommends using the DMA-BUF rather than using the ``nvidia-peermem`` kernel module from the GPU Driver. -Starting with v23.9.1 of the Operator, the Operator uses GDS driver version 2.17.5 or newer. +The Operator uses GDS driver version 2.17.5 or newer. This version and higher is only supported with the NVIDIA Open GPU Kernel module driver. -The sample commands for installing the Operator include the ``--set useOpenKernelModules=true`` -command-line argument for Helm. +Newer driver versions will use an open kernel module by default, however to make sure you are using an open model, you can include the ``--set driver.kernelModuleType=open`` command-line arugment in your Operator install command. In conjunction with the Network Operator, the GPU Operator can be used to set up the networking related components such as network device kernel drivers and Kubernetes device plugins to enable @@ -128,7 +127,6 @@ To use DMA-BUF and network device drivers that are installed by the Network Oper -n gpu-operator --create-namespace \ nvidia/gpu-operator \ --version=${version} \ - --set driver.useOpenKernelModules=true To use DMA-BUF and network device drivers that are installed on the host: @@ -138,11 +136,11 @@ To use DMA-BUF and network device drivers that are installed on the host: -n gpu-operator --create-namespace \ nvidia/gpu-operator \ --version=${version} \ - --set driver.useOpenKernelModules=true \ + --set driver.kernelModuleType=open \ --set driver.rdma.useHostMofed=true To use the legacy ``nvidia-peermem`` kernel module instead of DMA-BUF, add ``--set driver.rdma.enabled=true`` to either of the preceding commands. -The ``driver.useOpenKernelModules=true`` argument is optional for using the legacy kernel driver. +The ``driver.kernelModuleType=open`` argument is optional for using the legacy kernel driver. Verifying the Installation of GPUDirect with RDMA ================================================= @@ -431,11 +429,11 @@ The following sample command applies to clusters that use the Network Operator t -n gpu-operator --create-namespace \ nvidia/gpu-operator \ --version=${version} \ - --set driver.useOpenKernelModules=true \ --set gds.enabled=true Add ``--set driver.rdma.enabled=true`` to the command to use the legacy ``nvidia-peermem`` kernel module. +Add ``--set driver.kernelModuleType=open`` if you are using driver version other than 570.86.15 or 570.124.06. Verification ============== diff --git a/gpu-operator/manifests/input/nvd-demo-gold.yaml b/gpu-operator/manifests/input/nvd-demo-gold.yaml index eba812658..383e49932 100644 --- a/gpu-operator/manifests/input/nvd-demo-gold.yaml +++ b/gpu-operator/manifests/input/nvd-demo-gold.yaml @@ -16,6 +16,7 @@ spec: image: driver imagePullPolicy: IfNotPresent imagePullSecrets: [] + kernelModuleType: auto manager: {} nodeSelector: driver.config: "gold" @@ -30,6 +31,5 @@ spec: initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 60 - useOpenKernelModules: false usePrecompiled: false version: 535.104.12 diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index bda0dbbd2..6481da0f1 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -521,7 +521,8 @@ Supported operating systems and NVIDIA GPU Drivers with GPUDirect Storage. Version v2.17.5 and higher of the NVIDIA GPUDirect Storage kernel driver, ``nvidia-fs``, requires the NVIDIA Open GPU Kernel module driver. - You can install the open kernel modules by specifying the ``driver.useOpenKernelModules=true`` + You can install the open kernel modules by specifying the ``driver.kernelModuleType=auto`` if you are using driver container version 570.86.15, 570.124.06 or later. + Or use ``driver.kernelModuleType=open`` if you are using a different driver version or branch. argument to the ``helm`` command. Refer to :ref:`Common Chart Customization Options` for more information. diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 0ece83de5..cb391675a 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -58,14 +58,19 @@ New Features * Added support for the NVIDIA Data Center GPU Driver version 570.124.06. -* Added new parameter, `kernelModuleType`, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. - The `kernelModuleType` field supports three values to determine how the the kernal model is selected. Valid values for the new field are auto (default), open, and proprietary. +* Added new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. + + Valid values include: - In previous versions, the `useOpenKernelModules` field specified the driver containers to install the NVIDIA Open GPU Kernel module driver. + * ``auto``: Default and recommended option. Use the default kernel module type (open or proprietary) based on the GPU Operator and driver containers used. + * ``open``: Use the NVIDIA Open GPU Kernel module driver. + * ``proprietary``: Use the NVIDIA Proprietary GPU Kernel module driver. + + In previous versions, the ``useOpenKernelModules`` field specified the driver containers to install the NVIDIA Open GPU Kernel module driver. This field is now deprecated and will be removed in a future release. - If you were using the `useOpenKernelModules` field, it's recommended that you update your configuration to use the `kernelModuleType` field instead. + If you were using the ``useOpenKernelModules`` field, it's recommended that you update your configuration to use the `kernelModuleType` field instead. - Note, `auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. + Note, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. 550 and 535 branch drivers do not yet support this mode. * Added support for Ubuntu 24.04. @@ -121,6 +126,8 @@ Improvements * Improved GPU Operator metrics to include a `operatorMetricsNamespace` field that sets the metrcis namespace to `gpu_operator`. +* Improved error handling in Driver Manager for Kubernetes by adding pod watch permissions. + .. _v25.3.0-fixed-issues: Fixed Issues From aaf333fd8a36f014268be250e2f1e1eedad74cea Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 19 Mar 2025 09:16:26 -0400 Subject: [PATCH 04/12] update support matrix Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/life-cycle-policy.rst | 2 +- gpu-operator/platform-support.rst | 33 ++++++++++++++++++------------ gpu-operator/release-notes.rst | 27 +++++++++++++++--------- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index b14a63305..e03ee9745 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -124,7 +124,7 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. - `v1.3.1 `__ * - NVIDIA vGPU Device Manager - - `v0.2.8 `__ + - `v0.3.0 `__ * - NVIDIA GDS Driver |gds|_ - `2.20.5 `__ diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index 6481da0f1..655598761 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -285,9 +285,18 @@ The GPU Operator has been validated in the following scenarios: - 1.26 - 2.12, 2.13 + * - Ubuntu 24.04 LTS + - 1.29---1.32 + - + - 8.0 U2, 8.0 U3 + - 1.29---1.32 + - + - 1.26 + - 2.12, 2.13 + * - Red Hat Core OS - - - | 4.12---4.17 + - | 4.12---4.18 - - - @@ -326,6 +335,7 @@ The GPU Operator has been validated in the following scenarios: .. _ubuntu-kernel: :sup:`2` + For Ubuntu 24.04 LTS, kernel versions 6.8 are LTS ESM kernels. For Ubuntu 22.04 LTS, kernel versions 6.8 (non-precompiled driver containers only) 6.5 and 5.15 are LTS ESM kernels. For Ubuntu 20.04 LTS, kernel versions 5.4 and 5.15 are LTS ESM kernels. The GPU Driver containers support these Linux kernels. @@ -398,7 +408,7 @@ The GPU Operator has been validated in the following scenarios: * - Red Hat Core OS - - - 4.12---4.17 + - 4.12---4.18 - - - @@ -423,25 +433,22 @@ See the :doc:`precompiled-drivers` page for more on using precompiled drivers. +----------------------------+------------------------+----------------+---------------------+ | Operating System | Kernel Flavor | Kernel Version | CUDA Driver Branch | +============================+========================+================+=====================+ -| Ubuntu 22.04 | Generic | 5.15 | R535, R550 | -+----------------------------+------------------------+----------------+---------------------+ -| Ubuntu 22.04 | NVIDIA | 5.15 | R535, R550 | +| Ubuntu 22.04 | Generic, NVIDIA, Azure | 5.15 | R535, R550 | +| | AWS, Oracle | | | +----------------------------+------------------------+----------------+---------------------+ -| Ubuntu 22.04 | Azure | 5.15 | R535, R550 | -+----------------------------+------------------------+----------------+---------------------+ -| Ubuntu 22.04 | AWS | 5.15 | R535, R550 | -+----------------------------+------------------------+----------------+---------------------+ -| Ubuntu 22.04 | Oracle | 5.15 | R535, R550 | +| Ubuntu 24.04 | Generic, NVIDIA, Azure | 6.8 | R550 | +| | AWS, Oracle | | | +----------------------------+------------------------+----------------+---------------------+ + Supported Container Runtimes ---------------------------- The GPU Operator has been validated in the following scenarios: +----------------------------+------------------------+----------------+ -| Operating System | Containerd 1.4 - 1.7 | CRI-O | +| Operating System | Containerd 1.4 - 2.0 | CRI-O | +============================+========================+================+ | Ubuntu 20.04 LTS | Yes | Yes | +----------------------------+------------------------+----------------+ @@ -456,7 +463,7 @@ The GPU Operator has been validated in the following scenarios: .. note:: - The GPU Operator has been validated with version 2 of the containerd config file. + The GPU Operator has been validated with version 2 and 3 of the containerd config file. Support for KubeVirt and OpenShift Virtualization @@ -473,7 +480,7 @@ Operating System Kubernetes KubeVirt OpenShift Virtual ================ =========== ============= ========= ============= =========== Ubuntu 20.04 LTS 1.23---1.29 0.36+ 0.59.1+ Ubuntu 22.04 LTS 1.23---1.29 0.36+ 0.59.1+ -Red Hat Core OS 4.12---4.17 4.13---4.17 +Red Hat Core OS 4.12---4.18 4.13---4.18 ================ =========== ============= ========= ============= =========== You can run GPU passthrough and NVIDIA vGPU in the same cluster as long as you use diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index cb391675a..91c43c9a6 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -53,10 +53,14 @@ New Features - Node Feature Discovery v0.17.2 - NVIDIA MIG Manager for Kubernetes v0.12.0 - NVIDIA KubeVirt GPU Device Plugin v1.3.1 + - NVIDIA vGPU Device Manager v0.3.0 - NVIDIA Kata Manager for Kubernetes v0.2.3 - NVIDIA GDRCopy Driver v2.4.4 -* Added support for the NVIDIA Data Center GPU Driver version 570.124.06. +* Added support for the NVIDIA GPU DRA Driver v25.3.0 component which enables IMEX and Kubernetes Dynamic Resource Allocation (DRA) support. + See the `IMEX DRA Driver Support `__ documentation for more details on installing this component and running workloads. + +GB200 NVL72 with IMEX/DRA and with Driver Container (embedding the IMEX package) - user sets CDI flag to true for GPU operator * Added new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. @@ -73,16 +77,24 @@ New Features Note, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. 550 and 535 branch drivers do not yet support this mode. -* Added support for Ubuntu 24.04. +* Added support for Ubuntu 24.04. - runfile based driver container + - precompiled driver container - 6.8 kernel only * Added support for NVIDIA HGX GB200 NVL and NVIDIA HGX B200. +* Added support for the NVIDIA Data Center GPU Driver version 570.124.06. + +* Added support for KubeVirt and OpenShift Virtualization with vGPU v18 for A30, A100, and H100 GPUs. + * Added support for NVIDIA Network Operator v25.1.0. Refer to :ref:`Support for GPUDirect RDMA` and :ref:`Support for GPUDirect Storage`. -* Added support for OpenShift 4.18. +* Added support for OpenShift v4.18. -* Add support for Containerd 2.0. +* Added support for Containerd v2.0. + +* Added support for Kubernetes v1.32. + Note that the minimum supported Kubernetes versions has been updated to v1.29. * Added support for new MIG profiles with HGX B200. @@ -122,7 +134,7 @@ New Features Improvements ------------ -* Improved security by removing unneeded permissions in the GPU Operator ClusterRole. +* Improved security by removing unnecessary permissions in the GPU Operator ClusterRole. * Improved GPU Operator metrics to include a `operatorMetricsNamespace` field that sets the metrcis namespace to `gpu_operator`. @@ -141,11 +153,6 @@ Fixed Issues * Removed deprecated `operator.defaultRuntime`` parameter. -.. _v25.3.0-known-limitations: - -Known Limitations ------------------ - .. _v24.9.2: 24.9.2 From 8c96da14c3af3dc9a933409f03309019f311391e Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Thu, 20 Mar 2025 10:53:10 -0400 Subject: [PATCH 05/12] minor adjustments Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/platform-support.rst | 4 +++- gpu-operator/release-notes.rst | 15 +++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index 655598761..0b15f9019 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -494,6 +494,8 @@ Refer to :ref:`GPU Operator with KubeVirt` or :ref:`NVIDIA GPU Operator with Ope KubeVirt and OpenShift Virtualization with NVIDIA vGPU is supported on the following devices: +- H200NVL with vGPU v18.0 + - H100 - GA10x: A100, A40, RTX A6000, RTX A5500, RTX A5000, A30, A16, A10, A2. @@ -510,7 +512,7 @@ Support for GPUDirect RDMA Supported operating systems and NVIDIA GPU Drivers with GPUDirect RDMA. -- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.10.0 +- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.10.0. - Red Hat OpenShift 4.12 and higher with Network Operator 23.10.0 For information about configuring GPUDirect RDMA, refer to :doc:`gpu-operator-rdma`. diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 91c43c9a6..4698bdfd4 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -58,11 +58,11 @@ New Features - NVIDIA GDRCopy Driver v2.4.4 * Added support for the NVIDIA GPU DRA Driver v25.3.0 component which enables IMEX and Kubernetes Dynamic Resource Allocation (DRA) support. - See the `IMEX DRA Driver Support `__ documentation for more details on installing this component and running workloads. -GB200 NVL72 with IMEX/DRA and with Driver Container (embedding the IMEX package) - user sets CDI flag to true for GPU operator + This component is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL or NVIDIA HGX B200, and with CDI enabled on your GPU Operator. + See the `IMEX DRA Driver Support `__ documentation for more details on installing this component and running workloads. -* Added new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. +* Added a new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. Valid values include: @@ -70,15 +70,14 @@ GB200 NVL72 with IMEX/DRA and with Driver Container (embedding the IMEX package) * ``open``: Use the NVIDIA Open GPU Kernel module driver. * ``proprietary``: Use the NVIDIA Proprietary GPU Kernel module driver. + Note, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. + 550 and 535 branch drivers do not yet support this mode. + In previous versions, the ``useOpenKernelModules`` field specified the driver containers to install the NVIDIA Open GPU Kernel module driver. This field is now deprecated and will be removed in a future release. If you were using the ``useOpenKernelModules`` field, it's recommended that you update your configuration to use the `kernelModuleType` field instead. - Note, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. - 550 and 535 branch drivers do not yet support this mode. - -* Added support for Ubuntu 24.04. - runfile based driver container - - precompiled driver container - 6.8 kernel only +* Added support for Ubuntu 24.04. * Added support for NVIDIA HGX GB200 NVL and NVIDIA HGX B200. From 8206cd4a8a9ef4a594ab1bd6b77c972414f4c9e0 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Fri, 21 Mar 2025 14:35:15 -0400 Subject: [PATCH 06/12] updates from review Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/gpu-driver-configuration.rst | 2 +- gpu-operator/gpu-operator-rdma.rst | 6 ++++-- gpu-operator/life-cycle-policy.rst | 2 +- gpu-operator/release-notes.rst | 7 ++++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/gpu-operator/gpu-driver-configuration.rst b/gpu-operator/gpu-driver-configuration.rst index 25cee99cd..4467051df 100644 --- a/gpu-operator/gpu-driver-configuration.rst +++ b/gpu-operator/gpu-driver-configuration.rst @@ -225,7 +225,7 @@ The following table describes some of the fields in the custom resource. - ``nvcr.io/nvidia`` * - ``useOpenKernelModules`` Deprecated. - - This field is depreacted as of v25.3.0. Use ``kernelModuleType`` instead. + - This field is deprecated as of v25.3.0 and will be ignored. Use ``kernelModuleType`` instead. Specifies to use the NVIDIA Open GPU Kernel modules. - ``false`` diff --git a/gpu-operator/gpu-operator-rdma.rst b/gpu-operator/gpu-operator-rdma.rst index cccecf02c..292792c91 100644 --- a/gpu-operator/gpu-operator-rdma.rst +++ b/gpu-operator/gpu-operator-rdma.rst @@ -32,7 +32,9 @@ NVIDIA recommends using the DMA-BUF rather than using the ``nvidia-peermem`` ker The Operator uses GDS driver version 2.17.5 or newer. This version and higher is only supported with the NVIDIA Open GPU Kernel module driver. -Newer driver versions will use an open kernel module by default, however to make sure you are using an open model, you can include the ``--set driver.kernelModuleType=open`` command-line arugment in your Operator install command. +In GPU Operator v25.3.0 and later, the ``driver.kernelModuleType`` default is ``auto``, for the supported driver versions. +This configuration allows the GPU Operator to choose the recommended driver kernel module type depending on the driver branch and the GPU devices available. +Newer driver versions will use an open kernel module by default, however to make sure you are using an open model, include ``--set driver.kernelModuleType=open`` command-line arugment in your Operator install command. In conjunction with the Network Operator, the GPU Operator can be used to set up the networking related components such as network device kernel drivers and Kubernetes device plugins to enable @@ -433,7 +435,7 @@ The following sample command applies to clusters that use the Network Operator t Add ``--set driver.rdma.enabled=true`` to the command to use the legacy ``nvidia-peermem`` kernel module. -Add ``--set driver.kernelModuleType=open`` if you are using driver version other than 570.86.15 or 570.124.06. +Add ``--set driver.kernelModuleType=open`` if you are using a driver version from a branch earlier than R570. Verification ============== diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index e03ee9745..cda55a3be 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -112,7 +112,7 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. - `0.17.0 `__ * - NVIDIA MIG Manager for Kubernetes - - `0.12.0 `__ + - `0.12.1 `__ * - DCGM - `4.1.1-2 `__ diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 4698bdfd4..86f5cce75 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -51,7 +51,7 @@ New Features - NVIDIA DCGM Exporter v4.1.1-4.0.4 - NVIDIA DCGM v4.1.1-2 - Node Feature Discovery v0.17.2 - - NVIDIA MIG Manager for Kubernetes v0.12.0 + - NVIDIA MIG Manager for Kubernetes v0.12.1 - NVIDIA KubeVirt GPU Device Plugin v1.3.1 - NVIDIA vGPU Device Manager v0.3.0 - NVIDIA Kata Manager for Kubernetes v0.2.3 @@ -70,7 +70,7 @@ New Features * ``open``: Use the NVIDIA Open GPU Kernel module driver. * ``proprietary``: Use the NVIDIA Proprietary GPU Kernel module driver. - Note, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. + Currently, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. 550 and 535 branch drivers do not yet support this mode. In previous versions, the ``useOpenKernelModules`` field specified the driver containers to install the NVIDIA Open GPU Kernel module driver. @@ -104,6 +104,7 @@ New Features * ``1g.45gb`` * ``2g.45gb`` * ``3g.90gb`` + * ``4g.90gb`` * ``7g.180gb`` * Added an ``all-balanced`` profile creates the following GPU instances: @@ -144,7 +145,7 @@ Improvements Fixed Issues ------------ -* Removed default liveiness prode from the GDS and GDRCopy containers of the driver-daemonset. +* Removed default liveness probe from the GDS and GDRCopy containers of the driver-daemonset. Long response times of the `lsmod` commands were causing timeout errors in the probe and unnecessary restarts of the container, resulting in the DaemonSet being in a bad state. * Fixed an issue where the GPU Operator failed to create a valid DaemonSet name on OpenShift Container Platform when using 64 kernel page size. From 8ed7a159664ae2de588032d97a40a1c2a185e8ae Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Mon, 24 Mar 2025 15:54:40 -0400 Subject: [PATCH 07/12] updates from review Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/getting-started.rst | 2 +- gpu-operator/gpu-operator-rdma.rst | 5 ++--- gpu-operator/life-cycle-policy.rst | 4 ++-- gpu-operator/platform-support.rst | 25 ++++++++++++------------- gpu-operator/release-notes.rst | 12 ++++++++---- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/gpu-operator/getting-started.rst b/gpu-operator/getting-started.rst index 1b3834a9d..4ed0d5a94 100644 --- a/gpu-operator/getting-started.rst +++ b/gpu-operator/getting-started.rst @@ -209,7 +209,7 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. - ``60s`` * - ``driver.useOpenKernelModules`` Deprecated. - - This field is deprecated as of v25.3.0, use ``kernelModelueType`` instead. + - This field is deprecated as of v25.3.0 and will be ignored. Use ``kernelModuleType`` instead. When set to ``true``, the driver containers install the NVIDIA Open GPU Kernel module driver. - ``false`` diff --git a/gpu-operator/gpu-operator-rdma.rst b/gpu-operator/gpu-operator-rdma.rst index 292792c91..01a219792 100644 --- a/gpu-operator/gpu-operator-rdma.rst +++ b/gpu-operator/gpu-operator-rdma.rst @@ -34,7 +34,7 @@ The Operator uses GDS driver version 2.17.5 or newer. This version and higher is only supported with the NVIDIA Open GPU Kernel module driver. In GPU Operator v25.3.0 and later, the ``driver.kernelModuleType`` default is ``auto``, for the supported driver versions. This configuration allows the GPU Operator to choose the recommended driver kernel module type depending on the driver branch and the GPU devices available. -Newer driver versions will use an open kernel module by default, however to make sure you are using an open model, include ``--set driver.kernelModuleType=open`` command-line arugment in your Operator install command. +Newer driver versions will use the open kernel module by default, however to make sure you are using the open kernel model, include ``--set driver.kernelModuleType=open`` command-line argument in your helm Operator install command. In conjunction with the Network Operator, the GPU Operator can be used to set up the networking related components such as network device kernel drivers and Kubernetes device plugins to enable @@ -138,11 +138,10 @@ To use DMA-BUF and network device drivers that are installed on the host: -n gpu-operator --create-namespace \ nvidia/gpu-operator \ --version=${version} \ - --set driver.kernelModuleType=open \ --set driver.rdma.useHostMofed=true To use the legacy ``nvidia-peermem`` kernel module instead of DMA-BUF, add ``--set driver.rdma.enabled=true`` to either of the preceding commands. -The ``driver.kernelModuleType=open`` argument is optional for using the legacy kernel driver. +Add ``--set driver.kernelModuleType=open`` if you are using a driver version from a branch earlier than R570. Verifying the Installation of GPUDirect with RDMA ================================================= diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index cda55a3be..ccd4967b1 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -96,7 +96,7 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. - `v0.8.0 `__ * - NVIDIA Container Toolkit - - `1.17.4 `__ + - `1.17.5 `__ * - NVIDIA Kubernetes Device Plugin - `0.17.1 `__ @@ -109,7 +109,7 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. * - | NVIDIA GPU Feature Discovery | for Kubernetes - - `0.17.0 `__ + - `0.17.1 `__ * - NVIDIA MIG Manager for Kubernetes - `0.12.1 `__ diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index 0b15f9019..9cdd7d5b7 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -165,7 +165,7 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: | NVIDIA T400 | Turing | +-------------------------+------------------------+ - .. tab-item:: G-series Products + .. tab-item:: B-series Products +-------------------------+------------------------+ | Product | Architecture | @@ -288,11 +288,11 @@ The GPU Operator has been validated in the following scenarios: * - Ubuntu 24.04 LTS - 1.29---1.32 - - - 8.0 U2, 8.0 U3 - - 1.29---1.32 - - - 1.26 - - 2.12, 2.13 + - + - + - + - * - Red Hat Core OS - @@ -335,7 +335,6 @@ The GPU Operator has been validated in the following scenarios: .. _ubuntu-kernel: :sup:`2` - For Ubuntu 24.04 LTS, kernel versions 6.8 are LTS ESM kernels. For Ubuntu 22.04 LTS, kernel versions 6.8 (non-precompiled driver containers only) 6.5 and 5.15 are LTS ESM kernels. For Ubuntu 20.04 LTS, kernel versions 5.4 and 5.15 are LTS ESM kernels. The GPU Driver containers support these Linux kernels. @@ -436,7 +435,7 @@ See the :doc:`precompiled-drivers` page for more on using precompiled drivers. | Ubuntu 22.04 | Generic, NVIDIA, Azure | 5.15 | R535, R550 | | | AWS, Oracle | | | +----------------------------+------------------------+----------------+---------------------+ -| Ubuntu 24.04 | Generic, NVIDIA, Azure | 6.8 | R550 | +| Ubuntu 24.04 | Generic, NVIDIA, Azure | 6.8 | R550, R570 | | | AWS, Oracle | | | +----------------------------+------------------------+----------------+---------------------+ @@ -448,12 +447,14 @@ Supported Container Runtimes The GPU Operator has been validated in the following scenarios: +----------------------------+------------------------+----------------+ -| Operating System | Containerd 1.4 - 2.0 | CRI-O | +| Operating System | Containerd 1.6 - 2.0 | CRI-O | +============================+========================+================+ | Ubuntu 20.04 LTS | Yes | Yes | +----------------------------+------------------------+----------------+ | Ubuntu 22.04 LTS | Yes | Yes | +----------------------------+------------------------+----------------+ +| Ubuntu 24.04 LTS | Yes | Yes | ++----------------------------+------------------------+----------------+ | CentOS 7 | Yes | No | +----------------------------+------------------------+----------------+ | Red Hat Core OS (RHCOS) | No | Yes | @@ -461,10 +462,6 @@ The GPU Operator has been validated in the following scenarios: | Red Hat Enterprise Linux 8 | Yes | Yes | +----------------------------+------------------------+----------------+ -.. note:: - - The GPU Operator has been validated with version 2 and 3 of the containerd config file. - Support for KubeVirt and OpenShift Virtualization ------------------------------------------------- @@ -478,6 +475,7 @@ Operating System Kubernetes KubeVirt OpenShift Virtual \ \ | GPU vGPU | GPU vGPU | Passthrough | Passthrough ================ =========== ============= ========= ============= =========== +Ubuntu 24.04 LTS 1.23---1.29 0.36+ 0.59.1+ Ubuntu 20.04 LTS 1.23---1.29 0.36+ 0.59.1+ Ubuntu 22.04 LTS 1.23---1.29 0.36+ 0.59.1+ Red Hat Core OS 4.12---4.18 4.13---4.18 @@ -494,7 +492,7 @@ Refer to :ref:`GPU Operator with KubeVirt` or :ref:`NVIDIA GPU Operator with Ope KubeVirt and OpenShift Virtualization with NVIDIA vGPU is supported on the following devices: -- H200NVL with vGPU v18.0 +- H200NVL - H100 @@ -512,6 +510,7 @@ Support for GPUDirect RDMA Supported operating systems and NVIDIA GPU Drivers with GPUDirect RDMA. +- Ubuntu 22.04 LTS Network Operator 25.1.0. - Ubuntu 20.04 and 22.04 LTS with Network Operator 24.10.0. - Red Hat OpenShift 4.12 and higher with Network Operator 23.10.0 diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 86f5cce75..2d31985f8 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -45,21 +45,23 @@ New Features * Added support for the following software component versions: - - NVIDIA Container Toolkit version v1.15.0 + - NVIDIA Container Toolkit version v1.17.5 - NVIDIA Driver Manager for Kubernetes v0.8.0 - NVIDIA Kubernetes Device Plugin v0.17.1 - NVIDIA DCGM Exporter v4.1.1-4.0.4 - NVIDIA DCGM v4.1.1-2 - Node Feature Discovery v0.17.2 + - NVIDIA GPU Feature Discovery for Kubernetes v0.17.1 - NVIDIA MIG Manager for Kubernetes v0.12.1 - NVIDIA KubeVirt GPU Device Plugin v1.3.1 - NVIDIA vGPU Device Manager v0.3.0 - NVIDIA Kata Manager for Kubernetes v0.2.3 - NVIDIA GDRCopy Driver v2.4.4 -* Added support for the NVIDIA GPU DRA Driver v25.3.0 component which enables IMEX and Kubernetes Dynamic Resource Allocation (DRA) support. +* Added support for the NVIDIA GPU DRA Driver v25.3.0 component which enables Multi-Node NVLink through Kubernetes Dynamic Resource Allocation (DRA) and IMEXsupport. - This component is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL or NVIDIA HGX B200, and with CDI enabled on your GPU Operator. + This component is an additional component that can be installed alongside the the GPU Operator. + It is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL or NVIDIA HGX B200, and with CDI enabled on your GPU Operator. See the `IMEX DRA Driver Support `__ documentation for more details on installing this component and running workloads. * Added a new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. @@ -77,7 +79,7 @@ New Features This field is now deprecated and will be removed in a future release. If you were using the ``useOpenKernelModules`` field, it's recommended that you update your configuration to use the `kernelModuleType` field instead. -* Added support for Ubuntu 24.04. +* Added support for Ubuntu 24.04 LTS. * Added support for NVIDIA HGX GB200 NVL and NVIDIA HGX B200. @@ -95,6 +97,8 @@ New Features * Added support for Kubernetes v1.32. Note that the minimum supported Kubernetes versions has been updated to v1.29. +* Added support for Network Operator v25.1.0. + * Added support for new MIG profiles with HGX B200. * Added support for the following profiles: From c6e5b7d8a84b93713921acc854303427c85e2da2 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Mon, 24 Mar 2025 16:00:48 -0400 Subject: [PATCH 08/12] few mroe typos Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/release-notes.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 2d31985f8..b345d3a33 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -64,11 +64,11 @@ New Features It is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL or NVIDIA HGX B200, and with CDI enabled on your GPU Operator. See the `IMEX DRA Driver Support `__ documentation for more details on installing this component and running workloads. -* Added a new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. +* Added a new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel modules to use. Valid values include: - * ``auto``: Default and recommended option. Use the default kernel module type (open or proprietary) based on the GPU Operator and driver containers used. + * ``auto``: Default and recommended option. ``auto`` means that the recommended kernel module type (open or proprietary) is chosen based on the GPU devices on the host and the driver branch used. * ``open``: Use the NVIDIA Open GPU Kernel module driver. * ``proprietary``: Use the NVIDIA Proprietary GPU Kernel module driver. @@ -140,7 +140,7 @@ Improvements * Improved security by removing unnecessary permissions in the GPU Operator ClusterRole. -* Improved GPU Operator metrics to include a `operatorMetricsNamespace` field that sets the metrcis namespace to `gpu_operator`. +* Improved GPU Operator metrics to include a `operatorMetricsNamespace` field that sets the metrics namespace to `gpu_operator`. * Improved error handling in Driver Manager for Kubernetes by adding pod watch permissions. From b2f7e03d53bccaea773b30315b6ed7d77ab9612e Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Tue, 25 Mar 2025 23:19:47 -0400 Subject: [PATCH 09/12] add ubuntu 24.04 and remove wrong GPU Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/platform-support.rst | 7 +++++++ gpu-operator/release-notes.rst | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index 9cdd7d5b7..a9d185001 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -405,6 +405,13 @@ The GPU Operator has been validated in the following scenarios: - - 2.12, 2.13 + * - Ubuntu 24.04 LTS + - 1.29--1.32 + - + - + - + - + * - Red Hat Core OS - - 4.12---4.18 diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index b345d3a33..305b879b7 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -61,7 +61,7 @@ New Features * Added support for the NVIDIA GPU DRA Driver v25.3.0 component which enables Multi-Node NVLink through Kubernetes Dynamic Resource Allocation (DRA) and IMEXsupport. This component is an additional component that can be installed alongside the the GPU Operator. - It is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL or NVIDIA HGX B200, and with CDI enabled on your GPU Operator. + It is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL, and with CDI enabled on your GPU Operator. See the `IMEX DRA Driver Support `__ documentation for more details on installing this component and running workloads. * Added a new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel modules to use. From dc9f1b79f677560f14172932027f2ead3a4131b8 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 26 Mar 2025 12:13:36 -0400 Subject: [PATCH 10/12] minor typos Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/gpu-operator-rdma.rst | 2 +- gpu-operator/release-notes.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu-operator/gpu-operator-rdma.rst b/gpu-operator/gpu-operator-rdma.rst index 01a219792..f7a00a2aa 100644 --- a/gpu-operator/gpu-operator-rdma.rst +++ b/gpu-operator/gpu-operator-rdma.rst @@ -34,7 +34,7 @@ The Operator uses GDS driver version 2.17.5 or newer. This version and higher is only supported with the NVIDIA Open GPU Kernel module driver. In GPU Operator v25.3.0 and later, the ``driver.kernelModuleType`` default is ``auto``, for the supported driver versions. This configuration allows the GPU Operator to choose the recommended driver kernel module type depending on the driver branch and the GPU devices available. -Newer driver versions will use the open kernel module by default, however to make sure you are using the open kernel model, include ``--set driver.kernelModuleType=open`` command-line argument in your helm Operator install command. +Newer driver versions will use the open kernel module by default, however to make sure you are using the open kernel module, include ``--set driver.kernelModuleType=open`` command-line argument in your helm Operator install command. In conjunction with the Network Operator, the GPU Operator can be used to set up the networking related components such as network device kernel drivers and Kubernetes device plugins to enable diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 305b879b7..058ebe0e1 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -69,8 +69,8 @@ New Features Valid values include: * ``auto``: Default and recommended option. ``auto`` means that the recommended kernel module type (open or proprietary) is chosen based on the GPU devices on the host and the driver branch used. - * ``open``: Use the NVIDIA Open GPU Kernel module driver. - * ``proprietary``: Use the NVIDIA Proprietary GPU Kernel module driver. + * ``open``: Use the NVIDIA Open GPU kernel module driver. + * ``proprietary``: Use the NVIDIA Proprietary GPU kernel module driver. Currently, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. 550 and 535 branch drivers do not yet support this mode. From 643d53a618f11754135fdac6df4e09f9e34d2ac2 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 26 Mar 2025 13:49:10 -0400 Subject: [PATCH 11/12] add coming soon for dra Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/release-notes.rst | 3 +-- repo.toml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 058ebe0e1..6d9788be7 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -58,11 +58,10 @@ New Features - NVIDIA Kata Manager for Kubernetes v0.2.3 - NVIDIA GDRCopy Driver v2.4.4 -* Added support for the NVIDIA GPU DRA Driver v25.3.0 component which enables Multi-Node NVLink through Kubernetes Dynamic Resource Allocation (DRA) and IMEXsupport. +* Added support for the NVIDIA GPU DRA Driver v25.3.0 component (coming soon) which enables Multi-Node NVLink through Kubernetes Dynamic Resource Allocation (DRA) and IMEXsupport. This component is an additional component that can be installed alongside the the GPU Operator. It is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL, and with CDI enabled on your GPU Operator. - See the `IMEX DRA Driver Support `__ documentation for more details on installing this component and running workloads. * Added a new parameter, ``kernelModuleType``, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel modules to use. diff --git a/repo.toml b/repo.toml index d1ec38bb7..ac0604494 100644 --- a/repo.toml +++ b/repo.toml @@ -154,7 +154,7 @@ docs_root = "${root}/gpu-operator" project = "gpu-operator" name = "NVIDIA GPU Operator" version = "25.3.0" -source_substitutions = { version = "v25.3.0", recommended = "570.86.15" } +source_substitutions = { version = "v25.3.0", recommended = "570.124.06" } copyright_start = 2020 sphinx_exclude_patterns = [ "life-cycle-policy.rst", From 5ed9830c250c77780cb081958619ac8395f1322e Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:37:55 -0400 Subject: [PATCH 12/12] small typos Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/release-notes.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 6d9788be7..6cdd40f5f 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -58,7 +58,7 @@ New Features - NVIDIA Kata Manager for Kubernetes v0.2.3 - NVIDIA GDRCopy Driver v2.4.4 -* Added support for the NVIDIA GPU DRA Driver v25.3.0 component (coming soon) which enables Multi-Node NVLink through Kubernetes Dynamic Resource Allocation (DRA) and IMEXsupport. +* Added support for the NVIDIA GPU DRA Driver v25.3.0 component (coming soon) which enables Multi-Node NVLink through Kubernetes Dynamic Resource Allocation (DRA) and IMEX support. This component is an additional component that can be installed alongside the the GPU Operator. It is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL, and with CDI enabled on your GPU Operator. @@ -74,9 +74,9 @@ New Features Currently, ``auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. 550 and 535 branch drivers do not yet support this mode. - In previous versions, the ``useOpenKernelModules`` field specified the driver containers to install the NVIDIA Open GPU Kernel module driver. + In previous versions, the ``useOpenKernelModules`` field specified the driver containers to install the NVIDIA Open GPU kernel module driver. This field is now deprecated and will be removed in a future release. - If you were using the ``useOpenKernelModules`` field, it's recommended that you update your configuration to use the `kernelModuleType` field instead. + If you were using the ``useOpenKernelModules`` field, it's recommended that you update your configuration to use the ``kernelModuleType`` field instead. * Added support for Ubuntu 24.04 LTS. @@ -148,7 +148,7 @@ Improvements Fixed Issues ------------ -* Removed default liveness probe from the GDS and GDRCopy containers of the driver-daemonset. +* Removed default liveness probe from the ``nvidia-fs-ctr`` and ``nvidia-gdrcopy-ctr`` containers of the GPU driver daemonset. Long response times of the `lsmod` commands were causing timeout errors in the probe and unnecessary restarts of the container, resulting in the DaemonSet being in a bad state. * Fixed an issue where the GPU Operator failed to create a valid DaemonSet name on OpenShift Container Platform when using 64 kernel page size.