From 87b25e0830c85bf067435f105709977c21f8e4bf Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:32:13 +0100 Subject: [PATCH] nixos: minimize GPU image This reduces the size of the GPU-enabled image to 1.5G, from over 3G previously. It does so by de-duplicating NVIDIA drivers and disabling unnecessary NVIDIA tools like `nvidia-settings`. --- packages/nixos/gpu.nix | 89 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 16 deletions(-) diff --git a/packages/nixos/gpu.nix b/packages/nixos/gpu.nix index f54051d3de..7bcf738519 100644 --- a/packages/nixos/gpu.nix +++ b/packages/nixos/gpu.nix @@ -10,6 +10,67 @@ let cfg = config.contrast.gpu; + + nvidiaPackage = + ( + (config.boot.kernelPackages.nvidiaPackages.mkDriver { + # TODO(msanft): Investigate why the latest version breaks GPU containers. + version = "550.90.07"; + sha256_64bit = "sha256-Uaz1edWpiE9XOh0/Ui5/r6XnhB4iqc7AtLvq4xsLlzM="; + sha256_aarch64 = "sha256-uJa3auRlMHr8WyacQL2MyyeebqfT7K6VU0qR7LGXFXI="; + openSha256 = "sha256-VLmh7eH0xhEu/AK+Osb9vtqAFni+lx84P/bo4ZgCqj8="; + settingsSha256 = "sha256-sX9dHEp9zH9t3RWp727lLCeJLo8QRAGhVb8iN6eX49g="; + persistencedSha256 = "sha256-qe8e1Nxla7F0U88AbnOZm6cHxo57pnLCqtjdvOvq9jk="; + }).override + { + disable32Bit = true; + } + ).overrideAttrs + (oldAttrs: { + # We strip the driver package from its dependencies on desktop software like Wayland and X11. + # For server use-cases, we shouldn't need these. The Mesa (and thus Perl) and libGL dependencies are dropped + # too, as GPU workloads will likely be AI-related and not graphical. The libdrm dependency is dropped as well, + # as we're probably not going to be watching Netflix on the servers. + # Source: https://github.com/NixOS/nixpkgs/blob/eac1633a086e8e109e00ce58c0b47721da1dbdfd/pkgs/os-specific/linux/nvidia-x11/generic.nix#L100C3-L114C6 + libPath = lib.makeLibraryPath ( + with pkgs; + [ + zlib + stdenv.cc.cc + openssl + dbus # for nvidia-powerd + ] + ); + + # Hack to pass the "right" (i.e. the overridden) version of the nvidia driver to the persistenced. + # Looking at the package definition, it _should_ already do so, but it doesn't. + # So for now, override all occurences of `nvidia_x11` in the persistenced package "manually". + # We can't do an `override` on persistenced itself unfortunately, as it's call site doesn't allow this: + # https://github.com/NixOS/nixpkgs/blob/4d2418ebbfb107485b44aaa1b2909409322d9061/pkgs/os-specific/linux/nvidia-x11/generic.nix#L260 + # TODO(msanft): Clarify with upstream why that is the case. + passthru = oldAttrs.passthru // { + persistenced = oldAttrs.passthru.persistenced.overrideAttrs (oldAttrs: { + inherit (nvidiaPackage) version makeFlags; + src = oldAttrs.src // { + rev = nvidiaPackage.version; + }; + + postFixup = '' + # Save a copy of persistenced for mounting in containers + mkdir $out/origBin + cp $out/{bin,origBin}/nvidia-persistenced + patchelf --set-interpreter /lib64/ld-linux-x86-64.so.2 $out/origBin/nvidia-persistenced + + patchelf --set-rpath "$(patchelf --print-rpath $out/bin/nvidia-persistenced):${nvidiaPackage}/lib" \ + $out/bin/nvidia-persistenced + ''; + + meta = oldAttrs.meta // { + inherit (nvidiaPackage.meta) platforms; + }; + }); + }; + }); in { @@ -20,26 +81,22 @@ in config = lib.mkIf cfg.enable { hardware.nvidia = { open = true; - package = lib.mkDefault ( - config.boot.kernelPackages.nvidiaPackages.mkDriver { - # TODO: Investigate why the latest version breaks - # GPU containers. - version = "550.90.07"; - sha256_64bit = "sha256-Uaz1edWpiE9XOh0/Ui5/r6XnhB4iqc7AtLvq4xsLlzM="; - sha256_aarch64 = "sha256-uJa3auRlMHr8WyacQL2MyyeebqfT7K6VU0qR7LGXFXI="; - openSha256 = "sha256-VLmh7eH0xhEu/AK+Osb9vtqAFni+lx84P/bo4ZgCqj8="; - settingsSha256 = "sha256-sX9dHEp9zH9t3RWp727lLCeJLo8QRAGhVb8iN6eX49g="; - persistencedSha256 = "sha256-qe8e1Nxla7F0U88AbnOZm6cHxo57pnLCqtjdvOvq9jk="; - } - ); + package = nvidiaPackage; nvidiaPersistenced = true; + # Disable NVIDIA's GUI settings tool. + nvidiaSettings = false; + # We don't need video acceleration on a server. Disabling this + # saves quite some disk space. + videoAcceleration = false; }; - hardware.graphics = { - enable = true; - enable32Bit = true; - }; + hardware.nvidia-container-toolkit.enable = true; + # Make NVIDIA the "default" graphics driver to replace Mesa, + # which saves us another Perl dependency. + hardware.graphics.package = nvidiaPackage; + hardware.graphics.package32 = nvidiaPackage; + image.repart.partitions."10-root".contents."/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh".source = lib.getExe pkgs.nvidia-ctk-oci-hook;