From 2e53df7b44371b91ed28bb90b1a78d9f854ba69c Mon Sep 17 00:00:00 2001
From: Philip Taron <philip.taron@gmail.com>
Date: Fri, 22 Dec 2023 12:33:09 -0800
Subject: [PATCH] flake.nix: rewrite

1. Split into separate files per output.

2. Added overlays, so that this flake can be integrated into others.
   The names in the overlay are `llama-cpp`, `llama-cpp-opencl`,
   `llama-cpp-cuda`, and `llama-cpp-rocm` so that they fit into the
   broader set of Nix packages from [nixpkgs](https://github.com/nixos/nixpkgs).

3. Use [callPackage](https://summer.nixos.org/blog/callpackage-a-tool-for-the-lazy/)
   rather than `with pkgs;` so that there's dependency injection rather
   than dependency lookup.

4. Add a description and meta information for each package.
   The description includes a bit about what's trying to accelerate each one.
---
 apps.nix      |  14 ++++
 devshells.nix |  32 +++++++++
 flake.lock    |  40 +-----------
 flake.nix     | 178 +++++++++++++-------------------------------------
 llama-cpp.nix | 118 +++++++++++++++++++++++++++++++++
 overlays.nix  |  20 ++++++
 6 files changed, 231 insertions(+), 171 deletions(-)
 create mode 100644 apps.nix
 create mode 100644 devshells.nix
 create mode 100644 llama-cpp.nix
 create mode 100644 overlays.nix

diff --git a/apps.nix b/apps.nix
new file mode 100644
index 00000000000000..29774220a6c783
--- /dev/null
+++ b/apps.nix
@@ -0,0 +1,14 @@
+names: pkgs:
+
+let
+  default = builtins.elemAt names 0;
+  mkApp = name: {
+    ${name} = {
+      type = "app";
+      program = "${pkgs.llama-cpp}/bin/${name}";
+    };
+  };
+  result = builtins.foldl' (acc: name: (mkApp name) // acc) {} names;
+in
+
+result // { default = result.${default}; }
diff --git a/devshells.nix b/devshells.nix
new file mode 100644
index 00000000000000..1a2bbe545e1bcb
--- /dev/null
+++ b/devshells.nix
@@ -0,0 +1,32 @@
+pkgs:
+
+let
+  llama-python = pkgs.python3.withPackages (ps: [
+    ps.numpy
+    ps.sentencepiece
+  ]);
+
+  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
+  llama-python-extra = pkgs.python3.withPackages (ps: [
+    ps.numpy
+    ps.sentencepiece
+    ps.torchWithoutCuda
+    ps.transformers
+  ]);
+in
+
+{
+  default = pkgs.mkShell {
+    name = "default";
+    description = "contains numpy and sentencepiece";
+    inputsFrom = [ pkgs.llama-cpp ];
+    buildInputs = [ llama-python ];
+  };
+
+  extra = pkgs.mkShell {
+    name = "extra";
+    description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
+    inputsFrom = [ pkgs.llama-cpp ];
+    buildInputs = [ llama-python-extra ];
+  };
+}
diff --git a/flake.lock b/flake.lock
index 0455f65617a2dc..fdcd6d411b3240 100644
--- a/flake.lock
+++ b/flake.lock
@@ -1,30 +1,12 @@
 {
   "nodes": {
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1694529238,
-        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1698318101,
-        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
+        "lastModified": 1703013332,
+        "narHash": "sha256-+tFNwMvlXLbJZXiMHqYq77z/RfmpfpiI3yjL6o/Zo9M=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
+        "rev": "54aac082a4d9bb5bbc5c4e899603abfb76a3f6d6",
         "type": "github"
       },
       "original": {
@@ -36,24 +18,8 @@
     },
     "root": {
       "inputs": {
-        "flake-utils": "flake-utils",
         "nixpkgs": "nixpkgs"
       }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
     }
   },
   "root": "root",
diff --git a/flake.nix b/flake.nix
index 4cf28d5c11c0fd..08d82dba03afd9 100644
--- a/flake.nix
+++ b/flake.nix
@@ -1,139 +1,49 @@
 {
   inputs = {
     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
   };
-  outputs = { self, nixpkgs, flake-utils }:
-    flake-utils.lib.eachDefaultSystem (system:
-      let
-        name = "llama.cpp";
-        src = ./.;
-        meta.mainProgram = "llama";
-        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
-        buildInputs = with pkgs; [ openmpi ];
-        osSpecific = with pkgs; buildInputs ++ (
-          if isAarch64 && isDarwin then
-            with pkgs.darwin.apple_sdk_11_0.frameworks; [
-              Accelerate
-              MetalKit
-            ]
-          else if isAarch32 && isDarwin then
-            with pkgs.darwin.apple_sdk.frameworks; [
-              Accelerate
-              CoreGraphics
-              CoreVideo
-            ]
-          else if isDarwin then
-            with pkgs.darwin.apple_sdk.frameworks; [
-              Accelerate
-              CoreGraphics
-              CoreVideo
-            ]
-          else
-            with pkgs; [ openblas ]
-        );
-        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
-        cudatoolkit_joined = with pkgs; symlinkJoin {
-          # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
-          # see https://github.com/NixOS/nixpkgs/issues/224291
-          # copied from jaxlib
-          name = "${cudaPackages.cudatoolkit.name}-merged";
-          paths = [
-            cudaPackages.cudatoolkit.lib
-            cudaPackages.cudatoolkit.out
-          ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
-            # for some reason some of the required libs are in the targets/x86_64-linux
-            # directory; not sure why but this works around it
-            "${cudaPackages.cudatoolkit}/targets/${system}"
-          ];
-        };
-        llama-python =
-          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
-        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-        llama-python-extra =
-          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
-        postPatch = ''
-          substituteInPlace ./ggml-metal.m \
-            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-          substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
-        '';
-        postInstall = ''
-          mv $out/bin/main $out/bin/llama
-          mv $out/bin/server $out/bin/llama-server
-          mkdir -p $out/include
-          cp ${src}/llama.h $out/include/
-        '';
-        cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
-      in
-      {
-        packages.default = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = osSpecific;
-          cmakeFlags = cmakeFlags
-            ++ (if isAarch64 && isDarwin then [
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-            "-DLLAMA_METAL=ON"
-          ] else [
-            "-DLLAMA_BLAS=ON"
-            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
-          ]);
-        };
-        packages.opencl = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ clblast ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_CLBLAST=ON"
-          ];
-        };
-        packages.cuda = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_CUBLAS=ON"
-          ];
-        };
-        packages.rocm = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_HIPBLAS=1"
-            "-DCMAKE_C_COMPILER=hipcc"
-            "-DCMAKE_CXX_COMPILER=hipcc"
-            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-            # and select the line that matches the current nixpkgs version of rocBLAS.
-            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-          ];
-        };
-        apps.llama-server = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/llama-server";
-        };
-        apps.llama-embedding = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/embedding";
-        };
-        apps.llama = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/llama";
-        };
-        apps.quantize = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/quantize";
-        };
-        apps.train-text-from-scratch = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
-        };
-        apps.default = self.apps.${system}.llama;
-        devShells.default = pkgs.mkShell {
-          buildInputs = [ llama-python ];
-          packages = nativeBuildInputs ++ osSpecific;
-        };
-        devShells.extra = pkgs.mkShell {
-          buildInputs = [ llama-python-extra ];
-          packages = nativeBuildInputs ++ osSpecific;
-        };
-      });
+
+  outputs = { self, nixpkgs }:
+
+  let
+    inherit (nixpkgs.lib) genAttrs;
+    overlays = import ./overlays.nix;
+    importNixpkgs = system: import nixpkgs {
+      inherit system;
+      overlays = [ overlays ];
+    };
+    systems = [ "aarch64-darwin" "aarch64-linux" "x86_64-darwin" "x86_64-linux" ];
+    withSystemPackages = f: genAttrs systems (system: f (importNixpkgs system));
+  in
+
+  {
+    # These define the various ways to build the llama.cpp project.
+    # Integrate them into your flake.nix configuration by adding this
+    # overlay to nixpkgs.overlays.
+    overlays = {
+      default = overlays;
+    };
+
+    # These use the definitions from ./overlays.nix and expose them as installables.
+    packages = withSystemPackages (pkgs: {
+      default = pkgs.llama-cpp;
+      opencl = pkgs.llama-cpp-opencl;
+      cuda = pkgs.llama-cpp-cuda;
+      rocm = pkgs.llama-cpp-rocm;
+    });
+
+    # These use the definition of llama-cpp from ./overlays.nix and expose various
+    # binaries as apps so that they're able to be run with `nix run`.
+    apps = withSystemPackages (import ./apps.nix [
+      "llama"
+      "llama-embedding"
+      "llama-server"
+      "quantize"
+      "train-text-from-scratch"
+    ]);
+
+    # These expose a build environment for either a "default" or an "extra" set of
+    # dependencies.
+    devShells = withSystemPackages (import ./devshells.nix);
+  };
 }
diff --git a/llama-cpp.nix b/llama-cpp.nix
new file mode 100644
index 00000000000000..1191d33df122bf
--- /dev/null
+++ b/llama-cpp.nix
@@ -0,0 +1,118 @@
+{ lib
+, stdenv
+, cmake
+, ninja
+, pkg-config
+, symlinkJoin
+, python3
+, openmpi
+, openblas
+, cudaPackages
+, rocmPackages
+, clblast
+, Accelerate ? null
+, MetalKit ? null
+, CoreVideo ? null
+, CoreGraphics ? null
+, useOpenCL ? false
+, useCuda ? false
+, useRocm ? false
+}:
+
+let
+  inherit (lib) optional optionals versionOlder;
+  isDefault = !useOpenCL && !useCuda && !useRocm;
+
+  # Give a little description difference between the flavors.
+  descriptionSuffix = if useOpenCL then
+    " (OpenCL accelerated)"
+  else if useCuda then
+    " (CUDA accelerated)"
+  else if useRocm then
+    " (ROCm accelerated)"
+  else if (MetalKit != null) then
+    " (MetalKit accelerated)"
+  else "";
+
+  llama-python = python3.withPackages (ps: [ ps.numpy ps.sentencepiece ]);
+
+  # See ./overlay.nix for where these dependencies are passed in.
+  defaultBuildInputs = builtins.filter (p: p != null) [
+    Accelerate
+    MetalKit
+    CoreVideo
+    CoreGraphics
+  ];
+
+  cudaBuildInput = symlinkJoin (with cudaPackages; {
+    # HACK(Green-Sky): Nix currently has issues with CMake findcudatoolkit
+    # see https://github.com/NixOS/nixpkgs/issues/224291
+    # copied from jaxlib
+    name = "${cudatoolkit.name}-merged";
+    paths = [
+      cudatoolkit.lib
+      cudatoolkit.out
+    ] ++ optionals (versionOlder cudatoolkit.version "11") [
+      # For some reason, some of the required libs are in the targets/x86_64-linux
+      # directory; not sure why, but this works around it.
+      "${cudatoolkit}/targets/${stdenv.system}"
+    ];
+  });
+
+  rocmBuildInputs = with rocmPackages; [ clr hipblas rocblas ];
+in
+
+stdenv.mkDerivation {
+  name = "llama.cpp";
+  src = ./.;
+  meta = {
+    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+    mainProgram = "llama";
+  };
+
+  postPatch = ''
+    substituteInPlace ./ggml-metal.m \
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+    substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
+  '';
+
+  nativeBuildInputs = [ cmake ninja pkg-config ];
+
+  buildInputs = [ openmpi ]
+  ++ optional useOpenCL clblast
+  ++ optional useCuda cudaBuildInput
+  ++ optionals useRocm rocmBuildInputs
+  ++ optionals isDefault defaultBuildInputs;
+
+  cmakeFlags = [
+    "-DLLAMA_NATIVE=OFF"
+    "-DLLAMA_BUILD_SERVER=ON"
+    "-DBUILD_SHARED_LIBS=ON"
+    "-DCMAKE_SKIP_BUILD_RPATH=ON"
+  ]
+  ++ optional useOpenCL "-DLLAMA_CLBLAST=ON"
+  ++ optional useCuda "-DLLAMA_CUBLAS=ON"
+  ++ optionals useRocm [
+    "-DLLAMA_HIPBLAS=1"
+    "-DCMAKE_C_COMPILER=hipcc"
+    "-DCMAKE_CXX_COMPILER=hipcc"
+    # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+    # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+    # and select the line that matches the current nixpkgs version of rocBLAS.
+    "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+  ]
+  ++ optionals isDefault (if (MetalKit != null) then [
+    "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+    "-DLLAMA_METAL=ON"
+  ] else [
+    "-DLLAMA_BLAS=ON"
+    "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+  ]);
+
+  postInstall = ''
+    mv $out/bin/main $out/bin/llama
+    mv $out/bin/server $out/bin/llama-server
+    mkdir -p $out/include
+    cp $src/llama.h $out/include/
+  '';
+}
diff --git a/overlays.nix b/overlays.nix
new file mode 100644
index 00000000000000..0292576ab4db0e
--- /dev/null
+++ b/overlays.nix
@@ -0,0 +1,20 @@
+final: prev:
+
+let
+  inherit (prev.stdenv) isAarch32 isAarch64 isDarwin;
+
+  darwinSpecific = if isAarch64 then {
+    inherit (prev.darwin.apple_sdk_11_0.frameworks) Accelerate MetalKit;
+  } else {
+    inherit (prev.darwin.apple_sdk.frameworks) Accelerate CoreGraphics CoreVideo;
+  };
+
+  osSpecific = if isDarwin then darwinSpecific else {};
+in
+
+{
+  llama-cpp = prev.callPackage ./llama-cpp.nix osSpecific;
+  llama-cpp-opencl = prev.callPackage ./llama-cpp.nix (osSpecific // { useOpenCL = true; });
+  llama-cpp-cuda = prev.callPackage ./llama-cpp.nix (osSpecific // { useCuda = true; });
+  llama-cpp-rocm = prev.callPackage ./llama-cpp.nix (osSpecific // { useRocm = true; });
+}