chore: Remove model weights from container images (#786)

* chore: add PVC to deployments to store model weights * chore: add onCreate action to download model files * chore: use Zarf Injection to populate PVC with model files * chore: add zarf vars for pvc config
defenseunicorns · Aug 1, 2024 · 33e4efb · 33e4efb
1 parent 08f1d10
commit 33e4efb
Show file tree

Hide file tree

Showing 39 changed files with 436 additions and 57 deletions.
diff --git a/.github/actions/python/action.yaml b/.github/actions/python/action.yaml
@@ -1,6 +1,10 @@
 name: setup-python
 description: "Setup Python and library dependencies"
 
+inputs:
+  additionalOptionalDep:
+    description: "Additional optional dependencies to install"
+
 runs:
   using: composite
   steps:
@@ -10,5 +14,10 @@ runs:
         python-version-file: 'pyproject.toml'
 
     - name: Install Python Deps
+      shell: bash
       run: python -m pip install ".[dev]"
+
+    - name: Install additionalDeps
+      if: ${{ inputs.additionalOptionalDep != '' }}
       shell: bash
+      run: python -m pip install ".[${{ inputs.additionalOptionalDep }}]"
diff --git a/.github/workflows/e2e-vllm.yaml b/.github/workflows/e2e-vllm.yaml
@@ -61,12 +61,9 @@ jobs:
           uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
         - name: Setup Python
-          uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c #v5.0.0
+          uses: ./.github/actions/python
           with:
-            python-version-file: 'pyproject.toml'
-
-        - name: Install Python Deps
-          run: python -m pip install "."
+            additionalOptionalDep: dev-vllm
 
         - name: Setup UDS Environment
           uses: defenseunicorns/uds-common/.github/actions/setup@05f42bb3117b66ebef8c72ae050b34bce19385f5

diff --git a/.github/workflows/e2e-whisper.yaml b/.github/workflows/e2e-whisper.yaml
@@ -60,6 +60,8 @@ jobs:
 
         - name: Setup Python
           uses: ./.github/actions/python
+          with:
+            additionalOptionalDep: dev-whisper
 
         - name: Setup UDS Cluster
           uses: ./.github/actions/uds-cluster

diff --git a/packages/llama-cpp-python/Dockerfile b/packages/llama-cpp-python/Dockerfile
@@ -7,19 +7,6 @@ ARG SDK_DEST=src/leapfrogai_sdk/build
 USER root
 WORKDIR /leapfrogai
 
-# download model
-RUN python -m pip install -U huggingface_hub[cli,hf_transfer]
-ARG REPO_ID=TheBloke/SynthIA-7B-v2.0-GGUF
-ARG FILENAME=synthia-7b-v2.0.Q4_K_M.gguf
-ARG REVISION=3f65d882253d1f15a113dabf473a7c02a004d2b5
-
-# NOTE: This is checking for a pre-downloaded model file in the local build dir before downloading the model from HuggingFace
-# TODO: Add checksum validation to verify the model in the local build-dir is the model we expect
-COPY packages/llama-cpp-python/scripts/model_download.py scripts/model_download.py
-RUN REPO_ID=${REPO_ID} FILENAME=${FILENAME} REVISION=${REVISION} python3.11 scripts/model_download.py
-RUN mv .model/*.gguf .model/model.gguf
-
-
 # create virtual environment for light-weight portability and minimal libraries
 RUN python3.11 -m venv .venv
 ENV PATH="/leapfrogai/.venv/bin:$PATH"
@@ -42,7 +29,6 @@ ENV PATH="/leapfrogai/.venv/bin:$PATH"
 WORKDIR /leapfrogai
 
 COPY --from=builder /leapfrogai/.venv/ /leapfrogai/.venv/
-COPY --from=builder /leapfrogai/.model/ /leapfrogai/.model/
 
 COPY packages/llama-cpp-python/main.py .
 COPY packages/llama-cpp-python/config.yaml .

diff --git a/packages/llama-cpp-python/README.md b/packages/llama-cpp-python/README.md
@@ -37,6 +37,7 @@ To build and deploy just the llama-cpp-python Zarf package (from the root of the
 > Deploy a [UDS cluster](/README.md#uds) if one isn't deployed already
 
 ```shell
+pip install 'huggingface_hub[cli,hf_transfer]'  # Used to download the model weights from huggingface
 make build-llama-cpp-python LOCAL_VERSION=dev
 uds zarf package deploy packages/llama-cpp-python/zarf-package-llama-cpp-python-*-dev.tar.zst --confirm
 ```

diff --git a/packages/llama-cpp-python/chart/templates/deployment.yaml b/packages/llama-cpp-python/chart/templates/deployment.yaml
@@ -23,8 +23,38 @@ spec:
         {{- toYaml . | nindent 8 }}
       {{- end }}
       labels:
+        app: lfai-llama
         {{- include "chart.selectorLabels" . | nindent 8 }}
     spec:
+      # It's necessary to include the ###ZARF_DATA_INJECTION_MARKER### somewhere in the podspec, otherwise data injections will not occur.
+      initContainers:
+        - name: data-loader
+          image: cgr.dev/chainguard/bash:latest
+          securityContext:
+            runAsUser: 65532
+            runAsGroup: 65532
+            fsGroup: 65532
+          # This command looks for the Zarf "data injection marker" which is a timestamped file that is injected after everything else and marks the injection as complete.
+          command:
+            [
+              "sh",
+              "-c",
+              'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
+            ]
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "200m"
+            limits:
+              memory: "128Mi"
+              cpu: "500m"
+          volumeMounts:
+            - name: leapfrogai-pv-storage
+              mountPath: /data
+      volumes:
+        - name: leapfrogai-pv-storage
+          persistentVolumeClaim:
+            claimName: lfai-llama-pv-claim
       securityContext:
         {{- toYaml .Values.podSecurityContext | nindent 8 }}
       containers:
@@ -39,6 +69,9 @@ spec:
               protocol: TCP
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: leapfrogai-pv-storage
+              mountPath: "/data"
       {{- with .Values.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}

diff --git a/packages/llama-cpp-python/chart/templates/pvc.yaml b/packages/llama-cpp-python/chart/templates/pvc.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+    name: lfai-llama-pv-claim
+    namespace: leapfrogai
+spec:
+  {{- if .Values.persistence.storageClass }}
+  storageClassName: {{ .Values.persistence.storageClass  }}
+  {{- end }}
+  accessModes:
+    - {{ .Values.persistence.accessModes | quote }}
+  resources:
+    requests:
+      storage: {{ .Values.persistence.size | quote }}
diff --git a/packages/llama-cpp-python/chart/values.yaml b/packages/llama-cpp-python/chart/values.yaml
@@ -54,3 +54,8 @@ nodeSelector: {}
 tolerations: []
 
 affinity: {}
+
+persistence:
+  size: 15Gi
+  accessModes: ReadWriteOnce
+  storageClass: "local-path"
diff --git a/packages/llama-cpp-python/config.yaml b/packages/llama-cpp-python/config.yaml
@@ -1,5 +1,5 @@
 model:
-  source: ".model/model.gguf"
+  source: "/data/.model/model.gguf"
 max_context_length: 16384
 stop_tokens:
   - "<|im_end|>"

diff --git a/packages/llama-cpp-python/llama-cpp-python-values.yaml b/packages/llama-cpp-python/llama-cpp-python-values.yaml
@@ -1,2 +1,7 @@
 image:
   tag: "###ZARF_CONST_IMAGE_VERSION###"
+
+persistence:
+  size: ###ZARF_VAR_PVC_SIZE###
+  accessModes: ###ZARF_VAR_PVC_ACCESS_MODE###
+  storageClass: ###ZARF_VAR_PVC_STORAGE_CLASS###
diff --git a/packages/llama-cpp-python/main.py b/packages/llama-cpp-python/main.py
@@ -15,6 +15,9 @@
 class Model:
     backend_config = BackendConfig()
 
+    if not os.path.exists(backend_config.model.source):
+        raise ValueError(f"Model path ({backend_config.model.source}) does not exist")
+
     llm = Llama(
         model_path=backend_config.model.source,
         n_ctx=backend_config.max_context_length,

diff --git a/packages/llama-cpp-python/scripts/model_download.py b/packages/llama-cpp-python/scripts/model_download.py
@@ -1,17 +1,34 @@
 import os
+import hashlib
+import urllib.request
 
-from huggingface_hub import hf_hub_download
+REPO_ID = os.environ.get("REPO_ID", "")
+FILENAME = os.environ.get("FILENAME", "")
+REVISION = os.environ.get("REVISION", "main")
+CHECKSUM = os.environ.get("SHA256_CHECKSUM", "")
+OUTPUT_FILE = os.environ.get("OUTPUT_FILE", ".model/model.gguf")
 
-REPO_ID = os.environ.get("REPO_ID", "TheBloke/SynthIA-7B-v2.0-GGUF")
-FILENAME = os.environ.get("FILENAME", "synthia-7b-v2.0.Q4_K_M.gguf")
-REVISION = os.environ.get("REVISION", "3f65d882253d1f15a113dabf473a7c02a004d2b5")
 
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+def download_model():
+    # Check if the model is already downloaded.
+    if os.path.exists(OUTPUT_FILE) and CHECKSUM != "":
+        if hashlib.sha256(open(OUTPUT_FILE, "rb").read()).hexdigest() == CHECKSUM:
+            print("Model already downloaded.")
+            return
 
-hf_hub_download(
-    repo_id=REPO_ID,
-    filename=FILENAME,
-    local_dir=".model",
-    local_dir_use_symlinks=False,
-    revision=REVISION,
-)
+    # Validate that require environment variables are provided
+    if REPO_ID == "" or FILENAME == "":
+        print("Please provide REPO_ID and FILENAME environment variables.")
+        return
+
+    # Download the model!
+    print("Downloading model... This may take a while.")
+    if not os.path.exists(".model"):
+        os.mkdir(".model")
+    urllib.request.urlretrieve(
+        f"https://huggingface.co/{REPO_ID}/resolve/{REVISION}/{FILENAME}", OUTPUT_FILE
+    )
+
+
+if __name__ == "__main__":
+    download_model()
diff --git a/packages/llama-cpp-python/zarf.yaml b/packages/llama-cpp-python/zarf.yaml
@@ -11,6 +11,19 @@ constants:
   - name: IMAGE_VERSION
     value: "###ZARF_PKG_TMPL_IMAGE_VERSION###"
 
+variables:
+  - name: PVC_SIZE
+    description: Size of the PVC used for model storage.
+    default: "15Gi"
+    pattern: "^[0-9]+[a-zA-Z]+$"
+  - name: PVC_ACCESS_MODE
+    description: Access mode of the PVC used for model storage.
+    default: "ReadWriteOnce"
+    pattern: "^(ReadWriteOnce|ReadOnlyMany|ReadWriteMany)$"
+  - name: PVC_STORAGE_CLASS
+    description: Storage class of the PVC used for model storage.
+    default: "local-path"
+
 components:
   - name: llama-cpp-python-model
     required: true
@@ -26,3 +39,22 @@ components:
           - "llama-cpp-python-values.yaml"
     images:
       - ghcr.io/defenseunicorns/leapfrogai/llama-cpp-python:###ZARF_PKG_TMPL_IMAGE_VERSION###
+      - cgr.dev/chainguard/bash:latest
+    dataInjections:
+      - source: .model/
+        target:
+          namespace: leapfrogai
+          selector: app=lfai-llama
+          container: data-loader
+          path: /data/.model
+        compress: true
+    actions:
+      onCreate:
+        before:
+          # NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed
+          - cmd: python scripts/model_download.py
+            env:
+              - REPO_ID=TheBloke/SynthIA-7B-v2.0-GGUF
+              - FILENAME=synthia-7b-v2.0.Q4_K_M.gguf
+              - REVISION=3f65d882253d1f15a113dabf473a7c02a004d2b5
+              - SHA256_CHECKSUM=5d6369d456446c40a9fd149525747d8dc494196686861c43b00f9230a166ba82
diff --git a/packages/text-embeddings/Dockerfile b/packages/text-embeddings/Dockerfile
@@ -14,22 +14,13 @@ ENV PATH="/leapfrogai/.venv/bin:$PATH"
 # copy and install all python dependencies
 # NOTE: We are copying the leapfrog whl to this filename because installing 'optional extras' from
 #       a wheel requires the absolute path to the wheel file (instead of a wildcard whl)
-
 COPY --from=sdk /leapfrogai/${SDK_DEST} ${SDK_DEST}
 COPY packages/text-embeddings packages/text-embeddings
 
 RUN rm -f packages/text-embeddings/build/*.whl
 RUN python -m pip wheel packages/text-embeddings -w packages/text-embeddings/build --find-links=${SDK_DEST}
 RUN pip install packages/text-embeddings/build/lfai_text_embeddings*.whl --no-index --find-links=packages/text-embeddings/build/
 
-
-# download model
-RUN python -m pip install -U huggingface_hub[cli,hf_transfer]
-ARG REPO_ID="hkunlp/instructor-xl"
-ARG REVISION="ce48b213095e647a6c3536364b9fa00daf57f436"
-COPY packages/text-embeddings/scripts/model_download.py scripts/model_download.py
-RUN REPO_ID=${REPO_ID} REVISION=${REVISION} python scripts/model_download.py
-
 # hardened and slim python image
 FROM ghcr.io/defenseunicorns/leapfrogai/python:3.11
 
@@ -38,7 +29,6 @@ ENV PATH="/leapfrogai/.venv/bin:$PATH"
 WORKDIR /leapfrogai
 
 COPY --from=builder /leapfrogai/.venv/ /leapfrogai/.venv/
-COPY --from=builder /leapfrogai/.model/ /leapfrogai/.model/
 
 COPY packages/text-embeddings/main.py .
 

diff --git a/packages/text-embeddings/README.md b/packages/text-embeddings/README.md
@@ -14,6 +14,7 @@ To build and deploy just the text-embeddings Zarf package (from the root of the
 > Deploy a [UDS cluster](/README.md#uds) if one isn't deployed already
 
 ```shell
+pip install 'huggingface_hub[cli,hf_transfer]'  # Used to download the model weights from huggingface
 make build-text-embeddings LOCAL_VERSION=dev
 uds zarf package deploy packages/text-embeddings/zarf-package-text-embeddings-*-dev.tar.zst --confirm
 ```

diff --git a/packages/text-embeddings/chart/templates/deployment.yaml b/packages/text-embeddings/chart/templates/deployment.yaml
@@ -23,13 +23,43 @@ spec:
         {{- toYaml . | nindent 8 }}
       {{- end }}
       labels:
+        app: lfai-text-embeddings
         {{- include "chart.selectorLabels" . | nindent 8 }}
     spec:
       {{- if gt (index .Values.resources.limits "nvidia.com/gpu") 0.0 }}
       runtimeClassName: nvidia
       {{- else if .Values.gpu.runtimeClassName }}
       runtimeClassName: {{ .Values.gpu.runtimeClassName }}
       {{- end }}
+      # It's necessary to include the ###ZARF_DATA_INJECTION_MARKER### somewhere in the podspec, otherwise data injections will not occur.
+      initContainers:
+        - name: data-loader
+          image: cgr.dev/chainguard/bash:latest
+          securityContext:
+            runAsUser: 65532
+            runAsGroup: 65532
+            fsGroup: 65532
+          # This command looks for the Zarf "data injection marker" which is a timestamped file that is injected after everything else and marks the injection as complete.
+          command:
+            [
+              "sh",
+              "-c",
+              'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
+            ]
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "200m"
+            limits:
+              memory: "128Mi"
+              cpu: "500m"
+          volumeMounts:
+            - name: leapfrogai-pv-storage
+              mountPath: /data
+      volumes:
+        - name: leapfrogai-pv-storage
+          persistentVolumeClaim:
+            claimName: lfai-text-embeddings-pv-claim
       securityContext:
         {{- toYaml .Values.podSecurityContext | nindent 8 }}
       containers:
@@ -44,6 +74,12 @@ spec:
               protocol: TCP
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: leapfrogai-pv-storage
+              mountPath: "/data"
+          env:
+            - name: LFAI_MODEL_PATH
+              value: '/data/.model'
       {{- with .Values.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}

diff --git a/packages/text-embeddings/chart/templates/pvc.yaml b/packages/text-embeddings/chart/templates/pvc.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+    name: lfai-text-embeddings-pv-claim
+    namespace: leapfrogai
+spec:
+  {{- if .Values.persistence.storageClass }}
+  storageClassName: {{ .Values.persistence.storageClass  }}
+  {{- end }}
+  accessModes:
+    - {{ .Values.persistence.accessModes | quote }}
+  resources:
+    requests:
+      storage: {{ .Values.persistence.size | quote }}
diff --git a/packages/text-embeddings/chart/values.yaml b/packages/text-embeddings/chart/values.yaml
@@ -53,3 +53,8 @@ nodeSelector: {}
 tolerations: []
 
 affinity: {}
+
+persistence:
+  size: 15Gi
+  accessModes: ReadWriteOnce
+  storageClass: "local-path"
diff --git a/packages/text-embeddings/embedding-values.yaml b/packages/text-embeddings/embedding-values.yaml
@@ -7,3 +7,8 @@ gpu:
 resources:
   limits:
     nvidia.com/gpu: ###ZARF_VAR_GPU_LIMIT###
+
+persistence:
+  size: ###ZARF_VAR_PVC_SIZE###
+  accessModes: ###ZARF_VAR_PVC_ACCESS_MODE###
+  storageClass: ###ZARF_VAR_PVC_STORAGE_CLASS###