python: implement selectable GPU backend

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
nomic-ai · May 6, 2024 · 81140d4 · 81140d4
1 parent b331c7b
commit 81140d4
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 20 deletions.
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
@@ -602,9 +602,12 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
     auto devices = availableGPUDevices(memoryRequired);
 
     auto dev_it = devices.begin();
+#ifndef GGML_USE_CUDA
     if (name == "amd" || name == "nvidia" || name == "intel") {
         dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
-    } else if (name != "gpu") {
+    } else
+#endif
+    if (name != "gpu") {
         dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
     }
 

diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -201,9 +201,11 @@ class LLModel:
         Maximum size of context window
     ngl : int
         Number of GPU layers to use (Vulkan)
+    backend : str
+        Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
     """
 
-    def __init__(self, model_path: str, n_ctx: int, ngl: int):
+    def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
         self.model_path = model_path.encode()
         self.n_ctx = n_ctx
         self.ngl = ngl
@@ -213,7 +215,7 @@ def __init__(self, model_path: str, n_ctx: int, ngl: int):
 
         # Construct a model implementation
         err = ctypes.c_char_p()
-        model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
+        model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
         if model is None:
             s = err.value
             raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
@@ -232,7 +234,7 @@ def _raise_closed(self) -> NoReturn:
         raise ValueError("Attempted operation on a closed LLModel")
 
     @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
         if self.model is None:
             self._raise_closed()
         return llmodel.llmodel_model_backend_name(self.model).decode()
@@ -259,7 +261,7 @@ def list_gpus(mem_required: int = 0) -> list[str]:
         devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
         if not devices_ptr:
             raise ValueError("Unable to retrieve available GPU devices")
-        return [d.name.decode() for d in devices_ptr[:num_devices.value]]
+        return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]
 
     def init_gpu(self, device: str):
         if self.model is None:

diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -5,6 +5,7 @@
 
 import hashlib
 import os
+import platform
 import re
 import sys
 import time
@@ -44,7 +45,7 @@ class Embed4All:
 
     MIN_DIMENSIONALITY = 64
 
-    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
+    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
         """
         Constructor
 
@@ -172,7 +173,7 @@ def __init__(
         model_type: str | None = None,
         allow_download: bool = True,
         n_threads: int | None = None,
-        device: str | None = "cpu",
+        device: str | None = None,
         n_ctx: int = 2048,
         ngl: int = 100,
         verbose: bool = False,
@@ -190,30 +191,56 @@ def __init__(
             n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
             device: The processing unit on which the GPT4All model will run. It can be set to:
                 - "cpu": Model will run on the central processing unit.
-                - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
-                - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
-                - A specific device name from the list returned by `GPT4All.list_gpus()`.
-                Default is "cpu".
+                - "kompute": Use the best GPU provided by the Kompute backend.
+                - "cuda": Use the best GPU provided by the CUDA backend.
+                - "amd", "nvidia": Use the best available GPU from the specified vendor.
+                - A specific device name from the list returned by `GPT4All.list_gpus()` (not available on macOS).
+                Default is "metal" on ARM64 macOS, "cpu" otherwise.
+                "gpu" is a deprecated alias for "kompute".
 
                 Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
             n_ctx: Maximum size of context window
             ngl: Number of GPU layers to use (Vulkan)
             verbose: If True, print debug messages.
         """
+
         self.model_type = model_type
+        self._history: list[MessageType] | None = None
+        self._current_prompt_template: str = "{0}"
+
+        device_init = None
+        if sys.platform == 'darwin':
+            if device is None:
+                backend = 'auto'  # 'auto' is effectively 'metal' due to currently non-functional fallback
+            elif device == 'cpu':
+                backend = 'cpu'
+            else:
+                if platform.machine() != 'arm64' or device != 'gpu':
+                    raise ValueError(f'Unknown device for this platform: {device}')
+                backend = 'metal'
+        else:
+            backend = 'kompute'
+            if device is None or device == 'cpu':
+                pass  # use kompute with no device
+            elif device in ('cuda', 'kompute'):
+                backend = device
+                device_init = 'gpu'
+            elif device.startswith('cuda:'):
+                backend = 'cuda'
+                device_init = device.removeprefix('cuda:')
+            else:
+                device_init = device.removeprefix('kompute:')
+
         # Retrieve model and download if allowed
         self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        self.model = LLModel(self.config["path"], n_ctx, ngl)
-        if device is not None and device != "cpu":
-            self.model.init_gpu(device)
+        self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
+        if device_init is not None:
+            self.model.init_gpu(device_init)
         self.model.load_model()
         # Set n_threads
         if n_threads is not None:
             self.model.set_thread_count(n_threads)
 
-        self._history: list[MessageType] | None = None
-        self._current_prompt_template: str = "{0}"
-
     def __enter__(self) -> Self:
         return self
 
@@ -227,13 +254,13 @@ def close(self) -> None:
         self.model.close()
 
     @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
-        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
+        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
         return self.model.backend
 
     @property
     def device(self) -> str | None:
-        """The name of the GPU device currently in use, or None for backends other than Kompute."""
+        """The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
         return self.model.device
 
     @property