Skip to content

Commit

Permalink
python: implement selectable GPU backend
Browse files Browse the repository at this point in the history
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
  • Loading branch information
cebtenzzre committed May 6, 2024
1 parent b331c7b commit 81140d4
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 20 deletions.
5 changes: 4 additions & 1 deletion gpt4all-backend/llamamodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,9 +602,12 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
auto devices = availableGPUDevices(memoryRequired);

auto dev_it = devices.begin();
#ifndef GGML_USE_CUDA
if (name == "amd" || name == "nvidia" || name == "intel") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
} else if (name != "gpu") {
} else
#endif
if (name != "gpu") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
}

Expand Down
10 changes: 6 additions & 4 deletions gpt4all-bindings/python/gpt4all/_pyllmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,11 @@ class LLModel:
Maximum size of context window
ngl : int
Number of GPU layers to use (Vulkan)
backend : str
Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
"""

def __init__(self, model_path: str, n_ctx: int, ngl: int):
def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
self.model_path = model_path.encode()
self.n_ctx = n_ctx
self.ngl = ngl
Expand All @@ -213,7 +215,7 @@ def __init__(self, model_path: str, n_ctx: int, ngl: int):

# Construct a model implementation
err = ctypes.c_char_p()
model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
if model is None:
s = err.value
raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
Expand All @@ -232,7 +234,7 @@ def _raise_closed(self) -> NoReturn:
raise ValueError("Attempted operation on a closed LLModel")

@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
if self.model is None:
self._raise_closed()
return llmodel.llmodel_model_backend_name(self.model).decode()
Expand All @@ -259,7 +261,7 @@ def list_gpus(mem_required: int = 0) -> list[str]:
devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
if not devices_ptr:
raise ValueError("Unable to retrieve available GPU devices")
return [d.name.decode() for d in devices_ptr[:num_devices.value]]
return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]

def init_gpu(self, device: str):
if self.model is None:
Expand Down
57 changes: 42 additions & 15 deletions gpt4all-bindings/python/gpt4all/gpt4all.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import hashlib
import os
import platform
import re
import sys
import time
Expand Down Expand Up @@ -44,7 +45,7 @@ class Embed4All:

MIN_DIMENSIONALITY = 64

def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
"""
Constructor
Expand Down Expand Up @@ -172,7 +173,7 @@ def __init__(
model_type: str | None = None,
allow_download: bool = True,
n_threads: int | None = None,
device: str | None = "cpu",
device: str | None = None,
n_ctx: int = 2048,
ngl: int = 100,
verbose: bool = False,
Expand All @@ -190,30 +191,56 @@ def __init__(
n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
device: The processing unit on which the GPT4All model will run. It can be set to:
- "cpu": Model will run on the central processing unit.
- "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
- "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
- A specific device name from the list returned by `GPT4All.list_gpus()`.
Default is "cpu".
- "kompute": Use the best GPU provided by the Kompute backend.
- "cuda": Use the best GPU provided by the CUDA backend.
- "amd", "nvidia": Use the best available GPU from the specified vendor.
- A specific device name from the list returned by `GPT4All.list_gpus()` (not available on macOS).
Default is "metal" on ARM64 macOS, "cpu" otherwise.
"gpu" is a deprecated alias for "kompute".
Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
n_ctx: Maximum size of context window
ngl: Number of GPU layers to use (Vulkan)
verbose: If True, print debug messages.
"""

self.model_type = model_type
self._history: list[MessageType] | None = None
self._current_prompt_template: str = "{0}"

device_init = None
if sys.platform == 'darwin':
if device is None:
backend = 'auto' # 'auto' is effectively 'metal' due to currently non-functional fallback
elif device == 'cpu':
backend = 'cpu'
else:
if platform.machine() != 'arm64' or device != 'gpu':
raise ValueError(f'Unknown device for this platform: {device}')
backend = 'metal'
else:
backend = 'kompute'
if device is None or device == 'cpu':
pass # use kompute with no device
elif device in ('cuda', 'kompute'):
backend = device
device_init = 'gpu'
elif device.startswith('cuda:'):
backend = 'cuda'
device_init = device.removeprefix('cuda:')
else:
device_init = device.removeprefix('kompute:')

# Retrieve model and download if allowed
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
self.model = LLModel(self.config["path"], n_ctx, ngl)
if device is not None and device != "cpu":
self.model.init_gpu(device)
self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
if device_init is not None:
self.model.init_gpu(device_init)
self.model.load_model()
# Set n_threads
if n_threads is not None:
self.model.set_thread_count(n_threads)

self._history: list[MessageType] | None = None
self._current_prompt_template: str = "{0}"

def __enter__(self) -> Self:
return self

Expand All @@ -227,13 +254,13 @@ def close(self) -> None:
self.model.close()

@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
return self.model.backend

@property
def device(self) -> str | None:
"""The name of the GPU device currently in use, or None for backends other than Kompute."""
"""The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
return self.model.device

@property
Expand Down

0 comments on commit 81140d4

Please sign in to comment.