diff --git a/Makefile b/Makefile index 906551786b68f..b21c6caf795f5 100644 --- a/Makefile +++ b/Makefile @@ -178,7 +178,7 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 - LLAMA_CUDA_DMMV_X ?= 64 + LLAMA_CUDA_DMMV_X ?= 256 LLAMA_CUDA_DMMV_Y ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) @@ -207,7 +207,7 @@ endif ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ - -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) \ + -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) \ -DGGML_CUDA_FORCE_DMMV diff --git a/koboldcpp.py b/koboldcpp.py index bff90e1e3d947..5fb2471b98b13 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -9,6 +9,7 @@ import os import argparse import json, sys, http.server, time, asyncio, socket, threading +import re from concurrent.futures import ThreadPoolExecutor stop_token_max = 10 @@ -764,21 +765,27 @@ def getfilename(var, text): quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 4, 50) quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3) quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") + CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly") quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 5) - # hides gpu options when CLBlast is not chosen def changerunmode(a,b,c): index = runopts_var.get() if index == "Use CLBlast" or index == "Use CuBLAS": gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") - gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") - quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + if index == "Use CLBlast": + gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + elif index == "Use CuBLAS": + CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") else: gpu_selector_label.grid_forget() gpu_selector_box.grid_forget() + CUDA_gpu_selector_box.grid_forget() quick_gpu_selector_label.grid_forget() quick_gpu_selector_box.grid_forget() + CUDA_quick_gpu_selector_box.grid_forget() if index == "Use CuBLAS": lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") @@ -829,6 +836,7 @@ def changerunmode(a,b,c): gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 4, 50) gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3) gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") + CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3", "All"], width=60, variable=gpu_choice_var, state="readonly") lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 5) # presets selector @@ -958,12 +966,18 @@ def switch_old_gui(): args.stream = stream.get()==1 args.smartcontext = smartcontext.get()==1 args.unbantokens = unbantokens.get()==1 + gpu_choice_str = gpu_choice_var.get() + if gpu_choice_str.isdigit(): + gpuchoiceidx = int(gpu_choice_var.get())-1 - gpuchoiceidx = int(gpu_choice_var.get())-1 if runopts_var.get() == runopts[1]: args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx] if runopts_var.get() == runopts[2]: - args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] + if gpu_choice_str.lower() == "all": + args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] + else: + args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] + if gpulayers_var.get(): args.gpulayers = int(gpulayers_var.get()) if runopts_var.get()==runopts[3]: @@ -1329,7 +1343,7 @@ def main(args): compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) - compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU. Select lowvram to not allocate VRAM scratch buffer. Enter a number after to select a different main GPU.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2']) + compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2']) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) args = parser.parse_args() main(args)