From 369213eda98224d56d9f194b1cb298cd76b89416 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Thu, 30 May 2024 09:45:50 +0300 Subject: [PATCH 1/4] llama : offload to RPC in addition to other backends --- ggml-backend.c | 4 ++- llama.cpp | 87 +++++++++++++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 38 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 9e35ce98d7ace..35802f79b8637 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -321,7 +321,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); } else if (ggml_backend_buffer_is_host(dst->buffer)) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { + } + bool same_backend = strcmp(ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)) == 0; + if (!same_backend || !ggml_backend_buffer_copy_tensor(src, dst)) { #ifndef NDEBUG fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); #endif diff --git a/llama.cpp b/llama.cpp index e90da793c3814..75986ef1d9731 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2371,13 +2371,34 @@ struct llama_context { struct llama_control_vector cvec; }; +static size_t llama_get_device_count(const llama_model & model) { + size_t count = 1; +#if defined(GGML_USE_CUDA) + count = ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_SYCL) + count = ggml_backend_sycl_get_device_count(); +#elif defined(GGML_USE_VULKAN) + count = ggml_backend_vk_get_device_count(); +#endif +#if defined(GGML_USE_RPC) + count += model.rpc_servers.size(); +#endif + return count; + GGML_UNUSED(model); +} + static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_RPC - std::string endpoint = model.rpc_servers[gpu]; - buft = ggml_backend_rpc_buffer_type(endpoint.c_str()); -#elif defined(GGML_USE_METAL) +#if defined(GGML_USE_RPC) + int dev_count = (int)llama_get_device_count(model); + int rpc_count = (int)model.rpc_servers.size(); + if (gpu >= dev_count - rpc_count) { + const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str(); + return ggml_backend_rpc_buffer_type(endpoint); + } +#endif +#if defined(GGML_USE_METAL) buft = ggml_backend_metal_buffer_type(); #elif defined(GGML_USE_CUDA) buft = ggml_backend_cuda_buffer_type(gpu); @@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo GGML_UNUSED(tensor_split); } -static size_t llama_get_device_count(const llama_model & model) { -#if defined(GGML_USE_RPC) - return model.rpc_servers.size(); -#elif defined(GGML_USE_CUDA) - return ggml_backend_cuda_get_device_count(); -#elif defined(GGML_USE_SYCL) - return ggml_backend_sycl_get_device_count(); -#elif defined(GGML_USE_VULKAN) - return ggml_backend_vk_get_device_count(); -#else - return 1; -#endif - GGML_UNUSED(model); -} - static size_t llama_get_device_memory(const llama_model & model, int device) { #if defined(GGML_USE_RPC) - size_t total; - size_t free; - std::string endpoint = model.rpc_servers[device]; - ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); - return free; -#elif defined(GGML_USE_CUDA) + int dev_count = (int)llama_get_device_count(model); + int rpc_count = (int)model.rpc_servers.size(); + if (device >= dev_count - rpc_count) { + size_t total; + size_t free; + const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str(); + ggml_backend_rpc_get_device_memory(endpoint, &free, &total); + return free; + } +#endif +#if defined(GGML_USE_CUDA) size_t total; size_t free; ggml_backend_cuda_get_device_memory(device, &free, &total); @@ -16162,7 +16173,7 @@ struct llama_model * llama_load_model_from_file( return true; }; } - if (params.rpc_servers != nullptr) { + if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { // split the servers set them into model->rpc_servers std::string servers(params.rpc_servers); size_t pos = 0; @@ -16325,17 +16336,7 @@ struct llama_context * llama_new_context_with_model( if (!hparams.vocab_only) { // initialize backends -#if defined(GGML_USE_RPC) - for (auto & server : model->rpc_servers) { - ggml_backend_t backend = ggml_backend_rpc_init(server.c_str()); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str()); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } -#elif defined(GGML_USE_METAL) +#if defined(GGML_USE_METAL) if (model->n_gpu_layers > 0) { ctx->backend_metal = ggml_backend_metal_init(); if (ctx->backend_metal == nullptr) { @@ -16427,6 +16428,18 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#endif +#if defined(GGML_USE_RPC) + for (int i = 0; i < (int)model->rpc_servers.size(); i++) { + const char * endpoint = model->rpc_servers[i].c_str(); + ggml_backend_t backend = ggml_backend_rpc_init(endpoint); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { From 805cd78c5ab9ee085bc6ff2e45406fd7039ec9fa Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 31 May 2024 17:05:14 +0200 Subject: [PATCH 2/4] - fix copy_tensor being called on the src buffer instead of the dst buffer - always initialize views in the view_src buffer - add RPC backend to Makefile build - add endpoint to all RPC object names --- Makefile | 12 ++++++++++++ ggml-alloc.c | 6 +++--- ggml-backend.c | 14 ++++++-------- ggml-backend.h | 2 +- ggml-rpc.cpp | 4 ++-- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index c643fe0cc10bf..3878ec917ee48 100644 --- a/Makefile +++ b/Makefile @@ -416,6 +416,12 @@ ifdef LLAMA_BLIS MK_LDFLAGS += -lblis -L/usr/local/lib endif # LLAMA_BLIS +ifdef LLAMA_RPC + MK_CPPFLAGS += -DGGML_USE_RPC + OBJS += ggml-rpc.o +endif # LLAMA_RPC + + ifdef LLAMA_CUBLAS # LLAMA_CUBLAS is deprecated and will be removed in the future LLAMA_CUDA := 1 @@ -646,6 +652,12 @@ sgemm.o: sgemm.cpp sgemm.h ggml.h $(CXX) $(CXXFLAGS) -c $< -o $@ endif +ifdef LLAMA_RPC +ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # LLAMA_RPC + + GF_CC := $(CC) include scripts/get-flags.mk diff --git a/ggml-alloc.c b/ggml-alloc.c index 0146946ebd764..73a3c15756ba1 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * // this tensor was allocated without ggml-backend return; } - ggml_backend_view_init(galloc->buffers[buffer_id], tensor); + ggml_backend_view_init(tensor); } } else { if (tensor->data == NULL) { @@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx, if (t->view_src == NULL) { ggml_tallocr_alloc(&tallocr, t); } else if (t->buffer == NULL) { - ggml_backend_view_init(buffer, t); + ggml_backend_view_init(t); } } else { if (t->view_src != NULL && t->buffer == NULL) { // view of a pre-allocated tensor - ggml_backend_view_init(buffer, t); + ggml_backend_view_init(t); } } } diff --git a/ggml-backend.c b/ggml-backend.c index 35802f79b8637..05737ed696954 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; if (dst_buf->iface.cpy_tensor) { - return src->buffer->iface.cpy_tensor(dst_buf, src, dst); + return dst_buf->iface.cpy_tensor(dst_buf, src, dst); } return false; } @@ -321,9 +321,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); } else if (ggml_backend_buffer_is_host(dst->buffer)) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - } - bool same_backend = strcmp(ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)) == 0; - if (!same_backend || !ggml_backend_buffer_copy_tensor(src, dst)) { + } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { #ifndef NDEBUG fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); #endif @@ -1889,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, // utils -void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +void ggml_backend_view_init(struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); - tensor->buffer = buffer; + tensor->buffer = tensor->view_src->buffer; tensor->data = (char *)tensor->view_src->data + tensor->view_offs; - ggml_backend_buffer_init_tensor(buffer, tensor); + ggml_backend_buffer_init_tensor(tensor->buffer, tensor); } void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { @@ -1956,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); - ggml_backend_view_init(dst->view_src->buffer, dst); + ggml_backend_view_init(dst); } else { ggml_backend_tensor_copy(src, dst); diff --git a/ggml-backend.h b/ggml-backend.h index 744b6a77457d7..c582b06850ed1 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -225,7 +225,7 @@ extern "C" { // Tensor initialization GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); - GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); #ifdef __cplusplus diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 49a20df4bd85e..679ce4f280c5f 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer if (remote_ptr != 0) { ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_rpc_buffer_interface, - new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"}, + new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"}, remote_size); return buffer; } else { @@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { /* .endpoint = */ endpoint, - /* .name = */ "RPC", + /* .name = */ "RPC[" + std::string(endpoint) + "]", }; ggml_backend_t backend = new ggml_backend { From 464c75c00e804899f25319ea54a9dd5051f885fb Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 31 May 2024 17:22:05 +0200 Subject: [PATCH 3/4] add rpc-server to Makefile --- Makefile | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 3878ec917ee48..4347cbf563486 100644 --- a/Makefile +++ b/Makefile @@ -67,6 +67,10 @@ ifeq ($(UNAME_S),Darwin) endif endif +ifdef LLAMA_RPC + BUILD_TARGETS += rpc-server +endif + default: $(BUILD_TARGETS) test: $(TEST_TARGETS) @@ -417,11 +421,10 @@ ifdef LLAMA_BLIS endif # LLAMA_BLIS ifdef LLAMA_RPC - MK_CPPFLAGS += -DGGML_USE_RPC - OBJS += ggml-rpc.o + MK_CPPFLAGS += -DGGML_USE_RPC + OBJS += ggml-rpc.o endif # LLAMA_RPC - ifdef LLAMA_CUBLAS # LLAMA_CUBLAS is deprecated and will be removed in the future LLAMA_CUDA := 1 @@ -647,6 +650,10 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h endif endif # LLAMA_METAL +OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o +COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h +COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o + ifndef LLAMA_NO_LLAMAFILE sgemm.o: sgemm.cpp sgemm.h ggml.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -655,8 +662,13 @@ endif ifdef LLAMA_RPC ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h $(CXX) $(CXXFLAGS) -c $< -o $@ -endif # LLAMA_RPC +rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +endif # LLAMA_RPC GF_CC := $(CC) include scripts/get-flags.mk @@ -737,14 +749,9 @@ unicode.o: unicode.cpp unicode.h unicode-data.o: unicode-data.cpp unicode-data.h $(CXX) $(CXXFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o - llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h -COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o - common.o: common/common.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ From 243a3e4bb2ffb04248104fb375e61c55e5e42028 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 3 Jun 2024 14:59:49 +0300 Subject: [PATCH 4/4] Update llama.cpp Co-authored-by: slaren --- llama.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 75986ef1d9731..3079220d49867 100644 --- a/llama.cpp +++ b/llama.cpp @@ -16430,15 +16430,16 @@ struct llama_context * llama_new_context_with_model( } #endif #if defined(GGML_USE_RPC) - for (int i = 0; i < (int)model->rpc_servers.size(); i++) { - const char * endpoint = model->rpc_servers[i].c_str(); - ggml_backend_t backend = ggml_backend_rpc_init(endpoint); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint); - llama_free(ctx); - return nullptr; + if (model->n_gpu_layers > 0) { + for (const auto & endpoint : model->rpc_servers) { + ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str()); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str()); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); } - ctx->backends.push_back(backend); } #endif ctx->backend_cpu = ggml_backend_cpu_init();