Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : offload to RPC in addition to other backends #7640

Merged
merged 4 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ ifeq ($(UNAME_S),Darwin)
endif
endif

ifdef LLAMA_RPC
BUILD_TARGETS += rpc-server
endif

default: $(BUILD_TARGETS)

test: $(TEST_TARGETS)
Expand Down Expand Up @@ -416,6 +420,11 @@ ifdef LLAMA_BLIS
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS

ifdef LLAMA_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJS += ggml-rpc.o
endif # LLAMA_RPC

ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1
Expand Down Expand Up @@ -641,11 +650,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif
endif # LLAMA_METAL

OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o

ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif

ifdef LLAMA_RPC
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
$(CXX) $(CXXFLAGS) -c $< -o $@

rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
$(CXX) $(CXXFLAGS) -c $< -o $@

rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif # LLAMA_RPC

GF_CC := $(CC)
include scripts/get-flags.mk

Expand Down Expand Up @@ -725,14 +749,9 @@ unicode.o: unicode.cpp unicode.h
unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@

OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o

llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@

COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o

common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@

Expand Down
6 changes: 3 additions & 3 deletions ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
// this tensor was allocated without ggml-backend
return;
}
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
ggml_backend_view_init(tensor);
}
} else {
if (tensor->data == NULL) {
Expand Down Expand Up @@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) {
ggml_backend_view_init(buffer, t);
ggml_backend_view_init(t);
}
} else {
if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor
ggml_backend_view_init(buffer, t);
ggml_backend_view_init(t);
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
if (dst_buf->iface.cpy_tensor) {
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
}
return false;
}
Expand Down Expand Up @@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,

// utils

void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
void ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);

tensor->buffer = buffer;
tensor->buffer = tensor->view_src->buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
ggml_backend_buffer_init_tensor(buffer, tensor);
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
Comment on lines +1890 to +1898
Copy link
Collaborator

@airMeng airMeng Jun 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This breaks the latest SYCL support. Could I know if all the other backends assumes the buffer already bound to the right tensor? Of cause I know it is the issue of SYCL itself, we are maintaining SYCL during spare time and are still begging for official support from the company :) I will look into it and try to fix it soon.

@slaren @rgerganov for awareness

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's the same issue that was fixed for the Vulkan backend in #7806, there are more details about why this happens and why the change was necessary there. The best way to fix this for the SYCL backend would be to remove the extras entirely in the same way they were removed from the CUDA backend.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are maintaining SYCL during spare time and are still begging for official support from the company

Having a dedicated machine which runs the CI with the SYCL backend would be very helpful

}

void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
Expand Down Expand Up @@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst->view_src->buffer, dst);
ggml_backend_view_init(dst);
}
else {
ggml_backend_tensor_copy(src, dst);
Expand Down
2 changes: 1 addition & 1 deletion ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ extern "C" {

// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);


#ifdef __cplusplus
Expand Down
4 changes: 2 additions & 2 deletions ggml-rpc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
if (remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface,
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
remote_size);
return buffer;
} else {
Expand Down Expand Up @@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
/* .endpoint = */ endpoint,
/* .name = */ "RPC",
/* .name = */ "RPC[" + std::string(endpoint) + "]",
};

ggml_backend_t backend = new ggml_backend {
Expand Down
87 changes: 50 additions & 37 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2371,13 +2371,34 @@ struct llama_context {
struct llama_control_vector cvec;
};

static size_t llama_get_device_count(const llama_model & model) {
size_t count = 1;
#if defined(GGML_USE_CUDA)
count = ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count = ggml_backend_vk_get_device_count();
#endif
#if defined(GGML_USE_RPC)
count += model.rpc_servers.size();
#endif
return count;
GGML_UNUSED(model);
}

static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;

#ifdef GGML_USE_RPC
std::string endpoint = model.rpc_servers[gpu];
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
#elif defined(GGML_USE_METAL)
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size();
if (gpu >= dev_count - rpc_count) {
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
#endif
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
Expand Down Expand Up @@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
GGML_UNUSED(tensor_split);
}

static size_t llama_get_device_count(const llama_model & model) {
#if defined(GGML_USE_RPC)
return model.rpc_servers.size();
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
GGML_UNUSED(model);
}

static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
size_t total;
size_t free;
std::string endpoint = model.rpc_servers[device];
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
return free;
#elif defined(GGML_USE_CUDA)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size();
if (device >= dev_count - rpc_count) {
size_t total;
size_t free;
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free;
}
#endif
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
Expand Down Expand Up @@ -16162,7 +16173,7 @@ struct llama_model * llama_load_model_from_file(
return true;
};
}
if (params.rpc_servers != nullptr) {
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
// split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers);
size_t pos = 0;
Expand Down Expand Up @@ -16325,17 +16336,7 @@ struct llama_context * llama_new_context_with_model(

if (!hparams.vocab_only) {
// initialize backends
#if defined(GGML_USE_RPC)
for (auto & server : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_METAL)
#if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) {
Expand Down Expand Up @@ -16427,6 +16428,18 @@ struct llama_context * llama_new_context_with_model(
}
ctx->backends.push_back(backend);
}
#endif
#if defined(GGML_USE_RPC)
for (int i = 0; i < (int)model->rpc_servers.size(); i++) {
const char * endpoint = model->rpc_servers[i].c_str();
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
rgerganov marked this conversation as resolved.
Show resolved Hide resolved
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
Expand Down
Loading