From 22da05536ff4ad963080773bef1fb839fdab95d3 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Sun, 26 Nov 2023 02:30:02 -0600 Subject: [PATCH 01/27] metal : fix yarn (#4220) get the correct n_orig_ctx in metal --- ggml-metal.m | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index a9fdd39035aa31..d52a1c3c48210b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1433,7 +1433,8 @@ void ggml_metal_graph_compute( const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - const int n_orig_ctx = ((int32_t *) dst->op_params)[3]; + // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); From 922754a8d60080e956891f6cee1fb03aa48d57c6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Nov 2023 20:33:07 +0200 Subject: [PATCH 02/27] lookahead : add example for lookahead decoding (#4207) * lookahead : init * lookahead : generate and store n-grams * lookahead : use loop instead recursion to generate n-grams * lookahead : initial working implementation * lookahead : filter repeating n-grams * lookahead : use deterministic init * lookahead : add to Makefile * lookahead : fix a bug in the seq_id of the lookahead tokens * lookahead : add comments --------- Co-authored-by: slaren --- .gitignore | 1 + Makefile | 5 +- examples/CMakeLists.txt | 1 + examples/lookahead/CMakeLists.txt | 5 + examples/lookahead/lookahead.cpp | 487 ++++++++++++++++++++++++++++++ 5 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 examples/lookahead/CMakeLists.txt create mode 100644 examples/lookahead/lookahead.cpp diff --git a/.gitignore b/.gitignore index 41259a12f50cbc..3806e05ddcc126 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ models-mnt /libllama.so /llama-bench /llava-cli +/lookahead /main /metal /perplexity diff --git a/Makefile b/Makefile index a6d2c2ec0f380e..95d85236f8f241 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ BUILD_TARGETS = \ main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora tests/test-c.o + speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ @@ -657,6 +657,9 @@ speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + ifdef LLAMA_METAL metal: examples/metal/metal.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 71bcb6893e20d0..6744944fd8b992 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -32,6 +32,7 @@ else() add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(speculative) + add_subdirectory(lookahead) add_subdirectory(train-text-from-scratch) if (LLAMA_METAL) add_subdirectory(metal) diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt new file mode 100644 index 00000000000000..8827e3f11ecd66 --- /dev/null +++ b/examples/lookahead/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET lookahead) +add_executable(${TARGET} lookahead.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp new file mode 100644 index 00000000000000..4c49a85ebcde7c --- /dev/null +++ b/examples/lookahead/lookahead.cpp @@ -0,0 +1,487 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include + +struct ngram_data { + bool active = false; + + llama_seq_id seq_id = -1; + + std::vector i_batch; + + std::vector tokens; +}; + +// n-gram container +struct ngram_container { + ngram_container(int n_vocab, int N, int G) { + cnt.resize(n_vocab); + head.resize(n_vocab); + tokens.resize(n_vocab * G * (N - 1)); + } + + int n_total = 0; + + std::vector cnt; + std::vector head; + + // [n_vocab][G][N - 1] + // for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1 + std::vector tokens; +}; + +int main(int argc, char ** argv) { + gpt_params params; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + const int W = 15; // lookahead window + const int N = 5; // n-gram size + const int G = 15; // max verification n-grams + + const bool dump_kv_cache = params.dump_kv_cache; + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("lookahead", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + // init llama.cpp + llama_backend_init(params.numa); + + llama_model * model = NULL; + llama_context * ctx = NULL; + + // load the target model + std::tie(model, ctx) = llama_init_from_gpt_params(params); + + // Tokenize the prompt + const bool add_bos = llama_should_add_bos_token(model); + LOG("add_bos tgt: %d\n", add_bos); + + std::vector inp; + std::vector all; + + inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + all = inp; + + const int max_context_size = llama_n_ctx(ctx); + const int max_tokens_list_size = max_context_size - 4; + + if ((int) inp.size() > max_tokens_list_size) { + fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + return 1; + } + + fprintf(stderr, "\n\n"); + + for (auto id : inp) { + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + } + + fflush(stderr); + + const int n_input = inp.size(); + + const auto t_enc_start = ggml_time_us(); + + // eval the prompt + llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); + llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); + + for (int s = 1; s < W + G + 1; ++s) { + llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + } + + const auto t_enc_end = ggml_time_us(); + + int n_predict = 0; + int n_accept = 0; + + int n_past = inp.size(); + + llama_token id = 0; + + // used to determine end of generation + bool has_eos = false; + + // for each decoded batch, we have at most W + G + 1 distinct sequences: + // seq_id == 0 : the current input token + // seq_id [1, W] : tokens from the past N - 1 Jacobi iterations + // seq_id [W + 1, W + G] : verification n-grams + llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); + + // target model sampling context + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + + // verification n-grams + std::vector ngrams_cur(G); + + // tokens for the past N - 1 Jacobi iterations + std::vector tokens_j_prev(W); + std::vector> tokens_j(N - 1); + for (int j = 0; j < N - 1; j++) { + tokens_j[j].resize(W); + + for (int i = 0; i < W; i++) { + // there are different ways to init these tokens + if (0) { + // initialize randomly from the prompt tokens + tokens_j[j][i] = all[1 + rand() % (all.size() - 1)]; + } else { + // initialize with a sequence of increasing numbers + tokens_j[j][i] = 100 + i; + } + } + } + + std::vector seq_id_look; + + // the input token belongs both to all sequences + std::vector seq_id_all(W + G + 1); + for (int i = 0; i < W + G + 1; i++) { + seq_id_all[i] = i; + } + + // here we keep adding new n-grams as we go + ngram_container ngrams_observed(llama_n_vocab(model), N, G); + + // debug + struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1); + + const auto t_dec_start = ggml_time_us(); + + // sample first token + { + id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0); + + llama_sampling_accept(ctx_sampling, ctx, id, true); + + { + const std::string token_str = llama_token_to_piece(ctx, id); + + printf("%s", token_str.c_str()); + fflush(stdout); + } + } + + while (true) { + // debug + if (dump_kv_cache) { + llama_kv_cache_view_update(ctx, &kvc_view); + dump_kv_cache_view_seqs(kvc_view, 40); + } + + // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ + // + // Example for W = 5, N = 4, G = 2: + // (I = input, L = lookahead, V = verification) + // + // Batch: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + // T: -2 -2 -2 -2 -1 -1 -1 -1 -1 0 0 0 0 0 0 + // Info: I L L L L L L L L L L L L L L V V V V V V + // Pos: 0 1 2 3 4 1 2 3 4 5 2 3 4 5 6 1 2 3 1 2 3 (+ n_past) + // Logits: 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + // --------------------------------------------------------------------- + // Seq: 0 + // 1 1 1 + // 2 2 2 2 + // 3 3 3 3 3 + // 4 4 4 4 4 4 + // 5 5 5 5 5 5 5 + // 6 6 6 6 + // 7 7 7 7 + // --------------------------------------------------------------------- + // | | | | | | | | | | | + // V V V V V | | | | | | + // j_tokens | | | | | | + // V V V V V V + // id + { + llama_batch_clear(batch); + + // current token - first token of the first level + llama_batch_add(batch, id, n_past, seq_id_all, true); + + // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation + { + const int g_cur = ngrams_observed.cnt[id]; + + ngrams_cur.resize(g_cur); + for (int g = 0; g < g_cur; g++) { + ngrams_cur[g].active = true; + ngrams_cur[g].tokens.resize(N); + ngrams_cur[g].i_batch.resize(N); + ngrams_cur[g].seq_id = W + 1 + g; + ngrams_cur[g].i_batch[0] = 0; + ngrams_cur[g].tokens [0] = id; + } + + for (int j = 0; j < N - 1; j++) { + for (int g = 0; g < g_cur; g++) { + const int idx = id*(N - 1)*G + g*(N - 1); + + const llama_token t = ngrams_observed.tokens[idx + j]; + + ngrams_cur[g].tokens [j + 1] = t; + ngrams_cur[g].i_batch[j + 1] = batch.n_tokens; + + llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); + } + } + } + + // fill the remaining W - 1 tokens for the first level + for (int i = 1; i < W; i++) { + seq_id_look.resize(W - i); + for (int j = 0; j < W - i; j++) { + seq_id_look[j] = i + j + 1; + } + + llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); + } + + // fill the rest of the levels + for (int j = 1; j < N - 1; j++) { + for (int i = 0; i < W; i++) { + llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); + } + } + } + + if (llama_decode(ctx, batch) != 0) { + fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__); + return 1; + } + + int seq_id_best = 0; + + for (int v = 0; v < N; ++v) { + int i_batch = 0; + + // if no active ngrams are left, it means the sampled token does not pass the verification + if (v > 0) { + for (int g = 0; g < (int) ngrams_cur.size(); g++) { + if (ngrams_cur[g].active) { + i_batch = ngrams_cur[g].i_batch[v]; + seq_id_best = ngrams_cur[g].seq_id; + + ++n_accept; + break; + } + } + + // no more matches -> create a new batch + if (i_batch == 0) { + break; + } + } + + // sample the next token + id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch); + + llama_sampling_accept(ctx_sampling, ctx, id, true); + + // print + { + const std::string token_str = llama_token_to_piece(ctx, id); + + if (v == 0) { + printf("%s", token_str.c_str()); + } else { + // print light cyan + printf("\033[0;96m%s\033[0m", token_str.c_str()); + } + fflush(stdout); + + if (id == llama_token_eos(model)) { + has_eos = true; + } + + all.push_back(id); + } + + ++n_predict; + ++n_past; + + if (n_predict > params.n_predict || has_eos) { + break; + } + + // verify across active n-grams + for (int g = 0; g < (int) ngrams_cur.size(); g++) { + if (ngrams_cur[g].active) { + if (v == N - 1) { + ngrams_cur[g].active = false; + } else { + if (id != ngrams_cur[g].tokens[v + 1]) { + ngrams_cur[g].active = false; + } + } + } + } + + // print known n-grams starting with token id (debug) + if (0 && v == 0) { + if (ngrams_observed.cnt[id] > 0) { + printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); + } + + for (int i = 0; i < ngrams_observed.cnt[id]; i++) { + printf(" - ngram %2d: ", i); + + const int idx = id*(N - 1)*G + i*(N - 1); + + for (int j = 0; j < N - 1; j++) { + const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); + + printf("%s", token_str.c_str()); + } + + printf("\n"); + } + } + + // update lookahead tokens + { + for (int i = 0; i < W; i++) { + tokens_j_prev[i] = tokens_j[0][i]; + } + + for (int j = 0; j < N - 2; j++) { + tokens_j[j] = tokens_j[j + 1]; + } + + if (v == 0) { + // sample from the last level + for (int i = 0; i < W; i++) { + tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + } + } else { + for (int i = 0; i < W; i++) { + // there are different ways to init these tokens + if (0) { + // random init + tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)]; + } else { + // init from the previous level + tokens_j[N - 2][i] = tokens_j[0][i]; + } + } + } + } + + // update observed ngrams + if (v == 0) { + // the first token of the n-gram is determined by the index in the container so it is not stored + std::vector ngram(N - 1); + + // n-gram generation + // ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518 + for (int f = 0; f < W; ++f) { + const int ft = tokens_j_prev[f]; // first token of the n-gram + + for (int j = 0; j < N - 1; ++j) { + ngram[j] = tokens_j[j][f]; + } + + // filter-out repeating n-grams + { + bool is_unique = true; + + for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) { + const int idx = ft*(N - 1)*G + k*(N - 1); + + bool is_match = true; + for (int j = 0; j < N - 1; ++j) { + if (ngrams_observed.tokens[idx + j] != ngram[j]) { + is_match = false; + break; + } + } + + if (is_match) { + is_unique = false; + break; + } + } + + if (!is_unique) { + continue; + } + } + + const int head = ngrams_observed.head[ft]; + const int idx = ft*(N - 1)*G + head*(N - 1); + + for (int i = 0; i < N - 1; i++) { + ngrams_observed.tokens[idx + i] = ngram[i]; + } + + ngrams_observed.cnt[ft] = std::min(G, ngrams_observed.cnt[ft] + 1); + ngrams_observed.head[ft] = (head + 1) % G; + + ngrams_observed.n_total++; + } + } + } + + if (n_predict > params.n_predict || has_eos) { + break; + } + + // KV cache management + // if no verification token matched, we simply remove all cells from this batch -> no fragmentation + llama_kv_cache_seq_rm(ctx, -1, n_past, -1); + + if (seq_id_best != 0) { + // if a verification token matched, we keep the best sequence and remove the rest + // this leads to some KV cache fragmentation + llama_kv_cache_seq_keep(ctx, seq_id_best); + llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1); + llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1); + + for (int s = 1; s < W + G + 1; ++s) { + llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + } + } + } + + auto t_dec_end = ggml_time_us(); + + LOG_TEE("\n\n"); + + LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + + LOG_TEE("\n"); + LOG_TEE("W = %2d\n", W); + LOG_TEE("N = %2d\n", N); + LOG_TEE("G = %2d\n", G); + LOG_TEE("\n"); + LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("n_accept = %d\n", n_accept); + + llama_print_timings(ctx); + + llama_kv_cache_view_free(&kvc_view); + llama_sampling_free(ctx_sampling); + + llama_batch_free(batch); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + fprintf(stderr, "\n\n"); + + return 0; +} From 9656026b53236ed7328458269c4c798dd50ac8d1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Nov 2023 20:42:51 +0200 Subject: [PATCH 03/27] readme : update hot topics --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2f83a71fdee6f3..2892132c426a35 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167 From 3e73d31d9cc0232882ce61c64742aff3ecfec416 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Nov 2023 21:51:46 +0200 Subject: [PATCH 04/27] lookahead : support `-n -1` infinite generation --- examples/lookahead/lookahead.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 4c49a85ebcde7c..e55a15a1bf054b 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -311,7 +311,7 @@ int main(int argc, char ** argv) { ++n_predict; ++n_past; - if (n_predict > params.n_predict || has_eos) { + if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { break; } @@ -433,7 +433,7 @@ int main(int argc, char ** argv) { } } - if (n_predict > params.n_predict || has_eos) { + if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { break; } From f3b269813f6147c5b5cda082e6b45cf04a932e0d Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sun, 26 Nov 2023 22:58:43 -0500 Subject: [PATCH 05/27] ggml : fix -Warray-bounds warning with gcc (#4231) --- ggml.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index f92292b39c635e..0c7264a36216d4 100644 --- a/ggml.c +++ b/ggml.c @@ -15689,13 +15689,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = 1; } break; - case GGML_OP_COUNT: - { - GGML_ASSERT(false); - } break; default: { - printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op)); + fprintf(stderr, "%s: op not implemented: ", __func__); + if (node->op < GGML_OP_COUNT) { + fprintf(stderr, "%s\n", ggml_op_name(node->op)); + } else { + fprintf(stderr, "%d\n", node->op); + } GGML_ASSERT(false); } break; } From bb03290c17540768a16000a2b01ee4f22440aba1 Mon Sep 17 00:00:00 2001 From: Bailey Chittle <39804642+bachittle@users.noreply.github.com> Date: Mon, 27 Nov 2023 09:56:52 -0500 Subject: [PATCH 06/27] examples : iOS example with swift ui (#4159) * copy to llama.cpp as subdir * attempt enabling metal, fails * ggml metal compiles! * Update README.md * initial conversion to new format, utf8 errors? * bug fixes, but now has an invalid memory access :( * added O3, now has insufficient memory access * begin sync with master * update to match latest code, new errors * fixed it! * fix for loop conditionals, increase result size * fix current workflow errors * attempt a llama.swiftui workflow * Update .github/workflows/build.yml Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 11 + examples/llama.swiftui/.gitignore | 1 + examples/llama.swiftui/README.md | 7 + .../llama.cpp.swift/LibLlama.swift | 176 +++++++ .../llama.cpp.swift/bridging-header.h | 5 + .../llama.swiftui.xcodeproj/project.pbxproj | 481 ++++++++++++++++++ .../contents.xcworkspacedata | 7 + .../xcshareddata/IDEWorkspaceChecks.plist | 8 + .../AccentColor.colorset/Contents.json | 11 + .../AppIcon.appiconset/Contents.json | 13 + .../Assets.xcassets/Contents.json | 6 + .../llama.swiftui/Models/LlamaState.swift | 45 ++ .../Preview Assets.xcassets/Contents.json | 6 + .../llama.swiftui/Resources/models/.gitignore | 0 .../llama.swiftui/UI/ContentView.swift | 42 ++ .../llama.swiftui/llama_swiftuiApp.swift | 10 + 16 files changed, 829 insertions(+) create mode 100644 examples/llama.swiftui/.gitignore create mode 100644 examples/llama.swiftui/README.md create mode 100644 examples/llama.swiftui/llama.cpp.swift/LibLlama.swift create mode 100644 examples/llama.swiftui/llama.cpp.swift/bridging-header.h create mode 100644 examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj create mode 100644 examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata create mode 100644 examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist create mode 100644 examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json create mode 100644 examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json create mode 100644 examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift create mode 100644 examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json create mode 100644 examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore create mode 100644 examples/llama.swiftui/llama.swiftui/UI/ContentView.swift create mode 100644 examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bc295d52d2d5d2..22be233e6d11e6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -498,6 +498,17 @@ jobs: path: | cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + ios-xcode-build: + runs-on: macos-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Build Xcode project + run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build + + # freeBSD-latest: # runs-on: macos-12 # steps: diff --git a/examples/llama.swiftui/.gitignore b/examples/llama.swiftui/.gitignore new file mode 100644 index 00000000000000..9bce6af399ba96 --- /dev/null +++ b/examples/llama.swiftui/.gitignore @@ -0,0 +1 @@ +xcuserdata diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md new file mode 100644 index 00000000000000..fa68e6ed8e34db --- /dev/null +++ b/examples/llama.swiftui/README.md @@ -0,0 +1,7 @@ +# llama.swiftui + +Local inference of llama.cpp on an iPhone. +So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well. + +https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545 + diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift new file mode 100644 index 00000000000000..aaef09611bc909 --- /dev/null +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -0,0 +1,176 @@ +import Foundation + +// import llama + +enum LlamaError: Error { + case couldNotInitializeContext +} + +actor LlamaContext { + private var model: OpaquePointer + private var context: OpaquePointer + private var batch: llama_batch + private var tokens_list: [llama_token] + + var n_len: Int32 = 512 + var n_cur: Int32 = 0 + var n_decode: Int32 = 0 + + init(model: OpaquePointer, context: OpaquePointer) { + self.model = model + self.context = context + self.tokens_list = [] + self.batch = llama_batch_init(512, 0, 1) + } + + deinit { + llama_free(context) + llama_free_model(model) + llama_backend_free() + } + + static func createContext(path: String) throws -> LlamaContext { + llama_backend_init(false) + let model_params = llama_model_default_params() + + let model = llama_load_model_from_file(path, model_params) + guard let model else { + print("Could not load model at \(path)") + throw LlamaError.couldNotInitializeContext + } + var ctx_params = llama_context_default_params() + ctx_params.seed = 1234 + ctx_params.n_ctx = 2048 + ctx_params.n_threads = 8 + ctx_params.n_threads_batch = 8 + + let context = llama_new_context_with_model(model, ctx_params) + guard let context else { + print("Could not load context!") + throw LlamaError.couldNotInitializeContext + } + + return LlamaContext(model: model, context: context) + } + + func get_n_tokens() -> Int32 { + return batch.n_tokens; + } + + func completion_init(text: String) { + print("attempting to complete \"\(text)\"") + + tokens_list = tokenize(text: text, add_bos: true) + + let n_ctx = llama_n_ctx(context) + let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count) + + print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)") + + if n_kv_req > n_ctx { + print("error: n_kv_req > n_ctx, the required KV cache size is not big enough") + } + + for id in tokens_list { + print(token_to_piece(token: id)) + } + + // batch = llama_batch_init(512, 0) // done in init() + batch.n_tokens = Int32(tokens_list.count) + + for i1 in 0.. String { + var new_token_id: llama_token = 0 + + let n_vocab = llama_n_vocab(model) + let logits = llama_get_logits_ith(context, batch.n_tokens - 1) + + var candidates = Array() + candidates.reserveCapacity(Int(n_vocab)) + + for token_id in 0.. [llama_token] { + let n_tokens = text.count + (add_bos ? 1 : 0) + let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) + let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false) + + var swiftTokens: [llama_token] = [] + for i in 0.. String { + let result = UnsafeMutablePointer.allocate(capacity: 8) + result.initialize(repeating: Int8(0), count: 8) + + let _ = llama_token_to_piece(model, token, result, 8) + + let resultStr = String(cString: result) + + result.deallocate() + + return resultStr + } +} diff --git a/examples/llama.swiftui/llama.cpp.swift/bridging-header.h b/examples/llama.swiftui/llama.cpp.swift/bridging-header.h new file mode 100644 index 00000000000000..6cd72c97919eaf --- /dev/null +++ b/examples/llama.swiftui/llama.cpp.swift/bridging-header.h @@ -0,0 +1,5 @@ +// +// Use this file to import your target's public headers that you would like to expose to Swift. +// + +#import "llama.h" diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj new file mode 100644 index 00000000000000..bc1fd15cebb317 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -0,0 +1,481 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; }; + 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; }; + 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; + 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; + 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; }; + 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; + 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; + 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; + 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; + 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; + 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; + 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; + 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; + 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; + 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; + 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = ""; }; + 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = ""; }; + 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = ""; }; + 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = ""; }; + 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = ""; }; + 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = ""; }; + 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = ""; }; + 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = ""; }; + 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = ""; }; + 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = ""; }; + 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = ""; }; + 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = ""; }; + 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = ""; }; + 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; + 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = ""; }; + 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; + 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; + 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; + 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; + 8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = ""; }; + 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = ""; }; + 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = ""; }; + 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 8A1C83702AC328BD0096AF73 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, + 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { + isa = PBXGroup; + children = ( + 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, + 542376092B0D9C40008E6A1C /* ggml-backend.h */, + 542376062B0D9BEA008E6A1C /* ggml-quants.h */, + 542376072B0D9BFB008E6A1C /* ggml-quants.c */, + 549479C82AC9E10B00E0F78B /* ggml-metal.metal */, + 549479C62AC9E0F200E0F78B /* ggml-metal.h */, + 549479C52AC9E0F200E0F78B /* ggml-metal.m */, + 542EA09B2AC8723900A8AEE9 /* ggml.c */, + 542EA09C2AC8723900A8AEE9 /* ggml.h */, + 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, + 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, + 542EA0A12AC8729100A8AEE9 /* llama.cpp */, + 542EA0A22AC8729100A8AEE9 /* llama.h */, + ); + name = llama.cpp; + sourceTree = ""; + }; + 8A1C836A2AC328BD0096AF73 = { + isa = PBXGroup; + children = ( + 8A08D1F62AC7383900FE6CD4 /* llama.cpp */, + 8A907F312AC7134E006146EA /* llama.cpp.swift */, + 8A3F84232AC4C891005E2EE8 /* models */, + 8A1C83752AC328BD0096AF73 /* llama.swiftui */, + 8A1C83742AC328BD0096AF73 /* Products */, + 8A39BE082AC7601000BFEB40 /* Frameworks */, + ); + sourceTree = ""; + }; + 8A1C83742AC328BD0096AF73 /* Products */ = { + isa = PBXGroup; + children = ( + 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */, + ); + name = Products; + sourceTree = ""; + }; + 8A1C83752AC328BD0096AF73 /* llama.swiftui */ = { + isa = PBXGroup; + children = ( + 8A3F84102AC4BD85005E2EE8 /* Resources */, + 8A9F7C4B2AC332DC008AE1EA /* Models */, + 8A9F7C4A2AC332BF008AE1EA /* UI */, + 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, + 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, + 8A1C837C2AC328BE0096AF73 /* Preview Content */, + ); + path = llama.swiftui; + sourceTree = ""; + }; + 8A1C837C2AC328BE0096AF73 /* Preview Content */ = { + isa = PBXGroup; + children = ( + 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, + ); + path = "Preview Content"; + sourceTree = ""; + }; + 8A39BE082AC7601000BFEB40 /* Frameworks */ = { + isa = PBXGroup; + children = ( + 549479CA2AC9E16000E0F78B /* Metal.framework */, + 8A39BE092AC7601000BFEB40 /* Accelerate.framework */, + ); + name = Frameworks; + sourceTree = ""; + }; + 8A3F84102AC4BD85005E2EE8 /* Resources */ = { + isa = PBXGroup; + children = ( + 8A3F84112AC4BD8C005E2EE8 /* models */, + ); + path = Resources; + sourceTree = ""; + }; + 8A3F84112AC4BD8C005E2EE8 /* models */ = { + isa = PBXGroup; + children = ( + 8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */, + ); + path = models; + sourceTree = ""; + }; + 8A907F312AC7134E006146EA /* llama.cpp.swift */ = { + isa = PBXGroup; + children = ( + 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, + 8A907F322AC7134E006146EA /* LibLlama.swift */, + ); + path = llama.cpp.swift; + sourceTree = ""; + }; + 8A9F7C4A2AC332BF008AE1EA /* UI */ = { + isa = PBXGroup; + children = ( + 8A1C83782AC328BD0096AF73 /* ContentView.swift */, + ); + path = UI; + sourceTree = ""; + }; + 8A9F7C4B2AC332DC008AE1EA /* Models */ = { + isa = PBXGroup; + children = ( + 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */, + ); + path = Models; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + 8A1C83722AC328BD0096AF73 /* llama.swiftui */ = { + isa = PBXNativeTarget; + buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */; + buildPhases = ( + 8A1C836F2AC328BD0096AF73 /* Sources */, + 8A1C83702AC328BD0096AF73 /* Frameworks */, + 8A1C83712AC328BD0096AF73 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = llama.swiftui; + packageProductDependencies = ( + ); + productName = llama.swiftui; + productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; + productType = "com.apple.product-type.application"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 8A1C836B2AC328BD0096AF73 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + 8A1C83722AC328BD0096AF73 = { + CreatedOnToolsVersion = 15.0; + LastSwiftMigration = 1500; + }; + }; + }; + buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = 8A1C836A2AC328BD0096AF73; + packageReferences = ( + ); + productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 8A1C83722AC328BD0096AF73 /* llama.swiftui */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + 8A1C83712AC328BD0096AF73 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */, + 8A3F84242AC4C891005E2EE8 /* models in Resources */, + 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, + 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + 8A1C836F2AC328BD0096AF73 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, + 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, + 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, + 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, + 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, + 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, + 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, + 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, + 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, + 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + 8A1C837F2AC328BE0096AF73 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + 8A1C83802AC328BE0096AF73 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + 8A1C83822AC328BE0096AF73 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; + DEVELOPMENT_TEAM = STLSG3FG8Q; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + 8A1C83832AC328BE0096AF73 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; + DEVELOPMENT_TEAM = STLSG3FG8Q; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8A1C837F2AC328BE0096AF73 /* Debug */, + 8A1C83802AC328BE0096AF73 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8A1C83822AC328BE0096AF73 /* Debug */, + 8A1C83832AC328BE0096AF73 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; +} diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 00000000000000..919434a6254f0e --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist new file mode 100644 index 00000000000000..3d4c1e55259fee --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist @@ -0,0 +1,8 @@ + + + + + IDEDidComputeMac32BitWarning + + + diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 00000000000000..eb878970081645 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 00000000000000..13613e3ee1a934 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,13 @@ +{ + "images" : [ + { + "idiom" : "universal", + "platform" : "ios", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json new file mode 100644 index 00000000000000..73c00596a7fca3 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift new file mode 100644 index 00000000000000..babc60cdcc9dcc --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -0,0 +1,45 @@ +import Foundation + +@MainActor +class LlamaState: ObservableObject { + @Published var messageLog = "" + + private var llamaContext: LlamaContext? + private var modelUrl: URL? { + Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models") + // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models") + } + init() { + do { + try loadModel() + } catch { + messageLog += "Error!\n" + } + } + + private func loadModel() throws { + messageLog += "Loading model...\n" + if let modelUrl { + llamaContext = try LlamaContext.createContext(path: modelUrl.path()) + messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" + } else { + messageLog += "Could not locate model\n" + } + } + + func complete(text: String) async { + guard let llamaContext else { + return + } + messageLog += "Attempting to complete text...\n" + await llamaContext.completion_init(text: text) + messageLog += "\(text)" + + while await llamaContext.n_cur <= llamaContext.n_len { + let result = await llamaContext.completion_loop() + messageLog += "\(result)" + } + await llamaContext.clear() + messageLog += "\n\ndone\n" + } +} diff --git a/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json b/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json new file mode 100644 index 00000000000000..73c00596a7fca3 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore b/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift new file mode 100644 index 00000000000000..0bd16a806d10fa --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -0,0 +1,42 @@ +import SwiftUI + +struct ContentView: View { + @StateObject var llamaState = LlamaState() + + @State private var multiLineText = "" + + var body: some View { + VStack { + ScrollView(.vertical) { + Text(llamaState.messageLog) + } + + TextEditor(text: $multiLineText) + .frame(height: 200) + .padding() + .border(Color.gray, width: 0.5) + Button(action: { + sendText() + }) { + Text("Send") + .padding() + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + } + } + .padding() + } + + func sendText() { + Task { + await llamaState.complete(text: multiLineText) + multiLineText = "" + } + } +} +/* +#Preview { + ContentView() +} +*/ diff --git a/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift b/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift new file mode 100644 index 00000000000000..cccda8a979f5e4 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift @@ -0,0 +1,10 @@ +import SwiftUI + +@main +struct llama_swiftuiApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} From 0dab8cd7cca7e1bc3550dcb4797b9062cdbb1ebd Mon Sep 17 00:00:00 2001 From: Kasumi <90275229+kasumi-1@users.noreply.github.com> Date: Tue, 28 Nov 2023 01:39:42 +0800 Subject: [PATCH 07/27] readme : add Amica to UI list (#4230) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2892132c426a35..d0d6c96663449f 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ as the main playground for developing new features for the [ggml](https://github - [nat/openplayground](https://github.com/nat/openplayground) - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) - [withcatai/catai](https://github.com/withcatai/catai) +- [semperai/amica](https://github.com/semperai/amica) --- From b38a16dfcff88d547f78f52d1bea31b84a05aff7 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Mon, 27 Nov 2023 15:25:42 -0400 Subject: [PATCH 08/27] cmake : fix issue with version info not getting baked into LlamaConfig.cmake (#3970) * Split CPP generation from build-info query * Remove blank lines * Add BUILD_SHARED_LIBS option --- CMakeLists.txt | 4 ++++ common/CMakeLists.txt | 2 +- scripts/build-info.cmake | 22 ---------------------- scripts/gen-build-info-cpp.cmake | 24 ++++++++++++++++++++++++ 4 files changed, 29 insertions(+), 23 deletions(-) create mode 100644 scripts/gen-build-info-cpp.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index f32df5fe52335e..3e0009415a13f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ else() endif() # general +option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_LTO "llama: enable link time optimization" OFF) @@ -100,6 +101,9 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALO option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ON) +# Required for relocatable CMake package +include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) + # # Compile flags # diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 4f930bdc590592..71891edc3cc1f0 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -26,7 +26,7 @@ add_custom_command( COMMENT "Generating build details from Git" COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake" + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.." DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} VERBATIM diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake index 73853dfa47f41e..ea3dc55c83439a 100644 --- a/scripts/build-info.cmake +++ b/scripts/build-info.cmake @@ -1,5 +1,3 @@ -set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in") -set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") set(BUILD_NUMBER 0) set(BUILD_COMMIT "unknown") set(BUILD_COMPILER "unknown") @@ -58,23 +56,3 @@ else() ) set(BUILD_TARGET ${OUT}) endif() - -# Only write the build info if it changed -if(EXISTS ${OUTPUT_FILE}) - file(READ ${OUTPUT_FILE} CONTENTS) - string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_COMMIT ${CMAKE_MATCH_1}) - string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_COMPILER ${CMAKE_MATCH_1}) - string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_TARGET ${CMAKE_MATCH_1}) - if ( - NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR - NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR - NOT OLD_TARGET STREQUAL BUILD_TARGET - ) - configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) - endif() -else() - configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) -endif() diff --git a/scripts/gen-build-info-cpp.cmake b/scripts/gen-build-info-cpp.cmake new file mode 100644 index 00000000000000..d8933892011b36 --- /dev/null +++ b/scripts/gen-build-info-cpp.cmake @@ -0,0 +1,24 @@ +include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) + +set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in") +set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") + +# Only write the build info if it changed +if(EXISTS ${OUTPUT_FILE}) + file(READ ${OUTPUT_FILE} CONTENTS) + string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS}) + set(OLD_COMMIT ${CMAKE_MATCH_1}) + string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS}) + set(OLD_COMPILER ${CMAKE_MATCH_1}) + string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS}) + set(OLD_TARGET ${CMAKE_MATCH_1}) + if ( + NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR + NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR + NOT OLD_TARGET STREQUAL BUILD_TARGET + ) + configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) + endif() +else() + configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) +endif() From 8406b0924bf323f37d536dee8b8165c1f3d9d11d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 28 Nov 2023 10:32:03 +0200 Subject: [PATCH 09/27] ggml : re-enable BLAS for CPU when src0 != F32 + remove redundant full offload checks in llama.cpp (#4240) * ggml : use blas even if src0 is not F32 * llama : use n_threads_batch only when n_tokens >= 32 ggml-ci * llama : revert n_threads_batch logic ggml-ci --- ggml.c | 2 +- llama.cpp | 12 +----------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/ggml.c b/ggml.c index 0c7264a36216d4..c522a101f15526 100644 --- a/ggml.c +++ b/ggml.c @@ -9373,7 +9373,7 @@ static bool ggml_compute_forward_mul_mat_use_blas( // TODO: find the optimal values for these if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && - src0->type == GGML_TYPE_F32 && + //src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { diff --git a/llama.cpp b/llama.cpp index f2b5967d791e92..cb544228b9f021 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5550,18 +5550,8 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } - // If all tensors can be run on the GPU then using more than 1 thread is detrimental. - const bool full_offload_supported = - model.arch == LLM_ARCH_LLAMA || - model.arch == LLM_ARCH_BAICHUAN || - model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT || - model.arch == LLM_ARCH_MPT || - model.arch == LLM_ARCH_STARCODER || - model.arch == LLM_ARCH_STABLELM; - const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; - if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { + if (ggml_cpu_has_cublas() && fully_offloaded) { n_threads = 1; } From 64e64aa2557d97490b2fe1262b313e2f4a1607e3 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 28 Nov 2023 04:51:11 -0500 Subject: [PATCH 10/27] ggml : restore abort() in GGML_ASSERT (#4242) --- ggml.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ggml.h b/ggml.h index f2fce0f22d357a..4d6d4edfd933c5 100644 --- a/ggml.h +++ b/ggml.h @@ -244,11 +244,10 @@ #define GGML_ASSERT(x) \ do { \ if (!(x)) { \ - fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - fflush(stderr); \ fflush(stdout); \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ ggml_print_backtrace(); \ - exit(1); \ + abort(); \ } \ } while (0) From 4fea3420ee3918d125d74c94d962a6ea82875351 Mon Sep 17 00:00:00 2001 From: Peter Sugihara Date: Tue, 28 Nov 2023 23:16:34 -0800 Subject: [PATCH 11/27] readme : add FreeChat (#4248) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d0d6c96663449f..44cc940930e898 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ as the main playground for developing new features for the [ggml](https://github - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) - [withcatai/catai](https://github.com/withcatai/catai) - [semperai/amica](https://github.com/semperai/amica) +- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) --- From 1f5cd83275fabb43f2ae92c30033b384a3eb37b4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Nov 2023 11:00:17 +0200 Subject: [PATCH 12/27] examples : add readme files --- examples/lookahead/README.md | 7 +++++++ examples/speculative/README.md | 8 ++++++++ 2 files changed, 15 insertions(+) create mode 100644 examples/lookahead/README.md create mode 100644 examples/speculative/README.md diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md new file mode 100644 index 00000000000000..252a6689ef5285 --- /dev/null +++ b/examples/lookahead/README.md @@ -0,0 +1,7 @@ +# llama.cpp/examples/lookahead + +Demonstartion of lookahead decoding technique: + +https://lmsys.org/blog/2023-11-21-lookahead-decoding/ + +More info: https://github.com/ggerganov/llama.cpp/pull/4207 diff --git a/examples/speculative/README.md b/examples/speculative/README.md new file mode 100644 index 00000000000000..d88fd37901443f --- /dev/null +++ b/examples/speculative/README.md @@ -0,0 +1,8 @@ +# llama.cpp/examples/speculative + +Demonstartion of speculative decoding and tree-based speculative decoding techniques + +More info: + +- https://github.com/ggerganov/llama.cpp/pull/2926 +- https://github.com/ggerganov/llama.cpp/pull/3624 From e2bd725f4b39bc5c6234858d158e01248f5ab5bd Mon Sep 17 00:00:00 2001 From: rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com> Date: Thu, 30 Nov 2023 20:50:40 +0000 Subject: [PATCH 13/27] py : fix oai proxy (#3972) * fix oai proxy fix generation not stoped while bot stop talking in chat mode fix possible `slot_id` not exist response for cors (and pre flight) * oai proxy: workaround for some client (such as Chatbox) * use stop as separator to replace hardcoded `\n` --- examples/server/api_like_OAI.py | 46 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py index 313e1a9652d14a..830c056d4acfc6 100755 --- a/examples/server/api_like_OAI.py +++ b/examples/server/api_like_OAI.py @@ -11,10 +11,10 @@ slot_id = -1 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.") -parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n') -parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ") -parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ") -parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ") +parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.') +parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ") +parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ") +parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ") parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '')", default="") parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080') parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="") @@ -34,19 +34,19 @@ def is_present(json, key): #convert chat to prompt def convert_chat(messages): - prompt = "" + args.chat_prompt.replace("\\n", "\n") - system_n = args.system_name.replace("\\n", "\n") - user_n = args.user_name.replace("\\n", "\n") - ai_n = args.ai_name.replace("\\n", "\n") - stop = args.stop.replace("\\n", "\n") + system_n = args.system_name + user_n = args.user_name + ai_n = args.ai_name + stop = args.stop + prompt = "" + args.chat_prompt + stop for line in messages: if (line["role"] == "system"): - prompt += f"{system_n}{line['content']}" + prompt += f"{system_n}{line['content']}{stop}" if (line["role"] == "user"): - prompt += f"{user_n}{line['content']}" + prompt += f"{user_n}{line['content']}{stop}" if (line["role"] == "assistant"): prompt += f"{ai_n}{line['content']}{stop}" prompt += ai_n.rstrip() @@ -130,7 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False): } ] } - slot_id = data["slot_id"] + slot_id = data.get("slot_id") if (chat): if (start): resData["choices"][0]["delta"] = { @@ -150,11 +150,13 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False): return resData -@app.route('/chat/completions', methods=['POST']) -@app.route('/v1/chat/completions', methods=['POST']) +@app.route('/chat/completions', methods=['POST', 'OPTIONS']) +@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS']) def chat_completions(): if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key): return Response(status=403) + if request.method == 'OPTIONS': + return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) body = request.get_json() stream = False tokenize = False @@ -177,20 +179,22 @@ def generate(): data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True) time_now = int(time.time()) resData = make_resData_stream({}, chat=True, time_now=time_now, start=True) - yield 'data: {}\n'.format(json.dumps(resData)) + yield 'data: {}\n\n'.format(json.dumps(resData)) for line in data.iter_lines(): if line: decoded_line = line.decode('utf-8') resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now) - yield 'data: {}\n'.format(json.dumps(resData)) - return Response(generate(), mimetype='text/event-stream') + yield 'data: {}\n\n'.format(json.dumps(resData)) + return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) -@app.route('/completions', methods=['POST']) -@app.route('/v1/completions', methods=['POST']) +@app.route('/completions', methods=['POST', 'OPTIONS']) +@app.route('/v1/completions', methods=['POST', 'OPTIONS']) def completion(): if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key): return Response(status=403) + if request.method == 'OPTIONS': + return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) body = request.get_json() stream = False tokenize = False @@ -216,8 +220,8 @@ def generate(): if line: decoded_line = line.decode('utf-8') resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now) - yield 'data: {}\n'.format(json.dumps(resData)) - return Response(generate(), mimetype='text/event-stream') + yield 'data: {}\n\n'.format(json.dumps(resData)) + return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) if __name__ == '__main__': app.run(args.host, port=args.port) From 954e22858c5cea1dc03e9172d3879402af2b5990 Mon Sep 17 00:00:00 2001 From: tarcey Date: Thu, 30 Nov 2023 22:40:23 +0100 Subject: [PATCH 14/27] llama : fix typical sampling (#4261) Typical sampling was broken because after copying new_candidates into canditates, the "sorted" bool is left at "true", but the new data is no longer sorted according to probability. Patch to set "sorted" to false. Test: Generating with temp=0.0001 (approx. argmax) should generate the same sequence at typical>=1.0 and typical=0.9999 (approx. disabled, but enters the typical sampling codepath). --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index cb544228b9f021..4af4506157842b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7027,6 +7027,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c // Replace the data in candidates with the new_candidates data std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); candidates->size = new_candidates.size(); + candidates->sorted = false; if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; From f4d973cecb7368c985720ba9100ae6abba14806d Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 30 Nov 2023 22:42:23 +0100 Subject: [PATCH 15/27] convert.py : fix llama/llama2 conversion due to vocab_size=-1 (#4258) --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 3ad836ce0ec1df..6e95d6cb37e795 100755 --- a/convert.py +++ b/convert.py @@ -267,7 +267,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_ctx = 2048 return Params( - n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]), + n_vocab = model["tok_embeddings.weight"].shape[0], n_embd = config["dim"], n_layer = config["n_layers"], n_ctx = n_ctx, From b18c66ca6eee4fe0465cff5042daf05005dc9ab2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 30 Nov 2023 22:43:08 +0100 Subject: [PATCH 16/27] llama : fix alignment of general.name in print meta (#4254) * llama: fix alignment of general.name in print meta This commit fixes the alignment of the general.name field in the llm_load_print_meta function. Currently the output looks like this: ```console llm_load_print_meta: model ftype = mostly Q4_0 llm_load_print_meta: model params = 13.02 B llm_load_print_meta: model size = 6.86 GiB (4.53 BPW) llm_load_print_meta: general.name = LLaMA v2 ``` And with this commit it looks like this: ```console llm_load_print_meta: model ftype = mostly Q4_0 llm_load_print_meta: model params = 13.02 B llm_load_print_meta: model size = 6.86 GiB (4.53 BPW) llm_load_print_meta: general.name = LLaMA v2 ``` Signed-off-by: Daniel Bevenius * llama: fix alignment of special tokens Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- llama.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4af4506157842b..26754ef7246260 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2645,15 +2645,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } // general kv - LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); // special tokens - if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } - if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } - if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } - if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } - if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } - if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } + if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } static void llm_load_tensors( From 74daabae6927b99e7333d6126dee35193c418457 Mon Sep 17 00:00:00 2001 From: Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Date: Thu, 30 Nov 2023 22:43:32 +0100 Subject: [PATCH 17/27] readme : fix typo (#4253) llama.cpp uses GitHub Actions, not Gitlab Actions. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 44cc940930e898..b89ba73aa359d1 100644 --- a/README.md +++ b/README.md @@ -896,7 +896,7 @@ Additionally, there the following images, similar to the above: - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). +The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). #### Usage From f7f9e06212d44530b3200033286049dbdf84b3d3 Mon Sep 17 00:00:00 2001 From: Li Tan Date: Thu, 30 Nov 2023 13:44:11 -0800 Subject: [PATCH 18/27] cmake : fix the metal file foder path (#4217) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e0009415a13f1..6f35a25d5221d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,7 +165,7 @@ if (LLAMA_METAL) #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") # copy ggml-metal.metal to bin directory - configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY) + configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} From bde629bb53b85886ee0fe83524c1efe2689bc618 Mon Sep 17 00:00:00 2001 From: Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Date: Fri, 1 Dec 2023 06:45:17 +0900 Subject: [PATCH 19/27] batched.swift : update README.md (#4214) docs: update how to run --- examples/batched.swift/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md index 464c9079c46608..4c2721fe85b005 100644 --- a/examples/batched.swift/README.md +++ b/examples/batched.swift/README.md @@ -1,4 +1,4 @@ This is a swift clone of `examples/batched`. $ `make` -$ `./swift MODEL_PATH [PROMPT] [PARALLEL]` +$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]` From 3bd2c7ce1b752973cf937482a0333e85d1681e2b Mon Sep 17 00:00:00 2001 From: Juraj Bednar Date: Thu, 30 Nov 2023 22:46:01 +0100 Subject: [PATCH 20/27] docker : add finetune option (#4211) --- .devops/tools.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.devops/tools.sh b/.devops/tools.sh index 9d999315f3887c..3a7d274e466191 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then ./quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then ./main "$@" +elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then + ./finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -34,6 +36,8 @@ else echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" + echo " --finetune (-f): Run finetune command to create a lora finetune of the model" + echo " See documentation for finetune for command-line parameters" echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" echo " --server (-s): Run a model on the server" From 524907aa768a26cbf83d8e2eb30547e2ee1d1b1a Mon Sep 17 00:00:00 2001 From: vodkaslime <646329483@qq.com> Date: Fri, 1 Dec 2023 05:49:21 +0800 Subject: [PATCH 21/27] readme : fix (#4135) * fix: readme * chore: resolve comments * chore: resolve comments --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b89ba73aa359d1..dac971ae5dfe56 100644 --- a/README.md +++ b/README.md @@ -324,7 +324,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 ### BLAS Build -Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it: +Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: - #### Accelerate Framework: From 8efa0f6ebed53c9453e6721da86fb294e5015909 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Thu, 30 Nov 2023 13:56:19 -0800 Subject: [PATCH 22/27] main : pass LOG_TEE callback to llama.cpp log (#4033) * main : Call llama_log_set to use LOG_TEE * tabs to spaces --- examples/main/main.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 31ec8cade19be1..c5cdfbf21b9547 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -100,6 +100,12 @@ static void sigint_handler(int signo) { } #endif +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + int main(int argc, char ** argv) { gpt_params params; g_params = ¶ms; @@ -113,6 +119,7 @@ int main(int argc, char ** argv) { log_set_target(log_filename_generator("main", "log")); LOG_TEE("Log start\n"); log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); #endif // LOG_DISABLE_LOGS // TODO: Dump params ? From 33c9892af58b7b161f2a532935dcccff8c8048c6 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Thu, 30 Nov 2023 23:11:14 +0100 Subject: [PATCH 23/27] llava : ShareGPT4V compatibility (vision encoder only loading) (#4172) * ShareGPT4 compatibility (vision encoder only loading) Load only a CLIP vision encoder (as supplied by ShareGPT finetunes) Corrects the argument parsing for --img_mean and --img_std (which were previously not parsed but attempted to access) Defines defaults for img_mean and img_std which are equal to the llava 1.5 CLIP encoder, so you do not have to provide them * Update convert-image-encoder-to-gguf.py --- .../llava/convert-image-encoder-to-gguf.py | 52 +++++++++++++------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 2f5eef19919558..729aaef8f0fd21 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -5,7 +5,7 @@ import torch import numpy as np from gguf import * -from transformers import CLIPModel, CLIPProcessor +from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel TEXT = "clip.text" VISION = "clip.vision" @@ -78,11 +78,19 @@ def bytes_to_unicode(): help="Save a text-only model. It can't be used to encode images") ap.add_argument("--vision-only", action="store_true", required=False, help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip_model_is_vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) +# with proper args = ap.parse_args() @@ -96,15 +104,22 @@ def bytes_to_unicode(): # output in the same directory as the model if output_dir is None dir_model = args.model_dir - -with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - vocab = json.load(f) - tokens = [key for key in vocab] +if args.clip_model_is_vision: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] with open(dir_model + "/config.json", "r", encoding="utf-8") as f: config = json.load(f) - v_hparams = config["vision_config"] - t_hparams = config["text_config"] + if args.clip_model_is_vision: + v_hparams = config + t_hparams = None + else: + v_hparams = config["vision_config"] + t_hparams = config["text_config"] # possible data types # ftype == 0 -> float32 @@ -117,9 +132,12 @@ def bytes_to_unicode(): if args.use_f32: ftype = 0 - -model = CLIPModel.from_pretrained(dir_model) -processor = CLIPProcessor.from_pretrained(dir_model) +if args.clip_model_is_vision: + model = CLIPVisionModel.from_pretrained(dir_model) + processor = None +else: + model = CLIPModel.from_pretrained(dir_model) + processor = CLIPProcessor.from_pretrained(dir_model) fname_middle = None has_text_encoder = True @@ -128,13 +146,13 @@ def bytes_to_unicode(): if args.text_only: fname_middle = "text-" has_vision_encoder = False -elif args.vision_only: - fname_middle = "vision-" - has_text_encoder = False elif args.llava_projector is not None: fname_middle = "mmproj-" has_text_encoder = False has_llava_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False else: fname_middle = "" @@ -182,8 +200,12 @@ def bytes_to_unicode(): block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) - image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean - image_std = processor.image_processor.image_std if args.image_std is None else args.image_std + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std fout.add_array("clip.vision.image_mean", image_mean) fout.add_array("clip.vision.image_std", image_std) From 15f5d96037e597523b721aa39c874d69de2acf85 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 30 Nov 2023 17:23:08 -0500 Subject: [PATCH 24/27] build : fix build info generation and cleanup Makefile (#3920) * cmake : fix joining of REAL_GIT_DIR * fix includes with help from include-what-you-use * make : remove unneeded deps and add test-rope target * fix C includes in C++ source files * Revert "fix includes with help from include-what-you-use" This reverts commit 635e9fadfd516d4604a0fecf4a854bfb25ad17ae. --- .gitignore | 25 +++++++++++++------------ Makefile | 23 +++++++++++++---------- common/CMakeLists.txt | 7 ++++++- ggml-opencl.cpp | 12 +++++------- llama.cpp | 1 - 5 files changed, 37 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 3806e05ddcc126..58c4839940b498 100644 --- a/.gitignore +++ b/.gitignore @@ -88,15 +88,16 @@ poetry.lock poetry.toml # Test binaries -tests/test-grammar-parser -tests/test-llama-grammar -tests/test-double-float -tests/test-grad0 -tests/test-opt -tests/test-quantize-fns -tests/test-quantize-perf -tests/test-sampling -tests/test-tokenizer-0-llama -tests/test-tokenizer-0-falcon -tests/test-tokenizer-1-llama -tests/test-tokenizer-1-bpe +/tests/test-grammar-parser +/tests/test-llama-grammar +/tests/test-double-float +/tests/test-grad0 +/tests/test-opt +/tests/test-quantize-fns +/tests/test-quantize-perf +/tests/test-sampling +/tests/test-tokenizer-0-llama +/tests/test-tokenizer-0-falcon +/tests/test-tokenizer-1-llama +/tests/test-tokenizer-1-bpe +/tests/test-rope diff --git a/Makefile b/Makefile index 95d85236f8f241..22132ae2378968 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ BUILD_TARGETS = \ TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ - tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe + tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -648,7 +648,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) @@ -701,28 +701,28 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) @@ -737,5 +737,8 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + tests/test-c.o: tests/test-c.c llama.h $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 71891edc3cc1f0..b5d5453d2d357f 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -11,7 +11,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") if(NOT IS_DIRECTORY "${GIT_DIR}") file(READ ${GIT_DIR} REAL_GIT_DIR_LINK) string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK}) - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}") + string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS) + if (SLASH_POS EQUAL 0) + set(GIT_DIR "${REAL_GIT_DIR}") + else() + set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}") + endif() endif() set(GIT_INDEX "${GIT_DIR}/index") diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 202bcb4853893c..496f9cdca542d3 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1,20 +1,18 @@ +#include "ggml.h" #include "ggml-opencl.h" #include #include +#include +#include +#include +#include #include #include -#include #define CL_TARGET_OPENCL_VERSION 110 #include -#include -#include -#include - -#include "ggml.h" - #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif diff --git a/llama.cpp b/llama.cpp index 26754ef7246260..1e00ea4a97870d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -46,7 +46,6 @@ #endif #include #include - #include // for _fseeki64 #endif #include From d2809a3ba2780e00fce5a6149a7eda09f1c0e906 Mon Sep 17 00:00:00 2001 From: WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> Date: Thu, 30 Nov 2023 17:23:44 -0500 Subject: [PATCH 25/27] make : fix Apple clang determination bug (#4272) Co-authored-by: Will Findley --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 22132ae2378968..25b113e0a93069 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ ifeq '' '$(findstring clang,$(shell $(CC) --version))' CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }') else CC_IS_CLANG=1 - ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))' + ifeq '' '$(findstring Apple,$(shell $(CC) --version))' CC_IS_LLVM_CLANG=1 else CC_IS_APPLE_CLANG=1 From f43f09366dfd018e4568e23a232aaa8c4f7cfc78 Mon Sep 17 00:00:00 2001 From: Ziad Ben Hadj-Alouane Date: Thu, 30 Nov 2023 17:25:04 -0500 Subject: [PATCH 26/27] server : add single-client multi-prompt support (#4232) * * add multiprompt support * * cleanup * * more cleanup * * remove atomicity of id_gen, and change lock_guard to unique_lock on completion requests * * remove all references to mutex_multitasks * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel * * change to set --------- Co-authored-by: Jared Van Bortel --- examples/server/server.cpp | 139 ++++++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 11 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 50f124b13e8492..5edb3678efe092 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -155,15 +155,23 @@ struct task_server { json data; bool infill_mode = false; bool embedding_mode = false; + int multitask_id = -1; }; struct task_result { int id; + int multitask_id = -1; bool stop; bool error; json result_json; }; +struct task_multi { + int id; + std::set subtasks_remaining{}; + std::vector results{}; +}; + // TODO: can become bool if we can't find use of more states enum slot_state { @@ -406,6 +414,9 @@ struct llama_client_slot double t_prompt_processing; // ms double t_token_generation; // ms + // multitasks + int multitask_id = -1; + void reset() { num_prompt_tokens = 0; generated_text = ""; @@ -529,7 +540,8 @@ struct llama_server_context std::vector queue_tasks; std::vector queue_results; - std::mutex mutex_tasks; + std::vector queue_multitasks; + std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks std::mutex mutex_results; ~llama_server_context() @@ -1112,17 +1124,40 @@ struct llama_server_context return slot.images.size() > 0; } - void send_error(int id, std::string error) + void send_error(task_server& task, std::string error) { std::lock_guard lock(mutex_results); task_result res; - res.id = id; + res.id = task.id; + res.multitask_id = task.multitask_id; res.stop = false; res.error = true; res.result_json = { { "content", error } }; queue_results.push_back(res); } + void add_multi_task(int id, std::vector& sub_ids) + { + std::lock_guard lock(mutex_tasks); + task_multi multi; + multi.id = id; + std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); + queue_multitasks.push_back(multi); + } + + void update_multi_task(int multitask_id, int subtask_id, task_result& result) + { + std::lock_guard lock(mutex_tasks); + for (auto& multitask : queue_multitasks) + { + if (multitask.id == multitask_id) + { + multitask.subtasks_remaining.erase(subtask_id); + multitask.results.push_back(result); + } + } + } + json get_model_props() { return get_formated_generation(slots[0]); @@ -1167,6 +1202,7 @@ struct llama_server_context std::lock_guard lock(mutex_results); task_result res; res.id = slot.task_id; + res.multitask_id = slot.multitask_id; res.error = false; res.stop = false; @@ -1206,6 +1242,7 @@ struct llama_server_context std::lock_guard lock(mutex_results); task_result res; res.id = slot.task_id; + res.multitask_id = slot.multitask_id; res.error = false; res.stop = true; @@ -1251,6 +1288,12 @@ struct llama_server_context res.result_json["model"] = slot.oaicompat_model; } + // parent multitask, if any, needs to be updated + if (slot.multitask_id != -1) + { + update_multi_task(slot.multitask_id, slot.task_id, res); + } + queue_results.push_back(res); } @@ -1259,6 +1302,7 @@ struct llama_server_context std::lock_guard lock(mutex_results); task_result res; res.id = slot.task_id; + res.multitask_id = slot.multitask_id; res.error = false; res.stop = true; @@ -1285,9 +1329,9 @@ struct llama_server_context queue_results.push_back(res); } - int request_completion(json data, bool infill, bool embedding) + int request_completion(json data, bool infill, bool embedding, int multitask_id) { - std::lock_guard lock(mutex_tasks); + std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; task.target_id = 0; @@ -1295,6 +1339,16 @@ struct llama_server_context task.infill_mode = infill; task.embedding_mode = embedding; task.type = COMPLETION_TASK; + task.multitask_id = multitask_id; + + // when a completion task's prompt array is not a singleton, we split it into multiple requests + if (task.data.at("prompt").size() > 1) + { + lock.unlock(); // entering new func scope + return split_multiprompt_task(task); + } + + // otherwise, it's a single-prompt task, we actually queue it queue_tasks.push_back(task); return task.id; } @@ -1313,8 +1367,17 @@ struct llama_server_context for (int i = 0; i < (int) queue_results.size(); i++) { + // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result + if (queue_results[i].multitask_id == task_id) + { + update_multi_task(task_id, queue_results[i].id, queue_results[i]); + queue_results.erase(queue_results.begin() + i); + continue; + } + if (queue_results[i].id == task_id) { + assert(queue_results[i].multitask_id == -1); task_result res = queue_results[i]; queue_results.erase(queue_results.begin() + i); return res; @@ -1404,6 +1467,27 @@ struct llama_server_context queue_tasks.push_back(task); } + int split_multiprompt_task(task_server& multiprompt_task) + { + auto prompt_count = multiprompt_task.data.at("prompt").size(); + assert(prompt_count > 1); + + int multitask_id = id_gen++; + std::vector subtask_ids(prompt_count); + for (int i = 0; i < prompt_count; i++) + { + json subtask_data = multiprompt_task.data; + subtask_data["prompt"] = subtask_data["prompt"][i]; + + // subtasks inherit everything else (infill mode, embedding mode, etc.) + subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id); + } + + // queue up the multitask so we can track its subtask progression + add_multi_task(multitask_id, subtask_ids); + return multitask_id; + } + void process_tasks() { std::lock_guard lock(mutex_tasks); @@ -1419,7 +1503,7 @@ struct llama_server_context { LOG_TEE("slot unavailable\n"); // send error result - send_error(task.id, "slot unavailable"); + send_error(task, "slot unavailable"); return; } @@ -1433,11 +1517,12 @@ struct llama_server_context slot->infill = task.infill_mode; slot->embedding = task.embedding_mode; slot->task_id = task.id; + slot->multitask_id = task.multitask_id; if (!launch_slot_with_data(slot, task.data)) { // send error result - send_error(task.id, "internal_error"); + send_error(task, "internal_error"); break; } } break; @@ -1453,6 +1538,38 @@ struct llama_server_context } break; } } + + // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue + auto queue_iterator = queue_multitasks.begin(); + while (queue_iterator != queue_multitasks.end()) + { + if (queue_iterator->subtasks_remaining.empty()) + { + // all subtasks done == multitask is done + task_result aggregate_result; + aggregate_result.id = queue_iterator->id; + aggregate_result.stop = true; + aggregate_result.error = false; + + // collect json results into one json result + std::vector result_jsons; + for (auto& subres : queue_iterator->results) + { + result_jsons.push_back(subres.result_json); + aggregate_result.error = aggregate_result.error && subres.error; + } + aggregate_result.result_json = json{ "results", result_jsons }; + + std::lock_guard lock(mutex_results); + queue_results.push_back(aggregate_result); + + queue_iterator = queue_multitasks.erase(queue_iterator); + } + else + { + ++queue_iterator; + } + } } bool update_slots() { @@ -2596,7 +2713,7 @@ int main(int argc, char **argv) svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res) { json data = json::parse(req.body); - const int task_id = llama.request_completion(data, false, false); + const int task_id = llama.request_completion(data, false, false, -1); if (!json_value(data, "stream", false)) { std::string completion_text; task_result result = llama.next_result(task_id); @@ -2685,7 +2802,7 @@ int main(int argc, char **argv) { json data = oaicompat_completion_params_parse(json::parse(req.body)); - const int task_id = llama.request_completion(data, false, false); + const int task_id = llama.request_completion(data, false, false, -1); if (!json_value(data, "stream", false)) { std::string completion_text; @@ -2754,7 +2871,7 @@ int main(int argc, char **argv) svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) { json data = json::parse(req.body); - const int task_id = llama.request_completion(data, true, false); + const int task_id = llama.request_completion(data, true, false, -1); if (!json_value(data, "stream", false)) { std::string completion_text; task_result result = llama.next_result(task_id); @@ -2858,7 +2975,7 @@ int main(int argc, char **argv) { prompt = ""; } - const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true); + const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); task_result result = llama.next_result(task_id); return res.set_content(result.result_json.dump(), "application/json"); }); From 1d144112c0fbbb4ecc07dbcf4f05a380148bd6de Mon Sep 17 00:00:00 2001 From: Ziad Ben Hadj-Alouane Date: Thu, 30 Nov 2023 17:25:49 -0500 Subject: [PATCH 27/27] server : add --log-disable to disable logging to file (#4260) * * add --log-disable to disable logging to file in the server example * * typo fix --- examples/server/server.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5edb3678efe092..a65344b9207f43 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1961,6 +1961,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); + printf(" --log-disable disables logging to a file.\n"); printf("\n"); } @@ -2315,6 +2316,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.mmproj = argv[i]; } + else if (arg == "--log-disable") + { + log_set_target(stdout); + LOG_INFO("logging to file is disabled.", {}); + } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());