diff --git a/ggml-backend.h b/ggml-backend.h index f13c69bffb98c..c097cbcbe0cd8 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -80,7 +80,6 @@ extern "C" { // // CPU backend // - GGML_API ggml_backend_t ggml_backend_cpu_init(void); GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); diff --git a/llama.cpp b/llama.cpp index 28430254f698f..e1ac2600d595f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1948,6 +1948,9 @@ struct llama_context { std::vector buf_compute_meta; ggml_backend_sched_t sched = nullptr; + ggml_abort_callback abort_callback = nullptr; + void * abort_callback_data = nullptr; + // input tensors ggml_backend_buffer_t buf_input = nullptr; ggml_context * ctx_input = nullptr; @@ -7847,6 +7850,7 @@ static void llama_graph_compute( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } ggml_backend_sched_graph_compute(lctx.sched, gf); @@ -11644,6 +11648,8 @@ struct llama_context_params llama_context_default_params() { /*.embedding =*/ false, /*.offload_kqv =*/ true, /*.do_pooling =*/ true, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, }; return result; @@ -11835,8 +11841,11 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - ctx->rng = std::mt19937(params.seed); - ctx->logits_all = params.logits_all; + ctx->abort_callback = params.abort_callback; + ctx->abort_callback_data = params.abort_callback_data; + + ctx->rng = std::mt19937(params.seed); + ctx->logits_all = params.logits_all; const ggml_type type_k = params.type_k; const ggml_type type_v = params.type_v; @@ -12809,6 +12818,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_ ctx->cparams.n_threads_batch = n_threads_batch; } +void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; +} + struct llama_batch llama_batch_get_one( llama_token * tokens, int32_t n_tokens, diff --git a/llama.h b/llama.h index ff131996d9a38..d1ebcdeaff5fd 100644 --- a/llama.h +++ b/llama.h @@ -256,6 +256,9 @@ extern "C" { bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) + + ggml_abort_callback abort_callback; + void * abort_callback_data; }; // model quantization parameters @@ -661,6 +664,9 @@ extern "C" { // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); + // Set abort callback + LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); + // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row // Logits for which llama_batch.logits[i] == 0 are undefined