& pcmf32, std::vector
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
+ else if (is_wav_buffer(fname)) {
+ if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
+ fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
+ return false;
+ }
+ }
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
return false;
diff --git a/examples/common.h b/examples/common.h
index 005137107d5..b7050482394 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -135,7 +135,11 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
// Audio utils
//
+// Check if a buffer is a WAV audio file
+bool is_wav_buffer(const std::string buf);
+
// Read WAV audio file and store the PCM data into pcmf32
+// fname can be a buffer of WAV data instead of a filename
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
bool read_wav(
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8b6e4695259..69c04bf3a0a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -18,7 +18,7 @@
#endif
using namespace httplib;
-using json = nlohmann::json;
+using json = nlohmann::ordered_json;
namespace {
@@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
{"Access-Control-Allow-Origin", "*"},
{"Access-Control-Allow-Headers", "content-type"}});
- std::string const default_content = "hello";
+ std::string const default_content = R"(
+
+
+ Whisper.cpp Server
+
+
+
+
+
+ Whisper.cpp Server
+
+ /inference
+
+ curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
+ -H "Content-Type: multipart/form-data" \
+ -F file="@<file-path>" \
+ -F temperature="0.0" \
+ -F temperature_inc="0.2" \
+ -F response_format="json"
+
+
+ /load
+
+ curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
+ -H "Content-Type: multipart/form-data" \
+ -F model="<path-to-model-file>"
+
+
+
+
Try it out
+
+
+
+
+ )";
// store default params so we can reset after each inference request
whisper_params default_params = params;
@@ -556,7 +625,7 @@ int main(int argc, char ** argv) {
svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
// acquire whisper model mutex lock
- whisper_mutex.lock();
+ std::lock_guard lock(whisper_mutex);
// first check user requested fields of the request
if (!req.has_file("file"))
@@ -564,7 +633,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "error: no 'file' field in the request\n");
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
res.set_content(error_resp, "application/json");
- whisper_mutex.unlock();
return;
}
auto audio_file = req.get_file_value("file");
@@ -579,35 +647,42 @@ int main(int argc, char ** argv) {
std::vector pcmf32; // mono-channel F32 PCM
std::vector> pcmf32s; // stereo-channel F32 PCM
- // write to temporary file
- const std::string temp_filename = "whisper_server_temp_file.wav";
- std::ofstream temp_file{temp_filename, std::ios::binary};
- temp_file << audio_file.content;
- temp_file.close();
-
- // if file is not wav, convert to wav
-
if (sparams.ffmpeg_converter) {
+ // if file is not wav, convert to wav
+ // write to temporary file
+ const std::string temp_filename = "whisper_server_temp_file.wav";
+ std::ofstream temp_file{temp_filename, std::ios::binary};
+ temp_file << audio_file.content;
+ temp_file.close();
+
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
const bool is_converted = convert_to_wav(temp_filename, error_resp);
if (!is_converted) {
res.set_content(error_resp, "application/json");
- whisper_mutex.unlock();
return;
}
- }
- // read wav content into pcmf32
- if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
- fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
- const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
- res.set_content(error_resp, "application/json");
+ // read wav content into pcmf32
+ if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
+ {
+ fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
+ const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+ res.set_content(error_resp, "application/json");
+ std::remove(temp_filename.c_str());
+ return;
+ }
+ // remove temp file
std::remove(temp_filename.c_str());
- whisper_mutex.unlock();
- return;
+ } else {
+ if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
+ {
+ fprintf(stderr, "error: failed to read WAV file\n");
+ const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+ res.set_content(error_resp, "application/json");
+ return;
+ }
}
- // remove temp file
- std::remove(temp_filename.c_str());
+
printf("Successfully loaded %s\n", filename.c_str());
@@ -681,6 +756,7 @@ int main(int argc, char ** argv) {
wparams.logprob_thold = params.logprob_thold;
wparams.no_timestamps = params.no_timestamps;
+ wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
@@ -724,7 +800,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
res.set_content(error_resp, "application/json");
- whisper_mutex.unlock();
return;
}
}
@@ -778,6 +853,59 @@ int main(int argc, char ** argv) {
ss << speaker << text << "\n\n";
}
res.set_content(ss.str(), "text/vtt");
+ } else if (params.response_format == vjson_format) {
+ /* try to match openai/whisper's Python format */
+ std::string results = output_str(ctx, params, pcmf32s);
+ json jres = json{
+ {"task", params.translate ? "translate" : "transcribe"},
+ {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
+ {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
+ {"text", results},
+ {"segments", json::array()}
+ };
+ const int n_segments = whisper_full_n_segments(ctx);
+ for (int i = 0; i < n_segments; ++i)
+ {
+ json segment = json{
+ {"id", i},
+ {"text", whisper_full_get_segment_text(ctx, i)},
+ };
+
+ if (!params.no_timestamps) {
+ segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
+ segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
+ }
+
+ float total_logprob = 0;
+ const int n_tokens = whisper_full_n_tokens(ctx, i);
+ for (int j = 0; j < n_tokens; ++j) {
+ whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
+ if (token.id >= whisper_token_eot(ctx)) {
+ continue;
+ }
+
+ segment["tokens"].push_back(token.id);
+ json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
+ if (!params.no_timestamps) {
+ word["start"] = token.t0 * 0.01;
+ word["end"] = token.t1 * 0.01;
+ }
+ word["probability"] = token.p;
+ total_logprob += token.plog;
+ segment["words"].push_back(word);
+ }
+
+ segment["temperature"] = params.temperature;
+ segment["avg_logprob"] = total_logprob / n_tokens;
+
+ // TODO compression_ratio and no_speech_prob are not implemented yet
+ // segment["compression_ratio"] = 0;
+ // segment["no_speech_prob"] = 0;
+
+ jres["segments"].push_back(segment);
+ }
+ res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
+ "application/json");
}
// TODO add more output formats
else
@@ -792,18 +920,14 @@ int main(int argc, char ** argv) {
// reset params to thier defaults
params = default_params;
-
- // return whisper model mutex lock
- whisper_mutex.unlock();
});
svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
- whisper_mutex.lock();
+ std::lock_guard lock(whisper_mutex);
if (!req.has_file("model"))
{
fprintf(stderr, "error: no 'model' field in the request\n");
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
res.set_content(error_resp, "application/json");
- whisper_mutex.unlock();
return;
}
std::string model = req.get_file_value("model").content;
@@ -812,7 +936,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
const std::string error_resp = "{\"error\":\"model not found!\"}";
res.set_content(error_resp, "application/json");
- whisper_mutex.unlock();
return;
}
@@ -835,7 +958,6 @@ int main(int argc, char ** argv) {
res.set_content(success, "application/text");
// check if the model is in the file system
- whisper_mutex.unlock();
});
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
diff --git a/examples/stream/README.md b/examples/stream/README.md
index 124e7a6d779..eeae3277813 100644
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@@ -4,7 +4,7 @@ This is a naive example of performing real-time inference on audio from your mic
The `stream` tool samples the audio every half a second and runs the transcription continously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
-```java
+```bash
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```
@@ -14,7 +14,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
Setting the `--step` argument to `0` enables the sliding window mode:
-```java
+```bash
./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
```
@@ -39,8 +39,8 @@ brew install sdl2
make stream
```
-Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
-as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
+Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
+as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
`examples/steam` means your compiler cannot find them and it gives an error it cannot find the file.
```bash
diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp
index 7af38718c41..f7d054c577a 100644
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@@ -11,6 +11,10 @@
# include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h"
+#elif defined(GGML_USE_VULKAN)
+# include "ggml-vulkan.h"
+#elif defined(GGML_USE_SYCL)
+# include "ggml-sycl.h"
#endif
#ifdef GGML_USE_METAL
@@ -52,6 +56,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -192,8 +197,11 @@ enum llm_arch {
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,
+ LLM_ARCH_QWEN2,
LLM_ARCH_PHI2,
LLM_ARCH_PLAMO,
+ LLM_ARCH_CODESHELL,
+ LLM_ARCH_ORION,
LLM_ARCH_UNKNOWN,
};
@@ -211,8 +219,11 @@ static std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" },
+ { LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PLAMO, "plamo" },
+ { LLM_ARCH_CODESHELL, "codeshell" },
+ { LLM_ARCH_ORION, "orion" },
};
enum llm_kv {
@@ -566,6 +577,23 @@ static std::map> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_QWEN2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_PHI2,
{
@@ -600,6 +628,45 @@ static std::map> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_CODESHELL,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
+ {
+ LLM_ARCH_ORION,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_UNKNOWN,
@@ -1215,8 +1282,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type();
}
+#elif defined(GGML_USE_SYCL)
+ buft = ggml_backend_sycl_host_buffer_type();
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
+#elif defined(GGML_USE_VULKAN)
+ if (host_buffer) {
+ buft = ggml_backend_vk_host_buffer_type();
+ }
#endif
if (buft == nullptr) {
@@ -1234,6 +1307,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUBLAS)
buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_VULKAN)
+ buft = ggml_backend_vk_buffer_type();
+#elif defined(GGML_USE_SYCL)
+ buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#endif
@@ -1284,11 +1361,14 @@ static llama_state g_state;
// available llama models
enum e_model {
MODEL_UNKNOWN,
+ MODEL_0_5B,
MODEL_1B,
MODEL_3B,
+ MODEL_4B,
MODEL_7B,
MODEL_8B,
MODEL_13B,
+ MODEL_14B,
MODEL_15B,
MODEL_30B,
MODEL_34B,
@@ -1393,6 +1473,9 @@ struct llama_cparams {
bool mul_mat_q;
bool offload_kqv;
+
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
};
struct llama_layer {
@@ -1596,7 +1679,7 @@ struct llama_model {
std::unique_ptr mapping;
// objects representing data potentially being locked in memory
- llama_mlock mlock_buf;
+ std::vector> mlock_bufs;
llama_mlock mlock_mmap;
// for quantize-stats only
@@ -1623,6 +1706,9 @@ struct llama_context {
for (ggml_backend_t backend : backends) {
ggml_backend_free(backend);
}
+
+ ggml_backend_buffer_free(buf_input);
+ ggml_free(ctx_input);
}
llama_cparams cparams;
@@ -1669,8 +1755,14 @@ struct llama_context {
// allocator for the input tensors
ggml_tallocr * alloc = nullptr;
- // temporary buffer for copying data to/from the backend
- std::vector> buf_copy;
+ // input tensors
+ ggml_backend_buffer_t buf_input = nullptr;
+ ggml_context * ctx_input = nullptr;
+ struct ggml_tensor * inp_tokens; // I32 [n_batch]
+ struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
+ struct ggml_tensor * inp_pos; // I32 [n_batch]
+ struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
+ struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
@@ -2254,18 +2346,18 @@ struct llama_model_loader {
}
switch (type_max) {
- case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
- case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
- case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
- case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
- case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
- case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
- case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
- case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
- case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
- case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
- case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
- case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
+ case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
+ case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
+ case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
+ case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
+ case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
+ case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
+ case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
+ case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
+ case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
+ case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
+ case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
+ case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
default:
@@ -2615,6 +2707,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
default: return "unknown, may not work";
}
@@ -2627,6 +2720,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_7B: return "7B";
case MODEL_8B: return "8B";
case MODEL_13B: return "13B";
+ case MODEL_14B: return "14B";
case MODEL_15B: return "15B";
case MODEL_30B: return "30B";
case MODEL_34B: return "34B";
@@ -2830,6 +2924,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
+ case 24: model.type = e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_3B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
@@ -2844,6 +2939,17 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_QWEN2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
+ case 32: model.type = e_model::MODEL_7B; break;
+ case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
+ case 80: model.type = e_model::MODEL_70B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_PHI2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2874,7 +2980,23 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_CODESHELL:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 42: model.type = e_model::MODEL_SMALL; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_ORION:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 40: model.type = e_model::MODEL_14B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
default: (void)0;
}
@@ -3435,7 +3557,12 @@ static bool llm_load_tensors(
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ } else {
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
+ ml.n_created--; // artificial tensor
+ }
}
for (int i = 0; i < n_layer; ++i) {
@@ -3629,6 +3756,11 @@ static bool llm_load_tensors(
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ // optional bias tensors, present in Stable LM 2 1.6B
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
@@ -3666,6 +3798,41 @@ static bool llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
}
} break;
+ case LLM_ARCH_QWEN2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+ // optional bias tensors
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ } break;
case LLM_ARCH_PHI2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -3776,6 +3943,74 @@ static bool llm_load_tensors(
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
+ case LLM_ARCH_CODESHELL:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
+
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
+ }
+ } break;
+ case LLM_ARCH_ORION:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ }
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ } break;
+
+
default:
throw std::runtime_error("unknown architecture");
}
@@ -3812,8 +4047,10 @@ static bool llm_load_tensors(
else {
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
- model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
- model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
+ model.mlock_bufs.emplace_back(new llama_mlock);
+ auto & mlock_buf = model.mlock_bufs.back();
+ mlock_buf->init (ggml_backend_buffer_get_base(buf));
+ mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
}
}
if (buf == nullptr) {
@@ -3939,22 +4176,24 @@ static struct ggml_tensor * llm_build_inp_embd(
const llama_hparams & hparams,
const llama_batch & batch,
struct ggml_tensor * tok_embd,
+ struct ggml_tensor * inp_tokens,
+ struct ggml_tensor * inp_embd,
const llm_build_cb & cb) {
const int64_t n_embd = hparams.n_embd;
struct ggml_tensor * inpL;
if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
+ struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
cb(inp_tokens, "inp_tokens", -1);
- inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
+ inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
} else {
#ifdef GGML_USE_MPI
GGML_ASSERT(false && "not implemented");
#endif
- inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+ inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
}
return inpL;
@@ -3968,6 +4207,7 @@ static void llm_build_k_shift(
const llama_cparams & cparams,
const llama_kv_cache & kv,
struct ggml_cgraph * graph,
+ struct ggml_tensor * K_shift,
llm_rope_type type,
int64_t n_ctx,
float freq_base,
@@ -3984,9 +4224,6 @@ static void llm_build_k_shift(
const float beta_fast = cparams.yarn_beta_fast;
const float beta_slow = cparams.yarn_beta_slow;
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
- cb(K_shift, "K_shift", -1);
-
int rope_type = 0;
switch (type) {
@@ -4174,6 +4411,7 @@ static struct ggml_tensor * llm_build_kqv(
const llama_model & model,
const llama_hparams & hparams,
const llama_kv_cache & kv,
+ struct ggml_cgraph * graph,
struct ggml_tensor * wo,
struct ggml_tensor * wo_b,
struct ggml_tensor * q_cur,
@@ -4252,6 +4490,8 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il);
+ ggml_build_forward_expand(graph, cur);
+
cur = ggml_mul_mat(ctx, wo, cur);
if (wo_b) {
cb(cur, "kqv_wo", il);
@@ -4264,8 +4504,47 @@ static struct ggml_tensor * llm_build_kqv(
return cur;
}
+static struct ggml_tensor * llm_build_kv(
+ struct ggml_context * ctx,
+ const llama_model & model,
+ const llama_hparams & hparams,
+ const llama_kv_cache & kv,
+ struct ggml_cgraph * graph,
+ struct ggml_tensor * wo,
+ struct ggml_tensor * wo_b,
+ struct ggml_tensor * k_cur,
+ struct ggml_tensor * v_cur,
+ struct ggml_tensor * q_cur,
+ struct ggml_tensor * kq_mask,
+ int64_t n_ctx,
+ int32_t n_tokens,
+ int32_t kv_head,
+ int32_t n_kv,
+ float max_alibi_bias,
+ float kq_scale,
+ const llm_build_cb & cb,
+ int il) {
+
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(graph, q_cur);
+ ggml_build_forward_expand(graph, k_cur);
+ ggml_build_forward_expand(graph, v_cur);
+
+ llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
+
+ struct ggml_tensor * cur;
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph,
+ wo, wo_b,
+ q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
+ cb(cur, "kqv_out", il);
+
+ return cur;
+}
+
struct llm_build_context {
const llama_model & model;
+ const llama_context & lctx;
const llama_hparams & hparams;
const llama_cparams & cparams;
const llama_batch & batch;
@@ -4312,6 +4591,7 @@ struct llm_build_context {
const llm_build_cb & cb,
bool worst_case) :
model (lctx.model),
+ lctx (lctx),
hparams (model.hparams),
cparams (lctx.cparams),
batch (batch),
@@ -4361,8 +4641,7 @@ struct llm_build_context {
ctx0 = nullptr;
}
}
-
- struct ggml_cgraph * build_llama() {
+ struct ggml_cgraph * build_orion() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4372,20 +4651,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -4393,8 +4672,8 @@ struct llm_build_context {
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, cb, il);
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+ LLM_NORM, cb, il);
cb(cur, "attn_norm", il);
// self-attention
@@ -4402,50 +4681,42 @@ struct llm_build_context {
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
+ // if (model.layers[il].bq) {
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ // cb(Qcur, "Qcur", il);
+ // }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
+ // if (model.layers[il].bk) {
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ // cb(Kcur, "Kcur", il);
+ // }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
-
- // these nodes are added to the graph together so that they are not reordered
- // by doing so, the number of splits in the graph is reduced
- ggml_build_forward_expand(gf, Qcur);
- ggml_build_forward_expand(gf, Kcur);
- ggml_build_forward_expand(gf, Vcur);
+ // if (model.layers[il].bv) {
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ // cb(Vcur, "Vcur", il);
+ // }
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+ model.layers[il].wo, NULL,
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4453,22 +4724,143 @@ struct llm_build_context {
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_norm", il);
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM, cb, il);
+ cb(cur, "ffn_norm", il);
- cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+
+
+ struct ggml_cgraph * build_llama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
+ cb(inpL, "inp_embd", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+ cb(inp_pos, "inp_pos", -1);
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
+ cb(KQ_mask, "KQ_mask", -1);
+
+ // shift the entire K-cache if needed
+ if (do_rope_shift) {
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+ }
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ cb(cur, "kqv_out", il);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
@@ -4564,20 +4956,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -4622,14 +5014,13 @@ struct llm_build_context {
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
// apply ALiBi for 13B model
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4686,20 +5077,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -4751,11 +5142,9 @@ struct llm_build_context {
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4810,15 +5199,15 @@ struct llm_build_context {
struct ggml_tensor * pos;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@@ -4852,11 +5241,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4909,19 +5296,19 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -5059,12 +5446,9 @@ struct llm_build_context {
);
cb(Vcur, "Vcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- // TODO: not tested, could be broken
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5119,11 +5503,11 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
for (int il = 0; il < n_layer; ++il) {
@@ -5151,11 +5535,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5211,11 +5593,11 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
inpL = llm_build_norm(ctx0, inpL, hparams,
@@ -5249,11 +5631,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5306,11 +5686,11 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
for (int il = 0; il < n_layer; ++il) {
@@ -5344,11 +5724,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5404,20 +5782,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -5435,12 +5813,24 @@ struct llm_build_context {
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -5456,11 +5846,9 @@ struct llm_build_context {
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5517,20 +5905,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -5573,11 +5961,9 @@ struct llm_build_context {
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5622,6 +6008,126 @@ struct llm_build_context {
return gf;
}
+
+ struct ggml_cgraph * build_qwen2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
+ cb(inpL, "inp_embd", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+ cb(inp_pos, "inp_pos", -1);
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
+ cb(KQ_mask, "KQ_mask", -1);
+
+ // shift the entire K-cache if needed
+ if (do_rope_shift) {
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+ }
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(gf, Qcur);
+ ggml_build_forward_expand(gf, Kcur);
+ ggml_build_forward_expand(gf, Vcur);
+
+ Qcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ cb(cur, "kqv_out", il);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -5634,20 +6140,20 @@ struct llm_build_context {
struct ggml_tensor * ffn_output;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -5703,11 +6209,9 @@ struct llm_build_context {
);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
cb(cur, "kqv_out", il);
}
@@ -5758,20 +6262,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@@ -5808,46 +6312,142 @@ struct llm_build_context {
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * sa_out = cur;
cur = attention_norm;
- // feed-forward network
+ // feed-forward network
+ {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, sa_out);
+ cb(cur, "l_out", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_gpt2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * pos;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
+ cb(inpL, "inp_embd", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+ cb(inp_pos, "inp_pos", -1);
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
+ cb(KQ_mask, "KQ_mask", -1);
+
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
+
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ cb(cur, "kqv_out", il);
+ }
+
+ // add the input
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
{
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, cb, il);
+ cb(cur, "ffn_norm", il);
+
cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, NULL,
- model.layers[il].ffn_gate, NULL,
- model.layers[il].ffn_down, NULL,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
+ NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
- cur = ggml_add(ctx0, cur, sa_out);
- cb(cur, "l_out", il);
-
- cur = ggml_add(ctx0, cur, inpL);
- cb(cur, "l_out", il);
-
- // input for next layer
- inpL = cur;
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
}
- cur = inpL;
-
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm, NULL,
- LLM_NORM_RMS, cb, -1);
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, cb, -1);
cb(cur, "result_norm", -1);
- // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1);
@@ -5856,33 +6456,32 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_gpt2() {
+ struct ggml_cgraph * build_codeshell() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
- struct ggml_tensor * pos;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
- cb(pos, "pos_embd", -1);
-
- inpL = ggml_add(ctx0, inpL, pos);
- cb(inpL, "inpL", -1);
+ // shift the entire K-cache if needed
+ if (do_rope_shift) {
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+ }
for (int il = 0; il < n_layer; ++il) {
cur = llm_build_norm(ctx0, inpL, hparams,
@@ -5899,21 +6498,31 @@ struct llm_build_context {
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
+ cb(tmpq, "tmpq", il);
+ cb(tmpk, "tmpk", il);
cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ struct ggml_tensor * Qcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+ struct ggml_tensor * Kcur = ggml_rope_custom(
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5965,15 +6574,7 @@ static struct ggml_cgraph * llama_build_graph(
// check if we should build the worst-case graph (for memory measurement)
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
- // keep track of the input that has already been allocated
- bool alloc_inp_tokens = false;
- bool alloc_inp_embd = false;
- bool alloc_inp_pos = false;
- bool alloc_inp_KQ_mask = false;
- bool alloc_inp_K_shift = false;
-
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
- // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
if (il >= 0) {
ggml_format_name(cur, "%s-%d", name, il);
@@ -5981,118 +6582,78 @@ static struct ggml_cgraph * llama_build_graph(
ggml_set_name(cur, name);
}
- //
- // allocate input tensors and set input data
- //
-
- if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
- ggml_tallocr_alloc(lctx.alloc, cur);
-
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
- const int64_t n_tokens = cur->ne[0];
-
- ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
+ if (!lctx.cparams.offload_kqv) {
+ if (strcmp(name, "kqv_merged_cont") == 0) {
+ // all nodes between the KV store and the attention output are run on the CPU
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
}
-
- alloc_inp_tokens = true;
}
+ };
- if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
- ggml_tallocr_alloc(lctx.alloc, cur);
+ struct ggml_cgraph * result = NULL;
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
- const int64_t n_embd = cur->ne[0];
- const int64_t n_tokens = cur->ne[1];
+ struct llm_build_context llm(lctx, batch, cb, worst_case);
- ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
- }
+ //
+ // set input data
+ //
- alloc_inp_embd = true;
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
+ if (batch.token) {
+ const int64_t n_tokens = batch.n_tokens;
+
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
}
- if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
- ggml_tallocr_alloc(lctx.alloc, cur);
+ if (batch.embd) {
+ const int64_t n_embd = llm.n_embd;
+ const int64_t n_tokens = batch.n_tokens;
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
- const int64_t n_tokens = cur->ne[0];
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ }
- static_assert(std::is_same::value, "llama_pos must be int32_t");
- ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
- }
+ if (batch.pos) {
+ const int64_t n_tokens = batch.n_tokens;
- alloc_inp_pos = true;
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
}
- if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
- ggml_tallocr_alloc(lctx.alloc, cur);
+ {
+ const int64_t n_kv = llm.n_kv;
+ const int64_t n_tokens = batch.n_tokens;
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
- const int64_t n_kv = cur->ne[0];
- const int64_t n_tokens = cur->ne[1];
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+ float * data = (float *) lctx.inp_KQ_mask->data;
- float * data;
- if (ggml_backend_buffer_is_host(cur->buffer)) {
- data = (float *) cur->data;
- } else {
- lctx.buf_copy.resize(ggml_nbytes(cur));
- data = (float *) lctx.buf_copy.data();
- }
+ for (int h = 0; h < 1; ++h) {
+ for (int j = 0; j < n_tokens; ++j) {
+ const llama_pos pos = batch.pos[j];
+ const llama_seq_id seq_id = batch.seq_id[j][0];
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- float f;
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
- f = -INFINITY;
- } else {
- f = 0;
- }
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+ for (int i = 0; i < n_kv; ++i) {
+ float f;
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+ f = -INFINITY;
+ } else {
+ f = 0;
}
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
}
-
- if (data != cur->data) {
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
- }
}
-
- alloc_inp_KQ_mask = true;
}
- if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
- ggml_tallocr_alloc(lctx.alloc, cur);
-
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
- const int64_t n_ctx = cur->ne[0];
-
- int32_t * data;
- if (ggml_backend_buffer_is_host(cur->buffer)) {
- data = (int32_t *) cur->data;
- } else {
- lctx.buf_copy.resize(ggml_nbytes(cur));
- data = (int32_t *) lctx.buf_copy.data();
- }
+ if (llm.do_rope_shift) {
+ const int64_t n_ctx = llm.n_ctx;
- for (int i = 0; i < n_ctx; ++i) {
- data[i] = lctx.kv_self.cells[i].delta;
- }
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
- if (data != cur->data) {
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
- }
+ for (int i = 0; i < n_ctx; ++i) {
+ data[i] = lctx.kv_self.cells[i].delta;
}
-
- alloc_inp_K_shift = true;
}
- };
-
- struct ggml_cgraph * result = NULL;
-
- struct llm_build_context llm(lctx, batch, cb, worst_case);
+ }
llm.init();
@@ -6137,6 +6698,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_qwen();
} break;
+ case LLM_ARCH_QWEN2:
+ {
+ result = llm.build_qwen2();
+ } break;
case LLM_ARCH_PHI2:
{
result = llm.build_phi2();
@@ -6149,6 +6714,14 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_gpt2();
} break;
+ case LLM_ARCH_CODESHELL:
+ {
+ result = llm.build_codeshell();
+ } break;
+ case LLM_ARCH_ORION:
+ {
+ result = llm.build_orion();
+ } break;
default:
GGML_ASSERT(false);
}
@@ -6254,6 +6827,7 @@ static int llama_decode_internal(
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
ggml_backend_sched_reset(lctx.sched);
+ ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
ggml_cgraph * gf = llama_build_graph(lctx, batch);
@@ -6280,7 +6854,7 @@ static int llama_decode_internal(
}
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
- if (ggml_cpu_has_cublas() && fully_offloaded) {
+ if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
n_threads = 1;
}
@@ -7574,6 +8148,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
}
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+ // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
+ // if (k >= (int32_t)candidates->size) {
+ // return;
+ // }
+
const int64_t t_start_sample_us = ggml_time_us();
k = std::max(k, (int) min_keep);
@@ -7584,10 +8163,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
- if (k == (int) candidates->size) {
- std::sort(candidates->data, candidates->data + candidates->size, comp);
- } else {
+ if (k <= 128) {
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
+ } else {
+ constexpr int nbuckets = 128;
+ constexpr float bucket_low = -10.0f;
+ constexpr float bucket_high = 10.0f;
+ constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+ constexpr float bucker_inter = -bucket_low * bucket_scale;
+
+ std::vector bucket_idx(candidates->size);
+ std::vector histo(nbuckets, 0);
+
+ for (int i = 0; i < (int)candidates->size; ++i) {
+ const float val = candidates->data[i].logit;
+ int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+ ib = std::max(0, std::min(nbuckets-1, ib));
+ bucket_idx[i] = ib;
+ ++histo[ib];
+ }
+ int nhave = 0;
+ int ib = nbuckets - 1;
+ for ( ; ib >= 0; --ib) {
+ nhave += histo[ib];
+ if (nhave >= k) break;
+ }
+ std::vector tmp_tokens(nhave);
+ auto ptr = tmp_tokens.data();
+ std::vector bucket_ptrs;
+ bucket_ptrs.reserve(nbuckets - ib);
+ for (int j = nbuckets - 1; j >= ib; --j) {
+ bucket_ptrs.push_back(ptr);
+ ptr += histo[j];
+ }
+ for (int i = 0; i < (int)candidates->size; ++i) {
+ int j = bucket_idx[i];
+ if (j >= ib) {
+ *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
+ }
+ }
+
+ ptr = tmp_tokens.data();
+ int ndone = 0;
+ for (int j = nbuckets-1; j > ib; --j) {
+ std::sort(ptr, ptr + histo[j], comp);
+ ptr += histo[j];
+ ndone += histo[j];
+ }
+ std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+
+ std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+
}
candidates->sorted = true;
}
@@ -7635,21 +8261,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
return;
}
- llama_sample_softmax(ctx, candidates);
-
const int64_t t_start_sample_us = ggml_time_us();
- float scale = candidates->data[0].p; // scale by max prob
- size_t i = 1; // first token always matches
+ bool min_p_applied = false;
+
+ // if the candidates aren't sorted, try the unsorted implementation first
+ if (!candidates->sorted) {
+ std::vector filtered_tokens;
+
+ float max_logit = -FLT_MAX;
+ for (size_t i = 0; i < candidates->size; ++i) {
+ max_logit = std::max(max_logit, candidates->data[i].logit);
+ }
+ const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
+
+ for (size_t i = 0; i < candidates->size; ++i) {
+ if (candidates->data[i].logit >= min_logit) {
+ filtered_tokens.push_back(candidates->data[i]);
+ }
+ }
- for (; i < candidates->size; ++i) {
- if (candidates->data[i].p < p * scale && i >= min_keep) {
- break; // prob too small
+ // if we have enough values the operation was a success
+ if (filtered_tokens.size() >= min_keep) {
+ memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+ candidates->size = filtered_tokens.size();
+ min_p_applied = true;
}
}
- // Resize the output vector to keep only the matching tokens
- candidates->size = i;
+ // if the candidates are sorted or the unsorted implementation failed, use this implementation
+ if (!min_p_applied) {
+ // Sort the logits in descending order
+ if (!candidates->sorted) {
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+ return a.logit > b.logit;
+ });
+ candidates->sorted = true;
+ }
+
+ const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
+ size_t i = 1; // first token always matches
+
+ for (; i < candidates->size; ++i) {
+ if (candidates->data[i].logit < min_logit && i >= min_keep) {
+ break; // prob too small
+ }
+ }
+
+ // Resize the output vector to keep only the matching tokens
+ candidates->size = i;
+ }
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -7779,6 +8440,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
}
}
+void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
+ const int64_t t_start_sample_us = ggml_time_us();
+
+ // no need to do anything if there is only one (or zero) candidates
+ if(candidates_p->size <= 1) {
+ return;
+ }
+
+ // Calculate maximum possible entropy
+ float max_entropy = -logf(1.0f / candidates_p->size);
+
+ llama_sample_softmax(nullptr, candidates_p);
+
+ // Calculate entropy of the softmax probabilities
+ float entropy = 0.0f;
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ float prob = candidates_p->data[i].p;
+ if (prob > 0.0f) { // Ensure no log(0)
+ entropy -= prob * logf(prob);
+ }
+ }
+
+ // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
+ float normalized_entropy = entropy / max_entropy;
+
+ // Map the normalized entropy to the desired temperature range using the power function
+ float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+#ifdef DEBUG
+ LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+ LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+ LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+ LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+ LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+ LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+#endif
+
+ // Apply the dynamically calculated temperature scaling
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ candidates_p->data[i].logit /= dyn_temp;
+ }
+
+ // Re-compute softmax probabilities after scaling logits with dynamic temperature
+ double max_l_double = candidates_p->data[0].logit;
+ double cum_sum_double = 0.0;
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ double p = exp(candidates_p->data[i].logit - max_l_double);
+ candidates_p->data[i].p = p; // Store the scaled probability
+ cum_sum_double += p;
+ }
+ for (size_t i = 0; i < candidates_p->size; ++i) {
+ candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+ }
+
+#ifdef DEBUG
+ // Print the updated top 25 probabilities after temperature scaling
+ LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+ for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
+ LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
+ }
+#endif
+
+ if (ctx) {
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+ }
+}
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
const int64_t t_start_sample_us = ggml_time_us();
@@ -7898,39 +8626,59 @@ static void llama_log_softmax(float * array, size_t size) {
}
}
+void llama_sample_apply_guidance(
+ struct llama_context * ctx,
+ float * logits,
+ float * logits_guidance,
+ float scale) {
+ GGML_ASSERT(ctx);
+
+ const auto t_start_sample_us = ggml_time_us();
+ const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+ llama_log_softmax(logits, n_vocab);
+ llama_log_softmax(logits_guidance, n_vocab);
+
+ for (int i = 0; i < n_vocab; ++i) {
+ auto & l = logits[i];
+ const auto & g = logits_guidance[i];
+
+ l = scale * (l - g) + g;
+ }
+
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+}
+
void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale) {
- int64_t t_start_sample_us = ggml_time_us();
-
GGML_ASSERT(ctx);
+ int64_t t_start_sample_us;
- auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+ t_start_sample_us = ggml_time_us();
+ const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
- GGML_ASSERT(n_vocab == (int)candidates->size);
+ GGML_ASSERT(n_vocab == candidates->size);
GGML_ASSERT(!candidates->sorted);
- std::vector logits_base;
- logits_base.reserve(candidates->size);
- for (size_t i = 0; i < candidates->size; ++i) {
- logits_base.push_back(candidates->data[i].logit);
+ std::vector logits_base(n_vocab);
+ for (size_t i = 0; i < n_vocab; ++i) {
+ logits_base[i] = candidates->data[i].logit;
}
- llama_log_softmax(logits_base.data(), candidates->size);
- float* logits_guidance = llama_get_logits(guidance_ctx);
- llama_log_softmax(logits_guidance, n_vocab);
+ float * logits_guidance = llama_get_logits(guidance_ctx);
- for (int i = 0; i < n_vocab; ++i) {
- float logit_guidance = logits_guidance[i];
- float logit_base = logits_base[i];
- candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
- }
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+ llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
+ t_start_sample_us = ggml_time_us();
- if (ctx) {
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+ for (size_t i = 0; i < n_vocab; ++i) {
+ candidates->data[i].logit = logits_base[i];
}
+
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
@@ -8347,13 +9095,19 @@ struct quantize_state_internal {
const llama_model_quantize_params * params;
int n_attention_wv = 0;
- int n_feed_forward_w2 = 0;
+ int n_ffn_down = 0;
+ int n_ffn_gate = 0;
+ int n_ffn_up = 0;
int i_attention_wv = 0;
- int i_feed_forward_w2 = 0;
+ int i_ffn_down = 0;
+ int i_ffn_gate = 0;
+ int i_ffn_up = 0;
int n_k_quantized = 0;
int n_fallback = 0;
+ bool has_imatrix = false;
+
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
@@ -8431,6 +9185,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
};
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+ auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
+ if (n_expert > 1) {
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
+ // tensor name.
+ n_layer /= n_expert;
+ if (sscanf(name, "blk.%d.", &i_layer) != 1) {
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
+ }
+ if (i_layer < 0 || i_layer >= n_layer) {
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
+ }
+ }
+ return std::make_pair(i_layer, n_layer);
+ };
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
int nx = tensor->ne[0];
@@ -8450,12 +9221,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
++qs.i_attention_wv;
}
else if (name.find("ffn_down") != std::string::npos) {
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
- ++qs.i_feed_forward_w2;
+ if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
+ ++qs.i_ffn_down;
}
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
} else if (name.find("attn_v.weight") != std::string::npos) {
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+ new_type = GGML_TYPE_Q4_K;
+ }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
@@ -8483,27 +9259,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
- } else if (name.find("ffn_down") != std::string::npos) {
- const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
- int i_layer, n_layer;
- if (n_expert == 1) {
- i_layer = qs.i_feed_forward_w2;
- n_layer = qs.n_feed_forward_w2;
- } else {
- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
- // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
- // for getting the current layer as I initially thought, and we need to resort to parsing the
- // tensor name.
- n_layer = qs.n_feed_forward_w2 / n_expert;
- if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
- throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
- }
- if (i_layer < 0 || i_layer >= n_layer) {
- throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
- }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
+ new_type = GGML_TYPE_Q2_K;
}
+ } else if (name.find("ffn_down") != std::string::npos) {
+ auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+ int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8526,11 +9289,19 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
}
- ++qs.i_feed_forward_w2;
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
+ && qs.has_imatrix && i_layer < n_layer/8) {
+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+ }
+ ++qs.i_ffn_down;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
new_type = GGML_TYPE_Q5_K;
}
@@ -8548,6 +9319,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
+ else if (name.find("ffn_gate") != std::string::npos) {
+ auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
+ int i_layer = info.first, n_layer = info.second;
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
+ new_type = GGML_TYPE_Q2_K;
+ }
+ ++qs.i_ffn_gate;
+ }
+ else if (name.find("ffn_up") != std::string::npos) {
+ auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
+ int i_layer = info.first, n_layer = info.second;
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
+ new_type = GGML_TYPE_Q2_K;
+ }
+ ++qs.i_ffn_up;
+ }
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+ //}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -8559,7 +9348,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
//}
bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
@@ -8571,6 +9361,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
}
if (convert_incompatible_tensor) {
switch (new_type) {
+ case GGML_TYPE_IQ2_XXS:
+ case GGML_TYPE_IQ2_XS:
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -8599,8 +9391,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
// K-quants
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S:
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -8646,6 +9439,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
imatrix_data = static_cast>*>(params->imatrix);
if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+ qs.has_imatrix = true;
}
}
@@ -8667,12 +9461,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
++qs.n_attention_wv;
}
else if (name.find("ffn_down") != std::string::npos) {
- ++qs.n_feed_forward_w2;
+ ++qs.n_ffn_down;
+ }
+ else if (name.find("ffn_gate") != std::string::npos) {
+ ++qs.n_ffn_gate;
+ }
+ else if (name.find("ffn_up") != std::string::npos) {
+ ++qs.n_ffn_up;
}
}
- if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
- __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
+ if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
+ __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
}
size_t total_size_org = 0;
@@ -8705,8 +9505,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// placeholder for the meta data
::zeros(fout, meta_size);
- std::set used_iq2;
-
for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
@@ -8759,11 +9557,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else {
const size_t nelements = ggml_nelements(tensor);
- if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
- ggml_init_iq2_quantization(new_type);
- used_iq2.insert(new_type);
- }
-
const float * imatrix = nullptr;
if (imatrix_data) {
auto it = imatrix_data->find(tensor->name);
@@ -8889,10 +9682,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
fout.close();
- for (auto type : used_iq2) {
- ggml_deinit_iq2_quantization(type);
- }
-
gguf_free(ctx_out);
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9238,6 +10027,8 @@ struct llama_context_params llama_context_default_params() {
/*.yarn_beta_fast =*/ 32.0f,
/*.yarn_beta_slow =*/ 1.0f,
/*.yarn_orig_ctx =*/ 0,
+ /*.cb_eval =*/ nullptr,
+ /*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16,
/*.mul_mat_q =*/ true,
@@ -9298,6 +10089,7 @@ void llama_backend_free(void) {
#ifdef GGML_USE_MPI
ggml_mpi_backend_free();
#endif
+ ggml_quantize_free();
}
int64_t llama_time_us(void) {
@@ -9378,6 +10170,9 @@ struct llama_context * llama_new_context_with_model(
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
hparams.n_ctx_train;
+ cparams.cb_eval = params.cb_eval;
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
+
auto rope_scaling_type = params.rope_scaling_type;
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
rope_scaling_type = hparams.rope_scaling_type_train;
@@ -9444,6 +10239,26 @@ struct llama_context * llama_new_context_with_model(
}
}
}
+#elif defined(GGML_USE_VULKAN)
+ if (model->n_gpu_layers > 0) {
+ ggml_backend_t backend = ggml_backend_vk_init();
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ }
+#elif defined(GGML_USE_SYCL)
+ if (model->n_gpu_layers > 0) {
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ }
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
@@ -9485,6 +10300,35 @@ struct llama_context * llama_new_context_with_model(
ctx->embedding.resize(hparams.n_embd);
}
+ // graph inputs
+ {
+ ggml_init_params init_params = {
+ /* .mem_size */ ggml_tensor_overhead()*5,
+ /* .mem_buffer */ nullptr,
+ /* .no_alloc */ true,
+ };
+ ctx->ctx_input = ggml_init(init_params);
+
+ ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
+ ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
+ ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
+ ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
+ ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
+
+ ggml_set_name(ctx->inp_tokens, "inp_tokens");
+ ggml_set_name(ctx->inp_embd, "inp_embd");
+ ggml_set_name(ctx->inp_pos, "inp_pos");
+ ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
+ ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
+
+ ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
+
+ LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buffer_name(ctx->buf_input),
+ ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
+ }
+
+ // scheduler and compute buffers
{
// buffer types used for the compute buffer of each backend
std::vector backend_buft;
@@ -9511,9 +10355,6 @@ struct llama_context * llama_new_context_with_model(
// initialize scheduler with the worst-case graph
ggml_backend_sched_init_measure(ctx->sched, gf);
- // note: the number of splits during measure is higher than during inference due to the kv shift
- int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
- LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
for (ggml_backend_t backend : ctx->backends) {
@@ -9522,6 +10363,10 @@ struct llama_context * llama_new_context_with_model(
ggml_backend_buffer_name(buf),
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
}
+
+ // note: the number of splits during measure is higher than during inference due to the kv shift
+ int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
+ LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
}
}
diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h
index 79c8335b66b..3e33072c68c 100644
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@@ -2,9 +2,13 @@
#define LLAMA_H
#include "ggml.h"
+#include "ggml-backend.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#elif defined(GGML_USE_SYCL)
+#include "ggml-sycl.h"
+#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
#else
#define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS
@@ -45,7 +49,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 4
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif
@@ -106,6 +110,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
@@ -231,6 +236,9 @@ extern "C" {
float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
+
enum ggml_type type_k; // data type for K cache
enum ggml_type type_v; // data type for V cache
@@ -714,14 +722,21 @@ extern "C" {
float penalty_present);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
- /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
- /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
- /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
- LLAMA_API void llama_sample_classifier_free_guidance(
+ /// @param logits Logits extracted from the original generation context.
+ /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+ /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+ LLAMA_API void llama_sample_apply_guidance(
+ struct llama_context * ctx,
+ float * logits,
+ float * logits_guidance,
+ float scale);
+
+ LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
- float scale);
+ float scale),
+ "use llama_sample_apply_guidance() instead");
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(
@@ -763,6 +778,14 @@ extern "C" {
float p,
size_t min_keep);
+ /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+ LLAMA_API void llama_sample_entropy(
+ struct llama_context * ctx,
+ llama_token_data_array * candidates_p,
+ float min_temp,
+ float max_temp,
+ float exponent_val);
+
LLAMA_API void llama_sample_temp(
struct llama_context * ctx,
llama_token_data_array * candidates,
diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp
index 5eef1f4e619..d418d0c32fc 100644
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
auto * model = llama_get_model(ctx);
@@ -68,6 +69,8 @@ struct whisper_params {
std::string person = "Georgi";
std::string bot_name = "LLaMA";
+ std::string wake_cmd = "";
+ std::string heard_ok = "";
std::string language = "en";
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_llama = "models/ggml-llama-7B.bin";
@@ -104,6 +107,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "-bn" || arg == "--bot-name") { params.bot_name = argv[++i]; }
else if (arg == "--session") { params.path_session = argv[++i]; }
+ else if (arg == "-w" || arg == "--wake-command") { params.wake_cmd = argv[++i]; }
+ else if (arg == "-ho" || arg == "--heard-ok") { params.heard_ok = argv[++i]; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
@@ -149,6 +154,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -bn NAME, --bot-name NAME [%-7s] bot name (to display)\n", params.bot_name.c_str());
+ fprintf(stderr, " -w TEXT, --wake-command T [%-7s] wake-up command to listen for\n", params.wake_cmd.c_str());
+ fprintf(stderr, " -ho TEXT, --heard-ok TEXT [%-7s] said by TTS before generating reply\n", params.heard_ok.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
@@ -227,6 +234,18 @@ std::string transcribe(
return result;
}
+std::vector get_words(const std::string &txt) {
+ std::vector words;
+
+ std::istringstream iss(txt);
+ std::string word;
+ while (iss >> word) {
+ words.push_back(word);
+ }
+
+ return words;
+}
+
const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";
const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
@@ -441,6 +460,16 @@ int main(int argc, char ** argv) {
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
printf("%s : done! start speaking in the microphone\n", __func__);
+
+ // show wake command if enabled
+ const std::string wake_cmd = params.wake_cmd;
+ const int wake_cmd_length = get_words(wake_cmd).size();
+ const bool use_wake_cmd = wake_cmd_length > 0;
+
+ if (use_wake_cmd) {
+ printf("%s : the wake-up command is: '%s%s%s'\n", __func__, "\033[1m", wake_cmd.c_str(), "\033[0m");
+ }
+
printf("\n");
printf("%s%s", params.person.c_str(), chat_symb.c_str());
fflush(stdout);
@@ -486,10 +515,41 @@ int main(int argc, char ** argv) {
audio.get(params.voice_ms, pcmf32_cur);
- std::string text_heard;
+ std::string all_heard;
if (!force_speak) {
- text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
+ all_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
+ }
+
+ const auto words = get_words(all_heard);
+
+ std::string wake_cmd_heard;
+ std::string text_heard;
+
+ for (int i = 0; i < (int) words.size(); ++i) {
+ if (i < wake_cmd_length) {
+ wake_cmd_heard += words[i] + " ";
+ } else {
+ text_heard += words[i] + " ";
+ }
+ }
+
+ // check if audio starts with the wake-up command if enabled
+ if (use_wake_cmd) {
+ const float sim = similarity(wake_cmd_heard, wake_cmd);
+
+ if ((sim < 0.7f) || (text_heard.empty())) {
+ audio.clear();
+ continue;
+ }
+ }
+
+ // optionally give audio feedback that the current text is being processed
+ if (!params.heard_ok.empty()) {
+ int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + params.heard_ok + "'").c_str());
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to speak\n", __func__);
+ }
}
// remove text between brackets using regex
diff --git a/examples/talk-llama/unicode.h b/examples/talk-llama/unicode.h
index aeca879ea68..844eff3dad1 100644
--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@@ -2,8 +2,9 @@
#include
#include
-#include
+#include
#include
+#include
static const std::vector> digit_ranges = {
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
diff --git a/examples/whisper.android/lib/src/main/jni/whisper/jni.c b/examples/whisper.android/lib/src/main/jni/whisper/jni.c
index 08825ed94c3..7f9d724617d 100644
--- a/examples/whisper.android/lib/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/lib/src/main/jni/whisper/jni.c
@@ -228,6 +228,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, j
UNUSED(thiz);
const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
+ return string;
}
JNIEXPORT jstring JNICALL
@@ -236,4 +237,5 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *en
UNUSED(thiz);
const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
+ return string;
}
diff --git a/examples/whisper.objc/README.md b/examples/whisper.objc/README.md
index bb55653dcef..ece74aed29f 100644
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@@ -11,11 +11,11 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b
## Usage
-```java
+```bash
git clone https://github.com/ggerganov/whisper.cpp
open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/
-// If you don't want to convert a Core ML model, you can skip this step by create dummy model
+# if you don't want to convert a Core ML model, you can skip this step by create dummy model
mkdir models/ggml-base.en-encoder.mlmodelc
```
diff --git a/extra/sync-ggml.last b/extra/sync-ggml.last
index 7082f05c7f3..b559c8dd106 100644
--- a/extra/sync-ggml.last
+++ b/extra/sync-ggml.last
@@ -1 +1 @@
-bca51b528820d28f54ea092fd4deaafc812f39d9
+6b14d738d9100c50c199a3b1aaa960f633904476
diff --git a/ggml-alloc.c b/ggml-alloc.c
index 89b85d34870..f9be6e1cbc8 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
} else {
- fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
- __func__, size, max_avail);
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
+ __func__, tensor->name, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
return;
}
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
}
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
- return alloc->max_size;
+ // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
+ // to avoid this, we add a 10% margin to the buffer size
+ return alloc->max_size + alloc->max_size/10;
}
// graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
}
// utils
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
-
- size_t alignment = ggml_backend_buft_get_alignment(buft);
-
- size_t nbytes = 0;
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
- if (t->data == NULL && t->view_src == NULL) {
- nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
- }
- }
-
- if (nbytes == 0) {
- // all the tensors in the context are already allocated
-#ifndef NDEBUG
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
-#endif
- return NULL;
- }
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+static bool alloc_tensor_range(struct ggml_context * ctx,
+ struct ggml_tensor * first, struct ggml_tensor * last,
+ ggml_backend_buffer_type_t buft, size_t size,
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
- // failed to allocate buffer
#ifndef NDEBUG
- fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
- return NULL;
+ for (size_t i = 0; i < *n_buffers; i++) {
+ ggml_backend_buffer_free(*buffers[i]);
+ }
+ free(*buffers);
+ return false;
}
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
ggml_tallocr_free(tallocr);
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
+ (*buffers)[(*n_buffers)++] = buffer;
+
+ return true;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
+
+ ggml_backend_buffer_t * buffers = NULL;
+ size_t n_buffers = 0;
+
+ size_t cur_buf_size = 0;
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+ size_t this_size = 0;
+ if (t->data == NULL && t->view_src == NULL) {
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+ }
+
+ if (this_size > max_size) {
+ // tensor is too large to fit in a single buffer
+ fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+ __func__, t->name,
+ ggml_backend_buft_name(buft),
+ this_size, max_size);
+ for (size_t i = 0; i < n_buffers; i++) {
+ ggml_backend_buffer_free(buffers[i]);
+ }
+ free(buffers);
+ return NULL;
+ }
+
+ if ((cur_buf_size + this_size) > max_size) {
+ // allocate tensors in the current buffer
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+ return NULL;
+ }
+ first = t;
+ cur_buf_size = this_size;
+ } else {
+ cur_buf_size += this_size;
+ }
+ }
+
+ // allocate remaining tensors
+ if (cur_buf_size > 0) {
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+ return NULL;
+ }
+ }
+
+ if (n_buffers == 0) {
+ // all the tensors in the context are already allocated
+#ifndef NDEBUG
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+#endif
+ return NULL;
+ }
+
+ ggml_backend_buffer_t buffer;
+ if (n_buffers == 1) {
+ buffer = buffers[0];
+ } else {
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
+ }
+ free(buffers);
return buffer;
}
diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
index 1db32901fe6..f95df47f72b 100644
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -16,14 +16,15 @@ extern "C" {
typedef void * ggml_backend_buffer_type_context_t;
struct ggml_backend_buffer_type_i {
- const char * (*get_name) (ggml_backend_buffer_type_t buft);
- ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
- size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
- size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
- bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
+ ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+ bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
- bool (*is_host) (ggml_backend_buffer_type_t buft);
+ bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
};
struct ggml_backend_buffer_type {
@@ -35,15 +36,15 @@ extern "C" {
typedef void * ggml_backend_buffer_context_t;
struct ggml_backend_buffer_i {
- const char * (*get_name) (ggml_backend_buffer_t buffer);
- void (*free_buffer)(ggml_backend_buffer_t buffer);
- void * (*get_base) (ggml_backend_buffer_t buffer);
- void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
- bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
- void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
- void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
+ void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
+ void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
+ void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+ bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
+ void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
+ void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
};
struct ggml_backend_buffer {
@@ -54,7 +55,7 @@ extern "C" {
enum ggml_backend_buffer_usage usage;
};
- ggml_backend_buffer_t ggml_backend_buffer_init(
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
@@ -63,6 +64,11 @@ extern "C" {
// do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+ // buffer that contains a collection of buffers
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+
//
// Backend
//
@@ -70,31 +76,31 @@ extern "C" {
typedef void * ggml_backend_context_t;
struct ggml_backend_i {
- const char * (*get_name)(ggml_backend_t backend);
+ const char * (*GGML_CALL get_name)(ggml_backend_t backend);
- void (*free)(ggml_backend_t backend);
+ void (*GGML_CALL free)(ggml_backend_t backend);
// buffer allocation
- ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
+ ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
- void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
- bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
+ void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations
- void (*synchronize)(ggml_backend_t backend);
+ void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan
- ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
- void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
- void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+ void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async)
- bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation
- bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+ bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
};
struct ggml_backend {
@@ -107,9 +113,9 @@ extern "C" {
// Backend registry
//
- typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
+ typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
#ifdef __cplusplus
}
diff --git a/ggml-backend.c b/ggml-backend.c
index 505dbba4762..0764dfebca6 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name(buft);
}
-ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
return buft->iface.alloc_buffer(buft, size);
}
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
return buft->iface.get_alignment(buft);
}
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
+ // get_max_size is optional, defaults to SIZE_MAX
+ if (buft->iface.get_max_size) {
+ return buft->iface.get_max_size(buft);
+ }
+ return SIZE_MAX;
+}
+
+GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
- return buft->iface.get_alloc_size(buft, tensor);
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
+ assert(size >= ggml_nbytes(tensor));
+ return size;
}
return ggml_nbytes(tensor);
}
@@ -48,15 +58,13 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
// backend buffer
-ggml_backend_buffer_t ggml_backend_buffer_init(
+GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size) {
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
- GGML_ASSERT(iface.get_base != NULL);
-
(*buffer) = (struct ggml_backend_buffer) {
/* .interface = */ iface,
/* .buft = */ buft,
@@ -95,7 +103,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return base;
}
-void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
// init_tensor is optional
if (buffer->iface.init_tensor) {
buffer->iface.init_tensor(buffer, tensor);
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
}
+size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
+}
+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
}
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
buffer->usage = usage;
+
+ // FIXME: add a generic callback to the buffer interface
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
+ }
}
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
}
+size_t ggml_backend_get_max_size(ggml_backend_t backend) {
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
+}
+
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -191,7 +212,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
}
}
-void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -201,7 +222,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
}
-void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -318,9 +339,9 @@ struct ggml_backend_reg {
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
static size_t ggml_backend_registry_count = 0;
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
+GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
-static void ggml_backend_registry_init(void) {
+GGML_CALL static void ggml_backend_registry_init(void) {
static bool initialized = false;
if (initialized) {
@@ -333,18 +354,33 @@ static void ggml_backend_registry_init(void) {
// add forward decls here to avoid including the backend headers
#ifdef GGML_USE_CUBLAS
- extern void ggml_backend_cuda_reg_devices(void);
+ extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
ggml_backend_cuda_reg_devices();
#endif
+#ifdef GGML_USE_SYCL
+ extern void ggml_backend_sycl_reg_devices(void);
+ ggml_backend_sycl_reg_devices();
+#endif
+
#ifdef GGML_USE_METAL
- extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
- extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+ extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
+ extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
#endif
+
+#ifdef GGML_USE_VULKAN
+ extern GGML_CALL int ggml_backend_vk_reg_devices(void);
+ ggml_backend_vk_reg_devices();
+#endif
+
+#ifdef GGML_USE_KOMPUTE
+ extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
+ ggml_backend_kompute_reg_devices();
+#endif
}
-void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
+GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
size_t id = ggml_backend_registry_count;
@@ -439,33 +475,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
// backend CPU
-static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
return "CPU";
GGML_UNUSED(buffer);
}
-static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)buffer->context;
}
-static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
-static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
memcpy((char *)tensor->data + offset, data, size);
GGML_UNUSED(buffer);
}
-static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
-static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
memcpy(dst->data, src->data, ggml_nbytes(src));
return true;
@@ -475,7 +511,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
GGML_UNUSED(buffer);
}
-static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
@@ -506,13 +542,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
-static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU";
GGML_UNUSED(buft);
}
-static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
@@ -521,30 +557,31 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
}
-static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
-static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cpu(backend);
GGML_UNUSED(buft);
}
-static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
GGML_UNUSED(buft);
}
-ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
/* .iface = */ {
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -561,23 +598,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
#include
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU_HBM";
GGML_UNUSED(buft);
}
-static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
+GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
return "CPU_HBM";
GGML_UNUSED(buf);
}
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
//void * ptr = hbw_malloc(size);
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -617,20 +655,20 @@ struct ggml_backend_cpu_context {
size_t work_size;
};
-static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
return "CPU";
GGML_UNUSED(backend);
}
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
free(cpu_ctx->work_data);
free(cpu_ctx);
free(backend);
}
-static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(backend);
@@ -641,7 +679,7 @@ struct ggml_backend_plan_cpu {
struct ggml_cgraph cgraph;
};
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
+GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -656,7 +694,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
return cpu_plan;
}
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
free(cpu_plan->cplan.work_data);
@@ -665,7 +703,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
GGML_UNUSED(backend);
}
-static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -673,7 +711,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
GGML_UNUSED(backend);
}
-static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -690,8 +728,10 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
return true;
}
-static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
switch (op->op) {
+ case GGML_OP_CPY:
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
default:
@@ -732,7 +772,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
return cpu_backend;
}
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
+GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
return backend && backend->iface.get_name == ggml_backend_cpu_name;
}
@@ -743,17 +783,91 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ctx->n_threads = n_threads;
}
-ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
}
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
+GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
return ggml_backend_cpu_init();
GGML_UNUSED(params);
GGML_UNUSED(user_data);
}
+// multi-buffer buffer
+
+struct ggml_backend_multi_buffer_context {
+ ggml_backend_buffer_t * buffers;
+ size_t n_buffers;
+};
+
+typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
+
+GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
+}
+
+GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
+ ggml_backend_buffer_free(ctx->buffers[i]);
+ }
+
+ free(ctx->buffers);
+ free(ctx);
+}
+
+GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
+ }
+}
+
+static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
+ static struct ggml_backend_buffer_i multi_backend_buffer_i = {
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
+ /* .get_base = */ NULL,
+ /* .init_tensor = */ NULL,
+ /* .set_tensor = */ NULL,
+ /* .get_tensor = */ NULL,
+ /* .cpy_tensor = */ NULL,
+ /* .clear = */ ggml_backend_multi_buffer_clear,
+ /* .reset = */ NULL,
+ };
+
+ return multi_backend_buffer_i;
+}
+
+GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
+ ctx->n_buffers = n_buffers;
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
+
+ size_t total_size = 0;
+ for (size_t i = 0; i < n_buffers; i++) {
+ ctx->buffers[i] = buffers[i];
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
+ }
+
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
+}
+
+GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
+}
+
+GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
+ }
+}
+
// scheduler
@@ -802,6 +916,9 @@ struct ggml_backend_sched {
__attribute__((aligned(GGML_MEM_ALIGN)))
#endif
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
+
+ ggml_backend_sched_eval_callback callback_eval;
+ void * callback_eval_user_data;
};
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
@@ -1186,6 +1303,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
ggml_tallocr_t src_allocr = node_allocr(src);
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
if (src_allocr != node_allocr) {
+ // create a copy of the input in the split's backend
+ size_t id = hash_id(src);
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
+ node_allocr(tensor_copy) = cur_allocr;
+ SET_CAUSE(tensor_copy, "4.cpy");
+
+ int n_inputs = sched->splits[cur_split].n_inputs++;
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
+ sched->splits[cur_split].inputs[n_inputs] = src;
+ }
+ node->src[j] = sched->node_copies[id][cur_backend_id];
+
+#if 0
// check if the input is already in the split
bool found = false;
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1201,19 +1336,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src;
}
-
- // create a copy of the input in the split's backend
- size_t id = hash_id(src);
- if (sched->node_copies[id][cur_backend_id] == NULL) {
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
- sched->node_copies[id][cur_backend_id] = tensor_copy;
- node_allocr(tensor_copy) = cur_allocr;
- SET_CAUSE(tensor_copy, "4.cpy");
- }
- node->src[j] = sched->node_copies[id][cur_backend_id];
+#endif
}
}
}
@@ -1324,9 +1447,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
ggml_graph_dump_dot(split->graph, NULL, split_filename);
#endif
+
uint64_t compute_start_us = ggml_time_us();
- ggml_backend_graph_compute(split_backend, &split->graph);
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+ if (!sched->callback_eval) {
+ ggml_backend_graph_compute(split_backend, &split->graph);
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+ } else {
+ // similar to ggml_backend_compare_graph_backend
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
+ struct ggml_tensor * t = split->graph.nodes[j0];
+
+ // check if the user needs data from this node
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+
+ int j1 = j0;
+
+ // determine the range [j0, j1] of nodes that can be computed together
+ while (!need && j1 < split->graph.n_nodes - 1) {
+ t = split->graph.nodes[++j1];
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+ }
+
+ struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
+
+ ggml_backend_graph_compute(split_backend, &gv);
+
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+ break;
+ }
+
+ j0 = j1;
+ }
+ }
uint64_t compute_end_us = ggml_time_us();
compute_us[split_backend_id] += compute_end_us - compute_start_us;
}
@@ -1431,6 +1583,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
sched_reset(sched);
}
+
+void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
+ sched->callback_eval = callback;
+ sched->callback_eval_user_data = user_data;
+}
+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
return sched->n_splits;
}
diff --git a/ggml-backend.h b/ggml-backend.h
index 4eb244af1d3..8b8160fcf66 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -17,12 +17,13 @@ extern "C" {
//
// buffer type
- GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
- GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
- GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
- GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
- GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
+ GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
+ GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
+ GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+ GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
enum ggml_backend_buffer_usage {
@@ -30,18 +31,19 @@ extern "C" {
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
};
- GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
- GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
- GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
- GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
+ GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
//
// Backend
@@ -54,12 +56,13 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+ GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
- GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+ GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@@ -80,13 +83,13 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
- GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
- GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+ GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
+ GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
// Create a backend buffer from an existing pointer
- GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
@@ -148,6 +151,14 @@ extern "C" {
struct ggml_backend_sched;
typedef struct ggml_backend_sched * ggml_backend_sched_t;
+ // when ask == true, the scheduler wants to know if the user wants to observe this node
+ // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+ //
+ // when ask == false, the scheduler is passing the node tensor to the user for observation
+ // if the user returns false, the scheduler will cancel the graph compute
+ //
+ typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
// Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
@@ -168,6 +179,9 @@ extern "C" {
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
+ // Set a callback to be called for each resulting node during graph compute
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
//
// Utils
//
@@ -183,7 +197,7 @@ extern "C" {
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
- typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+ typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index bd3814c72b4..949bc8a1c49 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -12,9 +12,10 @@
#include
#include