diff --git a/bindings/ios b/bindings/ios index b5a163decd5..c9d5095f0c6 160000 --- a/bindings/ios +++ b/bindings/ios @@ -1 +1 @@ -Subproject commit b5a163decd5290a99806957905639c4456de97f5 +Subproject commit c9d5095f0c64455b201f1cd0b547efcf093ee7c3 diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 33ae03ae10f..e69bd51000c 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -9,6 +9,11 @@ static const std::map GGML_FTYPE_MAP = { {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, + {"q2_k", GGML_FTYPE_MOSTLY_Q2_K}, + {"q3_k", GGML_FTYPE_MOSTLY_Q3_K}, + {"q4_k", GGML_FTYPE_MOSTLY_Q4_K}, + {"q5_k", GGML_FTYPE_MOSTLY_Q5_K}, + {"q6_k", GGML_FTYPE_MOSTLY_Q6_K}, }; void ggml_print_ftypes(FILE * fp) { @@ -48,15 +53,15 @@ bool ggml_common_quantize_0( case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break; + case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break; + case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break; + case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break; + case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break; case GGML_FTYPE_UNKNOWN: case GGML_FTYPE_ALL_F32: case GGML_FTYPE_MOSTLY_F16: case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: - case GGML_FTYPE_MOSTLY_Q2_K: - case GGML_FTYPE_MOSTLY_Q3_K: - case GGML_FTYPE_MOSTLY_Q4_K: - case GGML_FTYPE_MOSTLY_Q5_K: - case GGML_FTYPE_MOSTLY_Q6_K: { fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); return false; @@ -167,24 +172,17 @@ bool ggml_common_quantize_0( switch ((ggml_type) ttype) { case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q5_0: - { - cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q5_1: - { - cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: { - cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data()); } break; case GGML_TYPE_F32: case GGML_TYPE_F16: @@ -192,11 +190,6 @@ bool ggml_common_quantize_0( case GGML_TYPE_I16: case GGML_TYPE_I32: case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: case GGML_TYPE_COUNT: { diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index 925f9b610b1..aae392bdc00 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -686,8 +686,8 @@ int main(int argc, char ** argv) { } } - text_to_speak = ::replace(text_to_speak, "\"", ""); - int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str()); + text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'"); + int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str()); if (ret != 0) { fprintf(stderr, "%s: failed to speak\n", __func__); } diff --git a/whisper.cpp b/whisper.cpp index acedee0d08c..e2197ff262c 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -6073,7 +6073,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations // 1GB array - const size_t size = arr*1e9; + const size_t size = arr*1e6; // single-thread {