teleprint-me · pull · Oct 16, 2024 · Oct 9, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/README.md b/README.md
@@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
+- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 
 **UI:**
 

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
@@ -611,7 +611,7 @@ class SchemaConverter {
             }
             return join_seq();
         };
-        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
     }
 
     /*

diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
@@ -540,7 +540,7 @@ def join_seq():
         return self._add_rule(
             name,
             to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
 
 
     def _resolve_ref(self, ref):

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
     if (!image_embed_result) {
         clip_image_u8_free(img);
-        LOG_ERR("%s: coulnd't embed the image\n", __func__);
+        LOG_ERR("%s: couldn't embed the image\n", __func__);
         return NULL;
     }
 

diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
@@ -529,7 +529,7 @@ export class SchemaConverter {
       return joinSeq();
     };
 
-    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
+    return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
   }
 
   _notStrings(strings) {

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1090,22 +1090,21 @@ struct server_context {
             size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
 
             const std::string str_test = slot.generated_text.substr(pos);
-            bool is_stop_full = false;
+            bool send_text = true;
 
             size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
             if (stop_pos != std::string::npos) {
-                is_stop_full = true;
                 slot.generated_text.erase(
                     slot.generated_text.begin() + pos + stop_pos,
                     slot.generated_text.end());
                 pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            } else {
-                is_stop_full = false;
+            } else if (slot.has_next_token) {
                 stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
+                send_text = stop_pos == std::string::npos;
             }
 
             // check if there is any token to predict
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
+            if (send_text) {
                 // no send the stop word in the response
                 result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                 slot.n_sent_text += result.text_to_send.size();

diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h
@@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 
+GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -348,7 +348,6 @@ struct tensor_alloc {
 };
 
 struct leaf_alloc {
-    int buffer_id;
     struct tensor_alloc leaf;
 };
 
@@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
         if (leaf->view_src || leaf->data) {
             galloc->leaf_allocs[i].leaf.buffer_id = -1;
             galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -557,14 +561,17 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_METAL
         register_backend(ggml_backend_metal_reg());
 #endif
+#ifdef GGML_USE_VULKAN
+        register_backend(ggml_backend_vk_reg());
+#endif
 #ifdef GGML_USE_BLAS
         register_backend(ggml_backend_blas_reg());
 #endif
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
 
-        // TODO: sycl, vulkan, kompute, cann
+        // TODO: sycl, kompute, cann
 
         register_backend(ggml_backend_cpu_reg());
     }
@@ -682,8 +689,6 @@ ggml_backend_t ggml_backend_init_best(void) {
 
 // backend CPU
 
-static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
-
 static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
     return "CPU";
 
@@ -702,7 +707,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 }
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
+    ggml_aligned_free(buffer->context, buffer->size);
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -770,14 +775,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
+    auto alloc_size = size;
+    if (alloc_size == 0) {
+        alloc_size = 1;
+    }
+
+    void * data = ggml_aligned_malloc(alloc_size);
+
     if (data == NULL) {
-        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
+        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
         return NULL;
     }
 
-    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
 }
 
 static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {

diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
@@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
         for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
             ggml_backend_cann_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
+                /* .device    = */ nullptr,
                 /* .context  = */
                  new ggml_backend_cann_buffer_type_context{
                     i, "CANN" + std::to_string(i)},
@@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new(
     ACL_CHECK(aclrtCreateEvent(&event));
 
     return new ggml_backend_event{
-        /* .backend = */ backend,
+        /* .device = */ nullptr,
         /* .context = */ event,
     };
 }
@@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
  *
  * @param event Pointer to the event structure to be recorded.
  */
-static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
+static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
     ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)event->backend->context;
-
+        (ggml_backend_cann_context*)backend->context;
     ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
 }
 
@@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
                                          ggml_backend_event_t event) {
     ggml_backend_cann_context* cann_ctx =
         (ggml_backend_cann_context*)backend->context;
-
-    if (ggml_backend_is_cann(event->backend)) {
+    if (ggml_backend_is_cann(backend)) {
         ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
                                        (aclrtEvent)event->context));
     } else {

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -19,6 +19,9 @@ extern "C" {
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
+// required for mmap as gguf only guarantees 32-byte alignment
+#define TENSOR_ALIGNMENT 32
+
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct ggml_cgraph {
 
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 
+// Memory allocation
+
+void * ggml_aligned_malloc(size_t size);
+void ggml_aligned_free(void * ptr, size_t size);
+
 #ifdef __cplusplus
 }
 #endif
-Original file line number
+Diff line change
@@ Expand Up / @@ -611,7 +611,7 @@ class SchemaConverter { @@
                 }
                 return join_seq();
             };
-            return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+            return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
         }
         /*
@@ Expand Down @@