Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from ggerganov:master #146

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)

**UI:**

Expand Down
2 changes: 1 addition & 1 deletion common/json-schema-to-grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ class SchemaConverter {
}
return join_seq();
};
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
}

/*
Expand Down
2 changes: 1 addition & 1 deletion examples/json_schema_to_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ def join_seq():
return self._add_rule(
name,
to_rule(transform()) if self._raw_pattern \
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")


def _resolve_ref(self, ref):
Expand Down
2 changes: 1 addition & 1 deletion examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
clip_image_u8_free(img);
LOG_ERR("%s: coulnd't embed the image\n", __func__);
LOG_ERR("%s: couldn't embed the image\n", __func__);
return NULL;
}

Expand Down
2 changes: 1 addition & 1 deletion examples/server/public/json-schema-to-grammar.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ export class SchemaConverter {
return joinSeq();
};

return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
}

_notStrings(strings) {
Expand Down
9 changes: 4 additions & 5 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,22 +1090,21 @@ struct server_context {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());

const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
bool send_text = true;

size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
if (stop_pos != std::string::npos) {
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
} else {
is_stop_full = false;
} else if (slot.has_next_token) {
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
send_text = stop_pos == std::string::npos;
}

// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
if (send_text) {
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
Expand Down
2 changes: 2 additions & 0 deletions ggml/include/ggml-vulkan.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);

GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);

#ifdef __cplusplus
}
#endif
2 changes: 0 additions & 2 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,6 @@ struct tensor_alloc {
};

struct leaf_alloc {
int buffer_id;
struct tensor_alloc leaf;
};

Expand Down Expand Up @@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
Expand Down
26 changes: 18 additions & 8 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
#include "ggml-metal.h"
#endif

#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif

#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
Expand All @@ -557,14 +561,17 @@ struct ggml_backend_registry {
#ifdef GGML_USE_METAL
register_backend(ggml_backend_metal_reg());
#endif
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif

// TODO: sycl, vulkan, kompute, cann
// TODO: sycl, kompute, cann

register_backend(ggml_backend_cpu_reg());
}
Expand Down Expand Up @@ -682,8 +689,6 @@ ggml_backend_t ggml_backend_init_best(void) {

// backend CPU

static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment

static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
return "CPU";

Expand All @@ -702,7 +707,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
}

static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
ggml_aligned_free(buffer->context, buffer->size);
}

static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
Expand Down Expand Up @@ -770,14 +775,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
}

static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
auto alloc_size = size;
if (alloc_size == 0) {
alloc_size = 1;
}

void * data = ggml_aligned_malloc(alloc_size);

if (data == NULL) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
return NULL;
}

return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
}

static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
Expand Down
11 changes: 5 additions & 6 deletions ggml/src/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
ggml_backend_cann_buffer_types[i] = {
/* .iface = */ ggml_backend_cann_buffer_type_interface,
/* .device = */ nullptr,
/* .context = */
new ggml_backend_cann_buffer_type_context{
i, "CANN" + std::to_string(i)},
Expand Down Expand Up @@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new(
ACL_CHECK(aclrtCreateEvent(&event));

return new ggml_backend_event{
/* .backend = */ backend,
/* .device = */ nullptr,
/* .context = */ event,
};
}
Expand All @@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
*
* @param event Pointer to the event structure to be recorded.
*/
static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)event->backend->context;

(ggml_backend_cann_context*)backend->context;
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
}

Expand All @@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
ggml_backend_event_t event) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;

if (ggml_backend_is_cann(event->backend)) {
if (ggml_backend_is_cann(backend)) {
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
(aclrtEvent)event->context));
} else {
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ extern "C" {
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

// required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32

// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
Expand Down Expand Up @@ -196,6 +199,11 @@ struct ggml_cgraph {

struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

// Memory allocation

void * ggml_aligned_malloc(size_t size);
void ggml_aligned_free(void * ptr, size_t size);

#ifdef __cplusplus
}
#endif
Loading
Loading