Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge upstream #43

Merged
merged 80 commits into from
Oct 27, 2024
Merged
Changes from 1 commit
Commits
Show all changes
80 commits
Select commit Hold shift + click to select a range
c7499c5
examples : do not use common library in simple example (#9803)
slaren Oct 10, 2024
cf8e0a3
musa: add docker image support (#9685)
yeahdongcn Oct 10, 2024
0e9f760
rpc : add backend registry / device interfaces (#9812)
slaren Oct 10, 2024
7eee341
common : use common_ prefix for common library functions (#9805)
slaren Oct 10, 2024
9677640
ggml : move more prints to the ggml log system (#9839)
slaren Oct 11, 2024
943d20b
musa : update doc (#9856)
yeahdongcn Oct 12, 2024
11ac980
llama : improve infill support and special token detection (#9798)
ggerganov Oct 12, 2024
95c76e8
server : remove legacy system_prompt feature (#9857)
ggerganov Oct 12, 2024
1bde94d
server : remove self-extend features (#9860)
ggerganov Oct 12, 2024
edc2656
server : add option to time limit the generation phase (#9865)
ggerganov Oct 12, 2024
92be9f1
flake.lock: Update (#9870)
ggerganov Oct 13, 2024
c7181bd
server : reuse cached context chunks (#9866)
ggerganov Oct 13, 2024
d4c19c0
server : accept extra_context for the infill endpoint (#9874)
ggerganov Oct 13, 2024
13dca2a
Vectorize load instructions in dmmv f16 CUDA kernel (#9816)
agray3 Oct 14, 2024
a89f75e
server : handle "logprobs" field with false value (#9871)
VoidIsVoid Oct 14, 2024
4c42f93
readme : update bindings list (#9889)
srgtuszy Oct 15, 2024
dcdd535
server : update preact (#9895)
ggerganov Oct 15, 2024
fbc98b7
sampling : add XTC sampler (#9742)
MaggotHATE Oct 15, 2024
223c25a
server : improve infill context reuse (#9894)
ggerganov Oct 15, 2024
755a9b2
llama : add infill sampler (#9896)
ggerganov Oct 15, 2024
becfd38
[CANN] Fix cann compilation error (#9891)
leo-pony Oct 16, 2024
cd60b88
ggml-alloc : remove buffer_id from leaf_alloc (ggml/987)
danbev Oct 9, 2024
0e41b30
sync : ggml
ggerganov Oct 16, 2024
1f66b69
server : fix the disappearance of the end of the text (#9867)
z80maniac Oct 16, 2024
10433e8
llama : add tensor name for "result_norm" (#9907)
MollySophia Oct 16, 2024
66c2c93
grammar : fix JSON Schema for string regex with top-level alt. (#9903)
jemc Oct 16, 2024
dbf18e4
llava : fix typo in error message [no ci] (#9884)
danbev Oct 16, 2024
9e04102
llama : suppress conversion from 'size_t' to 'int' (#9046)
danbev Oct 16, 2024
73afe68
fix: use `vm_allocate` to allocate CPU backend buffer on macOS (#9875)
giladgd Oct 16, 2024
2194200
fix: allocating CPU buffer with size `0` (#9917)
giladgd Oct 16, 2024
f010b77
vulkan : add backend registry / device interfaces (#9721)
slaren Oct 17, 2024
3752217
readme : update bindings list (#9918)
ShenghaiWang Oct 17, 2024
99bd4ac
llama : infill sampling handle very long tokens (#9924)
ggerganov Oct 17, 2024
9f45fc1
llama : change warning to debug log
ggerganov Oct 17, 2024
17bb928
readme : remove --memory-f32 references (#9925)
ggerganov Oct 17, 2024
6f55bcc
llama : rename batch_all to batch (#8881)
danbev Oct 17, 2024
8901755
server : add n_indent parameter for line indentation requirement (#9929)
ggerganov Oct 18, 2024
60ce97c
add amx kernel for gemm (#8998)
mingfeima Oct 18, 2024
87421a2
[SYCL] Add SYCL Backend registry, device and Event Interfaces (#9705)
OuadiElfarouki Oct 18, 2024
afd9909
rpc : backend refactoring (#9912)
rgerganov Oct 18, 2024
cda0e4b
llama : remove all_pos_0, all_pos_1, all_seq_id from llama_batch (#9745)
ngxson Oct 18, 2024
7cab208
readme : update infra list (#9942)
icppWorld Oct 20, 2024
45f0976
readme : update bindings list (#9951)
lcarrere Oct 20, 2024
1db8c84
fix mul_mat_vec_q and *_vec_q error (#9939)
NeoZhangJianyu Oct 21, 2024
bc21975
speculative : fix handling of some input params (#9963)
ggerganov Oct 21, 2024
55e4778
llama : default sampling changes + greedy update (#9897)
ggerganov Oct 21, 2024
d5ebd79
rpc : pack only RPC structs (#9959)
rgerganov Oct 21, 2024
f594bc8
ggml : add asserts for type conversion in fattn kernels (#9971)
ggerganov Oct 21, 2024
dbd5f2f
llama.vim : plugin for Neovim (#9787)
ggerganov Oct 21, 2024
94008cc
arg : fix attention non-causal arg value hint (#9985)
danbev Oct 21, 2024
994cfb1
readme : update UI list (#9972)
a-ghorbani Oct 21, 2024
e01c67a
llama.vim : move info to the right of screen [no ci] (#9787)
ggerganov Oct 21, 2024
e94a138
llama.vim : fix info text display [no ci] (#9787)
ggerganov Oct 21, 2024
674804a
arg : fix typo in embeddings argument help [no ci] (#9994)
danbev Oct 22, 2024
6b84473
[CANN] Adapt to dynamically loadable backends mechanism (#9970)
leo-pony Oct 22, 2024
4ff7fe1
llama : add chat template for RWKV-World + fix EOT (#9968)
MollySophia Oct 22, 2024
c421ac0
lora : warn user if new token is added in the adapter (#9948)
ngxson Oct 22, 2024
11d4705
Rwkv chat template fix (#10001)
MollySophia Oct 22, 2024
19d900a
llama : rename batch to ubatch (#9950)
danbev Oct 22, 2024
c8c07d6
llama : fix empty batch causing llama_batch_allocr to crash (#9966)
ngxson Oct 22, 2024
873279b
flake.lock: Update
github-actions[bot] Oct 20, 2024
4c9388f
metal : add POOL2D and fix IM2COL (#9943)
junhee-yoo Oct 23, 2024
ac113a0
llama.vim : add classic vim support (#9995)
m18coppola Oct 23, 2024
c19af0a
ggml : remove redundant set of contexts used field (ggml/978)
danbev Oct 16, 2024
80273a3
CUDA: fix 1D im2col, add tests (ggml/993)
JohannesGaessler Oct 18, 2024
2d3aba9
llama.vim : bump generation time limit to 3s [no ci]
ggerganov Oct 23, 2024
190a37d
sync : ggml
ggerganov Oct 23, 2024
0a1c750
server : samplers accept the prompt correctly (#10019)
wwoodsTM Oct 23, 2024
c39665f
CUDA: fix MMQ for non-contiguous src0, add tests (#10021)
JohannesGaessler Oct 24, 2024
167a515
CUDA: fix insufficient buffer clearing for MMQ (#10032)
JohannesGaessler Oct 24, 2024
40f2555
ci : fix cmake flags for SYCL
ggerganov Oct 24, 2024
958367b
server : refactor slot input data, move tokenizer to HTTP thread (#10…
ngxson Oct 24, 2024
bc5ba00
server : check that the prompt fits in the slot's context (#10030)
ggerganov Oct 25, 2024
2f8bd2b
llamafile : extend sgemm.cpp support for Q5_0 models (#10010)
Srihari-mcw Oct 25, 2024
d80fb71
llama: string_split fix (#10022)
Xarbirus Oct 25, 2024
ff252ea
llama : add DRY sampler (#9702)
wwoodsTM Oct 25, 2024
6687503
metal : support permuted matrix multiplicaions (#10033)
ggerganov Oct 25, 2024
9e4a256
scripts : fix amx sync [no ci]
ggerganov Oct 26, 2024
8c60a8a
increase cuda_cpy block size (ggml/996)
bssrdf Oct 23, 2024
cc2983d
sync : ggml
ggerganov Oct 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ggml : move more prints to the ggml log system (ggml-org#9839)
* ggml : move more prints to the ggml log system

* show BLAS OpenMP warnings in all builds using debug print
slaren authored Oct 11, 2024
commit 96776405a17034dcfd53d3ddf5d142d34bdbb657
34 changes: 17 additions & 17 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@

//#define GGML_ALLOCATOR_DEBUG

//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
#define AT_PRINTF(...)


@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
size = GGML_PAD(size, talloc->alignment);

if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ABORT("not enough space in the buffer");
}
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
best_fit_block = alloc->n_free_blocks - 1;
} else {
// this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("not enough space in the buffer");
}
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
}
}
}
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor) {
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].offset,
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
}
}
fprintf(stderr, "\n");
GGML_LOG_DEBUG("\n");
}
#endif

@@ -768,13 +768,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
if (new_size > cur_size || galloc->buffers[i] == NULL) {
#ifndef NDEBUG
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif

ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -825,14 +825,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (galloc->n_nodes != graph->n_nodes) {
#ifndef NDEBUG
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
#endif
return true;
}

if (galloc->n_leafs != graph->n_leafs) {
#ifndef NDEBUG
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
#endif
return true;
}
@@ -843,7 +843,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph

if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
#ifndef NDEBUG
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
#endif
return true;
}
@@ -855,7 +855,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
}
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
#ifndef NDEBUG
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
#endif
return true;
}
@@ -869,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
#ifndef NDEBUG
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
#endif
if (!ggml_gallocr_reserve(galloc, graph)) {
return false;
}
} else {
#ifndef NDEBUG
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
#endif
return false;
}
@@ -940,7 +940,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free((*buffers)[i]);
@@ -990,7 +990,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
}

if (this_size > max_size) {
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
@@ -1022,7 +1022,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte

if (n_buffers == 0) {
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
32 changes: 16 additions & 16 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
@@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
#ifndef NDEBUG
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
#endif
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
@@ -571,7 +571,7 @@ struct ggml_backend_registry {

void register_backend(ggml_backend_reg_t reg) {
#ifndef NDEBUG
fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
backends.push_back(reg);
@@ -582,7 +582,7 @@ struct ggml_backend_registry {

void register_device(ggml_backend_dev_t device) {
#ifndef NDEBUG
fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
devices.push_back(device);
}
@@ -773,7 +773,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}

@@ -836,7 +836,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
return NULL;
}

@@ -1459,7 +1459,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
}

#ifndef NDEBUG
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
#endif

@@ -1548,32 +1548,32 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
for (int i = 0; i < graph->n_nodes; i++) {
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
sched->splits[cur_split].n_inputs);
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
}
fprintf(stderr, "\n");
GGML_LOG_DEBUG("\n");
cur_split++;
}
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
}
fprintf(stderr, "\n");
GGML_LOG_DEBUG("\n");
}
}

@@ -2087,11 +2087,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
return false;
}
}
@@ -2485,7 +2485,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
struct ggml_context * ctx_unallocated = ggml_init(params);

if (ctx_allocated == NULL || ctx_unallocated == NULL) {
fprintf(stderr, "failed to allocate context for graph copy\n");
GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
@@ -2508,7 +2508,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
// allocate nodes
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
if (buffer == NULL) {
fprintf(stderr, "failed to allocate buffer for graph copy\n");
GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
8 changes: 4 additions & 4 deletions ggml/src/ggml-blas.cpp
Original file line number Diff line number Diff line change
@@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) {
/* .context = */ ctx,
};

#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
}
#endif

#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
#endif

return backend;
22 changes: 11 additions & 11 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
@@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
return;
}
}
GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
ggml_cuda_set_device(device);
CUDA_CHECK(cudaFree(ptr));
pool_size -= size;
@@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err));
return nullptr;
}
@@ -2406,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_

if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif
return false;
}
@@ -2524,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG
GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif
}
}
@@ -2575,14 +2575,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
#ifndef NDEBUG
GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
#endif
}

if (node->op == GGML_OP_MUL_MAT_ID) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG
GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
#endif
}

@@ -2591,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
// Changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false;
#ifndef NDEBUG
GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
#endif
}

@@ -2603,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (!ptr) {
use_cuda_graph = false;
#ifndef NDEBUG
GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
#endif
} else {
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
@@ -2627,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
#ifndef NDEBUG
GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif
}
}
@@ -2685,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
use_cuda_graph = false;
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
#ifndef NDEBUG
GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
#endif
} else {
graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2854,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
// clear the error
cudaGetLastError();

GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err));
return false;
}