From ad21c9e1f14d82b8c15ae369a8839019e3d498b4 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Wed, 20 Nov 2024 13:54:25 +0800 Subject: [PATCH 01/32] update rel to 4040 (#10395) Co-authored-by: arthw <14088817+arthw@users.noreply.github.com> --- docs/backend/SYCL.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index e431f51f184cc..8d8312e9158f1 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -34,9 +34,10 @@ The SYCL backend would be broken by some PRs due to no online CI. The following release is verified with good quality: -|Commit ID|Tag|Release|Verified Platform| -|-|-|-|-| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| +|Commit ID|Tag|Release|Verified Platform| Update date| +|-|-|-|-|-| +|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| ## News From 1bacb9f62514b520bdf74ed6feb46c80508dad38 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 01:11:00 -0600 Subject: [PATCH 02/32] vulkan: further optimize mul_mat_vec using larger loads (#10387) * vulkan: Use pipeline_robustness to disable robustness in mul_mat_vec. Add some early returns for nonexistent rows in mul_mat_vec shaders. These can only be hit when dispatching a 2D grid of workgroups. Fix the logic for the 2D grid of workgroups to round up. Enable the pipeline robustness extension if it's available, and use it to disable robustness for these pipelines. The instructions to do the bounds checking contend for the same ALU resources as the bit twiddling dequant instructions. * vulkan: Add GLSL structure aliases for quant types to allow larger loads In Vulkan it's not possible to cast pointer types, so instead you have to declare an aliased binding for the memory with a different type. This commit adds aliases for the quant formats using 16b ints, and in a few places where the struct size is a multiple of 4 also using 32b ints. Currently only q4_k's aliases are used, but others will be used in subsequent commits. * vulkan: use larger loads in q5_k and q6_k shaders. Similar to the optimization I did in q4_k recently, this vectorizes some loads and reduces the number of bit twiddling instructions. * vulkan: use larger K step per iteration in mul_mat_vec. Add vec4 dequantization functions, and use them to do K=8 per iteration in mul_mat_vec. This uses 16b loads for the quant values and 128b loads for B which helps reduce the load on the memory system. The K_PER_ITER==2 logic is still there, just for F16/F32, and really only because they support unaligned sizes. Tweak the num_iters/unrolling logic to be simpler and catch a couple missed unrolling opportunities. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 101 +++++++----- .../vulkan-shaders/dequant_funcs.comp | 48 ++++++ .../vulkan-shaders/mul_mat_vec.comp | 82 ++++++++-- .../vulkan-shaders/mul_mat_vec_base.comp | 3 + .../vulkan-shaders/mul_mat_vec_q2_k.comp | 4 + .../vulkan-shaders/mul_mat_vec_q3_k.comp | 4 + .../vulkan-shaders/mul_mat_vec_q4_k.comp | 34 ++--- .../vulkan-shaders/mul_mat_vec_q5_k.comp | 144 +++++++++++------- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 69 ++++++--- .../src/ggml-vulkan/vulkan-shaders/types.comp | 113 +++++++++++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 6 +- 11 files changed, 459 insertions(+), 149 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 21fee2f3d7533..ca71da2f7b7f5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -158,6 +158,7 @@ struct vk_device_struct { std::string name; uint64_t max_memory_allocation_size; bool fp16; + bool pipeline_robustness; vk::Device device; uint32_t vendor_id; vk_queue compute_queue; @@ -654,7 +655,7 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align) { +static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align, bool disable_robustness) { VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT @@ -724,6 +725,15 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin vk::PipelineCreateFlags(), pipeline_shader_create_info, pipeline->layout); + + vk::PipelineRobustnessCreateInfoEXT rci; + + if (device->pipeline_robustness && disable_robustness) { + rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + compute_pipeline_create_info.setPNext(&rci); + } + pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; { @@ -1261,7 +1271,7 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared(); std::vector> compiles; - auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align) { + auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false) { { // wait until fewer than N compiles are in progress uint32_t N = std::max(1u, std::thread::hardware_concurrency()); @@ -1271,7 +1281,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } compile_count++; } - compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align)); + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; if (device->fp16) { @@ -1370,45 +1380,45 @@ static void ggml_vk_load_shaders(vk_device& device) { // computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0. ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); // dequant shaders ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -1591,12 +1601,15 @@ static vk_device ggml_vk_get_device(size_t idx) { bool fp16_storage = false; bool fp16_compute = false; + bool pipeline_robustness = false; for (const auto& properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; + } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { + pipeline_robustness = true; } } @@ -1642,10 +1655,22 @@ static vk_device ggml_vk_get_device(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; + pl_robustness_features.pNext = nullptr; + pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; + pl_robustness_features.pipelineRobustness = VK_FALSE; + + if (pipeline_robustness) { + vk12_features.pNext = &pl_robustness_features; + device_extensions.push_back("VK_EXT_pipeline_robustness"); + } + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->fp16 = device->fp16 && vk12_features.shaderFloat16; + device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + if (!vk11_features.storageBuffer16BitAccess) { std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; throw std::runtime_error("Unsupported device"); @@ -3190,7 +3215,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& if (ne01 > max_groups_x) { groups_z = 64; - groups_x /= groups_z; + groups_x = CEIL_DIV(groups_x, groups_z); } // compute @@ -3767,7 +3792,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte if (ne01 > max_groups_x) { groups_z = 64; - groups_x /= groups_z; + groups_x = CEIL_DIV(groups_x, groups_z); } // compute diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index d5b989735bc0b..5fc1ba4ad364b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -2,6 +2,15 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #endif +#include "types.comp" + +#if defined(A_TYPE_PACKED16) +layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];}; +#endif +#if defined(A_TYPE_PACKED32) +layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];}; +#endif + #if defined(DATA_A_F32) vec2 dequantize(uint ib, uint iqs, uint a_offset) { return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]); @@ -20,6 +29,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d; +} #endif #if defined(DATA_A_Q4_1) @@ -29,6 +43,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2(vui & 0xF, vui >> 4) * d + m; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const float m = float(data_a_packed16[a_offset + ib].m); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m; +} #endif #if defined(DATA_A_Q5_0) @@ -39,6 +59,14 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0]; + const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d; +} #endif #if defined(DATA_A_Q5_1) @@ -50,6 +78,15 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const float m = float(data_a_packed16[a_offset + ib].m); + const uint uint_qh = data_a_packed16[a_offset + ib].qh; + const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m; +} #endif #if defined(DATA_A_Q8_0) @@ -57,6 +94,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const float d = float(data_a[a_offset + ib].d); return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2]; + uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1]; + return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d; +} #endif #if defined(DATA_A_IQ4_NL) @@ -65,4 +108,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d; +} #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 970aac6ef72cc..00807a060c46b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -3,7 +3,7 @@ #ifdef FLOAT16 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #endif -#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types : require #include "mul_mat_vec_base.comp" @@ -12,16 +12,48 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; +#if !defined(DATA_A_F32) && !defined(DATA_A_F16) +#define K_PER_ITER 8 +#else +#define K_PER_ITER 2 +#endif + + uint a_offset, b_offset, d_offset, y_offset; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter) { - const uint col = i*BLOCK_SIZE + 2*tid; + const uint col = i*BLOCK_SIZE + K_PER_ITER*tid; const uint iqs = (col%QUANT_K)/QUANT_R; // quant index const uint iybs = col - col%QUANT_K; // y block start index +#if K_PER_ITER == 8 +#if QUANT_R == 2 + B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4]; + B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4]; + FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x); + FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x); + FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y); + FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y); + FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z); + FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z); + FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w); + FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w); +#else + B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4]; + B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1]; + FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x); + FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y); + FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z); + FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w); + FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x); + FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y); + FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z); + FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w); +#endif +#else // Check if the second of the pair of elements is OOB, and don't fetch B or // accumulate it. We still fetch a pair of elements for A, which is fine for // quantized formats since they'll be within the same block. We should @@ -34,9 +66,24 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_ if (!OOB) { b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]); } +#endif [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index +#if K_PER_ITER == 8 + const vec4 v = dequantize4(ib, iqs, a_offset); + const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset); + + // matrix multiplication + temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]); +#else const vec2 v = dequantize(ib, iqs, a_offset); // matrix multiplication @@ -44,6 +91,7 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_ if (!OOB) { temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]); } +#endif } } @@ -61,22 +109,33 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { temp[i] = FLOAT_TYPE(0); } - const int unroll_count = 8; - - const uint num_iters = (p.ncols >= 2*tid) ? ((p.ncols - 2*tid + BLOCK_SIZE - 1) / BLOCK_SIZE) : 0; - const uint unrolled_iters = num_iters & ~(2*unroll_count - 1); + uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE); + if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) { + num_iters++; + } + int unroll_count = 4; + uint unrolled_iters = num_iters & ~(unroll_count - 1); uint i = 0; while (i < unrolled_iters) { // Manually partially unroll the loop [[unroll]] for (uint k = 0; k < unroll_count; ++k) { - iter(temp, first_row, num_rows, tid, i, false); - i += 2; + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false); + i++; + } + } + unroll_count = 2; + unrolled_iters = num_iters & ~(unroll_count - 1); + while (i < unrolled_iters) { + // Manually partially unroll the loop + [[unroll]] for (uint k = 0; k < unroll_count; ++k) { + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false); + i++; } } while (i < num_iters) { - iter(temp, first_row, num_rows, tid, i, true); - i += 2; + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true); + i++; } // sum up partial sums and write back result @@ -106,6 +165,9 @@ void main() { if (first_row + NUM_ROWS <= p.stride_d) { compute_outputs(first_row, NUM_ROWS); } else { + if (first_row >= p.stride_d) { + return; + } compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp index 5920bc93641c8..8d0a5791374d3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp @@ -12,6 +12,9 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];}; +layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; + layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #ifdef MUL_MAT_ID layout (binding = 3) readonly buffer IDS {int data_ids[];}; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index ec8eadcd5828a..e2625d32b902d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 3ca4ad85a5ca0..a28804533c320 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index b7c9b722d126b..5846f2e86f3e3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -8,30 +8,14 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; shared FLOAT_TYPE tmp[32]; -// Declare aliased versions of A and B bindings that can use 16b/32b loads for -// the quantized values, and vec4 loads for B. -struct block_q4_K_u32 -{ - f16vec2 d; - uint32_t scales[3*QUANT_K/64/4]; - uint32_t qs[QUANT_K/2/4]; -}; - -struct block_q4_K_u16 -{ - f16vec2 d; - uint16_t scales[3*QUANT_K/64/2]; - uint16_t qs[QUANT_K/2/2]; -}; - -layout (binding = 0) readonly buffer A_u32 {block_q4_K_u32 data_a_u32[];}; -layout (binding = 0) readonly buffer A_u16 {block_q4_K_u16 data_a_u16[];}; -layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; - // This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -64,9 +48,9 @@ void main() { const FLOAT_TYPE dall = FLOAT_TYPE(d.x); const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - uint32_t scale0_u32 = data_a_u16[ib0 + i].scales[v_im ]; - uint32_t scale4_u32 = data_a_u16[ib0 + i].scales[v_im + 2]; - uint32_t scale8_u32 = data_a_u16[ib0 + i].scales[v_im + 4]; + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; uvec4 scale0 = uvec4(unpack8(scale0_u32)); uvec4 scale4 = uvec4(unpack8(scale4_u32)); uvec4 scale8 = uvec4(unpack8(scale8_u32)); @@ -80,8 +64,8 @@ void main() { const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); - uint32_t qs0_u32 = data_a_u32[ib0 + i].qs[q_offset / 4]; - uint32_t qs64_u32 = data_a_u32[ib0 + i].qs[q_offset / 4 + 16]; + uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4]; + uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16]; uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F; uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index 2306785af4226..22a6bfae4fb61 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -1,5 +1,7 @@ #version 450 +#extension GL_EXT_shader_explicit_arithmetic_types : require + #include "mul_mat_vec_base.comp" layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; @@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -31,70 +37,106 @@ void main() { const uint8_t hm1 = uint8_t(1 << (2*v_im)); const uint8_t hm2 = uint8_t(hm1 << 4); - tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; - const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y); - - const uint8_t sc0 = uint8_t( data_a[ib0 + i].scales[v_im * 2 ] & 0x3f); - const uint8_t sc1 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 1] & 0x3f); - const uint8_t sc2 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 4] & 0x3f); - const uint8_t sc3 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 5] & 0x3f); - const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 ] & 0xc0) >> 2)); - const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2)); - const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2)); - const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2)); - - const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf); - const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf); - const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] & 0xf); - const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] & 0xf); - const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset ] >> 4); - const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] >> 4); - const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] >> 4); - const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] >> 4); - const uint8_t q4_8 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf); - const uint8_t q4_9 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf); - const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] & 0xf); - const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] & 0xf); - const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4); - const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4); - const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4); - const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4); + f16vec2 d = data_a[ib0 + i].d; + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); + + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; + uvec4 scale0 = uvec4(unpack8(scale0_u32)); + uvec4 scale4 = uvec4(unpack8(scale4_u32)); + uvec4 scale8 = uvec4(unpack8(scale8_u32)); + + const uint32_t sc0 = ( scale0.x & 0x3f); + const uint32_t sc1 = ( scale0.y & 0x3f); + const uint32_t sc2 = ( scale4.x & 0x3f); + const uint32_t sc3 = ( scale4.y & 0x3f); + const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); + const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); + const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); + const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); + + uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); + uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16); + + uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F; + uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F; + uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F; + uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F; + + uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4)); + uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4)); + uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4)); + uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4)); + + const uint32_t q4_0 = qs0_16_lo4.x; + const uint32_t q4_1 = qs0_16_lo4.y; + const uint32_t q4_2 = qs0_16_lo4.z; + const uint32_t q4_3 = qs0_16_lo4.w; + const uint32_t q4_4 = qs0_16_hi4.x; + const uint32_t q4_5 = qs0_16_hi4.y; + const uint32_t q4_6 = qs0_16_hi4.z; + const uint32_t q4_7 = qs0_16_hi4.w; + const uint32_t q4_8 = qs64_80_lo4.x; + const uint32_t q4_9 = qs64_80_lo4.y; + const uint32_t q4_10 = qs64_80_lo4.z; + const uint32_t q4_11 = qs64_80_lo4.w; + const uint32_t q4_12 = qs64_80_hi4.x; + const uint32_t q4_13 = qs64_80_hi4.y; + const uint32_t q4_14 = qs64_80_hi4.z; + const uint32_t q4_15 = qs64_80_hi4.w; + + B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2]; + B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8]; + B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16]; + B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24]; + B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2]; + B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8]; + B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16]; + B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24]; + + uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2]; + uint32_t qh1 = qh0 >> 8; + uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8]; + uint32_t qh17 = qh16 >> 8; const FLOAT_TYPE sx = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)), + FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0))))); const FLOAT_TYPE sy = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0))))); const FLOAT_TYPE sz = - fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by20.x), (q4_8 + (((qh0 & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by20.y), (q4_9 + (((qh1 & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)), + FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0))))); const FLOAT_TYPE sw = - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0))))); const FLOAT_TYPE smin = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2, - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3, - fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6, - (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7))); - const uint tmp_idx = 16 * ix + tid; - tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx])); + fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2, + fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3, + fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6, + (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7))); + temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp)); } + tmp[gl_LocalInvocationID.x] = temp; + // sum up partial sums and write back result barrier(); [[unroll]] for (uint s = 16; s > 0; s >>= 1) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 95c286eeb17e1..0b392d68d04d0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,5 +1,7 @@ #version 450 +#extension GL_EXT_shader_explicit_arithmetic_types : require + #include "mul_mat_vec_base.comp" layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; @@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -36,41 +42,66 @@ void main() { const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; - tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); -#if K_QUANTS_PER_ITERATION == 1 - const uint tmp_idx = 16 * ix + tid; - tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx])))))))); -#else + FLOAT_TYPE scales[4]; + scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]); + scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]); + scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]); + scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]); + + uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); + + uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; + uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; + uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; + uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; + + uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); + uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; + uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; + uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0; + uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; + + uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; + uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; + uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; + uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; + + uvec4 q0 = uvec4(unpack8(q0_u32)); + uvec4 q1 = uvec4(unpack8(q1_u32)); + uvec4 q2 = uvec4(unpack8(q2_u32)); + uvec4 q3 = uvec4(unpack8(q3_u32)); + + B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4]; + B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8]; + B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16]; + B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24]; + FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 4; ++l) { - sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum)))); + sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32), + fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32), + fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32), + fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum)))); } - tmp[16 * ix + tid] += sum; -#endif + temp += sum * d; } + tmp[gl_LocalInvocationID.x] = temp; + // sum up partial sums and write back result barrier(); [[unroll]] for (uint s = 16; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; - } + } barrier(); } if (tid == 0) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 21dce72fc7dfb..7a34820bc2789 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -1,6 +1,8 @@ -#if !defined(DATA_A_F32) && !defined(DATA_A_F16) -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require -#endif + +#if !defined(GGML_TYPES_COMP) +#define GGML_TYPES_COMP + +#extension GL_EXT_shader_explicit_arithmetic_types : require #if defined(DATA_A_F32) #define QUANT_K 1 @@ -38,8 +40,14 @@ struct block_q4_0 float16_t d; uint8_t qs[16]; }; +struct block_q4_0_packed16 +{ + float16_t d; + uint16_t qs[16/2]; +}; #define A_TYPE block_q4_0 +#define A_TYPE_PACKED16 block_q4_0_packed16 #endif #if defined(DATA_A_Q4_1) @@ -54,7 +62,15 @@ struct block_q4_1 uint8_t qs[16]; }; +struct block_q4_1_packed16 +{ + float16_t d; + float16_t m; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q4_1 +#define A_TYPE_PACKED16 block_q4_1_packed16 #endif #if defined(DATA_A_Q5_0) @@ -70,7 +86,15 @@ struct block_q5_0 uint8_t qs[16]; }; +struct block_q5_0_packed16 +{ + float16_t d; + uint16_t qh[2]; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q5_0 +#define A_TYPE_PACKED16 block_q5_0_packed16 #endif #if defined(DATA_A_Q5_1) @@ -87,7 +111,16 @@ struct block_q5_1 uint8_t qs[16]; }; +struct block_q5_1_packed16 +{ + float16_t d; + float16_t m; + uint qh; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q5_1 +#define A_TYPE_PACKED16 block_q5_1_packed16 #endif #if defined(DATA_A_Q8_0) @@ -100,8 +133,14 @@ struct block_q8_0 float16_t d; int8_t qs[32]; }; +struct block_q8_0_packed16 +{ + float16_t d; + uint16_t qs[32/2]; +}; #define A_TYPE block_q8_0 +#define A_TYPE_PACKED16 block_q8_0_packed16 #endif // K-quants @@ -116,7 +155,23 @@ struct block_q2_K f16vec2 d; }; +struct block_q2_K_packed16 +{ + uint16_t scales[QUANT_K/16/2]; + uint16_t qs[QUANT_K/4/2]; + f16vec2 d; +}; + +struct block_q2_K_packed32 +{ + uint32_t scales[QUANT_K/16/4]; + uint32_t qs[QUANT_K/4/4]; + f16vec2 d; +}; + #define A_TYPE block_q2_K +#define A_TYPE_PACKED16 block_q2_K_packed16 +#define A_TYPE_PACKED32 block_q2_K_packed32 #endif #if defined(DATA_A_Q3_K) @@ -131,7 +186,16 @@ struct block_q3_K float16_t d; }; +struct block_q3_K_packed16 +{ + uint16_t hmask[QUANT_K/8/2]; + uint16_t qs[QUANT_K/4/2]; + uint16_t scales[12/2]; + float16_t d; +}; + #define A_TYPE block_q3_K +#define A_TYPE_PACKED16 block_q3_K_packed16 #endif #if defined(DATA_A_Q4_K) @@ -145,7 +209,23 @@ struct block_q4_K uint8_t qs[QUANT_K/2]; }; +struct block_q4_K_packed16 +{ + f16vec2 d; + uint16_t scales[3*QUANT_K/64/2]; + uint16_t qs[QUANT_K/2/2]; +}; + +struct block_q4_K_packed32 +{ + f16vec2 d; + uint32_t scales[3*QUANT_K/64/4]; + uint32_t qs[QUANT_K/2/4]; +}; + #define A_TYPE block_q4_K +#define A_TYPE_PACKED16 block_q4_K_packed16 +#define A_TYPE_PACKED32 block_q4_K_packed32 #endif #if defined(DATA_A_Q5_K) @@ -160,7 +240,16 @@ struct block_q5_K uint8_t qs[QUANT_K/2]; }; +struct block_q5_K_packed16 +{ + f16vec2 d; + uint16_t scales[12/2]; + uint16_t qh[QUANT_K/8/2]; + uint16_t qs[QUANT_K/2/2]; +}; + #define A_TYPE block_q5_K +#define A_TYPE_PACKED16 block_q5_K_packed16 #endif #if defined(DATA_A_Q6_K) @@ -175,7 +264,16 @@ struct block_q6_K float16_t d; }; +struct block_q6_K_packed16 +{ + uint16_t ql[QUANT_K/2/2]; + uint16_t qh[QUANT_K/4/2]; + int8_t scales[QUANT_K/16]; + float16_t d; +}; + #define A_TYPE block_q6_K +#define A_TYPE_PACKED16 block_q6_K_packed16 #endif // IQuants @@ -191,10 +289,19 @@ struct block_iq4_nl uint8_t qs[QUANT_K/2]; }; +struct block_iq4_nl_packed16 +{ + float16_t d; + uint16_t qs[QUANT_K/2/2]; +}; + #define A_TYPE block_iq4_nl +#define A_TYPE_PACKED16 block_iq4_nl_packed16 const int8_t kvalues_iq4nl[16] = { int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10), int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; #endif + +#endif // !defined(GGML_TYPES_COMP) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index fe3e4cb39ff47..f753109556f93 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -317,10 +317,10 @@ void process_shaders() { std::string data_a_key = "DATA_A_" + to_uppercase(tname); std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp"; - string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); - string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}})); - string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); // Dequant shaders if (tname != "f16") { From 8fd4b7fa29c3061b2e02e897d818dfcbc593430a Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 01:40:18 -0600 Subject: [PATCH 03/32] vulkan: copy iq4_nl LUT into shared memory (#10409) --- .../ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp | 2 ++ .../ggml-vulkan/vulkan-shaders/get_rows_quant.comp | 4 ++++ .../src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp | 4 ++++ ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp | 4 ++++ ggml/src/ggml-vulkan/vulkan-shaders/types.comp | 13 ++++++++++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 6 +++--- 6 files changed, 29 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp index 34ef3da30b82c..8de14fc03f102 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp @@ -10,6 +10,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; + init_iq4nl_shmem(); + const uint tid = gl_LocalInvocationID.x % 64; const uint il = tid/32; const uint ir = tid%32; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp index 8d30b63c165b2..7f608315b6844 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp @@ -12,6 +12,10 @@ void main() { const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + if (i00 >= p.ne00) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 00807a060c46b..2d5b8e4661312 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -161,6 +161,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { void main() { const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + // do NUM_ROWS at a time, unless there aren't enough remaining rows if (first_row + NUM_ROWS <= p.stride_d) { compute_outputs(first_row, NUM_ROWS); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index fffdd18189d55..2ff5c430519bc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -75,6 +75,10 @@ shared u16vec2 row_ids[3072]; #endif void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + #ifdef MUL_MAT_ID const uint expert_idx = gl_GlobalInvocationID.z; #else diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 7a34820bc2789..bc28e0ab857aa 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -298,10 +298,21 @@ struct block_iq4_nl_packed16 #define A_TYPE block_iq4_nl #define A_TYPE_PACKED16 block_iq4_nl_packed16 -const int8_t kvalues_iq4nl[16] = { +const int8_t kvalues_iq4nl_const[16] = { int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10), int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; + +shared FLOAT_TYPE kvalues_iq4nl[16]; + +void init_iq4nl_shmem() +{ + // copy the table into shared memory and sync + if (gl_LocalInvocationIndex.x < 16) { + kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]); + } + barrier(); +} #endif #endif // !defined(GGML_TYPES_COMP) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index f753109556f93..6bbe8e96edd9f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -331,11 +331,11 @@ void process_shaders() { shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp"; if (tname == "f16") { - string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); } else { - string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}})); } - string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}); + string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}})); } } From fab5d30ff6729ff6ff615c41e8c0215d6bc30393 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 20 Nov 2024 12:57:53 +0100 Subject: [PATCH 04/32] llama : add .clang-format file (#10415) --- .clang-format | 161 ++++++ examples/llama-bench/llama-bench.cpp | 702 +++++++++++++++------------ 2 files changed, 551 insertions(+), 312 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000000..45232b80ed8cd --- /dev/null +++ b/.clang-format @@ -0,0 +1,161 @@ +--- +Language: Cpp +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveAssignments: AcrossComments +AlignConsecutiveBitFields: AcrossComments +AlignConsecutiveDeclarations: AcrossComments +AlignConsecutiveMacros: AcrossComments +# AlignConsecutiveShortCaseStatements: AcrossComments +AlignEscapedNewlines: Left # LeftWithLastLine +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: true # OnePerLine +BitFieldColonSpacing: Both +BreakBeforeBraces: Custom # Attach +BraceWrapping: + AfterCaseLabel: true + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +# BreakAdjacentStringLiterals: true +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: false +# BreakBinaryOperations: Never +BreakConstructorInitializers: AfterColon +# BreakFunctionDefinitionParameters: false +BreakInheritanceList: AfterComma +BreakStringLiterals: true +# BreakTemplateDeclarations: Yes +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: Leave +EmptyLineAfterAccessModifier: Never +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true # NOTE: may lead to incorrect formatting +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Middle +QualifierAlignment: Left +#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' +ReferenceAlignment: Middle +ReflowComments: false # IndentOnly +SeparateDefinitionBlocks: Always +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: c++17 +TabWidth: 4 +UseTab: Never +WhitespaceSensitiveMacros: ['STRINGIZE'] +... + diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8f4e0e206a27e..3dc84a75cbec7 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -6,28 +6,28 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include #include -#include #include +#include +#include "common.h" #include "ggml.h" #include "llama.h" -#include "common.h" #ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include #endif // utils @@ -36,8 +36,7 @@ static uint64_t get_time_ns() { return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); } -template -static std::string join(const std::vector & values, const std::string & delim) { +template static std::string join(const std::vector & values, const std::string & delim) { std::ostringstream str; for (size_t i = 0; i < values.size(); i++) { str << values[i]; @@ -48,38 +47,35 @@ static std::string join(const std::vector & values, const std::string & delim return str.str(); } -template -static std::vector transform_to_str(const std::vector & values, F f) { +template static std::vector transform_to_str(const std::vector & values, F f) { std::vector str_values; std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); return str_values; } -template -static T avg(const std::vector & v) { +template static T avg(const std::vector & v) { if (v.empty()) { return 0; } T sum = std::accumulate(v.begin(), v.end(), T(0)); - return sum / (T)v.size(); + return sum / (T) v.size(); } -template -static T stdev(const std::vector & v) { +template static T stdev(const std::vector & v) { if (v.size() <= 1) { return 0; } - T mean = avg(v); + T mean = avg(v); T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); - T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); + T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1)); return stdev; } static std::string get_cpu_info() { std::vector cpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { cpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -90,8 +86,8 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::vector gpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) { gpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -100,17 +96,24 @@ static std::string get_gpu_info() { } // command line params -enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL}; +enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; static const char * output_format_str(output_formats format) { switch (format) { - case NONE: return "none"; - case CSV: return "csv"; - case JSON: return "json"; - case JSONL: return "jsonl"; - case MARKDOWN: return "md"; - case SQL: return "sql"; - default: GGML_ABORT("invalid output format"); + case NONE: + return "none"; + case CSV: + return "csv"; + case JSON: + return "json"; + case JSONL: + return "jsonl"; + case MARKDOWN: + return "md"; + case SQL: + return "sql"; + default: + GGML_ABORT("invalid output format"); } } @@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_MODE_NONE: return "none"; - case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ABORT("invalid split mode"); + case LLAMA_SPLIT_MODE_NONE: + return "none"; + case LLAMA_SPLIT_MODE_LAYER: + return "layer"; + case LLAMA_SPLIT_MODE_ROW: + return "row"; + default: + GGML_ABORT("invalid split mode"); } } @@ -149,59 +156,59 @@ static std::string pair_str(const std::pair & p) { } struct cmd_params { - std::vector model; - std::vector n_prompt; - std::vector n_gen; + std::vector model; + std::vector n_prompt; + std::vector n_gen; std::vector> n_pg; - std::vector n_batch; - std::vector n_ubatch; - std::vector type_k; - std::vector type_v; - std::vector n_threads; - std::vector cpu_mask; - std::vector cpu_strict; - std::vector poll; - std::vector n_gpu_layers; - std::vector rpc_servers; - std::vector split_mode; - std::vector main_gpu; - std::vector no_kv_offload; - std::vector flash_attn; - std::vector> tensor_split; - std::vector use_mmap; - std::vector embeddings; - ggml_numa_strategy numa; - int reps; - ggml_sched_priority prio; - int delay; - bool verbose; - bool progress; - output_formats output_format; - output_formats output_format_stderr; + std::vector n_batch; + std::vector n_ubatch; + std::vector type_k; + std::vector type_v; + std::vector n_threads; + std::vector cpu_mask; + std::vector cpu_strict; + std::vector poll; + std::vector n_gpu_layers; + std::vector rpc_servers; + std::vector split_mode; + std::vector main_gpu; + std::vector no_kv_offload; + std::vector flash_attn; + std::vector> tensor_split; + std::vector use_mmap; + std::vector embeddings; + ggml_numa_strategy numa; + int reps; + ggml_sched_priority prio; + int delay; + bool verbose; + bool progress; + output_formats output_format; + output_formats output_format_stderr; }; static const cmd_params cmd_params_defaults = { - /* model */ {"models/7B/ggml-model-q4_0.gguf"}, - /* n_prompt */ {512}, - /* n_gen */ {128}, + /* model */ { "models/7B/ggml-model-q4_0.gguf" }, + /* n_prompt */ { 512 }, + /* n_gen */ { 128 }, /* n_pg */ {}, - /* n_batch */ {2048}, - /* n_ubatch */ {512}, - /* type_k */ {GGML_TYPE_F16}, - /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {cpu_get_num_math()}, - /* cpu_mask */ {"0x0"}, - /* cpu_strict */ {false}, - /* poll */ {50}, - /* n_gpu_layers */ {99}, - /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, - /* main_gpu */ {0}, - /* no_kv_offload */ {false}, - /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, - /* use_mmap */ {true}, - /* embeddings */ {false}, + /* n_batch */ { 2048 }, + /* n_ubatch */ { 512 }, + /* type_k */ { GGML_TYPE_F16 }, + /* type_v */ { GGML_TYPE_F16 }, + /* n_threads */ { cpu_get_num_math() }, + /* cpu_mask */ { "0x0" }, + /* cpu_strict */ { false }, + /* poll */ { 50 }, + /* n_gpu_layers */ { 99 }, + /* rpc_servers */ { "" }, + /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, + /* main_gpu */ { 0 }, + /* no_kv_offload */ { false }, + /* flash_attn */ { false }, + /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, + /* use_mmap */ { true }, + /* embeddings */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) { printf("options:\n"); printf(" -h, --help\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", + join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -C, --cpu-mask (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str()); - printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); + printf(" -pg (default: %s)\n", + join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", + join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", + join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", + join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -C, --cpu-mask (default: %s)\n", + join(cmd_params_defaults.cpu_mask, ",").c_str()); + printf(" --cpu-strict <0|1> (default: %s)\n", + join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", + join(cmd_params_defaults.n_gpu_layers, ",").c_str()); if (llama_supports_rpc()) { - printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); + printf(" -rpc, --rpc (default: %s)\n", + join(cmd_params_defaults.rpc_servers, ",").c_str()); } - printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); - printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", + join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", + join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", + join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", + join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", + join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); - printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -embd, --embeddings <0|1> (default: %s)\n", + join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); - printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" -o, --output (default: %s)\n", + output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err (default: %s)\n", + output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0"); printf("\n"); - printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); + printf( + "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " + "multiple times.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { @@ -281,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) { return GGML_TYPE_COUNT; } - static cmd_params parse_cmd_params(int argc, char ** argv) { - cmd_params params; - std::string arg; - bool invalid_param = false; - const std::string arg_prefix = "--"; - const char split_delim = ','; - - params.verbose = cmd_params_defaults.verbose; - params.output_format = cmd_params_defaults.output_format; + cmd_params params; + std::string arg; + bool invalid_param = false; + const std::string arg_prefix = "--"; + const char split_delim = ','; + + params.verbose = cmd_params_defaults.verbose; + params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; - params.reps = cmd_params_defaults.reps; - params.numa = cmd_params_defaults.numa; - params.prio = cmd_params_defaults.prio; - params.delay = cmd_params_defaults.delay; - params.progress = cmd_params_defaults.progress; + params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; + params.prio = cmd_params_defaults.prio; + params.delay = cmd_params_defaults.delay; + params.progress = cmd_params_defaults.progress; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -338,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -358,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -377,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -437,7 +464,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector modes; for (const auto & m : p) { llama_split_mode mode; @@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } + /**/ if (value == "distribute" || value == "") { + params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; + } else if (value == "isolate") { + params.numa = GGML_NUMA_STRATEGY_ISOLATE; + } else if (value == "numactl") { + params.numa = GGML_NUMA_STRATEGY_NUMACTL; + } else { + invalid_param = true; + break; + } } } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { @@ -509,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } for (auto ts : string_split(argv[i], split_delim)) { // split string by ; and / - const std::regex regex{R"([;/]+)"}; - std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; - std::vector split_arg{it, {}}; + const std::regex regex{ R"([;/]+)" }; + std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; + std::vector split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= llama_max_devices()); std::vector tensor_split(llama_max_devices()); @@ -570,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } // set defaults - if (params.model.empty()) { params.model = cmd_params_defaults.model; } - if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } - if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } - if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } - if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } - if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } - if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } - if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } - if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } - if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } - if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } - if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } - if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } - if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } - if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } - if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } - if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; } - if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; } - if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } + if (params.model.empty()) { + params.model = cmd_params_defaults.model; + } + if (params.n_prompt.empty()) { + params.n_prompt = cmd_params_defaults.n_prompt; + } + if (params.n_gen.empty()) { + params.n_gen = cmd_params_defaults.n_gen; + } + if (params.n_pg.empty()) { + params.n_pg = cmd_params_defaults.n_pg; + } + if (params.n_batch.empty()) { + params.n_batch = cmd_params_defaults.n_batch; + } + if (params.n_ubatch.empty()) { + params.n_ubatch = cmd_params_defaults.n_ubatch; + } + if (params.type_k.empty()) { + params.type_k = cmd_params_defaults.type_k; + } + if (params.type_v.empty()) { + params.type_v = cmd_params_defaults.type_v; + } + if (params.n_gpu_layers.empty()) { + params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; + } + if (params.rpc_servers.empty()) { + params.rpc_servers = cmd_params_defaults.rpc_servers; + } + if (params.split_mode.empty()) { + params.split_mode = cmd_params_defaults.split_mode; + } + if (params.main_gpu.empty()) { + params.main_gpu = cmd_params_defaults.main_gpu; + } + if (params.no_kv_offload.empty()) { + params.no_kv_offload = cmd_params_defaults.no_kv_offload; + } + if (params.flash_attn.empty()) { + params.flash_attn = cmd_params_defaults.flash_attn; + } + if (params.tensor_split.empty()) { + params.tensor_split = cmd_params_defaults.tensor_split; + } + if (params.use_mmap.empty()) { + params.use_mmap = cmd_params_defaults.use_mmap; + } + if (params.embeddings.empty()) { + params.embeddings = cmd_params_defaults.embeddings; + } + if (params.n_threads.empty()) { + params.n_threads = cmd_params_defaults.n_threads; + } + if (params.cpu_mask.empty()) { + params.cpu_mask = cmd_params_defaults.cpu_mask; + } + if (params.cpu_strict.empty()) { + params.cpu_strict = cmd_params_defaults.cpu_strict; + } + if (params.poll.empty()) { + params.poll = cmd_params_defaults.poll; + } return params; } struct cmd_params_instance { - std::string model; - int n_prompt; - int n_gen; - int n_batch; - int n_ubatch; - ggml_type type_k; - ggml_type type_v; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - int n_gpu_layers; - std::string rpc_servers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; + std::string model; + int n_prompt; + int n_gen; + int n_batch; + int n_ubatch; + ggml_type type_k; + ggml_type type_v; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + int n_gpu_layers; + std::string rpc_servers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; std::vector tensor_split; - bool use_mmap; - bool embeddings; + bool use_mmap; + bool embeddings; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -624,35 +699,31 @@ struct cmd_params_instance { if (!rpc_servers.empty()) { mparams.rpc_servers = rpc_servers.c_str(); } - mparams.split_mode = split_mode; - mparams.main_gpu = main_gpu; + mparams.split_mode = split_mode; + mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; + mparams.use_mmap = use_mmap; return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && - n_gpu_layers == other.n_gpu_layers && - rpc_servers == other.rpc_servers && - split_mode == other.split_mode && - main_gpu == other.main_gpu && - use_mmap == other.use_mmap && + return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers && + split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split; } llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; + cparams.n_ctx = n_prompt + n_gen; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; return cparams; } @@ -662,6 +733,7 @@ static std::vector get_cmd_params_instances(const cmd_param std::vector instances; // this ordering minimizes the number of times that each model needs to be reloaded + // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & rpc : params.rpc_servers) @@ -767,100 +839,94 @@ static std::vector get_cmd_params_instances(const cmd_param instances.push_back(instance); } } + // clang-format on return instances; } struct test { static const std::string build_commit; - static const int build_number; + static const int build_number; static const std::string cpu_info; static const std::string gpu_info; - std::string model_filename; - std::string model_type; - uint64_t model_size; - uint64_t model_n_params; - int n_batch; - int n_ubatch; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - ggml_type type_k; - ggml_type type_v; - int n_gpu_layers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; - std::vector tensor_split; - bool use_mmap; - bool embeddings; - int n_prompt; - int n_gen; - std::string test_time; - std::vector samples_ns; + std::string model_filename; + std::string model_type; + uint64_t model_size; + uint64_t model_n_params; + int n_batch; + int n_ubatch; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + ggml_type type_k; + ggml_type type_v; + int n_gpu_layers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; + std::vector tensor_split; + bool use_mmap; + bool embeddings; + int n_prompt; + int n_gen; + std::string test_time; + std::vector samples_ns; test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; char buf[128]; llama_model_desc(lmodel, buf, sizeof(buf)); - model_type = buf; - model_size = llama_model_size(lmodel); + model_type = buf; + model_size = llama_model_size(lmodel); model_n_params = llama_model_n_params(lmodel); - n_batch = inst.n_batch; - n_ubatch = inst.n_ubatch; - n_threads = inst.n_threads; - cpu_mask = inst.cpu_mask; - cpu_strict = inst.cpu_strict; - poll = inst.poll; - type_k = inst.type_k; - type_v = inst.type_v; - n_gpu_layers = inst.n_gpu_layers; - split_mode = inst.split_mode; - main_gpu = inst.main_gpu; - no_kv_offload = inst.no_kv_offload; - flash_attn = inst.flash_attn; - tensor_split = inst.tensor_split; - use_mmap = inst.use_mmap; - embeddings = inst.embeddings; - n_prompt = inst.n_prompt; - n_gen = inst.n_gen; + n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; + n_threads = inst.n_threads; + cpu_mask = inst.cpu_mask; + cpu_strict = inst.cpu_strict; + poll = inst.poll; + type_k = inst.type_k; + type_v = inst.type_v; + n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; + main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; + flash_attn = inst.flash_attn; + tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; + embeddings = inst.embeddings; + n_prompt = inst.n_prompt; + n_gen = inst.n_gen; // RFC 3339 date-time format - time_t t = time(NULL); + time_t t = time(NULL); std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); test_time = buf; (void) ctx; } - uint64_t avg_ns() const { - return ::avg(samples_ns); - } + uint64_t avg_ns() const { return ::avg(samples_ns); } - uint64_t stdev_ns() const { - return ::stdev(samples_ns); - } + uint64_t stdev_ns() const { return ::stdev(samples_ns); } std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; + int n_tokens = n_prompt + n_gen; std::vector ts; - std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), + [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); return ts; } - double avg_ts() const { - return ::avg(get_ts()); - } + double avg_ts() const { return ::avg(get_ts()); } - double stdev_ts() const { - return ::stdev(get_ts()); - } + double stdev_ts() const { return ::stdev(get_ts()); } static std::string get_backend() { std::vector backends; for (size_t i = 0; i < ggml_backend_reg_count(); i++) { - auto * reg = ggml_backend_reg_get(i); + auto * reg = ggml_backend_reg_get(i); std::string name = ggml_backend_reg_name(reg); if (name != "CPU") { backends.push_back(ggml_backend_reg_name(reg)); @@ -871,36 +937,27 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", - "cpu_info", "gpu_info", "backends", - "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_ubatch", - "n_threads", "cpu_mask", "cpu_strict", "poll", - "type_k", "type_v", - "n_gpu_layers", "split_mode", - "main_gpu", "no_kv_offload", "flash_attn", - "tensor_split", "use_mmap", "embeddings", - "n_prompt", "n_gen", "test_time", - "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", + "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", + "avg_ts", "stddev_ts", }; return fields; } - enum field_type {STRING, BOOL, INT, FLOAT}; + enum field_type { STRING, BOOL, INT, FLOAT }; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || - field == "n_threads" || field == "poll" || - field == "model_size" || field == "model_n_params" || - field == "n_gpu_layers" || field == "main_gpu" || - field == "n_prompt" || field == "n_gen" || - field == "avg_ns" || field == "stddev_ns") { + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || + field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || + field == "stddev_ns") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || - field == "cpu_strict" || - field == "flash_attn" || field == "use_mmap" || field == "embeddings") { + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -911,7 +968,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; - int max_nonzero = 0; + int max_nonzero = 0; for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; @@ -925,29 +982,47 @@ struct test { tensor_split_str += "/"; } } - std::vector values = { - build_commit, std::to_string(build_number), - cpu_info, gpu_info, get_backend(), - model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_ubatch), - std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll), - ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), split_mode_str(split_mode), - std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), - tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), - std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_ns()), std::to_string(stdev_ns()), - std::to_string(avg_ts()), std::to_string(stdev_ts()) - }; + std::vector values = { build_commit, + std::to_string(build_number), + cpu_info, + gpu_info, + get_backend(), + model_filename, + model_type, + std::to_string(model_size), + std::to_string(model_n_params), + std::to_string(n_batch), + std::to_string(n_ubatch), + std::to_string(n_threads), + cpu_mask, + std::to_string(cpu_strict), + std::to_string(poll), + ggml_type_name(type_k), + ggml_type_name(type_v), + std::to_string(n_gpu_layers), + split_mode_str(split_mode), + std::to_string(main_gpu), + std::to_string(no_kv_offload), + std::to_string(flash_attn), + tensor_split_str, + std::to_string(use_mmap), + std::to_string(embeddings), + std::to_string(n_prompt), + std::to_string(n_gen), + test_time, + std::to_string(avg_ns()), + std::to_string(stdev_ns()), + std::to_string(avg_ts()), + std::to_string(stdev_ts()) }; return values; } std::map get_map() const { std::map map; - auto fields = get_fields(); - auto values = get_values(); - std::transform(fields.begin(), fields.end(), values.begin(), - std::inserter(map, map.end()), std::make_pair); + auto fields = get_fields(); + auto values = get_values(); + std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()), + std::make_pair); return map; } }; @@ -961,9 +1036,12 @@ struct printer { virtual ~printer() {} FILE * fout; + virtual void print_header(const cmd_params & params) { (void) params; } + virtual void print_test(const test & t) = 0; - virtual void print_footer() { } + + virtual void print_footer() {} }; struct csv_printer : public printer { @@ -979,7 +1057,7 @@ struct csv_printer : public printer { return escaped; } - void print_header(const cmd_params & params) override { + void print_header(const cmd_params & params) override { std::vector fields = test::get_fields(); fprintf(fout, "%s\n", join(fields, ",").c_str()); (void) params; @@ -992,7 +1070,6 @@ struct csv_printer : public printer { } }; - static std::string escape_json(const std::string & value) { std::string escaped; for (auto c : value) { @@ -1000,7 +1077,7 @@ static std::string escape_json(const std::string & value) { escaped += "\\\""; } else if (c == '\\') { escaped += "\\\\"; - } else if (c <= 0x1f) { + } else if (c <= 0x1f) { char buf[8]; snprintf(buf, sizeof(buf), "\\u%04x", c); escaped += buf; @@ -1033,7 +1110,8 @@ struct json_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str()); + fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), + format_json_value(fields.at(i), values.at(i)).c_str()); } } @@ -1051,12 +1129,9 @@ struct json_printer : public printer { fflush(fout); } - void print_footer() override { - fprintf(fout, "\n]\n"); - } + void print_footer() override { fprintf(fout, "\n]\n"); } }; - struct jsonl_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); @@ -1116,7 +1191,7 @@ struct markdown_printer : public printer { return 13; } - int width = std::max((int)field.length(), 10); + int width = std::max((int) field.length(), 10); if (test::get_field_type(field) == test::STRING) { return -width; @@ -1230,18 +1305,18 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { std::string value; - char buf[128]; + char buf[128]; if (field == "model") { value = t.model_type; } else if (field == "size") { - if (t.model_size < 1024*1024*1024) { + if (t.model_size < 1024 * 1024 * 1024) { snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); } else { snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); } value = buf; } else if (field == "params") { - if (t.model_n_params < 1000*1000*1000) { + if (t.model_n_params < 1000 * 1000 * 1000) { snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); } else { snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); @@ -1303,7 +1378,8 @@ struct sql_printer : public printer { std::vector fields = test::get_fields(); fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); + fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), + i < fields.size() - 1 ? "," : ""); } fprintf(fout, ");\n"); fprintf(fout, "\n"); @@ -1324,8 +1400,8 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); std::vector tokens(n_batch); @@ -1333,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } @@ -1347,8 +1423,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th static void test_gen(llama_context * ctx, int n_gen, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; @@ -1411,7 +1487,7 @@ int main(int argc, char ** argv) { set_process_priority(params.prio); // initialize printer - std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p = create_printer(params.output_format); std::unique_ptr p_err = create_printer(params.output_format_stderr); if (p) { @@ -1426,13 +1502,13 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); - llama_model * lmodel = nullptr; + llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; - int params_idx = 0; + int params_idx = 0; auto params_count = params_instances.size(); for (const auto & inst : params_instances) { - params_idx ++; + params_idx++; if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); } @@ -1475,7 +1551,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1505,13 +1581,15 @@ int main(int argc, char ** argv) { if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); } From f95caa79546271722ada703da20ffb1cfcd21fed Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:22:19 -0400 Subject: [PATCH 05/32] cmake: add link dependencies to cmake find pkg (#10433) * cmake pkg: find accelerate, openmp, memkind libs * cmake pkg: find BLAS libs * try BLAS_LIBRARIES instead * Add BLAS link opts * Add more link deps. and set GGML_ vars --- cmake/llama-config.cmake.in | 136 +++++++++++++++++++++++++++--------- 1 file changed, 104 insertions(+), 32 deletions(-) diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index 28a8c18b65176..5c55bc6b822a6 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -3,12 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) +set(GGML_STATIC @GGML_STATIC@) +set(GGML_NATIVE @GGML_NATIVE@) +set(GGML_LTO @GGML_LTO@) +set(GGML_CCACHE @GGML_CCACHE@) +set(GGML_AVX @GGML_AVX@) +set(GGML_AVX2 @GGML_AVX2@) +set(GGML_AVX512 @GGML_AVX512@) +set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@) +set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@) +set(GGML_AVX512_BF16 @GGML_AVX512_BF16@) +set(GGML_AMX_TILE @GGML_AMX_TILE@) +set(GGML_AMX_INT8 @GGML_AMX_INT8@) +set(GGML_AMX_BF16 @GGML_AMX_BF16@) +set(GGML_FMA @GGML_FMA@) +set(GGML_LASX @GGML_LASX@) +set(GGML_LSX @GGML_LSX@) +set(GGML_RVV @GGML_RVV@) +set(GGML_SVE @GGML_SVE@) + set(GGML_ACCELERATE @GGML_ACCELERATE@) +set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_CPU_HBM @GGML_CPU_HBM@) +set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@) + +set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@) +set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@) +set(GGML_CUDA_F16 @GGML_CUDA_F16@) +set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@) +set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@) +set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@) +set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@) +set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@) + +set(GGML_HIP_UMA @GGML_HIP_UMA@) + set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) -set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) -set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) -set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) -set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) +set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) +set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@) +set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@) +set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) +set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@) + +set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@) +set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@) +set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@) +set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@) +set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@) +set(GGML_METAL_STD @GGML_METAL_STD@) + +set(GGML_SYCL_F16 @GGML_SYCL_F16@) +set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@) +set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@) + @PACKAGE_INIT@ @@ -20,6 +68,7 @@ find_package(Threads REQUIRED) set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") set(_llama_link_deps "") +set(_llama_link_opts "") foreach(_ggml_lib ggml ggml-base) string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") find_library(${_ggml_lib_var} ${_ggml_lib} @@ -49,41 +98,63 @@ foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan) endif() endforeach() -if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) -endif() +if (NOT LLAMA_SHARED_LIB) + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) + list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK}) + endif() -if (GGML_BLAS) - find_package(BLAS REQUIRED) -endif() + if (GGML_OPENMP) + find_package(OpenMP REQUIRED) + list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + endif() -if (GGML_CUDA) - find_package(CUDAToolkit REQUIRED) -endif() + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + list(APPEND _llama_link_deps memkind) + endif() -if (GGML_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) -endif() + if (GGML_BLAS) + find_package(BLAS REQUIRED) + list(APPEND _llama_link_deps ${BLAS_LIBRARIES}) + list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS}) + endif() -if (GGML_VULKAN) - find_package(Vulkan REQUIRED) -endif() + if (GGML_CUDA) + find_package(CUDAToolkit REQUIRED) + endif() -if (GGML_HIP) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) -endif() + if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) + endif() -if (GGML_SYCL) - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) -endif() + if (GGML_VULKAN) + find_package(Vulkan REQUIRED) + list(APPEND _llama_link_deps Vulkan::Vulkan) + endif() -if (GGML_OPENMP) - find_package(OpenMP REQUIRED) + if (GGML_HIP) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas) + endif() + + if (GGML_SYCL) + find_package(DNNL) + if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") + list(APPEND _llama_link_deps DNNL::dnnl) + endif() + if (WIN32) + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) + list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + endif() + endif() endif() find_library(llama_LIBRARY llama @@ -97,6 +168,7 @@ set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" + INTERFACE_LINK_OPTIONS "${_llama_link_opts}" INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LOCATION "${llama_LIBRARY}" From 9abe9eeae98b11fa93b82632b264126a010225ff Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 13:47:36 -0600 Subject: [PATCH 06/32] vulkan: predicate max operation in soft_max shaders/soft_max (#10437) Fixes #10434 --- ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp index f9727679ec5f2..6e20b6411cadc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp @@ -73,7 +73,9 @@ void soft_max(uint num_iters) { FLOAT_TYPE v = a * p.scale + slope * b; - max_val = max(max_val, v); + if (col < p.KX) { + max_val = max(max_val, v); + } if (idx < DATA_CACHE_SIZE) { data_cache[idx] = v; From 02e4eaf22f229a114054b053a9eff61483653670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 20 Nov 2024 14:56:04 +0100 Subject: [PATCH 07/32] ggml-opt: fix data corruption (ggml/1022) --- ggml/src/ggml-backend.cpp | 2 + ggml/src/ggml-impl.h | 3 + ggml/src/ggml-opt.cpp | 147 +++++++++++++++++-------------------- ggml/src/ggml.c | 94 ++++++++++++++---------- tests/test-backend-ops.cpp | 1 - 5 files changed, 129 insertions(+), 118 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9dcde8d11952a..3433d082e775b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -252,6 +252,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten } void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (size == 0) { @@ -266,6 +267,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (size == 0) { diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 92a64fe5a5930..3965be78755a1 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -295,6 +295,9 @@ struct ggml_cgraph { enum ggml_cgraph_eval_order order; }; +// returns a slice of cgraph with nodes [i0, i1) +// the slice does not have leafs or gradients +// if you need the gradients, get them from the original graph struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); // Memory allocation diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index 040205a31bab0..7c3e24103a250 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -14,51 +14,51 @@ #include struct ggml_opt_dataset { - struct ggml_context * ctx; - ggml_backend_buffer_t buf; - struct ggml_tensor * data; - struct ggml_tensor * labels; + struct ggml_context * ctx = nullptr; + ggml_backend_buffer_t buf = nullptr; + struct ggml_tensor * data = nullptr; + struct ggml_tensor * labels = nullptr; - int64_t ndata; - int64_t ndata_shard; - size_t nbs_data; - size_t nbs_labels; + int64_t ndata = -1; + int64_t ndata_shard = -1; + size_t nbs_data = -1; + size_t nbs_labels = -1; std::vector permutation; }; struct ggml_opt_context { - ggml_backend_sched_t backend_sched; - ggml_cgraph * allocated_graph; - ggml_cgraph * allocated_graph_copy; - struct ggml_context * ctx_static; - struct ggml_context * ctx_static_cpu; - struct ggml_context * ctx_compute; - struct ggml_context * ctx_copy; - ggml_backend_buffer_t buf_static; - ggml_backend_buffer_t buf_static_cpu; + ggml_backend_sched_t backend_sched = nullptr; + ggml_cgraph * allocated_graph = nullptr; + ggml_cgraph * allocated_graph_copy = nullptr; + struct ggml_context * ctx_static = nullptr; + struct ggml_context * ctx_static_cpu = nullptr; + struct ggml_context * ctx_compute = nullptr; + struct ggml_context * ctx_copy = nullptr; + ggml_backend_buffer_t buf_static = nullptr; + ggml_backend_buffer_t buf_static_cpu = nullptr; std::mt19937 rng; - struct ggml_tensor * inputs; - struct ggml_tensor * outputs; - struct ggml_tensor * labels; + struct ggml_tensor * inputs = nullptr; + struct ggml_tensor * outputs = nullptr; + struct ggml_tensor * labels = nullptr; - struct ggml_tensor * loss; - struct ggml_tensor * pred; - struct ggml_tensor * ncorrect; + struct ggml_tensor * loss = nullptr; + struct ggml_tensor * pred = nullptr; + struct ggml_tensor * ncorrect = nullptr; - struct ggml_cgraph * gf; - struct ggml_cgraph * gb_grad; - struct ggml_cgraph * gb_opt; + struct ggml_cgraph * gf = nullptr; + struct ggml_cgraph * gb_grad = nullptr; + struct ggml_cgraph * gb_opt = nullptr; - int64_t iter; - int32_t opt_period; - int32_t opt_i; - bool loss_per_datapoint; + int64_t iter = 1; + int32_t opt_period = 1; + int32_t opt_i = 0; + bool loss_per_datapoint = false; - ggml_opt_get_optimizer_params get_opt_pars; - void * get_opt_pars_ud; - struct ggml_tensor * adamw_params; + ggml_opt_get_optimizer_params get_opt_pars = nullptr; + void * get_opt_pars_ud = nullptr; + struct ggml_tensor * adamw_params = nullptr; }; struct ggml_opt_result { @@ -67,8 +67,8 @@ struct ggml_opt_result { std::vector pred; int64_t ncorrect = 0; - bool loss_per_datapoint = false; - int64_t opt_period = -1; + int64_t opt_period = -1; + bool loss_per_datapoint = false; }; // ====== Dataset ====== @@ -188,11 +188,11 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us } struct ggml_opt_params ggml_opt_default_params( - ggml_backend_sched_t backend_sched, - struct ggml_context * ctx_compute, - struct ggml_tensor * inputs, - struct ggml_tensor * outputs, - enum ggml_opt_loss_type loss_type) { + ggml_backend_sched_t backend_sched, + struct ggml_context * ctx_compute, + struct ggml_tensor * inputs, + struct ggml_tensor * outputs, + enum ggml_opt_loss_type loss_type) { return { /*backend_sched =*/ backend_sched, /*ctx_compute =*/ ctx_compute, @@ -237,25 +237,33 @@ static ggml_tensor * map_tensor(std::map & tensor_ return new_tensor; } -static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) { +static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) { std::map tensor_map; - ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); + ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true); - for (int i = 0; i < graph->n_leafs; i++) { - ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i])); + for (int i = 0; i < src->n_leafs; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); } - for (int i = 0; i < graph->n_nodes; i++) { - ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i])); + GGML_ASSERT(dst->n_leafs == src->n_leafs); + for (int i = 0; i < src->n_nodes; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); } - for (int i = 0; i < graph->n_nodes; ++i) { - const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]); - const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]); - graph->grads[igrad_dst] = new_graph->grads[igrad_src]; - graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src]; + GGML_ASSERT(dst->n_nodes == src->n_nodes); + for (int i = 0; i < src->n_nodes; ++i) { + const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); + const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + + dst->grads[igrad_dst] = src->grads[igrad_src]; + dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; } - return new_graph; + return dst; } static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) { @@ -284,18 +292,13 @@ static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { ggml_opt_context_t result = new struct ggml_opt_context; - result->backend_sched = params.backend_sched; - result->allocated_graph = nullptr; - result->allocated_graph_copy = nullptr; - result->ctx_compute = params.ctx_compute; - result->ctx_copy = nullptr; - result->inputs = params.inputs; - result->outputs = params.outputs; - result->iter = 1; - result->opt_period = params.opt_period; - result->opt_i = 0; - result->get_opt_pars = params.get_opt_pars; - result->get_opt_pars_ud = params.get_opt_pars_ud; + result->backend_sched = params.backend_sched; + result->ctx_compute = params.ctx_compute; + result->inputs = params.inputs; + result->outputs = params.outputs; + result->opt_period = params.opt_period; + result->get_opt_pars = params.get_opt_pars; + result->get_opt_pars_ud = params.get_opt_pars_ud; GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically"); GGML_ASSERT(result->opt_period >= 1); @@ -348,7 +351,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { switch (params.loss_type) { case GGML_OPT_LOSS_TYPE_MEAN: { - result->labels = nullptr; result->loss = ggml_sum(result->ctx_static, result->outputs); ggml_set_name(result->loss, "loss_sum"); const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs)); @@ -358,7 +360,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { break; } case GGML_OPT_LOSS_TYPE_SUM: { - result->labels = nullptr; result->loss = ggml_sum(result->ctx_static, result->outputs); ggml_set_name(result->loss, "loss_sum"); result->loss_per_datapoint = false; @@ -413,14 +414,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { } if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) { - result->gb_grad = nullptr; - result->gb_opt = nullptr; - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - result->buf_static_cpu = nullptr; - - ggml_opt_alloc_graph(result, result->gf); - return result; } @@ -429,14 +423,8 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate); if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) { - result->gb_opt = nullptr; - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - result->buf_static_cpu = nullptr; - - ggml_opt_alloc_graph(result, result->gb_grad); ggml_graph_reset(result->gb_grad); - return result; } @@ -466,7 +454,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type()); - ggml_opt_alloc_graph(result, result->gb_opt); ggml_graph_reset(result->gb_opt); return result; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ee72a173e07fd..719d75c70c75d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5019,8 +5019,10 @@ static void ggml_hash_map_free(struct hash_map * map) { } // utility functions to change gradients -// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator -// else if a is in zero_table, replace a +// isrc is the index of tensor in cgraph->visited_has_set.keys +// the corresponding gradient (accumulators) are also at position isrc +// if tensor has a gradient accumulator, modify that accumulator in-place +// else if there is no gradient for tensor, set the corresponding value // else, just add/subtract/etc. the gradients static void ggml_add_or_set( @@ -5028,11 +5030,14 @@ static void ggml_add_or_set( struct ggml_cgraph * cgraph, size_t isrc, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { - cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); + cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = tensor; } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5040,18 +5045,20 @@ static void ggml_acc_or_set( struct ggml_context * ctx, struct ggml_cgraph * cgraph, size_t isrc, - struct ggml_tensor * src, struct ggml_tensor * tensor, const size_t nb1, const size_t nb2, const size_t nb3, const size_t offset) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]); } else { struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5059,13 +5066,15 @@ static void ggml_add1_or_set( struct ggml_context * ctx, struct ggml_cgraph * cgraph, size_t isrc, - struct ggml_tensor * src, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5074,11 +5083,14 @@ static void ggml_sub_or_set( struct ggml_cgraph * cgraph, size_t isrc, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = ggml_neg(ctx, tensor); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5095,12 +5107,12 @@ static void ggml_compute_backward( struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src2 = tensor->src[2]; struct ggml_hash_set * hash_set = &cgraph->visited_hash_set; - const size_t isrc0 = ggml_hash_find(hash_set, src0); - const size_t isrc1 = ggml_hash_find(hash_set, src1); - const size_t isrc2 = ggml_hash_find(hash_set, src2); - const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0]; - const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1]; - const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2]; + const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1; + const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1; + const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1; + const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0]; + const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1]; + const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2]; switch (tensor->op) { case GGML_OP_DUP: { @@ -5200,7 +5212,7 @@ static void ggml_compute_backward( } break; case GGML_OP_SUM: { if (src0_needs_grads) { - ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad); + ggml_add1_or_set(ctx, cgraph, isrc0, grad); } } break; case GGML_OP_SUM_ROWS: { @@ -5210,7 +5222,7 @@ static void ggml_compute_backward( } break; case GGML_OP_MEAN: { if (src0_needs_grads) { - ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false)); + ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false)); } } break; case GGML_OP_REPEAT: { @@ -5363,7 +5375,7 @@ static void ggml_compute_backward( nb3 = (nb3 / n0) * ng; } - ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset); + ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset); } } break; case GGML_OP_PERMUTE: { @@ -5597,10 +5609,9 @@ void ggml_build_backward_expand( const int n_nodes_f = cgraph->n_nodes; - const size_t hash_size = ggml_hash_size(2*cgraph->size); - memset(cgraph->grads, 0, hash_size*sizeof(struct ggml_tensor *)); - memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *)); - bool * grads_needed = calloc(hash_size, sizeof(bool)); + memset(cgraph->grads, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *)); + memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *)); + bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool)); { bool any_params = false; @@ -5621,7 +5632,7 @@ void ggml_build_backward_expand( continue; } - bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM; + bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS); bool ignore_src[GGML_MAX_SRC] = {false}; switch (node->op) { // gradients in node->src[0] for one reason or another have no effect on output gradients @@ -5638,7 +5649,7 @@ void ggml_build_backward_expand( } break; // gradients in node->src[1] for one reason or another have no effect on output gradients - case GGML_OP_CPY: // gradients in CPY target are irrelevant + case GGML_OP_CPY: // gradients in CPY target are irrelevant case GGML_OP_GET_ROWS: // row indices not differentiable case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS case GGML_OP_ROPE: // positions not differentiable @@ -5665,9 +5676,12 @@ void ggml_build_backward_expand( node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node); + GGML_ASSERT(igrad != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad)); if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) { - cgraph->grads[igrad] = ggml_dup_tensor(ctx_static, node); - cgraph->grad_accs[igrad] = cgraph->grads[igrad]; + cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node); + cgraph->grads[igrad] = cgraph->grad_accs[igrad]; + ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name); } grads_needed[igrad] = true; } @@ -5761,15 +5775,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) { struct ggml_cgraph cgraph = { - /*.size =*/ 0, - /*.n_nodes =*/ i1 - i0, - /*.n_leafs =*/ 0, - /*.nodes =*/ cgraph0->nodes + i0, - /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, - /*.grad_accs =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL, - /*.leafs =*/ NULL, - /*.hash_table =*/ { 0, NULL, NULL }, - /*.order =*/ cgraph0->order, + /*.size =*/ 0, + /*.n_nodes =*/ i1 - i0, + /*.n_leafs =*/ 0, + /*.nodes =*/ cgraph0->nodes + i0, + /*.grads =*/ NULL, // gradients would need visited_hash_set + /*.grad_accs =*/ NULL, + /*.leafs =*/ NULL, + /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.order =*/ cgraph0->order, }; return cgraph; @@ -5799,12 +5813,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } } + if (dst->grads) { + memset(dst->grads, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *)); + memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *)); + } if (src->grads) { GGML_ASSERT(dst->grads != NULL); GGML_ASSERT(dst->grad_accs != NULL); for (int i = 0; i < src->n_nodes; ++i) { const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + dst->grads[igrad_dst] = src->grads[igrad_src]; dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; } @@ -5839,12 +5863,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { if (node->op == GGML_OP_OPT_STEP_ADAMW) { // clear momenta - if (node->src[2]->data) { - ggml_set_zero(node->src[2]); - } - if (node->src[3]->data) { - ggml_set_zero(node->src[3]); - } + ggml_set_zero(node->src[2]); + ggml_set_zero(node->src[3]); } // initial gradients of loss should be 1, 0 otherwise diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 01ac7166e0e22..37342c156d7c1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -819,7 +819,6 @@ struct test_case { } } - // TODO: refactor so that this check is only needed once for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (!ggml_backend_supports_op(backend, t)) { printf("not supported [%s] ", ggml_backend_name(backend)); From 59b917282236eadfb82bf1f46a31eb119941da08 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Nov 2024 13:25:08 +0100 Subject: [PATCH 08/32] ggml/sched : do not skip views in pre-assignments --- ggml/src/ggml-backend.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 3433d082e775b..45da0c27daffa 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -886,9 +886,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); - if (ggml_is_view_op(node->op)) { - continue; - } // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); From 87a533be57e602f8ca469d14ad15ee851265b655 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Nov 2024 09:22:11 +0200 Subject: [PATCH 09/32] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index e9bd2dbb05e2c..d101d2b57f8fa 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -2884dd72fea8922910fe53387c3d17ab928d3a8e +6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58 From 1bb30bf28cb5a7adf111bc41c935bdaf128397e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Nov 2024 10:22:47 +0200 Subject: [PATCH 10/32] llama : handle KV shift for recurrent models (#10402) ggml-ci --- src/llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c51b36e66042e..001711037d5d1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18211,13 +18211,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { static void llama_kv_cache_update_internal(struct llama_context & lctx) { bool need_reserve = false; - // apply K-shift if needed - if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + if (lctx.kv_self.has_shift) { if (!llama_kv_cache_can_shift(&lctx)) { - GGML_ABORT("Deepseek2 does not support K-shift"); + GGML_ABORT("The current context does not support K-shift"); } - { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); @@ -20463,7 +20463,7 @@ void llama_kv_cache_update(struct llama_context * ctx) { } bool llama_kv_cache_can_shift(struct llama_context * ctx) { - return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA } // deprecated From a5e47592b6171ae21f3eaa1aba6fb2b707875063 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 21 Nov 2024 18:18:50 +0100 Subject: [PATCH 11/32] cuda : optimize argmax (#10441) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cuda : optimize argmax * remove unused parameter ggml-ci * fixup : use full warps ggml-ci * Apply suggestions from code review Co-authored-by: Johannes Gäßler * fix ub * ggml : check ne00 <= INT32_MAX in argmax and argsort --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/argmax.cu | 96 +++++++++++++++++++--------------- ggml/src/ggml-cuda/common.cuh | 30 +++++------ ggml/src/ggml-cuda/quantize.cu | 8 +-- ggml/src/ggml.c | 2 + tests/test-backend-ops.cpp | 29 ++++++++++ 5 files changed, 104 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index aab04eca7a385..5340eedc08916 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -1,57 +1,69 @@ -#include "common.cuh" +#include +#include + #include "argmax.cuh" +#include "common.cuh" #include "sum.cuh" -#include +static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) { + const int64_t row = blockIdx.x; -static __global__ void argmax_f32( - const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) { + float maxval = -FLT_MAX; + int argmax = -1; + const float * rowx = x + row * ncols; - int argmax_thread = 0; - const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE; + for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) { + const float val = rowx[col]; + if (val > maxval) { + maxval = val; + argmax = col; + } + } #pragma unroll - for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) { - const int64_t row = row0 + row1; - - if (row >= nrows) { - break; + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; } + } - float maxval = -FLT_MAX; - int argmax = -1; - - for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) { - const float val = x[row*ncols + col]; - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; + const int n_warps = blockDim.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + if (n_warps > 1) { + constexpr int max_warps = 1024 / WARP_SIZE; + __shared__ float shared_maxval[max_warps]; + __shared__ int shared_argmax[max_warps]; + if (lane_id == 0) { + shared_maxval[warp_id] = maxval; + shared_argmax[warp_id] = argmax; } + __syncthreads(); + + if (warp_id == 0) { + if (lane_id < n_warps) { + maxval = shared_maxval[lane_id]; + argmax = shared_argmax[lane_id]; + } #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE); - const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE); - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; + } + } } - - const int store = row1 == threadIdx.x; - argmax_thread += store*argmax; } - const int row = row0 + threadIdx.x; - - if (row >= nrows) { - return; + if (warp_id == 0 && lane_id == 0) { + dst[row] = argmax; } - - dst[row] = argmax_thread; } void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -70,10 +82,10 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { cudaStream_t stream = ctx.stream(); - const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE; - - const dim3 blocks_dim(WARP_SIZE, 1, 1); + const int64_t num_blocks = nrows; + const int64_t num_threads = std::min(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE); + const dim3 blocks_dim(num_threads, 1, 1); const dim3 blocks_num(num_blocks, 1, 1); - argmax_f32<<>>(src0_d, dst_d, ne00, nrows); + argmax_f32<<>>(src0_d, dst_d, ne00); } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e146c691c6f87..b0dd16066b4ba 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -180,8 +180,8 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { return __reduce_add_sync(0xffffffff, x); #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE @@ -189,17 +189,17 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; } static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); - a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32); + a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32); } return a; } @@ -209,16 +209,16 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32); reinterpret_cast(a.x) += __low2half(a_other); reinterpret_cast(a.y) += __high2half(a_other); } return a; #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32)); } return a; #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) @@ -231,8 +231,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; } @@ -275,8 +275,8 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; #else diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 45408ce8684e4..1702e4ce2feba 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -69,8 +69,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange max. abs. value between vals_per_scale/4 threads. #pragma unroll - for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) { - amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE)); + for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE)); } float sum; @@ -79,8 +79,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange calculate sum across vals_per_sum/4 threads. #pragma unroll - for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) { - sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE); + for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { + sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 719d75c70c75d..78e7874dee04d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2255,6 +2255,7 @@ struct ggml_tensor * ggml_argmax( struct ggml_context * ctx, struct ggml_tensor * a) { GGML_ASSERT(ggml_is_matrix(a)); + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); @@ -4138,6 +4139,7 @@ struct ggml_tensor * ggml_argsort( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_sort_order order) { + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 37342c156d7c1..b2b5705243ea2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1154,6 +1154,26 @@ struct test_argmax : public test_case { return out; } + void initialize_tensors(ggml_context * ctx) override { + std::random_device rd; + std::default_random_engine rng(rd()); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_F32) { + // initialize with unique values to avoid ties + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int i = 0; i < t->ne[0]; i++) { + data[i] = i; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); + } + } else { + init_tensor_uniform(t); + } + } + } + double max_nmse_err() override { return 0.0; } @@ -3440,6 +3460,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); test_cases.emplace_back(new test_argmax()); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1})); + test_cases.emplace_back(new test_count_equal()); for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1 @@ -3830,6 +3855,10 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1})); + for (int bs : {1, 512}) { for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { From c18610b4ee29ca056bb4f2d375a4ad1b16f44ef7 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Fri, 22 Nov 2024 14:07:20 +0800 Subject: [PATCH 12/32] CANN: Support Ascend310P to accelerate F32 and F16 Model (#10216) * CANN Support Ascend310P to accelerate F32 and F16 Model * Add compile option soc type macro ASCEND_310P to ggml-cann lib * Remove unused code * Remove the ascend soc_type hard code compile option in CMakelist.txt --- ggml/src/ggml-cann/CMakeLists.txt | 29 ++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.cpp | 18 ++++++++++ ggml/src/ggml-cann/kernels/CMakeLists.txt | 7 ++-- ggml/src/ggml-cann/kernels/dup.cpp | 32 +++++++++++++----- ggml/src/ggml-cann/kernels/get_row_f16.cpp | 37 +++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_f32.cpp | 36 ++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 5 ++- 7 files changed, 123 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index c8e15c6d40ce9..756200b893d02 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") endif() +# Auto-detech Soc type and Soc version, if detect failed, will abort build +set(SOC_VERSION "") +function(detect_ascend_soc_type SOC_VERSION) + execute_process( + COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'" + OUTPUT_VARIABLE npu_info + RESULT_VARIABLE npu_result + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if("${npu_info}" STREQUAL "" OR ${npu_result}) + message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.") + endif() + set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE) +endfunction() + +if(NOT SOC_TYPE) + detect_ascend_soc_type(SOC_VERSION) + set(SOC_TYPE "${SOC_VERSION}") + message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") +else() + string(TOLOWER ${SOC_TYPE} SOC_VERSION) +endif() + +# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P. +string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") +set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") + if (CANN_INSTALL_DIR) # Only Support Linux. if (NOT UNIX) @@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR) target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) + target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") + message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") else() diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a4ec8418e2ab3..1f4ee986ceb89 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { switch (src0->type) { case GGML_TYPE_F32: + { +#ifdef ASCEND_310P + // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 8) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f32( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_F16: + { +#ifdef ASCEND_310P + // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 16) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16 + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f16( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_Q4_0: aclrtlaunch_ascendc_get_row_q4_0( 24, ctx.stream(), src0->data, src1->data, dst->data, diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index 5b4fef91b5877..6a4e17cce54c9 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -1,7 +1,3 @@ -if (NOT SOC_TYPE) - set (SOC_TYPE "Ascend910B3") -endif() - file(GLOB SRC_FILES get_row_f32.cpp get_row_f16.cpp @@ -13,7 +9,6 @@ file(GLOB SRC_FILES dup.cpp ) -string(TOLOWER ${SOC_TYPE} SOC_VERSION) set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") @@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC ${SRC_FILES} ) +message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.") +ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp index e2c651152f486..99f03e05883aa 100644 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ b/ggml/src/ggml-cann/kernels/dup.cpp @@ -5,6 +5,7 @@ using namespace AscendC; #define BUFFER_NUM 2 +const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template template class DupByRows { @@ -19,6 +20,7 @@ class DupByRows { // Input has four dims. int64_t op_block_num = GetBlockNum(); int64_t op_block_idx = GetBlockIdx(); + assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM); // param num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; @@ -51,24 +53,36 @@ class DupByRows { __aicore__ inline void copy_in() { LocalTensor src_local = src_queue.AllocTensor(); - - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = num_elem * sizeof(SRC_T); - DataCopyPadExtParams padParams; - DataCopyPad(src_local, src_gm, dataCopyParams, padParams); - + const size_t elem_per_block = 32 / sizeof(SRC_T); + size_t tail = num_elem % elem_per_block; + size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; + DataCopy(src_local, src_gm, cpy_elements_len); src_queue.EnQue(src_local); } __aicore__ inline void copy_out() { LocalTensor dst_local = dst_queue.DeQue(); - +#ifdef ASCEND_310P + const size_t elem_per_block = 32 / sizeof(DST_T); + size_t tail = num_elem % elem_per_block; + size_t len = num_elem & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(dst_gm, dst_local, len); + } + if(tail != 0) { + for (size_t i = tail; i < elem_per_block; i++) { + dst_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(dst_gm[len], dst_local[len], elem_per_block); + SetAtomicNone(); + } +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = num_elem * sizeof(DST_T); DataCopyPad(dst_gm, dst_local, dataCopyParams); - +#endif dst_queue.FreeTensor(dst_local); } diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp index c704b5b2ec0f3..416b45104de5b 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp @@ -14,7 +14,7 @@ class GET_ROW_F16 { int64_t *output_ne_ub, size_t *output_nb_ub) { // TODO, use template for F16/f32 int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -59,32 +59,42 @@ class GET_ROW_F16 { } __aicore__ inline void copy_in(uint32_t offset, size_t len) { + size_t origin_len = len; LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(half); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(half); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -150,6 +160,7 @@ class GET_ROW_F16 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp index 9db080af36998..02116905b18e4 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp @@ -13,7 +13,7 @@ class GET_ROW_F32 { int64_t *indices_ne_ub, size_t *indices_nb_ub, int64_t *output_ne_ub, size_t *output_nb_ub) { int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -55,31 +55,40 @@ class GET_ROW_F32 { __aicore__ inline void copy_in(uint32_t offset, size_t len) { LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -144,6 +153,7 @@ class GET_ROW_F32 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp index a80bfeec2417d..377211096e1f5 100644 --- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp @@ -110,9 +110,12 @@ class GET_ROW_Q4_0 { LocalTensor output_local = output_queue.AllocTensor(); // TODO: cast more data to speed up. +#ifdef ASCEND_310P + // TODO: 310P support quantification +#else Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); - +#endif // Only mul need compile by group. half scale = scale_gm.GetValue(scale_offset); From 599b3e0cd40432cd1975a8906f3db70bbe53b627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 22 Nov 2024 08:32:40 +0100 Subject: [PATCH 13/32] GitHub: ask for more info in issue templates (#10426) * GitHub: ask for more info in issues [no ci] * refactor issue templates to be component-specific * more understandable issue description * add dropdown for llama.cpp module --- .github/ISSUE_TEMPLATE/01-bug-low.yml | 50 ---------- .../ISSUE_TEMPLATE/010-bug-compilation.yml | 73 ++++++++++++++ .github/ISSUE_TEMPLATE/011-bug-results.yml | 98 +++++++++++++++++++ .github/ISSUE_TEMPLATE/019-bug-misc.yml | 78 +++++++++++++++ .github/ISSUE_TEMPLATE/02-bug-medium.yml | 50 ---------- ...05-enhancement.yml => 020-enhancement.yml} | 2 +- .github/ISSUE_TEMPLATE/03-bug-high.yml | 50 ---------- .../{06-research.yml => 030-research.yml} | 2 +- .github/ISSUE_TEMPLATE/04-bug-critical.yml | 50 ---------- .../{07-refactor.yml => 040-refactor.yml} | 2 +- 10 files changed, 252 insertions(+), 203 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/01-bug-low.yml create mode 100644 .github/ISSUE_TEMPLATE/010-bug-compilation.yml create mode 100644 .github/ISSUE_TEMPLATE/011-bug-results.yml create mode 100644 .github/ISSUE_TEMPLATE/019-bug-misc.yml delete mode 100644 .github/ISSUE_TEMPLATE/02-bug-medium.yml rename .github/ISSUE_TEMPLATE/{05-enhancement.yml => 020-enhancement.yml} (97%) delete mode 100644 .github/ISSUE_TEMPLATE/03-bug-high.yml rename .github/ISSUE_TEMPLATE/{06-research.yml => 030-research.yml} (97%) delete mode 100644 .github/ISSUE_TEMPLATE/04-bug-critical.yml rename .github/ISSUE_TEMPLATE/{07-refactor.yml => 040-refactor.yml} (95%) diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml deleted file mode 100644 index 54785854f776e..0000000000000 --- a/.github/ISSUE_TEMPLATE/01-bug-low.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Low Severity Bugs -description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches) -title: "Bug: " -labels: ["bug-unconfirmed", "low severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml new file mode 100644 index 0000000000000..550ee1b498ab9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -0,0 +1,73 @@ +name: Bug (compilation) +description: Something goes wrong when trying to compile llama.cpp. +title: "Compile bug: " +labels: ["bug-unconfirmed", "compilation"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for bug reports where the compilation of llama.cpp fails. + Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`. + If the compilation succeeds with ccache disabled you should be able to permanently fix the issue + by clearing `~/.cache/ccache` (on Linux). + - type: textarea + id: commit + attributes: + label: Git commit + description: Which commit are you trying to compile? + placeholder: | + $git rev-parse HEAD + 84a07a17b1b08cf2b9747c633a2372782848a27f + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: backends + attributes: + label: GGML backends + description: Which GGML backends do you know to be affected? + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + multiple: true + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. + placeholder: > + Here are the exact commands that I used: ... + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml new file mode 100644 index 0000000000000..1adb162b79a77 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -0,0 +1,98 @@ +name: Bug (model use) +description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module). +title: "Eval bug: " +labels: ["bug-unconfirmed", "model evaluation"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for bug reports where the model evaluation results + (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation. + If you encountered the issue while using an external UI (e.g. ollama), + please reproduce your issue using one of the examples/binaries in this repository. + The `llama-cli` binary can be used for simple and reproducible model inference. + - type: textarea + id: version + attributes: + label: Name and Version + description: Which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: backends + attributes: + label: GGML backends + description: Which GGML backends do you know to be affected? + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + multiple: true + - type: textarea + id: hardware + attributes: + label: Hardware + description: Which CPUs/GPUs are you using? + placeholder: > + e.g. Ryzen 5950X + 2x RTX 4090 + validations: + required: true + - type: textarea + id: model + attributes: + label: Model + description: > + Which model at which quantization were you using when encountering the bug? + If you downloaded a GGUF file off of Huggingface, please provide a link. + placeholder: > + e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M + validations: + required: false + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + If you can narrow down the bug to specific hardware, compile flags, or command line arguments, + that information would be very much appreciated by us. + placeholder: > + e.g. when I run llama-cli with -ngl 99 I get garbled outputs. + When I use -ngl 0 it works correctly. + Here are the exact commands that I used: ... + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml new file mode 100644 index 0000000000000..124cdee919089 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -0,0 +1,78 @@ +name: Bug (misc.) +description: Something is not working the way it should (and it's not covered by any of the above cases). +title: "Misc. bug: " +labels: ["bug-unconfirmed"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for miscellaneous bugs that don't fit into any other category. + If you encountered the issue while using an external UI (e.g. ollama), + please reproduce your issue using one of the examples/binaries in this repository. + - type: textarea + id: version + attributes: + label: Name and Version + description: Which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: module + attributes: + label: Which llama.cpp modules do you know to be affected? + multiple: true + options: + - libllama (core library) + - llama-cli + - llama-server + - llama-bench + - llama-quantize + - Python/Bash scripts + - Other (Please specify in the next section) + validations: + required: true + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml deleted file mode 100644 index a6285c6f05bac..0000000000000 --- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Medium Severity Bug -description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable) -title: "Bug: " -labels: ["bug-unconfirmed", "medium severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/05-enhancement.yml rename to .github/ISSUE_TEMPLATE/020-enhancement.yml index 58fca73183d41..02dd4f575a686 100644 --- a/.github/ISSUE_TEMPLATE/05-enhancement.yml +++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml @@ -1,5 +1,5 @@ name: Enhancement -description: Used to request enhancements for llama.cpp +description: Used to request enhancements for llama.cpp. title: "Feature Request: " labels: ["enhancement"] body: diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml deleted file mode 100644 index ff816b93769c3..0000000000000 --- a/.github/ISSUE_TEMPLATE/03-bug-high.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: High Severity Bug -description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow) -title: "Bug: " -labels: ["bug-unconfirmed", "high severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/06-research.yml rename to .github/ISSUE_TEMPLATE/030-research.yml index 3ae4e9f8caaa4..18975dbbfd0fe 100644 --- a/.github/ISSUE_TEMPLATE/06-research.yml +++ b/.github/ISSUE_TEMPLATE/030-research.yml @@ -1,5 +1,5 @@ name: Research -description: Track new technical research area +description: Track new technical research area. title: "Research: " labels: ["research 🔬"] body: diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml deleted file mode 100644 index 7af42a80b3b93..0000000000000 --- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Critical Severity Bug -description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss) -title: "Bug: " -labels: ["bug-unconfirmed", "critical severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml similarity index 95% rename from .github/ISSUE_TEMPLATE/07-refactor.yml rename to .github/ISSUE_TEMPLATE/040-refactor.yml index 3a68d3d5355d6..b6e6ab36defd6 100644 --- a/.github/ISSUE_TEMPLATE/07-refactor.yml +++ b/.github/ISSUE_TEMPLATE/040-refactor.yml @@ -1,5 +1,5 @@ name: Refactor (Maintainers) -description: Used to track refactoring opportunities +description: Used to track refactoring opportunities. title: "Refactor: " labels: ["refactor"] body: From 6dfcfef0787e9902df29f510b63621f60a09a50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=95=AD=E6=BE=A7=E9=82=A6?= <45505768+shou692199@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:44:08 +0800 Subject: [PATCH 14/32] ci: Update oneAPI runtime dll packaging (#10428) This is the minimum runtime dll dependencies for oneAPI 2025.0 --- .github/workflows/build.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6ef0770f3fa2f..572f91643fbba 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -986,13 +986,14 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin From 55ed008b2de01592659b9eba068ea01bb2f72160 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sat, 23 Nov 2024 14:41:12 +0100 Subject: [PATCH 15/32] ggml : do not use ARM features not included in the build (#10457) --- ggml/src/ggml-cpu/ggml-cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 0d23669c25bea..4b58254e7d108 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13896,7 +13896,7 @@ int ggml_cpu_has_vsx(void) { } int ggml_cpu_has_neon(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_NEON) return ggml_arm_arch_features.has_neon; #else return 0; @@ -13904,7 +13904,7 @@ int ggml_cpu_has_neon(void) { } int ggml_cpu_has_sve(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.has_sve; #else return 0; @@ -13912,7 +13912,7 @@ int ggml_cpu_has_sve(void) { } int ggml_cpu_has_matmul_int8(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8) return ggml_arm_arch_features.has_i8mm; #else return 0; @@ -13920,7 +13920,7 @@ int ggml_cpu_has_matmul_int8(void) { } int ggml_cpu_get_sve_cnt(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.sve_cnt; #else return 0; From 96fa2c5e2d6cb5319f34c3a7fb0cec05694b22f1 Mon Sep 17 00:00:00 2001 From: momonga <146910567+mmngays@users.noreply.github.com> Date: Sun, 24 Nov 2024 09:09:22 +0900 Subject: [PATCH 16/32] fix gguf-py: Conversion error when multiple licenses are configured (#9807) * fix general.license list to str * fix join license list --------- Co-authored-by: momonga <115213907+mmnga@users.noreply.github.com> --- gguf-py/gguf/metadata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index 321cbcd4c5507..962c27b204464 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -545,7 +545,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): gguf_writer.add_size_label(self.size_label) if self.license is not None: - gguf_writer.add_license(self.license) + if isinstance(self.license, list): + gguf_writer.add_license(",".join(self.license)) + else: + gguf_writer.add_license(self.license) if self.license_name is not None: gguf_writer.add_license_name(self.license_name) if self.license_link is not None: From 9336db462c0c34bbe2055413fe4e16442626c38b Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Sun, 24 Nov 2024 02:02:34 -0700 Subject: [PATCH 17/32] convert : XLMRoberta Type Vocab Size (#10458) This matches the key in common bert-based embedding models and may have a value other than 1 in it. Branch: XLMRobertaTypeVocabSize Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9f4b8154b88a8..80a179b86af7e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2707,7 +2707,7 @@ def set_vocab(self): self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(1) + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) if precompiled_charsmap: self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) From dc39012cbaf8752fabecaeb60af78ccdd1dfb73b Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 24 Nov 2024 16:10:26 +0100 Subject: [PATCH 18/32] llama : fix op mul check with command-r-plus (#10476) --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 001711037d5d1..20df09b133bfb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7181,12 +7181,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_ADD: { - ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); op_tensor = ggml_add(ctx, a, w); } break; case GGML_OP_MUL: { - ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); op_tensor = ggml_mul(ctx, a, w); } break; case GGML_OP_DIV: From cce5a9007572c6e9fa522296b77571d2e5071357 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 24 Nov 2024 18:03:25 +0200 Subject: [PATCH 19/32] flake.lock: Update (#10470) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/5e4fbfb6b3de1aa2872b76d49fafc942626e2add?narHash=sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg%3D' (2024-11-15) → 'github:NixOS/nixpkgs/23e89b7da85c3640bbc2173fe04f4bd114342367?narHash=sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w%3D' (2024-11-19) Co-authored-by: github-actions[bot] --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index ee8cf07e37220..d114f4422a36a 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1731676054, - "narHash": "sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg=", + "lastModified": 1732014248, + "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "5e4fbfb6b3de1aa2872b76d49fafc942626e2add", + "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367", "type": "github" }, "original": { From d9d54e498d38ec99bbc0031022f9c92711e97bbc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 09:58:41 +0200 Subject: [PATCH 20/32] speculative : refactor and add a simpler example (#10362) * speculative : refactor and add a simpler example ggml-ci * speculative : clean-up and add comments and TODOs [no ci] * speculative : manage context in common_speculative ggml-ci * speculative : simplify ggml-ci * speculative : simplify (cont) ggml-ci * speculative : add --draft-min CLI arg * speculative : minor fixup * make : build fixes * speculative : do not redraft previous drafts ggml-ci * speculative : fix the draft sampling ggml-ci * speculative : fix compile warning * common : refactor args ggml-ci * common : change defaults [no ci] * common : final touches ggml-ci --- Makefile | 1 + common/CMakeLists.txt | 2 + common/arg.cpp | 438 +++++++++--------- common/common.cpp | 76 ++- common/common.h | 41 +- common/sampling.cpp | 45 +- common/sampling.h | 23 +- common/speculative.cpp | 269 +++++++++++ common/speculative.h | 28 ++ examples/CMakeLists.txt | 1 + examples/batched/batched.cpp | 8 +- examples/infill/infill.cpp | 2 +- examples/llava/llava-cli.cpp | 2 +- examples/llava/minicpmv-cli.cpp | 2 +- examples/lookahead/lookahead.cpp | 2 +- examples/lookup/lookup-stats.cpp | 3 +- examples/lookup/lookup.cpp | 4 +- examples/main/main.cpp | 2 +- examples/parallel/parallel.cpp | 2 +- examples/retrieval/retrieval.cpp | 4 +- examples/save-load-state/save-load-state.cpp | 8 +- examples/server/server.cpp | 10 +- examples/server/utils.hpp | 57 --- examples/speculative-simple/CMakeLists.txt | 5 + examples/speculative-simple/README.md | 12 + .../speculative-simple/speculative-simple.cpp | 273 +++++++++++ examples/speculative/speculative.cpp | 30 +- tests/test-arg-parser.cpp | 4 +- 28 files changed, 1028 insertions(+), 326 deletions(-) create mode 100644 common/speculative.cpp create mode 100644 common/speculative.h create mode 100644 examples/speculative-simple/CMakeLists.txt create mode 100644 examples/speculative-simple/README.md create mode 100644 examples/speculative-simple/speculative-simple.cpp diff --git a/Makefile b/Makefile index 5c899438515e1..dd6d864ad513a 100644 --- a/Makefile +++ b/Makefile @@ -966,6 +966,7 @@ OBJ_COMMON = \ $(DIR_COMMON)/console.o \ $(DIR_COMMON)/ngram-cache.o \ $(DIR_COMMON)/sampling.o \ + $(DIR_COMMON)/speculative.o \ $(DIR_COMMON)/build-info.o \ $(DIR_COMMON)/json-schema-to-grammar.o diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 5ab1ffa1922aa..62a8a7db5652f 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -66,6 +66,8 @@ add_library(${TARGET} STATIC ngram-cache.h sampling.cpp sampling.h + speculative.cpp + speculative.h ) if (BUILD_SHARED_LIBS) diff --git a/common/arg.cpp b/common/arg.cpp index 4115b2f7511d3..32240f21f2469 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -233,10 +233,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } } - postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + + postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch); if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); @@ -251,7 +252,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context for (auto & antiprompt : params.antiprompt) { string_process_escapes(antiprompt); } - for (auto & seq_breaker : params.sparams.dry_sequence_breakers) { + for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { string_process_escapes(seq_breaker); } } @@ -329,7 +330,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::string sampler_type_chars; std::string sampler_type_names; - for (const auto & sampler : params.sparams.samplers) { + for (const auto & sampler : params.sampling.samplers) { sampler_type_chars += common_sampler_type_to_chr(sampler); sampler_type_names += common_sampler_type_to_str(sampler) + ";"; } @@ -407,26 +408,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); - add_opt(common_arg( - {"-td", "--threads-draft"}, "N", - "number of threads to use during generation (default: same as --threads)", - [](common_params & params, int value) { - params.draft_cpuparams.n_threads = value; - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-tbd", "--threads-batch-draft"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.n_threads = value; - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", @@ -515,108 +496,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cpuparams_batch.poll = value; } )); - add_opt(common_arg( - {"-Cd", "--cpu-mask-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crd", "--cpu-range-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](common_params & params, const std::string & range) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.draft_cpuparams.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: same as --poll])", - [](common_params & params, int value) { - params.draft_cpuparams.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Cbd", "--cpu-mask-batch-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](common_params & params, const std::string & range) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-batch-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-batch-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-batch-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: --poll-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--draft"}, "N", - string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](common_params & params, int value) { - params.n_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(common_arg( - {"-ps", "--p-split"}, "N", - string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](common_params & params, const std::string & value) { - params.p_split = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", @@ -701,7 +580,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), [](common_params & params) { params.no_perf = true; - params.sparams.no_perf = true; + params.sampling.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); add_opt(common_arg( @@ -883,155 +762,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), [](common_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); + params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", - string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), [](common_params & params, const std::string & value) { - params.sparams.seed = std::stoul(value); + params.sampling.seed = std::stoul(value); } ).set_sparam()); add_opt(common_arg( {"--sampling-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { - params.sparams.samplers = common_sampler_types_from_chars(value); + params.sampling.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", [](common_params & params) { - params.sparams.ignore_eos = true; + params.sampling.ignore_eos = true; } ).set_sparam()); add_opt(common_arg( {"--penalize-nl"}, - string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"), [](common_params & params) { - params.sparams.penalize_nl = true; + params.sampling.penalize_nl = true; } ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", - string_format("temperature (default: %.1f)", (double)params.sparams.temp), + string_format("temperature (default: %.1f)", (double)params.sampling.temp), [](common_params & params, const std::string & value) { - params.sparams.temp = std::stof(value); - params.sparams.temp = std::max(params.sparams.temp, 0.0f); + params.sampling.temp = std::stof(value); + params.sampling.temp = std::max(params.sampling.temp, 0.0f); } ).set_sparam()); add_opt(common_arg( {"--top-k"}, "N", - string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), [](common_params & params, int value) { - params.sparams.top_k = value; + params.sampling.top_k = value; } ).set_sparam()); add_opt(common_arg( {"--top-p"}, "N", - string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), [](common_params & params, const std::string & value) { - params.sparams.top_p = std::stof(value); + params.sampling.top_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", - string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), [](common_params & params, const std::string & value) { - params.sparams.min_p = std::stof(value); + params.sampling.min_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", - string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability), + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), [](common_params & params, const std::string & value) { - params.sparams.xtc_probability = std::stof(value); + params.sampling.xtc_probability = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-threshold"}, "N", - string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold), + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), [](common_params & params, const std::string & value) { - params.sparams.xtc_threshold = std::stof(value); + params.sampling.xtc_threshold = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", - string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), [](common_params & params, const std::string & value) { - params.sparams.typ_p = std::stof(value); + params.sampling.typ_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--repeat-last-n"}, "N", - string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), [](common_params & params, int value) { - params.sparams.penalty_last_n = value; - params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); + params.sampling.penalty_last_n = value; + params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); } ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", - string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), [](common_params & params, const std::string & value) { - params.sparams.penalty_repeat = std::stof(value); + params.sampling.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", - string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), [](common_params & params, const std::string & value) { - params.sparams.penalty_present = std::stof(value); + params.sampling.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", - string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), [](common_params & params, const std::string & value) { - params.sparams.penalty_freq = std::stof(value); + params.sampling.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-multiplier"}, "N", - string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier), + string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), [](common_params & params, const std::string & value) { - params.sparams.dry_multiplier = std::stof(value); + params.sampling.dry_multiplier = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-base"}, "N", - string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base), + string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), [](common_params & params, const std::string & value) { float potential_base = std::stof(value); if (potential_base >= 1.0f) { - params.sparams.dry_base = potential_base; + params.sampling.dry_base = potential_base; } } ).set_sparam()); add_opt(common_arg( {"--dry-allowed-length"}, "N", - string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length), + string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), [](common_params & params, int value) { - params.sparams.dry_allowed_length = value; + params.sampling.dry_allowed_length = value; } ).set_sparam()); add_opt(common_arg( {"--dry-penalty-last-n"}, "N", - string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n), + string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), [](common_params & params, int value) { - params.sparams.dry_penalty_last_n = value; + params.sampling.dry_penalty_last_n = value; } ).set_sparam()); add_opt(common_arg( {"--dry-sequence-breaker"}, "STRING", string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n", - params.sparams.dry_sequence_breakers.empty() ? "none" : - std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()), - params.sparams.dry_sequence_breakers.end(), - std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'", + params.sampling.dry_sequence_breakers.empty() ? "none" : + std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()), + params.sampling.dry_sequence_breakers.end(), + std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'", [](const std::string& a, const std::string& b) { std::string formatted_b = (b == "\n") ? "\\n" : b; return a + ", '" + formatted_b + "'"; @@ -1040,51 +919,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex static bool defaults_cleared = false; if (!defaults_cleared) { - params.sparams.dry_sequence_breakers.clear(); + params.sampling.dry_sequence_breakers.clear(); defaults_cleared = true; } if (value == "none") { - params.sparams.dry_sequence_breakers.clear(); + params.sampling.dry_sequence_breakers.clear(); } else { - params.sparams.dry_sequence_breakers.emplace_back(value); + params.sampling.dry_sequence_breakers.emplace_back(value); } } ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", - string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), [](common_params & params, const std::string & value) { - params.sparams.dynatemp_range = std::stof(value); + params.sampling.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", - string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), [](common_params & params, const std::string & value) { - params.sparams.dynatemp_exponent = std::stof(value); + params.sampling.dynatemp_exponent = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat"}, "N", string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat), [](common_params & params, int value) { - params.sparams.mirostat = value; + params.sampling.mirostat = value; } ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", - string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), [](common_params & params, const std::string & value) { - params.sparams.mirostat_eta = std::stof(value); + params.sampling.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", - string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), [](common_params & params, const std::string & value) { - params.sparams.mirostat_tau = std::stof(value); + params.sampling.mirostat_tau = std::stof(value); } ).set_sparam()); add_opt(common_arg( @@ -1100,7 +979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex try { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sparams.logit_bias.push_back({key, bias}); + params.sampling.logit_bias.push_back({key, bias}); } else { throw std::invalid_argument("invalid input format"); } @@ -1111,9 +990,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--grammar"}, "GRAMMAR", - string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()), [](common_params & params, const std::string & value) { - params.sparams.grammar = value; + params.sampling.grammar = value; } ).set_sparam()); add_opt(common_arg( @@ -1127,7 +1006,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.sparams.grammar) + std::back_inserter(params.sampling.grammar) ); } ).set_sparam()); @@ -1135,7 +1014,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", [](common_params & params, const std::string & value) { - params.sparams.grammar = json_schema_to_grammar(json::parse(value)); + params.sampling.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); add_opt(common_arg( @@ -1444,17 +1323,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(common_arg( - {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", - "number of layers to store in VRAM for the draft model", - [](common_params & params, int value) { - params.n_gpu_layers_draft = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" @@ -1593,13 +1461,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(common_arg( - {"-md", "--model-draft"}, "FNAME", - "draft model for speculative decoding (default: unused)", - [](common_params & params, const std::string & value) { - params.model_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", @@ -2037,5 +1898,168 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_LOG_TIMESTAMPS")); + // speculative parameters + add_opt(common_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](common_params & params, int value) { + params.speculative.cpuparams.n_threads = value; + if (params.speculative.cpuparams.n_threads <= 0) { + params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.n_threads = value; + if (params.speculative.cpuparams_batch.n_threads <= 0) { + params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](common_params & params, int value) { + params.speculative.cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](common_params & params, int value) { + params.speculative.cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-batch-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--draft-max", "--draft", "--draft-n"}, "N", + string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), + [](common_params & params, int value) { + params.speculative.n_max = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--draft-min", "--draft-n-min"}, "N", + string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), + [](common_params & params, int value) { + params.speculative.n_min = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--draft-p-split"}, "P", + string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), + [](common_params & params, const std::string & value) { + params.speculative.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--draft-p-min"}, "P", + string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), + [](common_params & params, const std::string & value) { + params.speculative.p_min = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-cd", "--ctx-size-draft"}, "N", + string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), + [](common_params & params, int value) { + params.speculative.n_ctx = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](common_params & params, int value) { + params.speculative.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](common_params & params, const std::string & value) { + params.speculative.model = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index d314523db4c62..c398329d05bf5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat [](const unsigned char c) { return !std::isprint(c); }), detokenized.end()); - buf << "\n" << std::to_string(i) - << ":token '" << detokenized << "'" - << ":pos " << std::to_string(batch.pos[i]) - << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) - << ":seq_id " << std::to_string(batch.seq_id[i][0]) - << ":logits " << std::to_string(batch.logits[i]); + buf << "\n" << std::to_string(i) + << ", token '" << detokenized << "'" + << ", pos " << std::to_string(batch.pos[i]) + << ", n_seq_id " << std::to_string(batch.n_seq_id[i]) + << ", seq_id " << std::to_string(batch.seq_id[i][0]) + << ", logits " << std::to_string(batch.logits[i]); } buf << " ]"; @@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) { common_lora_adapters_apply(lctx, iparams.lora_adapters); } - if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { + if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); - params.sparams.ignore_eos = false; + params.sampling.ignore_eos = false; } if (params.warmup) { @@ -1490,6 +1490,66 @@ void common_batch_add( batch.n_tokens++; } +// +// Token utils +// + +size_t common_lcp(const llama_tokens & a, const llama_tokens & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + + return i; +} + +size_t common_lcs(const llama_tokens & a, const llama_tokens & b) { + // check for empty sequences + if (a.empty() || b.empty()) { + return 0; + } + + // get the lengths of the input sequences + size_t a_len = a.size(); + size_t b_len = b.size(); + + // initialize the maximum length of the longest common subsequence (LCS) + size_t max_length = 0; + + // use two rows instead of a 2D matrix to optimize space + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); + + // iterate through the elements of a + for (size_t i = 1; i <= a_len; i++) { + // iterate through the elements of b + for (size_t j = 1; j <= b_len; j++) { + // if elements at the current positions match + if (a[i - 1] == b[j - 1]) { + // if it's the first element of either sequences, set LCS length to 1 + if (i == 1 || j == 1) { + curr_row[j] = 1; + } else { + // increment LCS length by 1 compared to the previous element + curr_row[j] = prev_row[j - 1] + 1; + } + + // update max_length if necessary + if (curr_row[j] > max_length) { + max_length = curr_row[j]; + } + } else { + // reset LCS length if elements don't match + curr_row[j] = 0; + } + } + + // update the previous row for the next iteration + prev_row = curr_row; + } + + // return the maximum length of the LCS + return max_length; +} + // // Vocab utils // diff --git a/common/common.h b/common/common.h index 7977cc7a99a78..5c579b5abfe03 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info { struct llama_lora_adapter * adapter; }; +using llama_tokens = std::vector; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -101,8 +103,8 @@ enum dimre_method { DIMRE_METHOD_MEAN, }; -// sampler parameters -struct common_sampler_params { +// sampling parameters +struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler int32_t n_prev = 64; // number of previous tokens to remember @@ -153,19 +155,30 @@ struct common_sampler_params { std::string print() const; }; +struct common_params_speculative { + int32_t n_ctx = 0; // draft context size + int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding + int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.9f; // minimum speculative decoding probability (greedy) + + struct cpu_params cpuparams; + struct cpu_params cpuparams_batch; + + std::string model = ""; // draft model for speculative decoding // NOLINT +}; + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode - float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor @@ -182,8 +195,6 @@ struct common_params { struct cpu_params cpuparams; struct cpu_params cpuparams_batch; - struct cpu_params draft_cpuparams; - struct cpu_params draft_cpuparams_batch; ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -195,10 +206,10 @@ struct common_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct common_sampler_params sparams; + struct common_params_sampling sampling; + struct common_params_speculative speculative; std::string model = ""; // model path // NOLINT - std::string model_draft = ""; // draft model for speculative decoding // NOLINT std::string model_alias = "unknown"; // model alias // NOLINT std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT @@ -461,7 +472,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f // clear LoRA adapters from context, then apply new list of adapters void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); +// // Batch utils +// void common_batch_clear(struct llama_batch & batch); @@ -472,6 +485,16 @@ void common_batch_add( const std::vector & seq_ids, bool logits); +// +// Token utils +// + +// longest common prefix +size_t common_lcp(const llama_tokens & a, const llama_tokens & b); + +// longet common subsequence +size_t common_lcs(const llama_tokens & a, const llama_tokens & b); + // // Vocab utils // diff --git a/common/sampling.cpp b/common/sampling.cpp index 7922fde47d369..0c4699a89c8b2 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -99,7 +99,7 @@ struct ring_buffer { }; struct common_sampler { - common_sampler_params params; + common_params_sampling params; struct llama_sampler * grmr; struct llama_sampler * chain; @@ -125,7 +125,7 @@ struct common_sampler { } }; -std::string common_sampler_params::print() const { +std::string common_params_sampling::print() const { char result[1024]; snprintf(result, sizeof(result), @@ -141,7 +141,7 @@ std::string common_sampler_params::print() const { return std::string(result); } -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); lparams.no_perf = params.no_perf; @@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co return cur_p.data[cur_p.selected].id; } +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { + GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); + + std::vector result; + result.reserve(idxs.size()); + + size_t i = 0; + for (; i < draft.size(); i++) { + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + common_sampler_accept(gsmpl, id, true); + + result.push_back(id); + + if (draft[i] != id) { + break; + } + } + + if (i == draft.size()) { + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + common_sampler_accept(gsmpl, id, true); + + result.push_back(id); + } + + return result; +} + +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { + std::vector idxs(draft.size() + 1); + for (size_t i = 0; i < idxs.size(); ++i) { + idxs[i] = i; + } + + return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); +} + uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } diff --git a/common/sampling.h b/common/sampling.h index d37f25ad37c4a..348911b18888b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -36,7 +36,7 @@ struct common_sampler; // llama_sampler API overloads -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params); void common_sampler_free(struct common_sampler * gsmpl); @@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam // llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +// generalized version of common_sampler_sample +// +// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match +// if the sampler disagrees at some point, we stop and return the accepted tokens up to now +// +// common_sampler_sample_n(gsmpl, ctx, { idx }, {}); +// +// is equivalent to +// +// common_sampler_sample(gsmpl, ctx, idx); +// common_sampler_accept(gsmpl, token, true); +// +// requires: idxs.size() == draft.size() + 1 +// +// returns at least 1 token, up to idxs.size() +// +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); + +// assume idxs == [ 0, 1, 2, ..., draft.size() ] +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); + uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); // helpers diff --git a/common/speculative.cpp b/common/speculative.cpp new file mode 100644 index 0000000000000..fe315a2703e9c --- /dev/null +++ b/common/speculative.cpp @@ -0,0 +1,269 @@ +#include "speculative.h" + +#include "log.h" +#include "common.h" +#include "sampling.h" + +#include + +#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 +#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 + +struct common_speculative { + struct llama_context * ctx; + struct common_sampler * smpl; + + llama_batch batch; + llama_tokens prompt; +}; + +struct common_speculative * common_speculative_init( + struct llama_context * ctx_dft) { + auto * result = new common_speculative { + /* .ctx = */ ctx_dft, + /* .smpl = */ nullptr, + /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1), + /* .prompt = */ {}, + }; + + // TODO: optimize or pass from outside? +#if 0 + { + common_params_sampling params; + params.no_perf = false; + + params.top_k = 40; + params.top_p = 0.9; + + params.samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + COMMON_SAMPLER_TYPE_TOP_P, + COMMON_SAMPLER_TYPE_INFILL, + }; + + result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); + } +#else + { + common_params_sampling params; + params.no_perf = false; + + params.top_k = 10; + + params.samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + }; + + result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); + } +#endif + + return result; +} + +void common_speculative_free(struct common_speculative * spec) { + common_sampler_free(spec->smpl); + + llama_batch_free(spec->batch); + + delete spec; +} + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft) { + const struct llama_model * model_tgt = llama_get_model(ctx_tgt); + const struct llama_model * model_dft = llama_get_model(ctx_dft); + + const bool vocab_type_tgt = llama_vocab_type(model_tgt); + LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); + + const bool vocab_type_dft = llama_vocab_type(model_dft); + LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); + + if (vocab_type_tgt != vocab_type_dft) { + LOG_ERR("%s: draft model vocab type must match target model to use speculation but " + "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt); + return false; + } + + if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || + llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || + llama_token_bos(model_tgt) != llama_token_bos(model_dft) || + llama_token_eos(model_tgt) != llama_token_eos(model_dft) + ) { + LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); + return false; + } + + { + const int n_vocab_tgt = llama_n_vocab(model_tgt); + const int n_vocab_dft = llama_n_vocab(model_dft); + + const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); + + if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { + LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " + "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + return false; + } + + for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { + const char * token_text_tgt = llama_token_get_text(model_tgt, i); + const char * token_text_dft = llama_token_get_text(model_dft, i); + if (std::strcmp(token_text_tgt, token_text_dft) != 0) { + LOG_ERR("%s: draft model vocab must match target model to use speculation but " + "token %d content differs - target '%s', draft '%s'\n", __func__, i, + common_token_to_piece(ctx_tgt, i).c_str(), + common_token_to_piece(ctx_dft, i).c_str()); + return false; + } + } + } + + return true; +} + +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt_tgt, + llama_token id_last) { + auto & batch = spec->batch; + auto & ctx = spec->ctx; + auto & smpl = spec->smpl; + auto & prompt = spec->prompt; + + int reuse_i = 0; + int reuse_n = 0; + + const int n_ctx = llama_n_ctx(ctx) - params.n_draft; + + const int i_start = std::max(0, (int) prompt_tgt.size() - n_ctx); + + // reuse as much as possible from the old draft context + // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt + for (int i = 0; i < (int) prompt.size(); ++i) { + int cur = 0; + while (i_start + cur < (int) prompt_tgt.size() && + i + cur < (int) prompt.size() && + prompt_tgt[i_start + cur] == prompt[i + cur]) { + cur++; + } + + if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) { + reuse_i = i; + reuse_n = cur; + } + } + + LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size()); + + llama_tokens result; + result.reserve(params.n_draft); + + if (reuse_n == 0) { + llama_kv_cache_clear(ctx); + + prompt.clear(); + } else { + // this happens when a previous draft has been discarded (for example, due to being too small), but the + // target model agreed with it. in this case, we simply pass back the previous results to save compute + if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) { + for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) { + result.push_back(prompt[i]); + + if (params.n_draft <= (int) result.size()) { + break; + } + } + + return result; + } + + if (reuse_i > 0) { + llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); + llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + + prompt.erase(prompt.begin(), prompt.begin() + reuse_i); + } + + if (reuse_n < (int) prompt.size()) { + llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); + + prompt.erase(prompt.begin() + reuse_n, prompt.end()); + } + } + + // prepare a batch to evaluate any new tokens in the prompt + common_batch_clear(batch); + + for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) { + //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]); + common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); + + prompt.push_back(prompt_tgt[i]); + } + + // we should rarely end-up here during normal decoding + if (batch.n_tokens > 0) { + //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str()); + + llama_decode(ctx, batch); + } + + const llama_pos n_past = prompt.size(); + + LOG_DBG("%s: n_past = %d\n", __func__, n_past); + + common_batch_clear(batch); + common_batch_add (batch, id_last, n_past, { 0 }, true); + + prompt.push_back(id_last); + + //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str()); + + llama_decode(ctx, batch); + + common_sampler_reset(smpl); + + // sample n_draft tokens from the draft model + for (int i = 0; i < params.n_draft; ++i) { + common_batch_clear(batch); + + common_sampler_sample(smpl, ctx, 0, true); + + const auto * cur_p = common_sampler_get_candidates(smpl); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str()); + } + + // add drafted token for each sequence + const llama_token id = cur_p->data[0].id; + + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + break; + } + + common_sampler_accept(smpl, id, true); + + result.push_back(id); + + if (params.n_draft <= (int) result.size()) { + break; + } + + common_batch_add(batch, id, n_past + i + 1, { 0 }, true); + + // evaluate the drafted tokens on the draft model + llama_decode(ctx, batch); + + prompt.push_back(id); + } + + return result; +} diff --git a/common/speculative.h b/common/speculative.h new file mode 100644 index 0000000000000..50ec0344618aa --- /dev/null +++ b/common/speculative.h @@ -0,0 +1,28 @@ +#pragma once + +#include "llama.h" +#include "common.h" + +struct common_speculative; + +struct common_speculative_params { + int n_draft = 16; // max drafted tokens + int n_reuse = 256; + + float p_min = 0.9f; // min probabiliy required to accept a token in the draft +}; + +struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); + +void common_speculative_free(struct common_speculative * spec); + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft); + +// sample up to n_draft tokens and add them to the batch using the draft model +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt, + llama_token id_last); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d63a96c1c2547..9bd099d4ef8a5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -50,5 +50,6 @@ else() add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) + add_subdirectory(speculative-simple) add_subdirectory(tokenize) endif() diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 3b554033e7ee4..ba219cd4b32ae 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -68,10 +68,10 @@ int main(int argc, char ** argv) { llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep)); - llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp)); - llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); if (ctx == NULL) { LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 15b358dc4e854..ef700895720ff 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -73,7 +73,7 @@ int main(int argc, char ** argv) { common_init(); - auto & sparams = params.sparams; + auto & sparams = params.sampling; console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 1610985858fc9..2691c6e6b2dd2 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG("\n"); - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index cbecec343c640..e9cbb51ed90ab 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm LOG_INF("\n"); - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); return smpl; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 3c0ccfea2ccd7..8d0ef8b3d75e6 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -115,7 +115,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct common_sampler * smpl = common_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sampling); // verification n-grams std::vector ngrams_cur(G); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 7faebe7ba11fc..dff07c075c47f 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -21,7 +21,7 @@ int main(int argc, char ** argv){ common_init(); - const int n_draft = params.n_draft; + const int n_draft = params.speculative.n_max; // init llama.cpp llama_backend_init(); @@ -40,6 +40,7 @@ int main(int argc, char ** argv){ common_ngram_cache ngram_cache_context; common_ngram_cache ngram_cache_dynamic; common_ngram_cache ngram_cache_static; + int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index a04728b1834cc..4d92bb2385358 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -22,7 +22,7 @@ int main(int argc, char ** argv){ common_init(); // max. number of additional tokens to draft if match is found - const int n_draft = params.n_draft; + const int n_draft = params.speculative.n_max; const bool dump_kv_cache = params.dump_kv_cache; @@ -102,7 +102,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct common_sampler * smpl = common_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sampling); std::vector draft; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7c4ce4be2abae..957451af7ce0a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -100,7 +100,7 @@ int main(int argc, char ** argv) { common_init(); - auto & sparams = params.sparams; + auto & sparams = params.sampling; // save choice to use color for later // (note for later: this is a slightly awkward choice) diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 43c8f3ed56ba9..fd2b1c0112838 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -160,7 +160,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.smpl = common_sampler_init(model, params.sparams); + client.smpl = common_sampler_init(model, params.sampling); } std::vector tokens_system; diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 1768aae510067..e78a8596d8cfe 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -282,8 +282,8 @@ int main(int argc, char ** argv) { return a.second > b.second; }); - LOG("Top %d similar chunks:\n", params.sparams.top_k); - for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { + LOG("Top %d similar chunks:\n", params.sampling.top_k); + for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) { LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str()); LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); LOG("similarity: %f\n", similarities[i].second); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 8c49a52a66124..2f0cf9baa32b7 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -9,7 +9,7 @@ int main(int argc, char ** argv) { common_params params; params.prompt = "The quick brown fox"; - params.sparams.seed = 1234; + params.sampling.seed = 1234; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; @@ -42,7 +42,7 @@ int main(int argc, char ** argv) { llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); // tokenize prompt auto tokens = common_tokenize(ctx, params.prompt, true); @@ -106,7 +106,7 @@ int main(int argc, char ** argv) { llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed)); + llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed)); printf("\nsecond run: %s", params.prompt.c_str()); @@ -169,7 +169,7 @@ int main(int argc, char ** argv) { llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed)); + llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); printf("\nsingle seq run: %s", params.prompt.c_str()); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b8e003be9730e..6c55d65c01330 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -175,7 +175,7 @@ struct server_slot { // sampling json json_schema; - struct common_sampler_params sparams; + struct common_params_sampling sparams; struct common_sampler * smpl = nullptr; llama_token sampled; @@ -687,7 +687,7 @@ struct server_context { SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - slot.sparams = params.sparams; + slot.sparams = params.sampling; slot.callback_on_release = [this](int) { queue_tasks.pop_deferred_task(); @@ -743,7 +743,7 @@ struct server_context { } // length of the Longest Common Subsequence between the current slot's prompt and the input prompt - int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); + int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens); // fraction of the common subsequence length compared to the current slot's prompt length float cur_similarity = static_cast(cur_lcs_len) / static_cast(slot.cache_tokens.size()); @@ -788,7 +788,7 @@ struct server_context { bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - auto default_sparams = params.sparams; + auto default_sparams = params.sampling; const auto & data = task.data; if (data.count("__oaicompat") != 0) { @@ -1960,7 +1960,7 @@ struct server_context { if (slot.params.cache_prompt) { // reuse any previously computed tokens that are common with the new prompt - slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); + slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); // reuse chunks from the cached prompt by shifting their KV cache in the new position if (params.n_cache_reuse > 0) { diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index c47ed3e47a76d..1665e9dc37db6 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -24,7 +24,6 @@ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" using json = nlohmann::ordered_json; -using llama_tokens = std::vector; #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) @@ -439,62 +438,6 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - - return i; -} - -static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) { - // check for empty sequences - if (a.empty() || b.empty()) { - return 0; - } - - // get the lengths of the input sequences - size_t a_len = a.size(); - size_t b_len = b.size(); - - // initialize the maximum length of the longest common subsequence (LCS) - size_t max_length = 0; - - // use two rows instead of a 2D matrix to optimize space - std::vector prev_row(b_len + 1, 0); - std::vector curr_row(b_len + 1, 0); - - // iterate through the elements of a - for (size_t i = 1; i <= a_len; i++) { - // iterate through the elements of b - for (size_t j = 1; j <= b_len; j++) { - // if elements at the current positions match - if (a[i - 1] == b[j - 1]) { - // if it's the first element of either sequences, set LCS length to 1 - if (i == 1 || j == 1) { - curr_row[j] = 1; - } else { - // increment LCS length by 1 compared to the previous element - curr_row[j] = prev_row[j - 1] + 1; - } - - // update max_length if necessary - if (curr_row[j] > max_length) { - max_length = curr_row[j]; - } - } else { - // reset LCS length if elements don't match - curr_row[j] = 0; - } - } - - // update the previous row for the next iteration - prev_row = curr_row; - } - - // return the maximum length of the LCS - return max_length; -} - static bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } diff --git a/examples/speculative-simple/CMakeLists.txt b/examples/speculative-simple/CMakeLists.txt new file mode 100644 index 0000000000000..7a3a141c27994 --- /dev/null +++ b/examples/speculative-simple/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-speculative-simple) +add_executable(${TARGET} speculative-simple.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/speculative-simple/README.md b/examples/speculative-simple/README.md new file mode 100644 index 0000000000000..e3a6c6b4aa0bf --- /dev/null +++ b/examples/speculative-simple/README.md @@ -0,0 +1,12 @@ +# llama.cpp/examples/speculative-simple + +Demonstration of basic greedy speculative decoding + +```bash +./bin/llama-speculative-simple \ + -m ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \ + -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \ + -f test.txt -c 0 -ngl 99 --color \ + --sampling-seq k --top-k 1 -fa --temp 0.0 \ + -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9 +``` diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp new file mode 100644 index 0000000000000..1bc7f428cf3b5 --- /dev/null +++ b/examples/speculative-simple/speculative-simple.cpp @@ -0,0 +1,273 @@ +#include "arg.h" +#include "common.h" +#include "sampling.h" +#include "speculative.h" +#include "log.h" +#include "llama.h" + +#include +#include +#include +#include + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { + return 1; + } + + if (params.n_predict < -1) { + LOG_ERR("%s: --n-predict must be >= -1\n", __func__); + return 1; + } + + common_init(); + + if (params.speculative.model.empty()) { + LOG_ERR("%s: --model-draft is required\n", __func__); + return 1; + } + + // init llama.cpp + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model_tgt = NULL; + llama_model * model_dft = NULL; + + llama_context * ctx_tgt = NULL; + llama_context * ctx_dft = NULL; + + // load the target model + common_init_result llama_init_tgt = common_init_from_params(params); + + model_tgt = llama_init_tgt.model; + ctx_tgt = llama_init_tgt.context; + + // load the draft model + params.model = params.speculative.model; + params.n_ctx = params.speculative.n_ctx; + params.n_batch = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch; + params.n_gpu_layers = params.speculative.n_gpu_layers; + + if (params.speculative.cpuparams.n_threads > 0) { + params.cpuparams.n_threads = params.speculative.cpuparams.n_threads; + } + + params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; + common_init_result llama_init_dft = common_init_from_params(params); + + model_dft = llama_init_dft.model; + ctx_dft = llama_init_dft.context; + + if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { + return 1; + } + + // Tokenize the prompt + std::vector inp; + inp = common_tokenize(ctx_tgt, params.prompt, true, true); + + if (llama_n_ctx(ctx_tgt) < (int) inp.size()) { + LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt)); + + return 1; + } + + if (llama_n_batch(ctx_tgt) < (int) inp.size()) { + LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt)); + + return 1; + } + + LOG("\n\n"); + + for (auto id : inp) { + LOG("%s", common_token_to_piece(ctx_tgt, id).c_str()); + } + + // how many tokens to draft each time + int n_draft = params.speculative.n_max; + int n_draft_min = params.speculative.n_min; + + float p_min = params.speculative.p_min; + + int n_predict = 0; + int n_drafted = 0; + int n_accept = 0; + + // used to determine end of generation + bool has_eos = false; + + // ================================================ + // everything until here is standard initialization + // the relevant stuff for speculative decoding starts here + + const auto t_enc_start = ggml_time_us(); + + // target model sampling context + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); + + // eval the prompt + llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1)); + + // note: keep the last token separate! + llama_token id_last = inp.back(); + + // all tokens currently in the target context + auto prompt_tgt = std::vector(inp.begin(), inp.end() - 1); + + int n_past = inp.size() - 1; + + // init the speculator + struct common_speculative_params params_spec; + params_spec.n_draft = n_draft; + params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft; + params_spec.p_min = p_min; + + struct common_speculative * spec = common_speculative_init(ctx_dft); + + llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1); + + const auto t_enc_end = ggml_time_us(); + + const auto t_dec_start = ggml_time_us(); + + while (true) { + // optionally, generate draft tokens that can be appended to the target batch + // + // this is the most important part of the speculation. the more probable tokens that are provided here + // the better the performance will be. in theory, this computation can be performed asynchronously and even + // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens + // from a cache or lookup tables. + // + llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last); + + //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str()); + + // always have a token to evaluate from before - id_last + common_batch_clear(batch_tgt); + common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true); + + // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1] + { + // do not waste time on small drafts + if (draft.size() < n_draft_min) { + draft.clear(); + } + + for (size_t i = 0; i < draft.size(); ++i) { + common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + } + + //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str()); + + llama_decode(ctx_tgt, batch_tgt); + } + + // sample from the full target batch and return the accepted tokens based on the target sampler + // + // for each token to be accepted, the sampler would have to sample that same token + // in such cases, instead of decoding the sampled token as we normally do, we simply continue with the + // available logits from the batch and sample the next token until we run out of logits or the sampler + // disagrees with the draft + // + const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft); + + //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str()); + + GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token + + n_past += ids.size() - 1; + n_drafted += batch_tgt.n_tokens - 1; + n_accept += ids.size() - 1; + + // process the accepted tokens and update contexts + // + // this is the standard token post-processing that we normally do + // in this case, we do it for a group of accepted tokens at once + // + { + llama_token id; + std::string token_str; + + for (size_t i = 0; i < ids.size(); ++i) { + id = ids[i]; + + ++n_predict; + + if (llama_token_is_eog(model_tgt, id)) { + has_eos = true; + break; + } + + token_str = common_token_to_piece(ctx_tgt, id); + + if (params.use_color && i + 1 < ids.size()) { + LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str()); + } else { + LOG("%s", token_str.c_str()); + } + } + + if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { + break; + } + + LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str()); + + { + LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); + + llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); + } + + prompt_tgt.push_back(id_last); + prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1); + + // remember the last accepted token for the next iteration + id_last = id; + } + } + + auto t_dec_end = ggml_time_us(); + + const int n_input = inp.size(); + + LOG("\n\n"); + + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + + LOG_INF("\n"); + LOG_INF("draft:\n\n"); + + llama_perf_context_print(ctx_dft); + + LOG_INF("\n"); + LOG_INF("target:\n\n"); + common_perf_print(ctx_tgt, smpl); + + common_sampler_free(smpl); + common_speculative_free(spec); + + llama_free(ctx_tgt); + llama_free_model(model_tgt); + + llama_free(ctx_dft); + llama_free_model(model_dft); + + llama_backend_free(); + + LOG("\n\n"); + + return 0; +} diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 6cafd8a837992..eb8bb2de54a26 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -12,7 +12,7 @@ #include #include -#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 +#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 struct seq_draft { @@ -33,7 +33,7 @@ int main(int argc, char ** argv) { common_params params; // needed to get candidate probs even for temp <= 0.0 - params.sparams.n_probs = 128; + params.sampling.n_probs = 128; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { return 1; @@ -46,7 +46,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.model_draft.empty()) { + if (params.speculative.model.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -55,9 +55,9 @@ int main(int argc, char ** argv) { const int n_seq_dft = params.n_parallel; // probability threshold for splitting a draft branch (only for n_seq_dft > 1) - const float p_split = params.p_split; + const float p_draft_split = params.speculative.p_split; - std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed); + std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed); std::uniform_real_distribution<> u_dist; // init llama.cpp @@ -76,13 +76,13 @@ int main(int argc, char ** argv) { ctx_tgt = llama_init_tgt.context; // load the draft model - params.model = params.model_draft; - params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.draft_cpuparams.n_threads > 0) { - params.cpuparams.n_threads = params.draft_cpuparams.n_threads; + params.model = params.speculative.model; + params.n_gpu_layers = params.speculative.n_gpu_layers; + if (params.speculative.cpuparams.n_threads > 0) { + params.cpuparams.n_threads = params.speculative.cpuparams.n_threads; } - params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; + params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; common_init_result llama_init_dft = common_init_from_params(params); model_dft = llama_init_dft.model; ctx_dft = llama_init_dft.context; @@ -170,7 +170,7 @@ int main(int argc, char ** argv) { //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft)); // how many tokens to draft each time - int n_draft = params.n_draft; + int n_draft = params.speculative.n_max; int n_predict = 0; int n_drafted = 0; @@ -183,14 +183,14 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context (reuse the llama_context's sampling instance) - struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams); + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); // draft sequence data std::vector drafts(n_seq_dft); for (int s = 0; s < n_seq_dft; ++s) { // allocate llama_sampler for each draft sequence - drafts[s].smpl = common_sampler_init(model_dft, params.sparams); + drafts[s].smpl = common_sampler_init(model_dft, params.sampling); } llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); @@ -230,7 +230,7 @@ int main(int argc, char ** argv) { // for stochastic sampling, attempt to match the token with the drafted tokens { bool accept = false; - if (params.sparams.temp > 0) { + if (params.sampling.temp > 0) { // stochastic verification common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); @@ -494,7 +494,7 @@ int main(int argc, char ** argv) { // attempt to split the branch if the probability is high enough for (int f = 1; f < 8; ++f) { - if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) { + if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 3665238b5a2d8..69604b87ceec4 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -70,7 +70,7 @@ int main(void) { // non-existence arg in specific example (--draft cannot be used outside llama-speculative) argv = {"binary_name", "--draft", "123"}; - assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING)); printf("test-arg-parser: test valid usage\n\n"); @@ -96,7 +96,7 @@ int main(void) { // --draft cannot be used outside llama-speculative argv = {"binary_name", "--draft", "123"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); - assert(params.n_draft == 123); + assert(params.speculative.n_max == 123); // skip this part on windows, because setenv is not supported #ifdef _WIN32 From 5a8987793f3e7c1fbfa6806bfcd17d578071b6c9 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Mon, 25 Nov 2024 17:31:10 +0800 Subject: [PATCH 21/32] [SYCL] Fix building Win package for oneAPI 2025.0 update (#10483) * fix build package for 2025.0 * debug * debug * fix * rm debug --------- Co-authored-by: arthw <14088817+arthw@users.noreply.github.com> --- .github/workflows/build.yml | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 572f91643fbba..abaf2c504c8f4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -952,7 +952,7 @@ jobs: env: WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" steps: - name: Clone @@ -962,7 +962,8 @@ jobs: fetch-depth: 0 - name: Install - run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + run: | + scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - name: Build id: cmake_build @@ -981,27 +982,34 @@ jobs: echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT fi - - name: Pack artifacts + - name: Build the release package id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin + echo "cp oneAPI running time dll files to ./build/bin done" 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + - name: Upload the release package + if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} uses: actions/upload-artifact@v4 with: path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip From b756441104ca8384640d6df22ba4ea6dab7ad799 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 15:08:04 +0200 Subject: [PATCH 22/32] metal : minor code formatting --- ggml/src/ggml-metal/ggml-metal.m | 598 +++++++++++++-------------- ggml/src/ggml-metal/ggml-metal.metal | 43 +- 2 files changed, 323 insertions(+), 318 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index d1abb3cef0ec4..3a533d7f9c9af 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1951,316 +1951,316 @@ static void ggml_metal_encode_node( } #endif - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs - // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if ([device supportsFamily:MTLGPUFamilyApple7] && - !ggml_is_transposed(src0) && - !ggml_is_transposed(src1) && - src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && ne00 >= 64 && - (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { - //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - - // some Metal matrix data types require aligned pointers - // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) - switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; - case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; - case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; - default: break; - } + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + if ([device supportsFamily:MTLGPUFamilyApple7] && + !ggml_is_transposed(src0) && + !ggml_is_transposed(src1) && + src1t == GGML_TYPE_F32 && + ne00 % 32 == 0 && ne00 >= 64 && + (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { + //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - id pipeline = nil; - - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; - case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; - case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; - case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; - case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; - case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; - default: GGML_ABORT("MUL MAT-MAT not implemented"); - } + // some Metal matrix data types require aligned pointers + // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) + switch (src0->type) { + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; + default: break; + } - ggml_metal_kargs_mul_mm args = { - /*.ne00 =*/ ne00, - /*.ne02 =*/ ne02, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.nb03 =*/ nb03, - /*.ne12 =*/ ne12, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.ne0 =*/ ne0, - /*.ne1 =*/ ne1, - /*.r2 =*/ r2, - /*.r3 =*/ r3, - }; + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; + case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; + case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; + default: GGML_ABORT("MUL MAT-MAT not implemented"); + } + + ggml_metal_kargs_mul_mm args = { + /*.ne00 =*/ ne00, + /*.ne02 =*/ ne02, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.ne0 =*/ ne0, + /*.ne1 =*/ ne1, + /*.r2 =*/ r2, + /*.r3 =*/ r3, + }; - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - } else { - int nth0 = 32; - int nth1 = 1; - int nrows = 1; - //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - id pipeline = nil; + id pipeline = nil; - // use custom matrix x vector kernel - switch (src0t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + // use custom matrix x vector kernel + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + nrows = 4; + } break; + case GGML_TYPE_F16: + { + nth0 = 32; + nth1 = 1; + if (src1t == GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; + nrows = ne11; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; nrows = 4; - } break; - case GGML_TYPE_F16: - { - nth0 = 32; - nth1 = 1; - if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; - } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; - nrows = ne11; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; - nrows = 4; - } - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; - nrows = 4; - } - } break; - case GGML_TYPE_BF16: - { - nth0 = 32; - nth1 = 1; - if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; - } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; - nrows = ne11; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline; - nrows = 4; - } - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline; - nrows = 4; - } - } break; - case GGML_TYPE_Q4_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; - } break; - case GGML_TYPE_Q4_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; - } break; - case GGML_TYPE_Q5_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; - } break; - case GGML_TYPE_Q5_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; - } break; - case GGML_TYPE_Q8_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; - } break; - case GGML_TYPE_Q2_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; - } break; - case GGML_TYPE_Q3_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; - } break; - case GGML_TYPE_Q4_K: - { - nth0 = 4; //1; - nth1 = 8; //32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; - } break; - case GGML_TYPE_Q5_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; - } break; - case GGML_TYPE_Q6_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline; - } break; - case GGML_TYPE_IQ2_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_M: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline; - } break; - case GGML_TYPE_IQ4_NL: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline; - } break; - case GGML_TYPE_IQ4_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline; - } break; - default: - { - GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t); - GGML_ABORT("not implemented"); } - }; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; + nrows = 4; + } + } break; + case GGML_TYPE_BF16: + { + nth0 = 32; + nth1 = 1; + if (src1t == GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; + nrows = ne11; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline; + nrows = 4; + } + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline; + nrows = 4; + } + } break; + case GGML_TYPE_Q4_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; + } break; + case GGML_TYPE_Q4_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; + } break; + case GGML_TYPE_Q5_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; + } break; + case GGML_TYPE_Q5_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; + } break; + case GGML_TYPE_Q8_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; + } break; + case GGML_TYPE_Q2_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; + } break; + case GGML_TYPE_Q3_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; + } break; + case GGML_TYPE_Q4_K: + { + nth0 = 4; //1; + nth1 = 8; //32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; + } break; + case GGML_TYPE_Q5_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; + } break; + case GGML_TYPE_Q6_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline; + } break; + case GGML_TYPE_IQ2_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline; + } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline; + } break; + case GGML_TYPE_IQ4_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline; + } break; + default: + { + GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t); + GGML_ABORT("not implemented"); + } + }; - ggml_metal_kargs_mul_mv args = { - /*.ne00 =*/ ne00, - /*.ne01 =*/ ne01, - /*.ne02 =*/ ne02, - /*.nb00 =*/ nb00, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.nb03 =*/ nb03, - /*.ne10 =*/ ne10, - /*.ne11 =*/ ne11, - /*.ne12 =*/ ne12, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.ne0 =*/ ne0, - /*.ne1 =*/ ne1, - /*.r2 =*/ r2, - /*.r3 =*/ r3, - }; + ggml_metal_kargs_mul_mv args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne10 =*/ ne10, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.ne0 =*/ ne0, + /*.ne1 =*/ ne1, + /*.r2 =*/ r2, + /*.r3 =*/ r3, + }; - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || - src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { - const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { - const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { - const int mem_size = 32*sizeof(float); - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q3_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else { - const int64_t ny = (ne11 + nrows - 1)/nrows; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - } + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { + const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q3_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + const int64_t ny = (ne11 + nrows - 1)/nrows; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } } break; case GGML_OP_MUL_MAT_ID: { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 971f5054bce2a..eaca38864bd65 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm( const int im = tgpig.z; // if this block is of 64x32 shape or smaller - short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; - short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; + const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; + const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; // a thread shouldn't load data outside of the matrix - short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; - short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; simdgroup_T8x8 ma[4]; simdgroup_float8x8 mb[2]; @@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm( const int i12 = im%args.ne12; const int i13 = im/args.ne12; - uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; - short offset1 = il/nl; + const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + const short offset1 = il/nl; + + device const block_q * x = (device const block_q *)(src0 + + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1; - device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1; device const float * y = (device const float *)(src1 + args.nb13*i13 + args.nb12*i12 - + args.nb11*(r1 * BLOCK_SIZE_N + thread_col) + + args.nb11*(r1*BLOCK_SIZE_N + thread_col) + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) { // load data and store to threadgroup memory T4x4 temp_a; dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); #pragma unroll(16) @@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm( + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; } - *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y); + *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y); il = (il + 2 < nl) ? il + 2 : il % 2; - x = (il < 2) ? x + (2+nl-1)/nl : x; + x = (il < 2) ? x + (2 + nl - 1)/nl : x; y += BLOCK_SIZE_K; threadgroup_barrier(mem_flags::mem_threadgroup); // load matrices from threadgroup memory and conduct outer products - threadgroup T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); - threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); + threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); + threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); #pragma unroll(4) - for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) { #pragma unroll(4) for (short i = 0; i < 4; i++) { simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) for (short i = 0; i < 2; i++) { simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); } - lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE; - lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE; - #pragma unroll(8) for (short i = 0; i < 8; i++){ simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); } + + lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE; + lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE; } } if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) { device float * C = (device float *) dst + - (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) + \ - (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; + (BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \ + (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; for (short i = 0; i < 8; i++) { simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0); @@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm( // block is smaller than 64x32, we should avoid writing data outside of the matrix threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup float * temp_str = ((threadgroup float *) shmem) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M; + + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M; for (short i = 0; i < 8; i++) { simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); } From f6d12e7df8fe64384f1939976871252e6422a01e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 15:17:32 +0200 Subject: [PATCH 23/32] tests : fix compile warning --- tests/test-quantize-fns.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 8d0bf0470fe4e..c77c8ed1388d7 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -79,9 +79,9 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) { } // Total dot product error -static float dot_product_error( - const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2 -) { +static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) { + GGML_UNUSED(qfns); + std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); From 5931c1f233c616083d64e41a228249d58e039aa5 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 25 Nov 2024 15:13:39 +0100 Subject: [PATCH 24/32] ggml : add support for dynamic loading of backends (#10469) * ggml : add support for dynamic loading of backends --------- Co-authored-by: Georgi Gerganov --- Makefile | 3 +- Package.swift | 3 +- common/common.cpp | 3 + examples/CMakeLists.txt | 27 +-- examples/eval-callback/CMakeLists.txt | 3 +- examples/llama-bench/llama-bench.cpp | 15 +- examples/main/main.cpp | 12 +- examples/simple-chat/simple-chat.cpp | 3 + examples/simple/simple.cpp | 4 + ggml/CMakeLists.txt | 1 + ggml/include/ggml-backend.h | 15 ++ ggml/include/ggml-cpu.h | 38 +--- ggml/include/ggml.h | 31 +++ ggml/src/CMakeLists.txt | 41 +++- ggml/src/ggml-amx/CMakeLists.txt | 10 +- ggml/src/ggml-amx/ggml-amx.cpp | 7 +- ggml/src/ggml-backend-impl.h | 44 +++-- ggml/src/ggml-backend-reg.cpp | 252 +++++++++++++++++++++++-- ggml/src/ggml-blas/CMakeLists.txt | 9 +- ggml/src/ggml-blas/ggml-blas.cpp | 7 +- ggml/src/ggml-cann/CMakeLists.txt | 6 +- ggml/src/ggml-cann/ggml-cann.cpp | 13 +- ggml/src/ggml-cpu/CMakeLists.txt | 19 +- ggml/src/ggml-cpu/ggml-cpu.c | 23 --- ggml/src/ggml-cpu/ggml-cpu.cpp | 50 +++-- ggml/src/ggml-cuda/CMakeLists.txt | 11 +- ggml/src/ggml-cuda/ggml-cuda.cu | 71 ++++++- ggml/src/ggml-hip/CMakeLists.txt | 10 +- ggml/src/ggml-kompute/CMakeLists.txt | 10 +- ggml/src/ggml-kompute/ggml-kompute.cpp | 7 +- ggml/src/ggml-metal/CMakeLists.txt | 9 +- ggml/src/ggml-metal/ggml-metal.m | 34 +++- ggml/src/ggml-musa/CMakeLists.txt | 10 +- ggml/src/ggml-rpc/CMakeLists.txt | 8 +- ggml/src/ggml-rpc/ggml-rpc.cpp | 7 +- ggml/src/ggml-sycl/CMakeLists.txt | 10 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 12 +- ggml/src/ggml-vulkan/CMakeLists.txt | 12 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 +- ggml/src/ggml.c | 23 +++ pocs/CMakeLists.txt | 4 +- src/llama.cpp | 77 ++++---- tests/CMakeLists.txt | 13 +- tests/test-backend-ops.cpp | 26 +-- 44 files changed, 728 insertions(+), 272 deletions(-) diff --git a/Makefile b/Makefile index dd6d864ad513a..14c05e93e7535 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 @@ -290,6 +290,7 @@ endif # some memory allocation are available on Linux through GNU extensions in libc ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GNU_SOURCE + MK_LDFLAGS += -ldl endif # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, diff --git a/Package.swift b/Package.swift index 6b68aecdebec1..d9e8a4e2d21d7 100644 --- a/Package.swift +++ b/Package.swift @@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL") + .define("GGML_USE_METAL"), + .define("GGML_USE_CPU") ] ) #endif diff --git a/common/common.cpp b/common/common.cpp index c398329d05bf5..98524f7467ab4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -377,6 +377,9 @@ void common_init() { #endif LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); + + // load dynamic backends + ggml_backend_load_all(); } std::string common_params_get_system_info(const common_params & params) { diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9bd099d4ef8a5..632409d5591b9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(export-lora) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -27,24 +24,16 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) - add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (GGML_RPC) - add_subdirectory(rpc) - endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) + add_subdirectory(server) endif() add_subdirectory(save-load-state) add_subdirectory(simple) @@ -52,4 +41,18 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + add_subdirectory(quantize-stats) + add_subdirectory(llava) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) + endif() + endif() endif() diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index a48753d38e16e..5d1048aad74b6 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} + COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3dc84a75cbec7..bac606f471639 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // initialize backends + ggml_backend_load_all(); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_threadpool_free(threadpool); + ggml_threadpool_free_fn(threadpool); } llama_free_model(lmodel); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 957451af7ce0a..d0c28f317b8c5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -165,6 +165,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -174,7 +178,7 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new(&tpp_batch); + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); return 1; @@ -184,7 +188,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); return 1; @@ -890,8 +894,8 @@ int main(int argc, char ** argv) { llama_backend_free(); - ggml_threadpool_free(threadpool); - ggml_threadpool_free(threadpool_batch); + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 5f9973163732d..7f4da666b08ec 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -62,6 +62,9 @@ int main(int argc, char ** argv) { } }, nullptr); + // load dynamic backends + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 59760fe95db22..3288c0250a001 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -74,6 +74,10 @@ int main(int argc, char ** argv) { } } + // load dynamic backends + + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2d32da1b6d879..70b5cfdf7fbb4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -33,6 +33,7 @@ else() endif() option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) +option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) # # option list diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index cef164764bb1a..19881a5059f17 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -190,6 +190,14 @@ extern "C" { typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); // Get additional buffer types provided by the device (returns a NULL-terminated array) typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); + // Set the abort callback for the backend + typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); + // Get a list of feature flags supported by the backend (returns a NULL-terminated array) + struct ggml_backend_feature { + const char * name; + const char * value; + }; + typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); // // Backend registry @@ -214,6 +222,13 @@ extern "C" { // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); + // Load a backend from a dynamic library and register it + GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); + // Unload a backend if loaded dynamically and unregister it + GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); + // Load all known backends from dynamic libraries + GGML_API void ggml_backend_load_all(void); + // // Backend scheduler // diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 7571ef9798364..a5358d047a08e 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -7,29 +7,6 @@ extern "C" { #endif - // Scheduling priorities - enum ggml_sched_priority { - GGML_SCHED_PRIO_NORMAL, - GGML_SCHED_PRIO_MEDIUM, - GGML_SCHED_PRIO_HIGH, - GGML_SCHED_PRIO_REALTIME - }; - - // Threadpool params - // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults - struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - enum ggml_sched_priority prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state - }; - - struct ggml_threadpool; // forward declaration, see ggml.c - - typedef struct ggml_threadpool * ggml_threadpool_t; - // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -75,14 +52,11 @@ extern "C" { GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); - GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data @@ -104,10 +78,10 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_sse3 (void); GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); GGML_BACKEND_API int ggml_cpu_has_avx (void); + GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx2 (void); GGML_BACKEND_API int ggml_cpu_has_f16c (void); GGML_BACKEND_API int ggml_cpu_has_fma (void); - GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx512 (void); GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 69e6a24344b97..9843b09fbe83e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2215,6 +2215,37 @@ extern "C" { GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); + // ggml threadpool + // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend + // the goal should be to create an API that other backends can use move everything to the ggml base + + // scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + + // threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state + }; + + struct ggml_threadpool; // forward declaration, see ggml.c + + typedef struct ggml_threadpool * ggml_threadpool_t; + + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8df0e85c0d092..071508ddae021 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -202,6 +202,10 @@ endif() # ggml +if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS) + message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS") +endif() + add_library(ggml-base ../include/ggml.h ../include/ggml-alloc.h @@ -226,6 +230,31 @@ add_library(ggml target_link_libraries(ggml PUBLIC ggml-base) +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(ggml PRIVATE dl) +endif() + +function(ggml_add_backend_library backend) + if (GGML_BACKEND_DL) + add_library(${backend} MODULE ${ARGN}) + # write the shared library to the output directory + set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) + else() + add_library(${backend} ${ARGN}) + target_link_libraries(ggml PUBLIC ${backend}) + install(TARGETS ${backend} LIBRARY) + endif() + + target_link_libraries(${backend} PRIVATE ggml-base) + target_include_directories(${backend} PRIVATE ..) + + if (${BUILD_SHARED_LIBS}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD) + target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED) + endif() +endfunction() + function(ggml_add_backend backend) string(TOUPPER "GGML_${backend}" backend_id) if (${backend_id}) @@ -236,14 +265,10 @@ function(ggml_add_backend backend) # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp if (${backend_id}) message(STATUS "Including ${backend} backend") - if (${BUILD_SHARED_LIBS}) - target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) - target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED) + if (NOT GGML_BACKEND_DL) + string(TOUPPER "GGML_USE_${backend}" backend_use) + target_compile_definitions(ggml PUBLIC ${backend_use}) endif() - install(TARGETS ${backend_target} LIBRARY) - target_link_libraries(ggml PUBLIC ${backend_target}) - string(TOUPPER "GGML_USE_${backend}" backend_use) - target_compile_definitions(ggml PUBLIC ${backend_use}) endif() endif() endfunction() @@ -256,10 +281,10 @@ ggml_add_backend(CUDA) ggml_add_backend(HIP) ggml_add_backend(Kompute) ggml_add_backend(METAL) +ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) -ggml_add_backend(MUSA) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt index d6676f3f67b20..cf3ade6f020ed 100644 --- a/ggml/src/ggml-amx/CMakeLists.txt +++ b/ggml/src/ggml-amx/CMakeLists.txt @@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA file(GLOB GGML_SOURCES_AMX "*.cpp") - add_library(ggml-amx - ${GGML_HEADERS_AMX} - ${GGML_SOURCES_AMX}) - - target_link_libraries(ggml-amx PRIVATE ggml-base) - target_include_directories(ggml-amx PRIVATE . ..) + ggml_add_backend_library(ggml-amx + ${GGML_HEADERS_AMX} + ${GGML_SOURCES_AMX} + ) # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags # TODO: integrate AMX backend into the CPU backend diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp index 8568e7965fd2e..6bfb3da274c39 100644 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ b/ggml/src/ggml-amx/ggml-amx.cpp @@ -409,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { ggml_backend_reg_t ggml_backend_amx_reg(void) { static struct ggml_backend_reg ggml_backend_amx_reg = { - /* .iface = */ ggml_backend_amx_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_amx_reg_i, + /* .context = */ NULL, }; return &ggml_backend_amx_reg; @@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) { } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index fa8d5b7fb68c9..dff7749b416dc 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -8,6 +8,8 @@ extern "C" { #endif + #define GGML_BACKEND_API_VERSION 1 + // // Backend buffer type // @@ -63,20 +65,20 @@ extern "C" { enum ggml_backend_buffer_usage usage; }; - ggml_backend_buffer_t ggml_backend_buffer_init( + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, void * context, size_t size); // do not use directly, use ggml_backend_tensor_copy instead - bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); // multi-buffer // buffer that contains a collection of buffers - ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); // // Backend (stream) @@ -199,17 +201,37 @@ extern "C" { }; struct ggml_backend_reg { - // int api_version; // TODO: for dynamic loading + int api_version; // initialize to GGML_BACKEND_API_VERSION struct ggml_backend_reg_i iface; void * context; }; - // Internal backend registry API - void ggml_backend_register(ggml_backend_reg_t reg); - void ggml_backend_device_register(ggml_backend_dev_t device); - // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function - // typedef ggml_backend_register_t * (*ggml_backend_init)(void); + GGML_API void ggml_backend_register(ggml_backend_reg_t reg); + GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); + + // Add backend dynamic loading support to the backend + typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); + + #ifdef GGML_BACKEND_DL + #ifdef __cplusplus + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + extern "C" { \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + } \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } + #endif + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) + #endif #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 63e9d82017457..43d03d7fa7385 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,11 +1,29 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml-impl.h" +#include #include +#include #include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#elif defined(__APPLE__) +# include +# include +#else +# include +# include +#endif + // Backend registry +#ifdef GGML_USE_CPU +#include "ggml-cpu.h" +#endif #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -43,8 +61,13 @@ #include "ggml-kompute.h" #endif +struct ggml_backend_reg_entry { + ggml_backend_reg_t reg; + void * handle; +}; + struct ggml_backend_registry { - std::vector backends; + std::vector backends; std::vector devices; ggml_backend_registry() { @@ -75,11 +98,19 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif - +#ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); +#endif } - void register_backend(ggml_backend_reg_t reg) { + ~ggml_backend_registry() { + while (!backends.empty()) { + // use silent since the log system may have been destroyed at this point + unload_backend(backends.back().reg, true); + } + } + + void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { if (!reg) { return; } @@ -88,7 +119,7 @@ struct ggml_backend_registry { GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif - backends.push_back(reg); + backends.push_back({ reg, handle }); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); } @@ -100,6 +131,111 @@ struct ggml_backend_registry { #endif devices.push_back(device); } + + ggml_backend_reg_t load_backend(const char * path, bool silent) { +#ifdef _WIN32 + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryA(path); + + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); + } + SetErrorMode(old_mode); + return nullptr; + } + + ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); + + SetErrorMode(old_mode); + + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); + } + FreeLibrary(handle); + return nullptr; + } +#else + void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); + } + return nullptr; + } + + auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); + + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); + } + dlclose(handle); + return nullptr; + } +#endif + ggml_backend_reg_t reg = backend_init(); + + if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { + if (!silent) { + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); + } else { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", + __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); + } + } +#ifdef _WIN32 + FreeLibrary(handle); +#else + dlclose(handle); +#endif + return nullptr; + } + + GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + register_backend(reg, handle); + return reg; + } + + void unload_backend(ggml_backend_reg_t reg, bool silent) { + auto it = std::find_if(backends.begin(), backends.end(), + [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); + + if (it == backends.end()) { + if (!silent) { + GGML_LOG_ERROR("%s: backend not found\n", __func__); + } + return; + } + + if (!silent) { + GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); + } + + // remove devices + devices.erase( + std::remove_if(devices.begin(), devices.end(), + [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), + devices.end()); + + // unload library + if (it->handle) { +#ifdef _WIN32 + FreeLibrary((HMODULE) it->handle); +#else + dlclose(it->handle); +#endif + } + + // remove backend + backends.erase(it); + } }; static ggml_backend_registry & get_reg() { @@ -123,7 +259,7 @@ size_t ggml_backend_reg_count() { ggml_backend_reg_t ggml_backend_reg_get(size_t index) { GGML_ASSERT(index < ggml_backend_reg_count()); - return get_reg().backends[index]; + return get_reg().backends[index].reg; } ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { @@ -133,7 +269,7 @@ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { return reg; } } - return NULL; + return nullptr; } // Device enumeration @@ -153,7 +289,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { return dev; } } - return NULL; + return nullptr; } ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { @@ -163,14 +299,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { return dev; } } - return NULL; + return nullptr; } // Convenience functions ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -178,7 +314,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -189,7 +325,97 @@ ggml_backend_t ggml_backend_init_best(void) { dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } if (!dev) { - return NULL; + return nullptr; } - return ggml_backend_dev_init(dev, NULL); + return ggml_backend_dev_init(dev, nullptr); +} + +// Dynamic loading +ggml_backend_reg_t ggml_backend_load(const char * path) { + return get_reg().load_backend(path, false); +} + +void ggml_backend_unload(ggml_backend_reg_t reg) { + get_reg().unload_backend(reg, true); +} + +void ggml_backend_load_all() { + std::vector search_prefix; + + // add the executable directory to the search path + // FIXME: this is convenient for development, but it should probably be disabled in production + +#if defined(__APPLE__) + // get executable path + std::vector path; + uint32_t size; + while (true) { + size = path.size(); + if (_NSGetExecutablePath(path.data(), &size) == 0) { + break; + } + path.resize(size); + } + std::string base_path(path.data(), size); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + search_prefix.push_back(base_path + "/"); +#elif defined(__linux__) + std::string base_path = "."; + std::vector path(1024); + while (true) { + // get executable path + ssize_t len = readlink("/proc/self/exe", path.data(), path.size()); + if (len == -1) { + break; + } + if (len < (ssize_t) path.size()) { + base_path = std::string(path.data(), len); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + break; + } + path.resize(path.size() * 2); + } + + search_prefix.push_back(base_path + "/"); +#endif + + auto & reg = get_reg(); + + auto try_load = [&](const std::string & name) { + std::string os_name; +#ifdef _WIN32 + os_name = "ggml-" + name + ".dll"; +#else + os_name = "libggml-" + name + ".so"; +#endif + if (reg.load_backend(os_name.c_str(), true)) { + return; + } + for (const auto & prefix : search_prefix) { + if (reg.load_backend((prefix + os_name).c_str(), true)) { + return; + } + } + }; + + try_load("amx"); + try_load("blas"); + try_load("cann"); + try_load("cuda"); + try_load("hip"); + try_load("kompute"); + try_load("metal"); + try_load("rpc"); + try_load("sycl"); + try_load("vulkan"); + try_load("musa"); + try_load("cpu"); } diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index e2cbabf0dae74..0bf3c05d93a89 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -11,12 +11,9 @@ find_package(BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - add_library(ggml-blas - ggml-blas.cpp - ) - - target_link_libraries(ggml-blas PRIVATE ggml-base) - target_include_directories(ggml-blas PRIVATE . ..) + ggml_add_backend_library(ggml-blas + ggml-blas.cpp + ) if (${GGML_BLAS_VENDOR} MATCHES "Apple") add_compile_definitions(ACCELERATE_NEW_LAPACK) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 648c9d875e346..ec158dfac6e3e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { ggml_backend_reg_t ggml_backend_blas_reg(void) { static struct ggml_backend_reg ggml_backend_blas_reg = { - /* .iface = */ ggml_backend_blas_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_blas_reg_i, + /* .context = */ NULL, }; return &ggml_backend_blas_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index 756200b893d02..901327185fb75 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR) file(GLOB GGML_SOURCES_CANN "*.cpp") - add_library(ggml-cann ${GGML_SOURCES_CANN}) - target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES}) - target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) + ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN}) + target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES}) + target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 776340881434d..d96f65936136d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2064,16 +2064,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() { dev_ctx->name = GGML_CANN_NAME + std::to_string(i); ggml_cann_set_device(i); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cann_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cann_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cann_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cann_reg_interface, + /* .context = */ ctx }; } @@ -2126,3 +2127,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); } + +GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 2880523331dbd..c2905d1fbf4e8 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,14 +1,13 @@ -add_library(ggml-cpu - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - ) +ggml_add_backend_library(ggml-cpu + ggml-cpu.c + ggml-cpu.cpp + ggml-cpu-aarch64.c + ggml-cpu-aarch64.h + ggml-cpu-quants.c + ggml-cpu-quants.h + ) -target_link_libraries(ggml-cpu PRIVATE ggml-base) -target_include_directories(ggml-cpu PRIVATE . ..) +target_include_directories(ggml-cpu PRIVATE .) if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4b58254e7d108..c6ede19d9d1c0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int #endif // GGML_USE_OPENMP -void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { - p->n_threads = n_threads; - p->prio = 0; // default priority (usually means normal or inherited) - p->poll = 50; // hybrid-polling enabled - p->strict_cpu = false; // no strict placement (all threads share same cpumask) - p->paused = false; // threads are ready to go - memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) -} - -struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { - struct ggml_threadpool_params p; - ggml_threadpool_params_init(&p, n_threads); - return p; -} - -bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { - if (p0->n_threads != p1->n_threads ) return false; - if (p0->prio != p1->prio ) return false; - if (p0->poll != p1->poll ) return false; - if (p0->strict_cpu != p1->strict_cpu ) return false; - return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; -} - static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_threadpool_params * tpp, struct ggml_cgraph * cgraph, diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 573b7c5b9b375..febed433ada2b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg return &ggml_backend_cpu_device; } -struct ggml_backend_feature { - const char * name; - const char * value; -}; - -// Not used yet // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, -// and additionally to allow other backends to expose their own list of features that applications can query using the same API. +// and additionally to allow other backends to expose their own list of features that applications can query using the same API static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { static std::vector features = []() { + ggml_cpu_init(); + std::vector features; if (ggml_cpu_has_sse3()) { features.push_back({ "SSE3", "1" }); @@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_avx()) { features.push_back({ "AVX", "1" }); } + if (ggml_cpu_has_avx_vnni()) { + features.push_back({ "AVX_VNNI", "1" }); + } if (ggml_cpu_has_avx2()) { features.push_back({ "AVX2", "1" }); } @@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_fma()) { features.push_back({ "FMA", "1" }); } - if (ggml_cpu_has_avx_vnni()) { - features.push_back({ "AVX_VNNI", "1" }); - } if (ggml_cpu_has_avx512()) { features.push_back({ "AVX512", "1" }); } @@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } + // TODO: rename this + #ifdef GGML_USE_CPU_AARCH64 + features.push_back({ "AARCH64_REPACK", "1" }); + #endif features.push_back({ nullptr, nullptr }); @@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { return (void *)ggml_backend_cpu_get_extra_bufts; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cpu_get_features; + } + if (strcmp(name, "ggml_backend_set_abort_callback") == 0) { + return (void *)ggml_backend_cpu_set_abort_callback; + } + if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) { + return (void *)ggml_numa_init; + } + if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { + return (void *)ggml_is_numa; + } + + // threadpool - TODO: move to ggml-base + if (strcmp(name, "ggml_threadpool_new") == 0) { + return (void *)ggml_threadpool_new; + } + if (strcmp(name, "ggml_threadpool_free") == 0) { + return (void *)ggml_threadpool_free; + } + if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) { + return (void *)ggml_backend_cpu_set_threadpool; + } return NULL; @@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) { ggml_cpu_init(); static struct ggml_backend_reg ggml_backend_cpu_reg = { - /* .iface = */ ggml_backend_cpu_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cpu_reg_i, + /* .context = */ NULL, }; return &ggml_backend_cpu_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index e1482a269d698..b0cb93e070fd3 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -46,13 +46,10 @@ if (CUDAToolkit_FOUND) list(APPEND GGML_SOURCES_CUDA ${SRCS}) endif() - add_library(ggml-cuda - ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_CUDA} - ) - - target_link_libraries(ggml-cuda PRIVATE ggml-base) - target_include_directories(ggml-cuda PRIVATE . ..) + ggml_add_backend_library(ggml-cuda + ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_CUDA} + ) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index dd94ab03d5b6c..2a78a4393d0f7 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re return ctx->devices[index]; } +static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) { + static std::vector features = []() { + std::vector features; + #define _STRINGIFY(...) #__VA_ARGS__ + #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__) + + #ifdef __CUDA_ARCH_LIST__ + features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) }); + #endif + + #ifdef GGML_CUDA_FORCE_MMQ + features.push_back({ "FORCE_MMQ", "1" }); + #endif + + #ifdef GGML_CUDA_FORCE_CUBLAS + features.push_back({ "FORCE_CUBLAS", "1" }); + #endif + + #ifdef GGML_CUDA_NO_VMM + features.push_back({ "NO_VMM", "1" }); + #endif + + #ifdef GGML_CUDA_NO_PEER_COPY + features.push_back({ "NO_PEER_COPY", "1" }); + #endif + + #ifdef GGML_CUDA_F16 + features.push_back({ "F16", "1" }); + #endif + + #ifdef GGML_CUDA_USE_GRAPHS + features.push_back({ "USE_GRAPHS", "1" }); + #endif + + #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) }); + #endif + + #ifdef GGML_CUDA_FA_ALL_QUANTS + features.push_back({ "FA_ALL_QUANTS", "1" }); + #endif + + #undef _STRINGIFY + #undef STRINGIFY + + features.push_back({ nullptr, nullptr }); + + return features; + }(); + + return features.data(); + + GGML_UNUSED(reg); +} + static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { @@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { return (void *)ggml_backend_cuda_unregister_host_buffer; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cuda_get_features; + } return nullptr; } @@ -3169,16 +3227,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->description = prop.name; ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cuda_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cuda_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cuda_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cuda_reg_interface, + /* .context = */ ctx }; } @@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return cuda_backend; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg) diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index fccf8eb8440b8..b15fbd24d6b36 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -64,12 +64,10 @@ else() list(APPEND GGML_SOURCES_ROCM ${SRCS}) endif() -add_library(ggml-hip - ${GGML_HEADERS_ROCM} - ${GGML_SOURCES_ROCM}) - -target_link_libraries(ggml-hip PRIVATE ggml-base) -target_include_directories(ggml-hip PRIVATE . ..) +ggml_add_backend_library(ggml-hip + ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_ROCM} + ) # TODO: do not use CUDA definitions for HIP target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt index 0bd027c7f537e..dc623926c7685 100644 --- a/ggml/src/ggml-kompute/CMakeLists.txt +++ b/ggml/src/ggml-kompute/CMakeLists.txt @@ -6,13 +6,13 @@ if (NOT glslc_executable) message(FATAL_ERROR "glslc not found") endif() -add_library(ggml-kompute - ggml-kompute.cpp - ../../include/ggml-kompute.h - ) +ggml_add_backend_library(ggml-kompute + ggml-kompute.cpp + ../../include/ggml-kompute.h + ) target_link_libraries(ggml-kompute PRIVATE ggml-base kompute) -target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index 2fea9e4cc8d38..24566404ded0f 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = { ggml_backend_reg_t ggml_backend_kompute_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_kompute_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_kompute_reg_i, + /* .context = */ nullptr, }; return ® } + +GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index b237d79f47ddb..1bad272068244 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) message(STATUS "Metal framework found") -add_library(ggml-metal - ggml-metal.m - ) +ggml_add_backend_library(ggml-metal + ggml-metal.m + ) target_link_libraries(ggml-metal PRIVATE - ggml-base ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} ) -target_include_directories(ggml-metal PRIVATE . ..) - if (GGML_METAL_NDEBUG) add_compile_definitions(GGML_METAL_NDEBUG) endif() diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 3a533d7f9c9af..63baaf163581c 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4372,19 +4372,45 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r GGML_UNUSED(index); } +static struct ggml_backend_feature g_ggml_backend_metal_features[] = { +#if defined(GGML_METAL_EMBED_LIBRARY) + { "EMBED_LIBRARY", "1" }, +#endif +#if defined(GGML_METAL_USE_BF16) + { "BF16", "1" }, +#endif + { nil, nil }, +}; + +static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) { + return g_ggml_backend_metal_features; + + GGML_UNUSED(reg); +} + +static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_metal_get_features; + } + + return NULL; + + GGML_UNUSED(reg); +} static struct ggml_backend_reg_i ggml_backend_metal_reg_i = { /* .get_name = */ ggml_backend_metal_reg_get_name, /* .device_count = */ ggml_backend_metal_reg_device_count, /* .device_get = */ ggml_backend_metal_reg_device_get, - /* .get_proc_address = */ NULL, + /* .get_proc_address = */ ggml_backend_metal_get_proc_address, }; ggml_backend_reg_t ggml_backend_metal_reg(void) { // TODO: make this thread-safe somehow? { g_ggml_backend_metal_reg = (struct ggml_backend_reg) { - /* .iface = */ ggml_backend_metal_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_metal_reg_i, + /* .context = */ NULL, }; g_ggml_backend_metal_device = (struct ggml_backend_device) { @@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) { return &g_ggml_backend_metal_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg) diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index f3c0136920540..e1a69186e669f 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND) set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22") endforeach() - add_library(ggml-musa - ${GGML_HEADERS_MUSA} - ${GGML_SOURCES_MUSA}) - - target_link_libraries(ggml-musa PRIVATE ggml-base) - target_include_directories(ggml-musa PRIVATE . ..) + ggml_add_backend_library(ggml-musa + ${GGML_HEADERS_MUSA} + ${GGML_SOURCES_MUSA} + ) # TODO: do not use CUDA definitions for MUSA target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-rpc/CMakeLists.txt b/ggml/src/ggml-rpc/CMakeLists.txt index a2d6770eb053f..f5acb8ec2cb28 100644 --- a/ggml/src/ggml-rpc/CMakeLists.txt +++ b/ggml/src/ggml-rpc/CMakeLists.txt @@ -1,10 +1,8 @@ message(STATUS "Using RPC backend") -add_library(ggml-rpc - ggml-rpc.cpp) - -target_link_libraries(ggml-rpc PRIVATE ggml-base) -target_include_directories(ggml-rpc PRIVATE . ..) +ggml_add_backend_library(ggml-rpc + ggml-rpc.cpp + ) if (WIN32) target_link_libraries(ggml-rpc PRIVATE ws2_32) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 47357daabdf54..43108242639a3 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1369,8 +1369,9 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = { ggml_backend_reg_t ggml_backend_rpc_reg(void) { static struct ggml_backend_reg ggml_backend_rpc_reg = { - /* .iface = */ ggml_backend_rpc_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_rpc_reg_i, + /* .context = */ NULL, }; return &ggml_backend_rpc_reg; @@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { return dev; } + +GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index d1d0ff83d636c..83f223fd7b6fc 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -16,12 +16,10 @@ endif() message(STATUS "SYCL found") #todo: AOT -add_library(ggml-sycl - ggml-sycl.cpp - ../../include/ggml-sycl.h) - -target_link_libraries(ggml-sycl PRIVATE ggml-base) -target_include_directories(ggml-sycl PRIVATE . ..) +ggml_add_backend_library(ggml-sycl + ggml-sycl.cpp + ../../include/ggml-sycl.h + ) if (GGML_SYCL_F16) if (GGML_SYCL_TARGET STREQUAL "AMD") diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 255bc64c6badd..b6392ed8dcc6a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4637,16 +4637,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() { dev_ctx->description = prop.get_name(); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_sycl_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_sycl_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_sycl_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_sycl_reg_interface, + /* .context = */ ctx }; } @@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) { return sycl_backend; } +GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 1e85dd15b7ab1..ae0485e04255d 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED) if (Vulkan_FOUND) message(STATUS "Vulkan found") - add_library(ggml-vulkan - ggml-vulkan.cpp - ../../include/ggml-vulkan.h - ) + ggml_add_backend_library(ggml-vulkan + ggml-vulkan.cpp + ../../include/ggml-vulkan.h + ) - target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan) - target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) + target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ca71da2f7b7f5..49527fdf40e94 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -6738,8 +6738,9 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { ggml_backend_reg_t ggml_backend_vk_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_vk_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_vk_reg_i, + /* .context = */ nullptr, }; return ® @@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 78e7874dee04d..1a2318cb188c4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; } + +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + +bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { + if (p0->n_threads != p1->n_threads ) return false; + if (p0->prio != p1->prio ) return false; + if (p0->poll != p1->poll ) return false; + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; +} diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt index 03e1d2c04be65..d49d14dee4351 100644 --- a/pocs/CMakeLists.txt +++ b/pocs/CMakeLists.txt @@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(vdot) + if (NOT GGML_BACKEND_DL) + add_subdirectory(vdot) + endif() endif() diff --git a/src/llama.cpp b/src/llama.cpp index 20df09b133bfb..83bbc10a57a43 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4866,7 +4866,9 @@ struct llama_model_loader { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { - std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); mmaps_used.emplace_back(mapping->size, 0); if (mlock_mmaps) { std::unique_ptr mlock_mmap(new llama_mlock()); @@ -9190,7 +9192,7 @@ static bool llm_load_tensors( ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); if (!dev) { // FIXME: workaround for CPU backend buft having a NULL device - dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0); + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); @@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute( int n_threads, ggml_threadpool * threadpool) { if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); - ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(lctx.backend_cpu, threadpool); } // set the number of threads for all the backends @@ -19478,7 +19481,11 @@ void llama_backend_init(void) { void llama_numa_init(enum ggml_numa_strategy numa) { if (numa != GGML_NUMA_STRATEGY_DISABLED) { - ggml_numa_init(numa); + auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + GGML_ASSERT(dev && "CPU backend is not loaded"); + auto * reg = ggml_backend_dev_backend_reg(dev); + auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); + numa_init_fn(numa); } } @@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - ctx->abort_callback = params.abort_callback; - ctx->abort_callback_data = params.abort_callback_data; - ctx->logits_all = params.logits_all; // build worst-case graph for encoder if a model contains encoder @@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model( } // add CPU backend - ctx->backend_cpu = ggml_backend_cpu_init(); + ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); llama_free(ctx); @@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model( } } + llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); + if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); @@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model( std::vector backend_ptrs; for (auto & backend : ctx->backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model->devices[0]; auto * host_buft = ggml_backend_dev_host_buffer_type(dev); @@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model( // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { for (auto & backend : ctx->backends) { - if (ggml_backend_is_cpu(backend.get())) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { // ignore CPU backend continue; } @@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) { void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; + + for (auto & backend : ctx->backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); + } + } } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { @@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int } const char * llama_print_system_info(void) { - ggml_cpu_init(); // some ARM features are detected at runtime - static std::string s; - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; - s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + auto * reg = ggml_backend_reg_get(i); + auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (get_features_fn) { + ggml_backend_feature * features = get_features_fn(reg); + s += ggml_backend_reg_name(reg); + s += " : "; + for (; features->name; features++) { + s += features->name; + s += " = "; + s += features->value; + s += " | "; + } + } + } return s.c_str(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b06f122e89873..82373ff4e1862 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU # llama_target_and_test(test-double-float.cpp) # SLOW llama_target_and_test(test-log.cpp) llama_target_and_test(test-arg-parser.cpp) -llama_target_and_test(test-quantize-fns.cpp) -llama_target_and_test(test-quantize-perf.cpp) llama_target_and_test(test-sampling.cpp) llama_target_and_test(test-chat-template.cpp) llama_target_and_test(test-grammar-parser.cpp) llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-llama-grammar.cpp) -llama_target_and_test(test-barrier.cpp) # llama_target_and_test(test-opt.cpp) # SLOW llama_target_and_test(test-backend-ops.cpp) -llama_target_and_test(test-rope.cpp) - llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model") +if (NOT GGML_BACKEND_DL) + # these tests use the backends directly and cannot be built with dynamic loading + llama_target_and_test(test-barrier.cpp) + llama_target_and_test(test-quantize-fns.cpp) + llama_target_and_test(test-quantize-perf.cpp) + llama_target_and_test(test-rope.cpp) +endif() + # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b2b5705243ea2..6376b0e4c66cf 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -16,7 +16,6 @@ #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -639,19 +637,20 @@ struct test_case { // determine number of runs int n_runs; + bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU; if (op_flops(out) > 0) { // based on flops const uint64_t GFLOP = 1000 * 1000 * 1000; const uint64_t target_flops_cpu = 8ULL * GFLOP; const uint64_t target_flops_gpu = 100ULL * GFLOP; - uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu; + uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1; } else { // based on memory size const size_t GB = 1ULL << 30; const size_t target_size_cpu = 8 * GB; const size_t target_size_gpu = 32 * GB; - size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu; + size_t target_size = is_cpu ? target_size_cpu : target_size_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; } @@ -3873,7 +3872,11 @@ static std::vector> make_test_cases_perf() { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { if (mode == MODE_TEST) { auto test_cases = make_test_cases_eval(); - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (backend_cpu == NULL) { + printf(" Failed to initialize CPU backend\n"); + return false; + } size_t n_ok = 0; for (auto & test : test_cases) { @@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) { } } - // enumerate backends + // load and enumerate backends + ggml_backend_load_all(); + printf("Testing %zu devices\n\n", ggml_backend_dev_count()); size_t n_ok = 0; @@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) { continue; } - ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); - GGML_ASSERT(backend != NULL); - - if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) { + if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) { printf(" Skipping CPU backend\n"); - ggml_backend_free(backend); n_ok++; continue; } + ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); + GGML_ASSERT(backend != NULL); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); if (ggml_backend_set_n_threads_fn) { From 9ca2e677626fce759d5d95c407c03677b9c87a26 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 16:31:38 +0200 Subject: [PATCH 25/32] server : add speculative decoding support (#10455) * server : add speculative decoding support ggml-ci * server : add helper function slot.can_speculate() ggml-ci --- examples/server/server.cpp | 441 +++++++++++++++++++++++++------------ 1 file changed, 300 insertions(+), 141 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6c55d65c01330..f9d20fee5a6f6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2,10 +2,11 @@ #include "arg.h" #include "common.h" -#include "log.h" -#include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" +#include "log.h" +#include "sampling.h" +#include "speculative.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT @@ -121,12 +122,21 @@ struct slot_params { int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; + + struct common_params_sampling sampling; + struct common_params_speculative speculative; }; struct server_slot { int id; int id_task = -1; + llama_batch batch_spec; + + llama_context * ctx_dft = nullptr; + + common_speculative * spec = nullptr; + // the index relative to completion multi-task request size_t index = 0; @@ -175,7 +185,6 @@ struct server_slot { // sampling json json_schema; - struct common_params_sampling sparams; struct common_sampler * smpl = nullptr; llama_token sampled; @@ -212,7 +221,7 @@ struct server_slot { generated_token_probs.clear(); } - bool has_budget(common_params &global_params) { + bool has_budget(const common_params & global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } @@ -232,6 +241,10 @@ struct server_slot { return state != SLOT_STATE_IDLE; } + bool can_speculate() const { + return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt; + } + void add_token(const completion_token_output & token) { if (!is_processing()) { SLT_WRN(*this, "%s", "slot is not processing\n"); @@ -591,11 +604,14 @@ struct server_response { }; struct server_context { + common_params params_base; + llama_model * model = nullptr; llama_context * ctx = nullptr; std::vector loras; - common_params params; + llama_model * model_dft = nullptr; + llama_context_params cparams_dft; llama_batch batch = {}; @@ -628,27 +644,41 @@ struct server_context { model = nullptr; } + if (model_dft) { + llama_free_model(model_dft); + model_dft = nullptr; + } + // Clear any sampling context for (server_slot & slot : slots) { - if (slot.smpl != nullptr) { - common_sampler_free(slot.smpl); - } + common_sampler_free(slot.smpl); + slot.smpl = nullptr; + + llama_free(slot.ctx_dft); + slot.ctx_dft = nullptr; + + common_speculative_free(slot.spec); + slot.spec = nullptr; + + llama_batch_free(slot.batch_spec); } llama_batch_free(batch); } - bool load_model(const common_params & params_) { - params = params_; + bool load_model(const common_params & params) { + SRV_INF("loading model '%s'\n", params.model.c_str()); - common_init_result llama_init = common_init_from_params(params); + params_base = params; + + common_init_result llama_init = common_init_from_params(params_base); model = llama_init.model; ctx = llama_init.context; loras = llama_init.lora_adapters; if (model == nullptr) { - SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); + SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str()); return false; } @@ -657,6 +687,40 @@ struct server_context { add_bos_token = llama_add_bos_token(model); has_eos_token = !llama_add_eos_token(model); + if (!params_base.speculative.model.empty()) { + SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); + + auto params_dft = params_base; + + params_dft.model = params_base.speculative.model; + params_dft.n_ctx = params_base.speculative.n_ctx; + params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; + + common_init_result llama_init_dft = common_init_from_params(params_dft); + + model_dft = llama_init_dft.model; + + if (model_dft == nullptr) { + SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str()); + return false; + } + + if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) { + SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str()); + + llama_free (llama_init_dft.context); + llama_free_model(llama_init_dft.model); + + return false; + } + + cparams_dft = common_context_params_to_llama(params_base); + cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); + + // the context is not needed - we will create one for each slot + llama_free(llama_init_dft.context); + } + return true; } @@ -674,20 +738,36 @@ struct server_context { } void init() { - const int32_t n_ctx_slot = n_ctx / params.n_parallel; + const int32_t n_ctx_slot = n_ctx / params_base.n_parallel; - SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel); + SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); - for (int i = 0; i < params.n_parallel; i++) { + for (int i = 0; i < params_base.n_parallel; i++) { server_slot slot; slot.id = i; slot.n_ctx = n_ctx_slot; - slot.n_predict = params.n_predict; + slot.n_predict = params_base.n_predict; + + if (model_dft) { + slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); + + slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); + if (slot.ctx_dft == nullptr) { + SRV_ERR("%s", "failed to create draft context\n"); + return; + } + + slot.spec = common_speculative_init(slot.ctx_dft); + if (slot.spec == nullptr) { + SRV_ERR("%s", "failed to create speculator\n"); + return; + } + } SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - slot.sparams = params.sampling; + slot.params.sampling = params_base.sampling; slot.callback_on_release = [this](int) { queue_tasks.pop_deferred_task(); @@ -707,7 +787,7 @@ struct server_context { const int32_t n_batch = llama_n_batch(ctx); // only a single seq_id per token is needed - batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1); + batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } metrics.init(); @@ -786,9 +866,11 @@ struct server_context { } bool launch_slot_with_task(server_slot & slot, const server_task & task) { - slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - auto default_sparams = params.sampling; + slot_params defaults; + defaults.sampling = params_base.sampling; + defaults.speculative = params_base.speculative; + const auto & data = task.data; if (data.count("__oaicompat") != 0) { @@ -799,42 +881,48 @@ struct server_context { slot.oaicompat_model = ""; } - slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", false); - slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); - slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent); - slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); - slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); - slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); - slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); - slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); - slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); - slot.sparams.dry_multiplier = json_value(data, "dry_multiplier", default_sparams.dry_multiplier); - slot.sparams.dry_base = json_value(data, "dry_base", default_sparams.dry_base); - slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length); - slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n); - slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); - slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep); - slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); - slot.sparams.seed = json_value(data, "seed", default_sparams.seed); - slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); - slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); - //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement - slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms); - - if (slot.sparams.dry_base < 1.0f) - { - slot.sparams.dry_base = default_sparams.dry_base; + slot.params.stream = json_value(data, "stream", false); + slot.params.cache_prompt = json_value(data, "cache_prompt", false); + slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); + slot.params.n_indent = json_value(data, "n_indent", defaults.n_indent); + slot.params.n_keep = json_value(data, "n_keep", defaults.n_keep); + slot.params.n_discard = json_value(data, "n_discard", defaults.n_discard); + //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement + slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); + + slot.params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + slot.params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + slot.params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + slot.params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + slot.params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + slot.params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + slot.params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + slot.params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + slot.params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + slot.params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + slot.params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + slot.params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + slot.params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + slot.params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + slot.params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + slot.params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + slot.params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + slot.params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + slot.params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); + slot.params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + slot.params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + slot.params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + + slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); + slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); + slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); + + slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); + + if (slot.params.sampling.dry_base < 1.0f) { + slot.params.sampling.dry_base = defaults.sampling.dry_base; } // sequence breakers for DRY @@ -843,8 +931,8 @@ struct server_context { // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 if (data.contains("dry_sequence_breakers")) { - slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); - if (slot.sparams.dry_sequence_breakers.empty()) { + slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); + if (slot.params.sampling.dry_sequence_breakers.empty()) { send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST); return false; } @@ -858,14 +946,14 @@ struct server_context { } if (data.contains("json_schema") && !data.contains("grammar")) { try { - auto schema = json_value(data, "json_schema", json::object()); - slot.sparams.grammar = json_schema_to_grammar(schema); + auto schema = json_value(data, "json_schema", json::object()); + slot.params.sampling.grammar = json_schema_to_grammar(schema); } catch (const std::exception & e) { send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); return false; } } else { - slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); + slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -875,10 +963,10 @@ struct server_context { } { - slot.sparams.logit_bias.clear(); + slot.params.sampling.logit_bias.clear(); if (json_value(data, "ignore_eos", false) && has_eos_token) { - slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); + slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); } const auto & logit_bias = data.find("logit_bias"); @@ -899,12 +987,12 @@ struct server_context { if (el[0].is_number_integer()) { llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { - slot.sparams.logit_bias.push_back({tok, bias}); + slot.params.sampling.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) { auto toks = common_tokenize(model, el[0].get(), false); for (auto tok : toks) { - slot.sparams.logit_bias.push_back({tok, bias}); + slot.params.sampling.logit_bias.push_back({tok, bias}); } } } @@ -935,16 +1023,16 @@ struct server_context { sampler_names.emplace_back(name); } } - slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false); + slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false); } else if (samplers->is_string()){ std::string sampler_string; for (const auto & name : *samplers) { sampler_string += name; } - slot.sparams.samplers = common_sampler_types_from_chars(sampler_string); + slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string); } } else { - slot.sparams.samplers = default_sparams.samplers; + slot.params.sampling.samplers = defaults.sampling.samplers; } } @@ -953,7 +1041,7 @@ struct server_context { common_sampler_free(slot.smpl); } - slot.smpl = common_sampler_init(model, slot.sparams); + slot.smpl = common_sampler_init(model, slot.params.sampling); if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); @@ -961,6 +1049,12 @@ struct server_context { } } + if (slot.ctx_dft) { + llama_batch_free(slot.batch_spec); + + slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1); + } + slot.state = SLOT_STATE_STARTED; SLT_INF(slot, "%s", "processing task\n"); @@ -978,7 +1072,7 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); + const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special); slot.sampled = result.tok; // search stop word and delete it @@ -1043,7 +1137,7 @@ struct server_context { } // check the limits - if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { + if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { slot.stopped_limit = true; slot.has_next_token = false; @@ -1136,50 +1230,54 @@ struct server_context { json get_formated_generation(const server_slot & slot) const { std::vector samplers; - samplers.reserve(slot.sparams.samplers.size()); - for (const auto & sampler : slot.sparams.samplers) { + samplers.reserve(slot.params.sampling.samplers.size()); + for (const auto & sampler : slot.params.sampling.samplers) { samplers.emplace_back(common_sampler_type_to_str(sampler)); } return json { {"n_ctx", slot.n_ctx}, {"n_predict", slot.n_predict}, // Server configured n_predict - {"model", params.model_alias}, - {"seed", slot.sparams.seed}, + {"model", params_base.model_alias}, + {"seed", slot.params.sampling.seed}, {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, - {"temperature", slot.sparams.temp}, - {"dynatemp_range", slot.sparams.dynatemp_range}, - {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, - {"top_k", slot.sparams.top_k}, - {"top_p", slot.sparams.top_p}, - {"min_p", slot.sparams.min_p}, - {"xtc_probability", slot.sparams.xtc_probability}, - {"xtc_threshold", slot.sparams.xtc_threshold}, - {"typical_p", slot.sparams.typ_p}, - {"repeat_last_n", slot.sparams.penalty_last_n}, - {"repeat_penalty", slot.sparams.penalty_repeat}, - {"presence_penalty", slot.sparams.penalty_present}, - {"frequency_penalty", slot.sparams.penalty_freq}, - {"dry_multiplier", slot.sparams.dry_multiplier}, - {"dry_base", slot.sparams.dry_base}, - {"dry_allowed_length", slot.sparams.dry_allowed_length}, - {"dry_penalty_last_n", slot.sparams.dry_penalty_last_n}, - {"dry_sequence_breakers", slot.sparams.dry_sequence_breakers}, - {"mirostat", slot.sparams.mirostat}, - {"mirostat_tau", slot.sparams.mirostat_tau}, - {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, + {"temperature", slot.params.sampling.temp}, + {"dynatemp_range", slot.params.sampling.dynatemp_range}, + {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent}, + {"top_k", slot.params.sampling.top_k}, + {"top_p", slot.params.sampling.top_p}, + {"min_p", slot.params.sampling.min_p}, + {"xtc_probability", slot.params.sampling.xtc_probability}, + {"xtc_threshold", slot.params.sampling.xtc_threshold}, + {"typical_p", slot.params.sampling.typ_p}, + {"repeat_last_n", slot.params.sampling.penalty_last_n}, + {"repeat_penalty", slot.params.sampling.penalty_repeat}, + {"presence_penalty", slot.params.sampling.penalty_present}, + {"frequency_penalty", slot.params.sampling.penalty_freq}, + {"dry_multiplier", slot.params.sampling.dry_multiplier}, + {"dry_base", slot.params.sampling.dry_base}, + {"dry_allowed_length", slot.params.sampling.dry_allowed_length}, + {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n}, + {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers}, + {"mirostat", slot.params.sampling.mirostat}, + {"mirostat_tau", slot.params.sampling.mirostat_tau}, + {"mirostat_eta", slot.params.sampling.mirostat_eta}, + {"penalize_nl", slot.params.sampling.penalize_nl}, {"stop", slot.params.antiprompt}, {"max_tokens", slot.params.n_predict}, // User configured n_predict {"n_keep", slot.params.n_keep}, {"n_discard", slot.params.n_discard}, - {"ignore_eos", slot.sparams.ignore_eos}, + {"ignore_eos", slot.params.sampling.ignore_eos}, {"stream", slot.params.stream}, - //{"logit_bias", slot.sparams.logit_bias}, - {"n_probs", slot.sparams.n_probs}, - {"min_keep", slot.sparams.min_keep}, - {"grammar", slot.sparams.grammar}, + //{"logit_bias", slot.params.sampling.logit_bias}, + {"n_probs", slot.params.sampling.n_probs}, + {"min_keep", slot.params.sampling.min_keep}, + {"grammar", slot.params.sampling.grammar}, {"samplers", samplers}, + {"speculative", slot.can_speculate()}, + {"speculative.n_max", slot.params.speculative.n_max}, + {"speculative.n_min", slot.params.speculative.n_min}, + {"speculative.p_min", slot.params.speculative.p_min}, }; } @@ -1216,7 +1314,7 @@ struct server_context { {"index", slot.index}, }; - if (slot.sparams.n_probs > 0) { + if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1249,7 +1347,7 @@ struct server_context { {"content", !slot.params.stream ? slot.generated_text : ""}, {"id_slot", slot.id}, {"stop", true}, - {"model", params.model_alias}, + {"model", params_base.model_alias}, {"tokens_predicted", slot.n_decoded}, {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, @@ -1265,7 +1363,7 @@ struct server_context { {"index", slot.index}, }; - if (slot.sparams.n_probs > 0) { + if (slot.params.sampling.n_probs > 0) { std::vector probs; if (!slot.params.stream && slot.stopped_word) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); @@ -1422,10 +1520,10 @@ struct server_context { data.at("input_prefix"), data.at("input_suffix"), data.at("input_extra"), - params.n_batch, - params.n_predict, + params_base.n_batch, + params_base.n_predict, slots[0].n_ctx, // TODO: there should be a better way - params.spm_infill, + params_base.spm_infill, tokenized_prompts[i] ); create_task(data, tokens); @@ -1798,7 +1896,7 @@ struct server_context { // TODO: simplify and improve for (server_slot & slot : slots) { if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { - if (!params.ctx_shift) { + if (!params_base.ctx_shift) { // this check is redundant (for good) // we should never get here, because generation should already stopped in process_token() slot.release(); @@ -1864,7 +1962,7 @@ struct server_context { int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; // next, batch any pending prompts without exceeding n_batch - if (params.cont_batching || batch.n_tokens == 0) { + if (params_base.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { // this slot still has a prompt to be processed if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { @@ -1917,7 +2015,7 @@ struct server_context { continue; } } else { - if (!params.ctx_shift) { + if (!params_base.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size // TODO: there should be a separate parameter that control prompt truncation // context shift should be applied only during the generation phase @@ -1963,11 +2061,11 @@ struct server_context { slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); // reuse chunks from the cached prompt by shifting their KV cache in the new position - if (params.n_cache_reuse > 0) { + if (params_base.n_cache_reuse > 0) { size_t head_c = slot.n_past; // cache size_t head_p = slot.n_past; // current prompt - SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past); while (head_c < slot.cache_tokens.size() && head_p < prompt_tokens.size()) { @@ -1980,7 +2078,7 @@ struct server_context { n_match++; } - if (n_match >= (size_t) params.n_cache_reuse) { + if (n_match >= (size_t) params_base.n_cache_reuse) { SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); //for (size_t i = head_p; i < head_p + n_match; i++) { // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); @@ -2168,38 +2266,99 @@ struct server_context { continue; // continue loop of slots } - completion_token_output result; - const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + llama_token id; + + { + completion_token_output result; + + id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + + slot.i_batch = -1; + + common_sampler_accept(slot.smpl, id, true); + + slot.n_decoded += 1; + if (slot.n_decoded == 1) { + slot.t_start_generation = ggml_time_us(); + slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); + } + + result.tok = id; + + const auto * cur_p = common_sampler_get_candidates(slot.smpl); - common_sampler_accept(slot.smpl, id, true); + for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); + } + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + continue; + } + } - slot.n_decoded += 1; - if (slot.n_decoded == 1) { - slot.t_start_generation = ggml_time_us(); - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); + // check if the slot supports speculative decoding + if (!slot.can_speculate()) { + continue; } - result.tok = id; + struct common_speculative_params params_spec; + params_spec.n_draft = slot.params.speculative.n_max; + params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; + params_spec.p_min = slot.params.speculative.p_min; - const auto * cur_p = common_sampler_get_candidates(slot.smpl); + llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id); - for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { - result.probs.push_back({ - cur_p->data[i].id, - i >= cur_p->size ? 0.0f : cur_p->data[i].p, - }); + // ignore small drafts + if (slot.params.speculative.n_min > (int) draft.size()) { + continue; } - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); + // construct the speculation batch + common_batch_clear(slot.batch_spec); + common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true); + + for (size_t i = 0; i < draft.size(); ++i) { + common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); + } + + llama_decode(ctx, slot.batch_spec); + + // the accepted tokens from the speculation + const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft); + + slot.n_past += ids.size(); + slot.n_decoded += ids.size(); + + slot.cache_tokens.push_back(id); + slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); + + llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + + for (size_t i = 0; i < ids.size(); ++i) { + completion_token_output result; + + result.tok = ids[i]; + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + break; + } } - slot.i_batch = -1; + SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size()); } } @@ -2697,7 +2856,7 @@ int main(int argc, char ** argv) { const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { json data = { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, - { "total_slots", ctx_server.params.n_parallel }, + { "total_slots", ctx_server.params_base.n_parallel }, { "chat_template", llama_get_chat_template(ctx_server.model) }, }; @@ -2705,7 +2864,7 @@ int main(int argc, char ** argv) { }; const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params.endpoint_props) { + if (!ctx_server.params_base.endpoint_props) { res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2718,7 +2877,7 @@ int main(int argc, char ** argv) { }; const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { - if (ctx_server.params.embedding) { + if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2824,7 +2983,7 @@ int main(int argc, char ** argv) { // TODO: maybe merge this function with "handle_completions_generic" const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { - if (ctx_server.params.embedding) { + if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -3001,7 +3160,7 @@ int main(int argc, char ** argv) { }; const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params.reranking || ctx_server.params.embedding) { + if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); return; } From a9a678a6b2264e5895533217b0cf8f250534cc58 Mon Sep 17 00:00:00 2001 From: brucepro Date: Mon, 25 Nov 2024 08:11:55 -0800 Subject: [PATCH 26/32] Add download chat feature to server chat (#10481) * Add download chat feature to server chat Add a download feature next to the delete chat feature in the server vue chat interface. * code style --------- Co-authored-by: Xuan Son Nguyen --- examples/server/public/index.html | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 6216c08410a28..c54260867d2c8 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -81,7 +81,13 @@

Conversations

- +