From 3f6f2b7fe5bd49b9b91d844aaf3b5c870a97bafa Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Thu, 13 Jun 2024 09:39:26 +0800 Subject: [PATCH 01/18] Upgrade to latest ggml --- CMakeLists.txt | 28 +- Dockerfile | 2 +- README.md | 148 +---- chatglm.cpp | 1175 ++++++++++++++----------------------- chatglm.h | 658 +++++++++------------ chatglm_cpp/_C.pyi | 26 +- chatglm_cpp/__init__.py | 6 +- chatglm_cpp/convert.py | 146 ----- chatglm_cpp/openai_api.py | 2 - chatglm_pybind.cpp | 36 +- chatglm_test.cpp | 1031 ++++++++++++-------------------- examples/chatglm3_demo.py | 1 - examples/web_demo.py | 2 - main.cpp | 8 +- tests/data/linear.data | Bin 2496 -> 2688 bytes tests/perf.sh | 18 +- tests/perplexity.cpp | 25 +- tests/ppl.sh | 8 - tests/test_chatglm_cpp.py | 57 +- tests/test_convert.py | 215 ++----- third_party/ggml | 2 +- 21 files changed, 1181 insertions(+), 2413 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bff1b222..11aaa3db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE STRING "") set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-expansion-to-defined") # suppress ggml warnings if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) @@ -24,17 +25,8 @@ if (CHATGLM_ENABLE_PYBIND) endif () # third-party libraries -add_compile_definitions(GGML_CUDA_MMV_Y=4) # for large vocab -include_directories(third_party/ggml/include/ggml third_party/ggml/src) -add_subdirectory(third_party/ggml) - -set(SPM_ENABLE_SHARED OFF CACHE BOOL "chatglm: disable sentencepiece shared libraries by default") -set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "chatglm: disable tcmalloc by default") -include_directories(third_party/sentencepiece/src) -add_subdirectory(third_party/sentencepiece) - -if (GGML_CUBLAS) - add_compile_definitions(GGML_USE_CUBLAS) +if (GGML_CUDA) + add_compile_definitions(GGML_USE_CUDA) enable_language(CUDA) # ref: https://stackoverflow.com/questions/28932864/which-compute-capability-is-supported-by-which-cuda-versions set(CUDA_ARCH_LIST "52;61;70;75") @@ -47,10 +39,17 @@ if (GGML_CUBLAS) if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") set(CUDA_ARCH_LIST "${CUDA_ARCH_LIST};89;90") endif () - set(CUDA_ARCHITECTURES ${CUDA_ARCH_LIST} CACHE STRING "chatglm: cuda architectures to compile") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) + set(GGML_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST} CACHE STRING "chatglm: cuda architectures to compile") endif () +include_directories(third_party/ggml/include/ggml third_party/ggml/src) +add_subdirectory(third_party/ggml) + +set(SPM_ENABLE_SHARED OFF CACHE BOOL "chatglm: disable sentencepiece shared libraries by default") +set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "chatglm: disable tcmalloc by default") +include_directories(third_party/sentencepiece/src) +add_subdirectory(third_party/sentencepiece) + include_directories(third_party/sentencepiece/third_party/protobuf-lite) set(ABSL_ENABLE_INSTALL ON CACHE BOOL "" FORCE) @@ -72,7 +71,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) file(GLOB CPP_SOURCES ${PROJECT_SOURCE_DIR}/*.h - ${PROJECT_SOURCE_DIR}/*.cpp) + ${PROJECT_SOURCE_DIR}/*.cpp + ${PROJECT_SOURCE_DIR}/tests/*.cpp) set_source_files_properties(${CPP_SOURCES} PROPERTIES COMPILE_FLAGS "-pedantic-errors") diff --git a/Dockerfile b/Dockerfile index 3b23d1f9..c9bd9500 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ARG BASE_IMAGE=ubuntu:20.04 FROM ${BASE_IMAGE} AS build -ARG CMAKE_ARGS="-DGGML_CUBLAS=OFF" +ARG CMAKE_ARGS="-DGGML_CUDA=OFF" WORKDIR /chatglm.cpp diff --git a/README.md b/README.md index c0f53b5f..89ce8aea 100644 --- a/README.md +++ b/README.md @@ -22,9 +22,7 @@ Highlights: Support Matrix: * Hardwares: x86/arm CPU, NVIDIA GPU, Apple Silicon GPU * Platforms: Linux, MacOS, Windows -* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan-7B](https://github.com/baichuan-inc/Baichuan-7B), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan2](https://github.com/baichuan-inc/Baichuan2), [InternLM](https://github.com/InternLM/InternLM) - -**NOTE**: Baichuan & InternLM model series are deprecated in favor of [llama.cpp](https://github.com/ggerganov/llama.cpp). +* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2) ## Getting Started @@ -59,7 +57,6 @@ The original model (`-i `) can be a Hugging Face model name * ChatGLM3-6B: `THUDM/chatglm3-6b` * ChatGLM4-9B: `THUDM/glm-4-9b-chat` * CodeGeeX2: `THUDM/codegeex2-6b`, `THUDM/codegeex2-6b-int4` -* Baichuan & Baichuan2: `baichuan-inc/Baichuan-13B-Chat`, `baichuan-inc/Baichuan2-7B-Chat`, `baichuan-inc/Baichuan2-13B-Chat` You are free to try any of the below quantization types by specifying `-t `: * `q4_0`: 4-bit integer quantization with fp16 scales. @@ -212,56 +209,6 @@ print(bubble_sort([5, 4, 3, 2, 1])) ``` -
-Baichuan-13B-Chat - -```sh -python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o models/baichuan-13b-chat-ggml.bin -./build/bin/main -m models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 -# 你好!有什么我可以帮助你的吗? -``` -
- -
-Baichuan2-7B-Chat - -```sh -python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o models/baichuan2-7b-chat-ggml.bin -./build/bin/main -m models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 -# 你好!很高兴为您提供帮助。请问有什么问题我可以帮您解答? -``` -
- -
-Baichuan2-13B-Chat - -```sh -python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o models/baichuan2-13b-chat-ggml.bin -./build/bin/main -m models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 -# 你好!今天我能为您提供什么帮助? -``` -
- -
-InternLM-Chat-7B - -```sh -python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b -t q4_0 -o models/internlm-chat-7b-ggml.bin -./build/bin/main -m models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 -# 你好,我是书生·浦语,有什么可以帮助你的吗? -``` -
- -
-InternLM-Chat-20B - -```sh -python3 chatglm_cpp/convert.py -i internlm/internlm-chat-20b -t q4_0 -o models/internlm-chat-20b-ggml.bin -./build/bin/main -m models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 -# 你好!有什么我可以帮到你的吗? -``` -
- ## Using BLAS BLAS library can be integrated to further accelerate matrix multiplication. However, in some cases, using BLAS may cause performance degradation. Whether to turn on BLAS should depend on the benchmarking result. @@ -279,15 +226,15 @@ cmake -B build -DGGML_OPENBLAS=ON && cmake --build build -j **cuBLAS** -cuBLAS uses NVIDIA GPU to accelerate BLAS. Add the CMake flag `-DGGML_CUBLAS=ON` to enable it. +cuBLAS uses NVIDIA GPU to accelerate BLAS. Add the CMake flag `-DGGML_CUDA=ON` to enable it. ```sh -cmake -B build -DGGML_CUBLAS=ON && cmake --build build -j +cmake -B build -DGGML_CUDA=ON && cmake --build build -j ``` -By default, all kernels will be compiled for all possible CUDA architectures and it takes some time. To run on a specific type of device, you may specify `CUDA_ARCHITECTURES` to speed up the nvcc compilation. For example: +By default, all kernels will be compiled for all possible CUDA architectures and it takes some time. To run on a specific type of device, you may specify `GGML_CUDA_ARCHITECTURES` to speed up the nvcc compilation. For example: ```sh -cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="80" # for A100 -cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="70;75" # compatible with both V100 and T4 +cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_ARCHITECTURES="80" # for A100 +cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_ARCHITECTURES="70;75" # compatible with both V100 and T4 ``` To find out the CUDA architecture of your GPU device, see [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus). @@ -312,7 +259,7 @@ pip install -U chatglm-cpp To enable cuBLAS acceleration on NVIDIA GPU: ```sh -CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install -U chatglm-cpp +CMAKE_ARGS="-DGGML_CUDA=ON" pip install -U chatglm-cpp ``` To enable Metal on Apple silicon devices: @@ -426,51 +373,6 @@ python3 web_demo.py -m ../models/codegeex2-ggml.bin --temp 0 --max_length 512 -- ``` -
-Baichuan-13B-Chat - -```sh -python3 cli_demo.py -m ../models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # CLI demo -python3 web_demo.py -m ../models/baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # web demo -``` -
- -
-Baichuan2-7B-Chat - -```sh -python3 cli_demo.py -m ../models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo -python3 web_demo.py -m ../models/baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo -``` -
- -
-Baichuan2-13B-Chat - -```sh -python3 cli_demo.py -m ../models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo -python3 web_demo.py -m ../models/baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo -``` -
- -
-InternLM-Chat-7B - -```sh -python3 cli_demo.py -m ../models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo -python3 web_demo.py -m ../models/internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8 # web demo -``` -
- -
-InternLM-Chat-20B - -```sh -python3 cli_demo.py -m ../models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo -python3 web_demo.py -m ../models/internlm-chat-20b-ggml.bin --top_p 0.8 --temp 0.8 # web demo -``` -
- **Converting Hugging Face LLMs at Runtime** Sometimes it might be inconvenient to convert and save the intermediate GGML models beforehand. Here is an option to directly load from the original Hugging Face model, quantize it into GGML models in a minute, and start serving. All you need is to replace the GGML model path with the Hugging Face model name or path. @@ -579,7 +481,7 @@ For CUDA support, make sure [nvidia-docker](https://github.com/NVIDIA/nvidia-doc ```sh docker build . --network=host -t chatglm.cpp-cuda \ --build-arg BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 \ - --build-arg CMAKE_ARGS="-DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES=80" + --build-arg CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_CUDA_ARCHITECTURES=80" docker run -it --rm --gpus all -v $PWD/models:/chatglm.cpp/models chatglm.cpp-cuda \ ./build/bin/main -m models/chatglm-ggml.bin -p "你好" ``` @@ -637,40 +539,6 @@ ChatGLM4-9B: | ms/token (CUDA @ V100 SXM2) | 12.1 | 12.5 | 13.8 | 13.9 | 17.7 | 27.7 | | file size | 5.0G | 5.5G | 6.1G | 6.6G | 9.4G | 18G | -Baichuan-7B / Baichuan2-7B: - -| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 | -|--------------------------------|-------|-------|-------|-------|-------|-------| -| ms/token (CPU @ Platinum 8260) | 85.3 | 94.8 | 103.4 | 109.6 | 136.8 | 248.5 | -| ms/token (CUDA @ V100 SXM2) | 8.7 | 9.2 | 10.2 | 10.3 | 13.2 | 21.0 | -| ms/token (MPS @ M2 Ultra) | 11.3 | 12.0 | N/A | N/A | 16.4 | 25.6 | -| file size | 4.0G | 4.4G | 4.9G | 5.3G | 7.5G | 14G | -| mem usage | 4.5G | 4.9G | 5.3G | 5.7G | 7.8G | 14G | - -Baichuan-13B / Baichuan2-13B: - -| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 | -|--------------------------------|-------|-------|-------|-------|-------|-------| -| ms/token (CPU @ Platinum 8260) | 161.7 | 175.8 | 189.9 | 192.3 | 255.6 | 459.6 | -| ms/token (CUDA @ V100 SXM2) | 13.7 | 15.1 | 16.3 | 16.9 | 21.9 | 36.8 | -| ms/token (MPS @ M2 Ultra) | 18.2 | 18.8 | N/A | N/A | 27.2 | 44.4 | -| file size | 7.0G | 7.8G | 8.5G | 9.3G | 14G | 25G | -| mem usage | 7.8G | 8.8G | 9.5G | 10G | 14G | 25G | - -InternLM-7B: - -| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 | -|--------------------------------|-------|-------|-------|-------|-------|-------| -| ms/token (CPU @ Platinum 8260) | 85.3 | 90.1 | 103.5 | 112.5 | 137.3 | 232.2 | -| ms/token (CUDA @ V100 SXM2) | 9.1 | 9.4 | 10.5 | 10.5 | 13.3 | 21.1 | - -InternLM-20B: - -| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 | -|--------------------------------|-------|-------|-------|-------|-------|-------| -| ms/token (CPU @ Platinum 8260) | 230.0 | 236.7 | 276.6 | 290.6 | 357.1 | N/A | -| ms/token (CUDA @ V100 SXM2) | 21.6 | 23.2 | 25.0 | 25.9 | 33.4 | N/A | - ## Model Quality We measure model quality by evaluating the perplexity over the WikiText-2 test dataset, following the strided sliding window strategy in https://huggingface.co/docs/transformers/perplexity. Lower perplexity usually indicates a better model. diff --git a/chatglm.cpp b/chatglm.cpp index 8c94b85a..3c20bd80 100644 --- a/chatglm.cpp +++ b/chatglm.cpp @@ -38,7 +38,7 @@ #include #endif -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA #include #endif @@ -47,7 +47,7 @@ namespace chatglm { static std::string shape_to_string(ggml_tensor *tensor) { std::ostringstream oss; oss << '['; - for (int i = tensor->n_dims - 1; i >= 0; i--) { + for (int i = ggml_n_dims(tensor) - 1; i >= 0; i--) { oss << tensor->ne[i] << (i > 0 ? ", " : ""); } oss << ']'; @@ -57,7 +57,7 @@ static std::string shape_to_string(ggml_tensor *tensor) { static std::string strides_to_string(ggml_tensor *tensor) { std::ostringstream oss; oss << '['; - for (int i = tensor->n_dims - 1; i >= 0; i--) { + for (int i = ggml_n_dims(tensor) - 1; i >= 0; i--) { oss << tensor->nb[i] << (i > 0 ? ", " : ""); } oss << ']'; @@ -65,23 +65,26 @@ static std::string strides_to_string(ggml_tensor *tensor) { } std::string to_string(ggml_tensor *tensor, bool with_data) { + std::vector> buf(ggml_nbytes(tensor)); + ggml_backend_tensor_get(tensor, buf.data(), 0, buf.size()); + std::ostringstream oss; oss << "ggml_tensor("; if (with_data) { - if (tensor->n_dims > 3) + if (ggml_n_dims(tensor) > 3) oss << "["; for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - if (tensor->n_dims > 2) + if (ggml_n_dims(tensor) > 2) oss << (i3 > 0 ? ",\n\n[" : "["); for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - if (tensor->n_dims > 1) + if (ggml_n_dims(tensor) > 1) oss << (i2 > 0 ? ",\n\n[" : "["); for (int i1 = 0; i1 < tensor->ne[1]; i1++) { oss << (i1 > 0 ? ",\n[" : "["); for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - auto ptr = (char *)tensor->data + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + - i0 * tensor->nb[0]; + char *ptr = (char *)buf.data() + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + + i0 * tensor->nb[0]; oss << (i0 > 0 ? ", " : ""); if (tensor->type == GGML_TYPE_I32) { oss << *(int *)ptr; @@ -99,13 +102,13 @@ std::string to_string(ggml_tensor *tensor, bool with_data) { } oss << "]"; } - if (tensor->n_dims > 1) + if (ggml_n_dims(tensor) > 1) oss << "]"; } - if (tensor->n_dims > 2) + if (ggml_n_dims(tensor) > 2) oss << "]"; } - if (tensor->n_dims > 3) + if (ggml_n_dims(tensor) > 3) oss << "]"; oss << ", "; } @@ -114,33 +117,6 @@ std::string to_string(ggml_tensor *tensor, bool with_data) { return oss.str(); } -ggml_tensor *tensor_assign_buffers(ggml_tensor *tensor) { -#ifdef GGML_USE_CUBLAS - ggml_cuda_assign_buffers(tensor); -#endif - return tensor; -} - -ggml_tensor *tensor_to_device(ggml_tensor *tensor) { -#ifdef GGML_USE_CUBLAS - if (tensor->backend == GGML_BACKEND_CPU) { - tensor->backend = GGML_BACKEND_GPU; - ggml_cuda_transform_tensor(tensor->data, tensor); - } -#endif - return tensor; -} - -ggml_tensor *tensor_to_cpu(ggml_tensor *tensor) { -#ifdef GGML_USE_CUBLAS - if (tensor->backend != GGML_BACKEND_CPU) { - ggml_cuda_free_data(tensor); - tensor->backend = GGML_BACKEND_CPU; - } -#endif - return tensor; -} - const std::string ToolCallMessage::TYPE_FUNCTION = "function"; const std::string ToolCallMessage::TYPE_CODE = "code"; @@ -174,47 +150,34 @@ std::vector BaseTokenizer::filter_user_assistant_messages(const std return user_assistant_messages; } -// Adapted from https://github.com/ggerganov/llama.cpp/blob/master/llama.cpp -void ggml_graph_compute_helper(std::vector &buf, ggml_cgraph *graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = (uint8_t *)buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - // for debugging purpose [[maybe_unused]] static inline ggml_tensor *add_zero(ggml_context *ctx, ggml_tensor *tensor) { - ggml_tensor *zeros = ggml_new_tensor(ctx, GGML_TYPE_F32, tensor->n_dims, tensor->ne); + ggml_tensor *zeros = ggml_new_tensor(ctx, GGML_TYPE_F32, ggml_n_dims(tensor), tensor->ne); ggml_set_f32(zeros, 0); - tensor_to_device(zeros); - ggml_tensor *out = tensor_assign_buffers(ggml_add(ctx, tensor, zeros)); + ggml_tensor *out = ggml_add(ctx, tensor, zeros); return out; } -void ModelContext::init_device_context() { -#ifdef GGML_USE_METAL - ctx_metal = make_unique_ggml_metal_context(1); +// void ModelContext::init_device_context() { +// #ifdef GGML_USE_METAL +// ctx_metal = make_unique_ggml_metal_context(1); - const size_t max_size = ggml_get_max_tensor_size(ctx_w.get()); +// const size_t max_size = ggml_get_max_tensor_size(ctx_w.get()); - void *weight_data = weight_buffer.empty() ? ggml_get_mem_buffer(ctx_w.get()) : (void *)weight_buffer.data(); - size_t weight_size = weight_buffer.empty() ? ggml_get_mem_size(ctx_w.get()) : weight_buffer.size(); - CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "weights", weight_data, weight_size, max_size)); +// void *weight_data = weight_buffer.empty() ? ggml_get_mem_buffer(ctx_w.get()) : (void *)weight_buffer.data(); +// size_t weight_size = weight_buffer.empty() ? ggml_get_mem_size(ctx_w.get()) : weight_buffer.size(); +// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "weights", weight_data, weight_size, max_size)); - CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "kv", ggml_get_mem_buffer(ctx_kv.get()), - ggml_get_mem_size(ctx_kv.get()), 0)); +// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "kv", ggml_get_mem_buffer(ctx_kv.get()), +// ggml_get_mem_size(ctx_kv.get()), 0)); - void *compute_data = ctx_b ? ggml_get_mem_buffer(ctx_b.get()) : compute_buffer.data(); - size_t compute_size = ctx_b ? ggml_get_mem_size(ctx_b.get()) : compute_buffer.size(); - CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "compute", compute_data, compute_size, 0)); +// void *compute_data = ctx_b ? ggml_get_mem_buffer(ctx_b.get()) : compute_meta.data(); +// size_t compute_size = ctx_b ? ggml_get_mem_size(ctx_b.get()) : compute_meta.size(); +// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "compute", compute_data, compute_size, 0)); - CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "scratch", scratch.data, scratch.size, 0)); -#endif -} +// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "scratch", scratch.data, scratch.size, 0)); +// #endif +// } // ===== streamer ===== @@ -387,125 +350,154 @@ std::string ModelLoader::read_string(size_t length) { return s; } -void ModelLoader::checked_read_tensor_meta(const std::string &name, int target_ndim, int64_t *target_ne, - ggml_type target_dtype) { - // read and check tensor name - { +StateDict ModelLoader::read_state_dict() { + StateDict sd; + sd.ctx = make_unique_ggml_context(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead(), nullptr, true); + sd.buf = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr(data, size)); + + // assume state dict is stored at the back of file + while (tell() < (int64_t)size) { + // tensor name int name_size = read_basic(); - CHATGLM_CHECK(name_size == (int)name.size()) - << "tensor " << name << " name size mismatch: expect " << name.size() << " but got " << name_size; std::string weight_name = read_string(name_size); - CHATGLM_CHECK(weight_name == name) << "tensor name mismatch: expect " << name << " but got " << weight_name; - } - // read and check tensor shape - { + // tensor shape + int64_t ne[4]{1, 1, 1, 1}; int ndim = read_basic(); - CHATGLM_CHECK(ndim == target_ndim) - << "tensor " << name << " ndim mismatch: expect " << target_ndim << " but got " << ndim; + CHATGLM_CHECK(0 < ndim && ndim <= 4); for (int i = ndim - 1; i >= 0; i--) { - int dim_size = read_basic(); - CHATGLM_CHECK(dim_size == target_ne[i]) << "tensor " << name << " shape mismatch at dim " << i - << ": expect " << target_ne[i] << " but got " << dim_size; + ne[i] = read_basic(); } - } - // read and check tensor dtype - { + // tensor dtype ggml_type dtype = (ggml_type)read_basic(); - CHATGLM_CHECK(dtype == target_dtype) - << "tensor " << name << " dtype mismatch: expect " << target_dtype << " but got " << dtype; + + // tensor data + ggml_tensor *tensor = ggml_new_tensor(sd.ctx.get(), dtype, ndim, ne); + constexpr int64_t MEM_ALIGNED = 16; + const int64_t data_offset = (tell() + (MEM_ALIGNED - 1)) & ~(MEM_ALIGNED - 1); + ggml_backend_tensor_alloc(sd.buf.get(), tensor, data + data_offset); + // tensor->data = data + data_offset; + seek(data_offset + ggml_nbytes(tensor), SEEK_SET); + + // add to state dict + sd.kv.emplace(weight_name, tensor); } + return sd; } -void *ModelLoader::read_tensor_data(size_t nbytes) { - constexpr int64_t MEM_ALIGNED = 16; - const int64_t data_offset = (tell() + (MEM_ALIGNED - 1)) & ~(MEM_ALIGNED - 1); - void *tensor_data = data + data_offset; - seek(data_offset + nbytes, SEEK_SET); - return tensor_data; -} +ModelContext::ModelContext(ggml_type dtype) + : dtype(dtype), compute_meta(ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead()), + ctx_w(make_unique_ggml_context(ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE, nullptr, true)), + ctx_kv(make_unique_ggml_context(ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE, nullptr, true)), + ctx_b(make_unique_ggml_context(compute_meta.size(), compute_meta.data(), true)), gf(ggml_new_graph(ctx_b.get())) { -void ModelLoader::read_tensor(const std::string &name, ggml_tensor *tensor) { - checked_read_tensor_meta(name, tensor->n_dims, tensor->ne, tensor->type); - tensor->data = read_tensor_data(ggml_nbytes(tensor)); +#if defined(GGML_USE_CUDA) + backend = unique_ggml_backend_t(ggml_backend_cuda_init(0)); +#elif defined(GGML_USE_METAL) + backend = unique_ggml_backend_t(ggml_backend_metal_init()); +#else + backend = unique_ggml_backend_t(ggml_backend_cpu_init()); +#endif + CHATGLM_CHECK(backend) << "failed to initialize ggml backend"; + + allocr = unique_ggml_gallocr_t(ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get()))); } // ===== modules ===== -ggml_tensor *Embedding::forward(ModelContext *ctx, ggml_tensor *input) const { - ggml_tensor *output = ggml_get_rows(ctx->ctx_b.get(), weight, input); +ggml_tensor *Embedding::forward(ModelContext *mctx, ggml_tensor *input) const { + ggml_tensor *output = ggml_get_rows(mctx->ctx_b.get(), weight, input); return output; } -ggml_tensor *Linear::forward(ModelContext *ctx, ggml_tensor *input) const { +ggml_tensor *Linear::forward(ModelContext *mctx, ggml_tensor *input) const { // input: [seqlen, in_features] - ggml_context *gctx = ctx->ctx_b.get(); - ggml_tensor *output = tensor_assign_buffers(ggml_mul_mat(gctx, weight, input)); // [seqlen, out_features] + ggml_context *ctx = mctx->ctx_b.get(); + ggml_tensor *output = ggml_mul_mat(ctx, weight, input); // [seqlen, out_features] if (bias) { - output = tensor_assign_buffers(ggml_add_inplace(gctx, output, bias)); + output = ggml_add_inplace(ctx, output, bias); } return output; } -ggml_tensor *LayerNorm::forward(ModelContext *ctx, ggml_tensor *input) const { +ggml_tensor *LayerNorm::forward(ModelContext *mctx, ggml_tensor *input) const { // input: [seqlen, normalized_shape] - ggml_context *gctx = ctx->ctx_b.get(); + ggml_context *ctx = mctx->ctx_b.get(); auto ggml_norm_fn = inplace ? ggml_norm_inplace : ggml_norm; - ggml_tensor *output = tensor_assign_buffers(ggml_norm_fn(gctx, input, eps)); - output = tensor_assign_buffers(ggml_mul_inplace(gctx, output, weight)); - output = tensor_assign_buffers(ggml_add_inplace(gctx, output, bias)); + ggml_tensor *output = ggml_norm_fn(ctx, input, eps); + output = ggml_mul_inplace(ctx, output, weight); + output = ggml_add_inplace(ctx, output, bias); return output; } -ggml_tensor *RMSNorm::forward(ModelContext *ctx, ggml_tensor *input) const { - ggml_context *gctx = ctx->ctx_b.get(); +ggml_tensor *RMSNorm::forward(ModelContext *mctx, ggml_tensor *input) const { + ggml_context *ctx = mctx->ctx_b.get(); auto ggml_rms_norm_fn = inplace ? ggml_rms_norm_inplace : ggml_rms_norm; - ggml_tensor *output = tensor_assign_buffers(ggml_rms_norm_fn(gctx, input, eps)); - output = tensor_assign_buffers(ggml_mul_inplace(gctx, output, weight)); + ggml_tensor *output = ggml_rms_norm_fn(ctx, input, eps); + output = ggml_mul_inplace(ctx, output, weight); return output; } static ggml_tensor *apply_activation_inplace(ggml_context *ctx, ggml_tensor *hidden_states, ActivationType hidden_act) { switch (hidden_act) { case ActivationType::GELU: - return tensor_assign_buffers(ggml_gelu_inplace(ctx, hidden_states)); + return ggml_gelu_inplace(ctx, hidden_states); case ActivationType::SILU: - return tensor_assign_buffers(ggml_silu_inplace(ctx, hidden_states)); + return ggml_silu_inplace(ctx, hidden_states); default: CHATGLM_THROW << "Unknown activation type " << (int)hidden_act; } } -ggml_tensor *BasicMLP::forward(ModelContext *ctx, ggml_tensor *hidden_states) const { - ggml_context *gctx = ctx->ctx_b.get(); - hidden_states = dense_h_to_4h.forward(ctx, hidden_states); - hidden_states = apply_activation_inplace(gctx, hidden_states, hidden_act); - hidden_states = dense_4h_to_h.forward(ctx, hidden_states); +ggml_tensor *BasicMLP::forward(ModelContext *mctx, ggml_tensor *hidden_states) const { + ggml_context *ctx = mctx->ctx_b.get(); + hidden_states = dense_h_to_4h.forward(mctx, hidden_states); + hidden_states = apply_activation_inplace(ctx, hidden_states, hidden_act); + hidden_states = dense_4h_to_h.forward(mctx, hidden_states); return hidden_states; } -ggml_tensor *BasicGLU::forward(ModelContext *ctx, ggml_tensor *hidden_states) const { - ggml_context *gctx = ctx->ctx_b.get(); - ggml_tensor *gate = gate_proj.forward(ctx, hidden_states); - gate = apply_activation_inplace(gctx, gate, hidden_act); - hidden_states = up_proj.forward(ctx, hidden_states); - hidden_states = tensor_assign_buffers(ggml_mul_inplace(gctx, hidden_states, gate)); - hidden_states = down_proj.forward(ctx, hidden_states); +ggml_tensor *BasicGLU::forward(ModelContext *mctx, ggml_tensor *hidden_states) const { + ggml_context *ctx = mctx->ctx_b.get(); + ggml_tensor *gate = gate_proj.forward(mctx, hidden_states); + gate = apply_activation_inplace(ctx, gate, hidden_act); + hidden_states = up_proj.forward(mctx, hidden_states); + hidden_states = ggml_mul_inplace(ctx, hidden_states, gate); + hidden_states = down_proj.forward(mctx, hidden_states); return hidden_states; } -// Adapted from https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp -int get_num_physical_cores() { +// Adapted from https://github.com/ggerganov/llama.cpp/blob/master/common/common.cpp +static int get_num_physical_cores() { unsigned int n_threads = std::thread::hardware_concurrency(); return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -int get_default_num_threads() { -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_METAL) - return 1; -#else - return std::min(get_num_physical_cores(), 16); +static void set_default_num_threads(ggml_backend_t backend, int num_tokens) { + int n_threads = 1; + if (ggml_backend_is_cpu(backend)) { + if (num_tokens > 1) { + // context + n_threads = get_num_physical_cores(); + } else { + // decode + n_threads = std::min(get_num_physical_cores(), 16); + } + } + if (num_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + // BLAS is enabled + n_threads = std::min(4, n_threads); + } + + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(backend)) { + ggml_backend_metal_set_n_cb(backend, n_threads); + } #endif } @@ -519,110 +511,102 @@ std::string to_string(ModelType model_type) { return "ChatGLM3"; case ModelType::CHATGLM4: return "ChatGLM4"; - case ModelType::BAICHUAN7B: - return "Baichuan7B"; - case ModelType::BAICHUAN13B: - return "Baichuan13B"; - case ModelType::INTERNLM: - return "InternLM"; default: CHATGLM_THROW << "unknown model type " << (int)model_type; } } -static ggml_tensor *apply_rotary_emb_basic(ModelContext *ctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx, - RopeType rope_type, float rope_theta, int dim_scale) { +static ggml_tensor *apply_rotary_emb_basic(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx, + RopeType rope_type, float rope_theta) { // tensor a (activation) is of shape [s, #h, d] // tensor b (position_ids) is of shape [s] - ggml_context *gctx = ctx->ctx_b.get(); -#ifdef GGML_USE_CUBLAS + ggml_context *ctx = mctx->ctx_b.get(); +#ifdef GGML_USE_CUDA if (!ggml_is_contiguous(layer)) { - layer = tensor_assign_buffers(ggml_cont(gctx, layer)); + layer = ggml_cont(ctx, layer); } #endif const int head_size = layer->ne[0]; - const int rope_dim = head_size / dim_scale; - layer = tensor_assign_buffers(ggml_rope_custom_inplace(gctx, layer, position_ids, rope_dim, (int)rope_type, n_ctx, - rope_theta, 1.f)); // [s, #h, d] + layer = ggml_rope_ext_inplace(ctx, layer, position_ids, nullptr, head_size, (int)rope_type, n_ctx, 0, rope_theta, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // [s, #h, d] return layer; } -static ggml_tensor *apply_rotary_emb_glm(ModelContext *ctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx) { +static ggml_tensor *apply_rotary_emb_glm(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx) { // tensor a (activation) is of shape [s, #h, d] // tensor b (position_ids) is of shape [2 * s] - ggml_context *gctx = ctx->ctx_b.get(); + ggml_context *ctx = mctx->ctx_b.get(); const int head_size = layer->ne[0]; const int num_heads = layer->ne[1]; const int qlen = layer->ne[2]; const int rope_dim = head_size / 2; - ggml_tensor *b1 = ggml_view_1d(gctx, position_ids, qlen, 0); - ggml_tensor *b2 = ggml_view_1d(gctx, position_ids, qlen, qlen * ggml_element_size(position_ids)); + ggml_tensor *b1 = ggml_view_1d(ctx, position_ids, qlen, 0); + ggml_tensor *b2 = ggml_view_1d(ctx, position_ids, qlen, qlen * ggml_element_size(position_ids)); - ggml_tensor *a1 = ggml_view_3d(gctx, layer, head_size / 2, num_heads, qlen, layer->nb[1], layer->nb[2], 0); - ggml_tensor *a2 = ggml_view_3d(gctx, layer, head_size / 2, num_heads, qlen, layer->nb[1], layer->nb[2], + ggml_tensor *a1 = ggml_view_3d(ctx, layer, head_size / 2, num_heads, qlen, layer->nb[1], layer->nb[2], 0); + ggml_tensor *a2 = ggml_view_3d(ctx, layer, head_size / 2, num_heads, qlen, layer->nb[1], layer->nb[2], head_size / 2 * ggml_element_size(layer)); ggml_tensor *a1_rope = a1; ggml_tensor *a2_rope = a2; -#ifdef GGML_USE_CUBLAS - a1_rope = tensor_assign_buffers(ggml_cont(gctx, a1_rope)); - a2_rope = tensor_assign_buffers(ggml_cont(gctx, a2_rope)); +#ifdef GGML_USE_CUDA + a1_rope = ggml_cont(ctx, a1_rope); + a2_rope = ggml_cont(ctx, a2_rope); #endif - a1_rope = tensor_assign_buffers( - ggml_rope_inplace(gctx, a1_rope, b1, rope_dim, (int)RopeType::NEOX, n_ctx)); // [s, #h, d/2] - a2_rope = tensor_assign_buffers( - ggml_rope_inplace(gctx, a2_rope, b2, rope_dim, (int)RopeType::NEOX, n_ctx)); // [s, #h, d/2] + a1_rope = ggml_rope_inplace(ctx, a1_rope, b1, rope_dim, (int)RopeType::NEOX, n_ctx); // [s, #h, d/2] + a2_rope = ggml_rope_inplace(ctx, a2_rope, b2, rope_dim, (int)RopeType::NEOX, n_ctx); // [s, #h, d/2] -#ifdef GGML_USE_CUBLAS - a1_rope = ggml_cpy(gctx, a1_rope, a1); - a2_rope = ggml_cpy(gctx, a2_rope, a2); +#ifdef GGML_USE_CUDA + a1_rope = ggml_cpy(ctx, a1_rope, a1); + a2_rope = ggml_cpy(ctx, a2_rope, a2); #endif - ggml_build_forward_expand(&ctx->gf, a1_rope); - ggml_build_forward_expand(&ctx->gf, a2_rope); + ggml_build_forward_expand(mctx->gf, a1_rope); + ggml_build_forward_expand(mctx->gf, a2_rope); return layer; } -[[maybe_unused]] static ggml_tensor *apply_rotary_emb_glm2(ModelContext *ctx, ggml_tensor *layer, - ggml_tensor *position_ids) { +static ggml_tensor *apply_rotary_emb_glm2(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, + float rope_theta) { + // NOTE: ChatGLM2 applies RoPE only on half of the features. The remaining half is skipped. // layer: [s, #h, d], position_ids: [s] - ggml_context *gctx = ctx->ctx_b.get(); -#ifdef GGML_USE_CUBLAS - if (!ggml_is_contiguous(layer)) { - layer = tensor_assign_buffers(ggml_cont(gctx, layer)); - } -#endif + ggml_context *ctx = mctx->ctx_b.get(); + const int head_size = layer->ne[0]; const int rope_dim = head_size / 2; - ggml_tensor *roped_layer = - tensor_assign_buffers(ggml_rope(gctx, layer, position_ids, rope_dim, (int)RopeType::GPTJ, 0)); // [s, #h, d] - - ggml_tensor *roped_layer_view = tensor_assign_buffers( - ggml_view_3d(gctx, roped_layer, rope_dim, roped_layer->ne[1], roped_layer->ne[2], roped_layer->nb[1], - roped_layer->nb[2], rope_dim * roped_layer->nb[0])); // [s, #h, d/2] - ggml_tensor *layer_view = - tensor_assign_buffers(ggml_view_3d(gctx, layer, rope_dim, layer->ne[1], layer->ne[2], layer->nb[1], - layer->nb[2], rope_dim * layer->nb[0])); // [s, #h, d/2] + ggml_tensor *half_layer_view = + ggml_view_3d(ctx, layer, rope_dim, layer->ne[1], layer->ne[2], layer->nb[1], layer->nb[2], 0); - ggml_build_forward_expand(&ctx->gf, ggml_cpy(gctx, layer_view, roped_layer_view)); + // TODO: metal + ggml_tensor *half_layer = half_layer_view; + if (!ggml_backend_is_cpu(mctx->backend.get())) { + half_layer = ggml_cont(ctx, half_layer); + } + ggml_tensor *roped_half_layer = + ggml_rope_ext_inplace(ctx, half_layer, position_ids, nullptr, rope_dim, (int)RopeType::GPTJ, 0, 0, rope_theta, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // [s, #h, d] + if (!ggml_backend_is_cpu(mctx->backend.get())) { + roped_half_layer = ggml_cpy(ctx, roped_half_layer, half_layer_view); + } + ggml_build_forward_expand(mctx->gf, roped_half_layer); - return roped_layer; + return layer; } -static ggml_tensor *apply_rotary_emb(ModelContext *ctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx, - RopeType rope_type, float rope_theta, int dim_scale) { +static ggml_tensor *apply_rotary_emb(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx, + RopeType rope_type, float rope_theta) { switch (rope_type) { case RopeType::GPTJ: case RopeType::NEOX: - return apply_rotary_emb_basic(ctx, layer, position_ids, n_ctx, rope_type, rope_theta, dim_scale); + return apply_rotary_emb_basic(mctx, layer, position_ids, n_ctx, rope_type, rope_theta); case RopeType::CHATGLM: - return apply_rotary_emb_glm(ctx, layer, position_ids, n_ctx); - // case RopeType::CHATGLM2: - // return apply_rotary_emb_glm2(ctx, layer, position_ids); + return apply_rotary_emb_glm(mctx, layer, position_ids, n_ctx); + case RopeType::CHATGLM2: + return apply_rotary_emb_glm2(mctx, layer, position_ids, rope_theta); case RopeType::DISABLED: return layer; default: @@ -630,50 +614,16 @@ static ggml_tensor *apply_rotary_emb(ModelContext *ctx, ggml_tensor *layer, ggml } } -static inline ggml_tensor *apply_attention_mask_causal(ModelContext *ctx, ggml_tensor *attn_scores, int n_past) { - return tensor_assign_buffers(ggml_diag_mask_inf_inplace(ctx->ctx_b.get(), attn_scores, n_past)); -} - -static ggml_tensor *apply_attention_mask_glm(ModelContext *ctx, ggml_tensor *attn_scores, int n_past) { - // attn_scores: [#h, s, kvs] - // semantic: attn_scores[:, :-1, -1] = -inf - ggml_context *gctx = ctx->ctx_b.get(); - const int kvlen = attn_scores->ne[0]; - const int qlen = attn_scores->ne[1]; - const int num_attention_heads = attn_scores->ne[2]; - ggml_tensor *inf = ggml_new_tensor_3d(gctx, attn_scores->type, 1, qlen - 1, num_attention_heads); - ggml_set_f32(inf, -INFINITY); - tensor_to_device(inf); // TODO: optimize - ggml_tensor *masked_attn_scores = - tensor_assign_buffers(ggml_view_3d(gctx, attn_scores, 1, qlen - 1, num_attention_heads, attn_scores->nb[1], - attn_scores->nb[2], (kvlen - 1) * attn_scores->nb[0])); - ggml_build_forward_expand(&ctx->gf, ggml_cpy(gctx, inf, masked_attn_scores)); - return attn_scores; -} - -static ggml_tensor *apply_attention_mask(ModelContext *ctx, ggml_tensor *attn_scores, int n_past, - AttentionMaskType attn_mask_type) { - switch (attn_mask_type) { - case AttentionMaskType::CAUSAL: - return apply_attention_mask_causal(ctx, attn_scores, n_past); - case AttentionMaskType::CHATGLM: - return apply_attention_mask_glm(ctx, attn_scores, n_past); - default: - CHATGLM_THROW << "Unknown attention mask type " << (int)attn_mask_type; - } -} - -ggml_tensor *BasicAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *position_ids, - int n_past, int n_ctx) const { - ggml_context *gctx = ctx->ctx_b.get(); +ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, + ggml_tensor *position_ids, int n_past, int n_ctx) const { + ggml_context *ctx = mctx->ctx_b.get(); const int hidden_size = hidden_states->ne[0]; const int qlen = hidden_states->ne[1]; const int head_size = hidden_size / num_attention_heads; const int num_shared_q_heads = num_attention_heads / num_kv_heads; - const bool is_gqa = num_shared_q_heads > 1; - ggml_tensor *qkv = query_key_value.forward(ctx, hidden_states); // [sq, (#h + 2 * #kvh) * d] + ggml_tensor *qkv = query_key_value.forward(mctx, hidden_states); // [sq, (#h + 2 * #kvh) * d] // split mixed qkv into separate query, key and value ggml_tensor *query_layer; // [s, #h, d] @@ -681,138 +631,119 @@ ggml_tensor *BasicAttention::forward(ModelContext *ctx, ggml_tensor *hidden_stat ggml_tensor *value_layer; // [s, #kvh, d] if (interleaved_qkv) { - CHATGLM_CHECK(!is_gqa) << "interleaved qkv is not supported for GQA"; - query_layer = ggml_view_3d(gctx, qkv, head_size, num_attention_heads, qlen, + CHATGLM_CHECK(num_shared_q_heads == 1) << "interleaved qkv is not supported for GQA"; + query_layer = ggml_view_3d(ctx, qkv, head_size, num_attention_heads, qlen, 3 * head_size * ggml_element_size(qkv), qkv->nb[1], 0); - key_layer = - ggml_view_3d(gctx, qkv, head_size, num_attention_heads, qlen, 3 * head_size * ggml_element_size(qkv), - qkv->nb[1], head_size * ggml_element_size(qkv)); + key_layer = ggml_view_3d(ctx, qkv, head_size, num_attention_heads, qlen, 3 * head_size * ggml_element_size(qkv), + qkv->nb[1], head_size * ggml_element_size(qkv)); value_layer = - ggml_view_3d(gctx, qkv, head_size, num_attention_heads, qlen, 3 * head_size * ggml_element_size(qkv), + ggml_view_3d(ctx, qkv, head_size, num_attention_heads, qlen, 3 * head_size * ggml_element_size(qkv), qkv->nb[1], 2 * head_size * ggml_element_size(qkv)); } else { - query_layer = ggml_view_3d(gctx, qkv, head_size, num_attention_heads, qlen, head_size * ggml_element_size(qkv), + query_layer = ggml_view_3d(ctx, qkv, head_size, num_attention_heads, qlen, head_size * ggml_element_size(qkv), qkv->nb[1], 0); - key_layer = ggml_view_3d(gctx, qkv, head_size, num_kv_heads, qlen, head_size * ggml_element_size(qkv), + key_layer = ggml_view_3d(ctx, qkv, head_size, num_kv_heads, qlen, head_size * ggml_element_size(qkv), qkv->nb[1], hidden_size * ggml_element_size(qkv)); - value_layer = ggml_view_3d(gctx, qkv, head_size, num_kv_heads, qlen, head_size * ggml_element_size(qkv), + value_layer = ggml_view_3d(ctx, qkv, head_size, num_kv_heads, qlen, head_size * ggml_element_size(qkv), qkv->nb[1], (hidden_size + head_size * num_kv_heads) * ggml_element_size(qkv)); } - query_layer = apply_rotary_emb(ctx, query_layer, position_ids, n_ctx, rope_type, rope_theta, rope_dim_scale); - key_layer = apply_rotary_emb(ctx, key_layer, position_ids, n_ctx, rope_type, rope_theta, rope_dim_scale); + query_layer = apply_rotary_emb(mctx, query_layer, position_ids, n_ctx, rope_type, rope_theta); + key_layer = apply_rotary_emb(mctx, key_layer, position_ids, n_ctx, rope_type, rope_theta); - query_layer = tensor_assign_buffers(ggml_cont(gctx, ggml_permute(gctx, query_layer, 0, 2, 1, 3))); // [#h, s, d] + query_layer = ggml_cont(ctx, ggml_permute(ctx, query_layer, 0, 2, 1, 3)); // [#h, s, d] if (num_shared_q_heads > 1) { - query_layer = tensor_assign_buffers(ggml_reshape_3d(gctx, query_layer, head_size, num_shared_q_heads * qlen, - num_kv_heads)); // [#kvh, (#h/#kvh) * s, d] + query_layer = ggml_reshape_3d(ctx, query_layer, head_size, num_shared_q_heads * qlen, + num_kv_heads); // [#kvh, (#h/#kvh) * s, d] } - key_layer = tensor_assign_buffers(ggml_permute(gctx, key_layer, 0, 2, 1, 3)); // [#kvh, s, d] - value_layer = tensor_assign_buffers(ggml_permute(gctx, value_layer, 1, 2, 0, 3)); // [#kvh, d, s] + key_layer = ggml_permute(ctx, key_layer, 0, 2, 1, 3); // [#kvh, s, d] + value_layer = ggml_permute(ctx, value_layer, 1, 2, 0, 3); // [#kvh, d, s] // store key & value to cache - ggml_tensor *k_cache_view = tensor_assign_buffers( - ggml_view_3d(gctx, k_cache, head_size, qlen, num_kv_heads, k_cache->nb[1], k_cache->nb[2], - (num_virtual_tokens + n_past) * head_size * ggml_element_size(k_cache))); // [#kvh, s, d] - ggml_build_forward_expand(&ctx->gf, ggml_cpy(gctx, key_layer, k_cache_view)); + ggml_tensor *k_cache_view = + ggml_view_3d(ctx, k_cache, head_size, qlen, num_kv_heads, k_cache->nb[1], k_cache->nb[2], + (num_virtual_tokens + n_past) * head_size * ggml_element_size(k_cache)); // [#kvh, s, d] + ggml_build_forward_expand(mctx->gf, ggml_cpy(ctx, key_layer, k_cache_view)); ggml_tensor *v_cache_view = - tensor_assign_buffers(ggml_view_3d(gctx, v_cache, qlen, head_size, num_kv_heads, v_cache->nb[1], v_cache->nb[2], - (num_virtual_tokens + n_past) * ggml_element_size(v_cache))); // [#kvh, d, s] - ggml_build_forward_expand(&ctx->gf, ggml_cpy(gctx, value_layer, v_cache_view)); + ggml_view_3d(ctx, v_cache, qlen, head_size, num_kv_heads, v_cache->nb[1], v_cache->nb[2], + (num_virtual_tokens + n_past) * ggml_element_size(v_cache)); // [#kvh, d, s] + ggml_build_forward_expand(mctx->gf, ggml_cpy(ctx, value_layer, v_cache_view)); // concat key & value with past kv - key_layer = tensor_assign_buffers(ggml_view_3d(gctx, k_cache, head_size, num_virtual_tokens + n_past + qlen, - num_kv_heads, k_cache->nb[1], k_cache->nb[2], - 0)); // [#kvh, kvs, d] - value_layer = tensor_assign_buffers(ggml_view_3d(gctx, v_cache, num_virtual_tokens + n_past + qlen, head_size, - num_kv_heads, v_cache->nb[1], v_cache->nb[2], - 0)); // [#kvh, d, kvs] + key_layer = ggml_view_3d(ctx, k_cache, head_size, num_virtual_tokens + n_past + qlen, num_kv_heads, k_cache->nb[1], + k_cache->nb[2], + 0); // [#kvh, kvs, d] + value_layer = ggml_view_3d(ctx, v_cache, num_virtual_tokens + n_past + qlen, head_size, num_kv_heads, + v_cache->nb[1], v_cache->nb[2], + 0); // [#kvh, d, kvs] // attention - ggml_tensor *attn_scores = - tensor_assign_buffers(ggml_mul_mat(gctx, key_layer, query_layer)); // [#kvh, (#h/#kvh) * s, kvs] - attn_scores = - tensor_assign_buffers(ggml_scale_inplace(gctx, attn_scores, ggml_new_f32(gctx, 1.f / std::sqrt(head_size)))); - if (use_alibi) { - attn_scores = tensor_assign_buffers(ggml_alibi(gctx, attn_scores, n_past, num_attention_heads, 8)); - } + ggml_tensor *attn_scores = ggml_mul_mat(ctx, key_layer, query_layer); // [#kvh, (#h/#kvh) * s, kvs] + attn_scores = ggml_scale_inplace(ctx, attn_scores, 1.f / std::sqrt(head_size)); + if (n_past == 0) { // build attention mask for context input if (num_shared_q_heads > 1) { - attn_scores = ggml_reshape_3d(gctx, attn_scores, num_virtual_tokens + n_past + qlen, qlen, + attn_scores = ggml_reshape_3d(ctx, attn_scores, num_virtual_tokens + n_past + qlen, qlen, num_attention_heads); // [#h, s, kvs] } - attn_scores = apply_attention_mask(ctx, attn_scores, num_virtual_tokens + n_past, attn_mask_type); + + if (attn_mask_type == AttentionMaskType::CAUSAL) { + attn_scores = ggml_diag_mask_inf_inplace(ctx, attn_scores, num_virtual_tokens + n_past); + } else { + attn_scores = ggml_add_inplace(ctx, attn_scores, attention_mask); + } + if (num_shared_q_heads > 1) { attn_scores = - ggml_reshape_3d(gctx, attn_scores, num_virtual_tokens + n_past + qlen, num_shared_q_heads * qlen, + ggml_reshape_3d(ctx, attn_scores, num_virtual_tokens + n_past + qlen, num_shared_q_heads * qlen, num_kv_heads); // [#kvh, (#h/#kvh) * s, kvs] } } - ggml_tensor *attn_probs = - tensor_assign_buffers(ggml_soft_max_inplace(gctx, attn_scores)); // [#kvh, (#h/#kvh) * s, kvs] - ggml_tensor *context_layer = - tensor_assign_buffers(ggml_mul_mat(gctx, value_layer, attn_probs)); // [#kvh, (#h/#kvh) * s, d] + ggml_tensor *attn_probs = ggml_soft_max_inplace(ctx, attn_scores); // [#kvh, (#h/#kvh) * s, kvs] + + ggml_tensor *context_layer = ggml_mul_mat(ctx, value_layer, attn_probs); // [#kvh, (#h/#kvh) * s, d] if (num_shared_q_heads > 1) { - context_layer = ggml_reshape_3d(gctx, context_layer, head_size, qlen, + context_layer = ggml_reshape_3d(ctx, context_layer, head_size, qlen, num_attention_heads); // [#h, s, d] } - context_layer = tensor_assign_buffers(ggml_cont(gctx, ggml_permute(gctx, context_layer, 0, 2, 1, 3))); // [s, #h, d] - context_layer = tensor_assign_buffers(ggml_reshape_2d(gctx, context_layer, hidden_size, qlen)); // [s, #h * d] + context_layer = ggml_cont(ctx, ggml_permute(ctx, context_layer, 0, 2, 1, 3)); // [s, #h, d] + context_layer = ggml_reshape_2d(ctx, context_layer, hidden_size, qlen); // [s, #h * d] - ggml_tensor *attn_output = dense.forward(ctx, context_layer); + ggml_tensor *attn_output = dense.forward(mctx, context_layer); return attn_output; } -BaseModelForCausalLM::BaseModelForCausalLM(ModelConfig config, size_t mem_size, size_t scratch_size, size_t num_weights) - : config(config) { - ctx_.dtype = config.dtype; - const size_t ctx_w_size = num_weights * ggml_tensor_overhead(); - const size_t ctx_kv_size = 2 * config.num_hidden_layers * - ((config.max_length + config.num_virtual_tokens) * config.hidden_size / - config.num_attention_heads * config.num_kv_heads * ggml_type_size(GGML_TYPE_F16) + - ggml_tensor_overhead()); - ctx_.ctx_w = make_unique_ggml_context(ctx_w_size, nullptr, true); - ctx_.ctx_kv = make_unique_ggml_context(ctx_kv_size + 1 * MB, nullptr, false); // 1MB extra for MPS - - ctx_.compute_buffer.resize(mem_size); - ctx_.scratch_buffer.resize(scratch_size); - ctx_.scratch = {0, ctx_.scratch_buffer.size(), ctx_.scratch_buffer.data()}; -#ifdef GGML_USE_CUBLAS - ggml_cuda_set_scratch_size(scratch_size); -#endif -} +BaseModelForCausalLM::BaseModelForCausalLM(ModelConfig config) + : config(config), mctx_(std::make_unique(config.dtype)) {} ggml_tensor *BaseModelForCausalLM::forward_graph_compute(const std::vector &input_ids, int n_past, int n_ctx, - int n_threads, bool is_decoding) { - ctx_.ctx_b = make_unique_ggml_context(ctx_.compute_buffer.size(), ctx_.compute_buffer.data(), false); - ctx_.gf = {}; + bool is_decoding) { + mctx_->ctx_b = make_unique_ggml_context(mctx_->compute_meta.size(), mctx_->compute_meta.data(), true); + mctx_->gf = ggml_new_graph(mctx_->ctx_b.get()); - if (n_threads <= 0) { - n_threads = get_default_num_threads(); // default thread num - } - int curr_input_ids_size = input_ids.size() - n_past; - if (curr_input_ids_size >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - n_threads = 1; // use 1 thread if BLAS is enabled - } + const int qlen = input_ids.size() - n_past; - ggml_tensor *curr_input_ids = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size); - memcpy(curr_input_ids->data, input_ids.data() + n_past, ggml_nbytes(curr_input_ids)); + ggml_tensor *curr_input_ids = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_I32, qlen); + ggml_set_name(curr_input_ids, "input_ids"); + ggml_set_input(curr_input_ids); - ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx, is_decoding); - lm_logits->backend = GGML_BACKEND_CPU; + ggml_tensor *lm_logits = forward(mctx_.get(), curr_input_ids, n_past, n_ctx, is_decoding); - ggml_build_forward_expand(&ctx_.gf, lm_logits); -#ifdef GGML_USE_METAL - ggml_metal_graph_compute(ctx_.ctx_metal.get(), &ctx_.gf); -#else - ggml_graph_compute_helper(ctx_.work_buffer, &ctx_.gf, n_threads); -#endif + ggml_build_forward_expand(mctx_->gf, lm_logits); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + + ggml_backend_tensor_set(curr_input_ids, input_ids.data() + n_past, 0, qlen * sizeof(int)); + + set_graph_inputs(qlen, n_past, n_ctx); + + set_default_num_threads(mctx_->backend.get(), qlen); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); #ifdef GGML_PERF - ggml_graph_print(&ctx_.gf); + ggml_graph_print(mctx_->gf); #endif return lm_logits; @@ -820,10 +751,12 @@ ggml_tensor *BaseModelForCausalLM::forward_graph_compute(const std::vector int BaseModelForCausalLM::generate_next_token(const std::vector &input_ids, const GenerationConfig &gen_config, int n_past, int n_ctx) { - ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, gen_config.num_threads, true); + ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, true); + CHATGLM_CHECK(ggml_n_dims(lm_logits) == 1); int vocab_size = lm_logits->ne[0]; - float *next_token_logits = (float *)lm_logits->data; + std::vector next_token_logits(vocab_size); + ggml_backend_tensor_get(lm_logits, next_token_logits.data(), 0, vocab_size * sizeof(float)); // check nan for (int i = 0; i < vocab_size; i++) { @@ -832,7 +765,7 @@ int BaseModelForCausalLM::generate_next_token(const std::vector &input_ids, // logits pre-process if (gen_config.repetition_penalty != 1.f) { - sampling_repetition_penalty(next_token_logits, next_token_logits + vocab_size, input_ids, + sampling_repetition_penalty(next_token_logits.data(), next_token_logits.data() + vocab_size, input_ids, gen_config.repetition_penalty); } @@ -840,7 +773,8 @@ int BaseModelForCausalLM::generate_next_token(const std::vector &input_ids, if (gen_config.do_sample) { // temperature sampling if (gen_config.temperature > 0) { - sampling_temperature(next_token_logits, next_token_logits + vocab_size, gen_config.temperature); + sampling_temperature(next_token_logits.data(), next_token_logits.data() + vocab_size, + gen_config.temperature); } std::vector token_scores(vocab_size); @@ -870,11 +804,12 @@ int BaseModelForCausalLM::generate_next_token(const std::vector &input_ids, thread_local std::random_device rd; thread_local std::mt19937 gen(rd()); - std::discrete_distribution<> dist(next_token_logits, next_token_logits + token_scores.size()); + std::discrete_distribution<> dist(next_token_logits.data(), next_token_logits.data() + token_scores.size()); next_token_id = token_scores[dist(gen)].id; } else { // greedy search - next_token_id = std::max_element(next_token_logits, next_token_logits + vocab_size) - next_token_logits; + next_token_id = + std::max_element(next_token_logits.begin(), next_token_logits.end()) - next_token_logits.begin(); } return next_token_id; @@ -1127,74 +1062,107 @@ std::string ChatGLMTokenizer::postprocess(const std::string &text) { return output; } -ggml_tensor *GLMBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *position_ids, int n_past, - int n_ctx) const { - ggml_context *gctx = ctx->ctx_b.get(); +ggml_tensor *GLMBlock::forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, + ggml_tensor *position_ids, int n_past, int n_ctx) const { + ggml_context *ctx = mctx->ctx_b.get(); - ggml_tensor *alpha = ggml_new_f32(gctx, alpha_value); + ggml_tensor *attn_input = input_layernorm.forward(mctx, hidden_states); + ggml_tensor *attn_output = attention.forward(mctx, attn_input, attention_mask, position_ids, n_past, n_ctx); + ggml_build_forward_expand(mctx->gf, attn_output); + attn_input = ggml_scale_inplace(ctx, attn_input, alpha); + hidden_states = ggml_add_inplace(ctx, attn_input, attn_output); - ggml_tensor *attn_input = input_layernorm.forward(ctx, hidden_states); - ggml_tensor *attn_output = attention.forward(ctx, attn_input, position_ids, n_past, n_ctx); - ggml_build_forward_expand(&ctx->gf, attn_output); - attn_input = tensor_assign_buffers(ggml_scale_inplace(gctx, attn_input, alpha)); - hidden_states = tensor_assign_buffers(ggml_add_inplace(gctx, attn_input, attn_output)); - - ggml_tensor *mlp_input = post_attention_layernorm.forward(ctx, hidden_states); - ggml_tensor *mlp_output = mlp.forward(ctx, mlp_input); - ggml_build_forward_expand(&ctx->gf, mlp_output); - mlp_input = tensor_assign_buffers(ggml_scale_inplace(gctx, mlp_input, alpha)); - ggml_tensor *output = tensor_assign_buffers(ggml_add_inplace(gctx, mlp_input, mlp_output)); + ggml_tensor *mlp_input = post_attention_layernorm.forward(mctx, hidden_states); + ggml_tensor *mlp_output = mlp.forward(mctx, mlp_input); + ggml_build_forward_expand(mctx->gf, mlp_output); + mlp_input = ggml_scale_inplace(ctx, mlp_input, alpha); + ggml_tensor *output = ggml_add_inplace(ctx, mlp_input, mlp_output); return output; } -ChatGLMForCausalLM::ChatGLMForCausalLM(const ModelConfig &config) - : BasicModelForCausalLM(config, MEM_SIZE, SCRATCH_SIZE, num_weights(config.num_hidden_layers)) { - state_dict_ = state_dict(); -} +ChatGLMForCausalLM::ChatGLMForCausalLM(const ModelConfig &config) : BasicModelForCausalLM(config) {} -void ChatGLMForCausalLM::load(ModelLoader &loader) { - for (auto &item : state_dict_) { +void ChatGLMForCausalLM::load_state_dict(const StateDict &sd) { + // TODO: handle metal + if (ggml_backend_is_cpu(mctx_->backend.get())) { + mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr( + ggml_backend_buffer_get_base(sd.buf.get()), ggml_backend_buffer_get_size(sd.buf.get()))); + } else { + mctx_->buf_w = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); + } + + StateDict self_sd = state_dict(); + for (auto &item : self_sd.kv) { const std::string &name = item.first; - ggml_tensor *tensor = item.second; - if (name != "lm_head.weight") { - loader.read_tensor(name, tensor); + ggml_tensor *self_weight = item.second; + ggml_tensor *ckpt_weight = sd.kv.at(name); + if (ggml_backend_is_cpu(mctx_->backend.get())) { + ggml_backend_tensor_alloc(mctx_->buf_w.get(), self_weight, ckpt_weight->data); + } else { + ggml_backend_tensor_set(self_weight, ckpt_weight->data, 0, ggml_nbytes(self_weight)); } } - lm_head.weight->data = transformer.word_embeddings.weight->data; // tied weight +} + +void ChatGLMForCausalLM::set_graph_inputs(int qlen, int n_past, int n_ctx) const { + set_graph_inputs(mctx_->gf, qlen, n_past, n_ctx); +} + +void ChatGLMForCausalLM::set_graph_inputs(ggml_cgraph *gf, int qlen, int n_past, int n_ctx) { + // attention_mask: [s, kvs] auto broadcast to [#h, s, kvs] + // semantic: attn_scores[:, :-1, -1] = -inf + if (n_past == 0) { + ggml_tensor *attention_mask = ggml_graph_get_tensor(gf, "attention_mask"); + const int kvlen = attention_mask->ne[0]; + std::vector attention_mask_buffer(qlen * kvlen, 0.f); + CHATGLM_CHECK(ggml_nbytes(attention_mask) == attention_mask_buffer.size() * sizeof(float)); + for (int i = 0; i < qlen - 1; i++) { + attention_mask_buffer[i * kvlen + (kvlen - 1)] = -INFINITY; + } + ggml_backend_tensor_set(attention_mask, attention_mask_buffer.data(), 0, + attention_mask_buffer.size() * sizeof(float)); + } - to_device(); + // position_ids: [2 * qlen] + ggml_tensor *position_ids = ggml_graph_get_tensor(gf, "position_ids"); + CHATGLM_CHECK(ggml_n_dims(position_ids) == 1 && position_ids->ne[0] == 2 * qlen) + << "invalid position ids size " << position_ids->ne[0]; - ctx_.weight_buffer = std::string_view(loader.data, loader.size); - ctx_.init_device_context(); + std::vector position_ids_buffer(position_ids->ne[0]); + for (int i = 0; i < qlen; i++) { + const int p = n_past + i; + position_ids_buffer[i] = std::min(p, n_ctx - 2); + position_ids_buffer[qlen + i] = std::max(p - (n_ctx - 2), 0); + } + ggml_backend_tensor_set(position_ids, position_ids_buffer.data(), 0, position_ids_buffer.size() * sizeof(int)); } StateDict ChatGLMForCausalLM::state_dict() const { StateDict sd; - sd.reserve(num_weights(config.num_hidden_layers)); - sd.emplace_back("transformer.word_embeddings.weight", transformer.word_embeddings.weight); + sd.kv.emplace("transformer.word_embeddings.weight", transformer.word_embeddings.weight); for (int i = 0; i < config.num_hidden_layers; i++) { std::string layer_prefix = "transformer.layers." + std::to_string(i) + '.'; - sd.emplace_back(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight); - sd.emplace_back(layer_prefix + "input_layernorm.bias", transformer.layers[i].input_layernorm.bias); - sd.emplace_back(layer_prefix + "attention.query_key_value.weight", - transformer.layers[i].attention.query_key_value.weight); - sd.emplace_back(layer_prefix + "attention.query_key_value.bias", - transformer.layers[i].attention.query_key_value.bias); - sd.emplace_back(layer_prefix + "attention.dense.weight", transformer.layers[i].attention.dense.weight); - sd.emplace_back(layer_prefix + "attention.dense.bias", transformer.layers[i].attention.dense.bias); - sd.emplace_back(layer_prefix + "post_attention_layernorm.weight", - transformer.layers[i].post_attention_layernorm.weight); - sd.emplace_back(layer_prefix + "post_attention_layernorm.bias", - transformer.layers[i].post_attention_layernorm.bias); - sd.emplace_back(layer_prefix + "mlp.dense_h_to_4h.weight", transformer.layers[i].mlp.dense_h_to_4h.weight); - sd.emplace_back(layer_prefix + "mlp.dense_h_to_4h.bias", transformer.layers[i].mlp.dense_h_to_4h.bias); - sd.emplace_back(layer_prefix + "mlp.dense_4h_to_h.weight", transformer.layers[i].mlp.dense_4h_to_h.weight); - sd.emplace_back(layer_prefix + "mlp.dense_4h_to_h.bias", transformer.layers[i].mlp.dense_4h_to_h.bias); - } - sd.emplace_back("transformer.final_layernorm.weight", transformer.final_layernorm.weight); - sd.emplace_back("transformer.final_layernorm.bias", transformer.final_layernorm.bias); - sd.emplace_back("lm_head.weight", lm_head.weight); + sd.kv.emplace(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight); + sd.kv.emplace(layer_prefix + "input_layernorm.bias", transformer.layers[i].input_layernorm.bias); + sd.kv.emplace(layer_prefix + "attention.query_key_value.weight", + transformer.layers[i].attention.query_key_value.weight); + sd.kv.emplace(layer_prefix + "attention.query_key_value.bias", + transformer.layers[i].attention.query_key_value.bias); + sd.kv.emplace(layer_prefix + "attention.dense.weight", transformer.layers[i].attention.dense.weight); + sd.kv.emplace(layer_prefix + "attention.dense.bias", transformer.layers[i].attention.dense.bias); + sd.kv.emplace(layer_prefix + "post_attention_layernorm.weight", + transformer.layers[i].post_attention_layernorm.weight); + sd.kv.emplace(layer_prefix + "post_attention_layernorm.bias", + transformer.layers[i].post_attention_layernorm.bias); + sd.kv.emplace(layer_prefix + "mlp.dense_h_to_4h.weight", transformer.layers[i].mlp.dense_h_to_4h.weight); + sd.kv.emplace(layer_prefix + "mlp.dense_h_to_4h.bias", transformer.layers[i].mlp.dense_h_to_4h.bias); + sd.kv.emplace(layer_prefix + "mlp.dense_4h_to_h.weight", transformer.layers[i].mlp.dense_4h_to_h.weight); + sd.kv.emplace(layer_prefix + "mlp.dense_4h_to_h.bias", transformer.layers[i].mlp.dense_4h_to_h.bias); + } + sd.kv.emplace("transformer.final_layernorm.weight", transformer.final_layernorm.weight); + sd.kv.emplace("transformer.final_layernorm.bias", transformer.final_layernorm.bias); return sd; } @@ -1263,83 +1231,97 @@ bool ChatGLM2Tokenizer::is_special_id(int id) const { id == eop_token_id; } -ChatGLM2ForCausalLM::ChatGLM2ForCausalLM(const ModelConfig &config) - : BasicModelForCausalLM(config, MEM_SIZE, SCRATCH_SIZE, num_weights(config.num_hidden_layers)) { - state_dict_ = state_dict(); -} +ChatGLM2ForCausalLM::ChatGLM2ForCausalLM(const ModelConfig &config) : BasicModelForCausalLM(config) {} -void ChatGLM2ForCausalLM::load(ModelLoader &loader) { - if (config.num_virtual_tokens > 0) { - const int head_size = config.hidden_size / config.num_attention_heads; - auto prefix_cache_ctx = make_unique_ggml_context( - ggml_tensor_overhead() + config.num_hidden_layers * 2 * config.num_kv_heads * config.num_virtual_tokens * - head_size * ggml_type_size(GGML_TYPE_F16), - nullptr, false); - ggml_tensor *past_key_values = - ggml_new_tensor_4d(prefix_cache_ctx.get(), GGML_TYPE_F16, head_size, config.num_virtual_tokens, - config.num_kv_heads, config.num_hidden_layers * 2); - CHATGLM_CHECK(ggml_used_mem(prefix_cache_ctx.get()) == ggml_get_mem_size(prefix_cache_ctx.get())) - << "corrupted prefix cache"; - loader.read_tensor("past_key_values", past_key_values); - load_prefix_cache(past_key_values); +void ChatGLM2ForCausalLM::load_state_dict(const StateDict &sd) { + if (ggml_backend_is_cpu(mctx_->backend.get())) { + mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr( + ggml_backend_buffer_get_base(sd.buf.get()), ggml_backend_buffer_get_size(sd.buf.get()))); + } else { + mctx_->buf_w = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); } - std::unordered_map glu_name_map; - for (int i = 0; i < config.num_hidden_layers; i++) { - std::string layer_prefix = "transformer.encoder.layers." + std::to_string(i) + '.'; - glu_name_map.emplace(layer_prefix + "mlp.gate_proj.weight", layer_prefix + "mlp.dense_h_to_4h.weight"); - glu_name_map.emplace(layer_prefix + "mlp.up_proj.weight", layer_prefix + "mlp.dense_h_to_4h.weight"); + if (config.num_virtual_tokens > 0) { + ggml_tensor *past_key_values = sd.kv.at("past_key_values"); + load_prefix_cache(past_key_values); } - for (auto it = state_dict_.begin(); it != state_dict_.end(); it++) { + auto self_sd = state_dict(); + for (auto it = sd.kv.begin(); it != sd.kv.end(); it++) { const std::string &name = it->first; - ggml_tensor *tensor = it->second; - - auto glu_it = glu_name_map.find(name); - if (glu_it != glu_name_map.end()) { - // for compatibility: load gate_proj & up_proj from dense_h_to_4h - const std::string &dense_h_to_4h_name = glu_it->second; - ggml_tensor *gate_proj = tensor; - it++; - CHATGLM_CHECK(glu_name_map.at(it->first) == dense_h_to_4h_name) << "corrupted glu weights"; - ggml_tensor *up_proj = it->second; - - int64_t target_ne[4]{gate_proj->ne[0], gate_proj->ne[1] + up_proj->ne[1]}; - loader.checked_read_tensor_meta(dense_h_to_4h_name, gate_proj->n_dims, target_ne, gate_proj->type); - gate_proj->data = loader.read_tensor_data(ggml_nbytes(gate_proj)); - up_proj->data = loader.read_tensor_data(ggml_nbytes(up_proj)); + ggml_tensor *ckpt_weight = it->second; + + if (name == "past_key_values") { + continue; + } + + size_t pos = name.rfind("mlp.dense_h_to_4h.weight"); + if (pos != std::string::npos) { + // split dense_h_to_4h to gate & up + std::string gate_name = name.substr(0, pos) + "mlp.gate_proj.weight"; + ggml_tensor *gate_proj = self_sd.kv.at(gate_name); + + std::string up_name = name.substr(0, pos) + "mlp.up_proj.weight"; + ggml_tensor *up_proj = self_sd.kv.at(up_name); + + CHATGLM_CHECK(ggml_nbytes(ckpt_weight) == ggml_nbytes(gate_proj) + ggml_nbytes(up_proj)); + + if (ggml_backend_is_cpu(mctx_->backend.get())) { + ggml_backend_tensor_alloc(mctx_->buf_w.get(), gate_proj, ckpt_weight->data); + ggml_backend_tensor_alloc(mctx_->buf_w.get(), up_proj, + (char *)ckpt_weight->data + ggml_nbytes(gate_proj)); + } else { + ggml_backend_tensor_set(gate_proj, ckpt_weight->data, 0, ggml_nbytes(gate_proj)); + ggml_backend_tensor_set(up_proj, (char *)ckpt_weight->data + ggml_nbytes(gate_proj), 0, + ggml_nbytes(up_proj)); + } } else { - loader.read_tensor(name, tensor); + // normal weight + ggml_tensor *self_weight = self_sd.kv.at(name); + if (ggml_backend_is_cpu(mctx_->backend.get())) { + ggml_backend_tensor_alloc(mctx_->buf_w.get(), self_weight, ckpt_weight->data); + } else { + ggml_backend_tensor_set(self_weight, ckpt_weight->data, 0, ggml_nbytes(self_weight)); + } } } +} - to_device(); +void ChatGLM2ForCausalLM::set_graph_inputs(int qlen, int n_past, int n_ctx) const { + set_graph_inputs(mctx_->gf, qlen, n_past, n_ctx); +} + +void ChatGLM2ForCausalLM::set_graph_inputs(ggml_cgraph *gf, int qlen, int n_past, int n_ctx) { + ggml_tensor *position_ids = ggml_graph_get_tensor(gf, "position_ids"); + CHATGLM_CHECK(ggml_n_dims(position_ids) == 1 && position_ids->ne[0] == qlen) + << "invalid position ids size " << position_ids->ne[0]; - ctx_.weight_buffer = std::string_view(loader.data, loader.size); - ctx_.init_device_context(); + std::vector position_ids_buffer(position_ids->ne[0]); + std::iota(position_ids_buffer.begin(), position_ids_buffer.end(), n_past); + ggml_backend_tensor_set(position_ids, position_ids_buffer.data(), 0, position_ids_buffer.size() * sizeof(int)); } StateDict ChatGLM2ForCausalLM::state_dict() const { StateDict sd; - sd.reserve(num_weights(config.num_hidden_layers)); - sd.emplace_back("transformer.embedding.word_embeddings.weight", transformer.word_embeddings.weight); + sd.kv.emplace("transformer.embedding.word_embeddings.weight", transformer.word_embeddings.weight); for (int i = 0; i < config.num_hidden_layers; i++) { std::string layer_prefix = "transformer.encoder.layers." + std::to_string(i) + '.'; - sd.emplace_back(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight); - sd.emplace_back(layer_prefix + "self_attention.query_key_value.weight", - transformer.layers[i].attention.query_key_value.weight); - sd.emplace_back(layer_prefix + "self_attention.query_key_value.bias", - transformer.layers[i].attention.query_key_value.bias); - sd.emplace_back(layer_prefix + "self_attention.dense.weight", transformer.layers[i].attention.dense.weight); - sd.emplace_back(layer_prefix + "post_attention_layernorm.weight", - transformer.layers[i].post_attention_layernorm.weight); - sd.emplace_back(layer_prefix + "mlp.gate_proj.weight", transformer.layers[i].mlp.gate_proj.weight); - sd.emplace_back(layer_prefix + "mlp.up_proj.weight", transformer.layers[i].mlp.up_proj.weight); + sd.kv.emplace(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight); + sd.kv.emplace(layer_prefix + "self_attention.query_key_value.weight", + transformer.layers[i].attention.query_key_value.weight); + sd.kv.emplace(layer_prefix + "self_attention.query_key_value.bias", + transformer.layers[i].attention.query_key_value.bias); + sd.kv.emplace(layer_prefix + "self_attention.dense.weight", transformer.layers[i].attention.dense.weight); + sd.kv.emplace(layer_prefix + "post_attention_layernorm.weight", + transformer.layers[i].post_attention_layernorm.weight); + sd.kv.emplace(layer_prefix + "mlp.gate_proj.weight", transformer.layers[i].mlp.gate_proj.weight); + sd.kv.emplace(layer_prefix + "mlp.up_proj.weight", transformer.layers[i].mlp.up_proj.weight); // for compatibility - sd.emplace_back(layer_prefix + "mlp.dense_4h_to_h.weight", transformer.layers[i].mlp.down_proj.weight); + sd.kv.emplace(layer_prefix + "mlp.dense_4h_to_h.weight", transformer.layers[i].mlp.down_proj.weight); } - sd.emplace_back("transformer.encoder.final_layernorm.weight", transformer.final_layernorm.weight); - sd.emplace_back("transformer.output_layer.weight", lm_head.weight); + sd.kv.emplace("transformer.encoder.final_layernorm.weight", transformer.final_layernorm.weight); + sd.kv.emplace("transformer.output_layer.weight", lm_head.weight); return sd; } @@ -1519,245 +1501,6 @@ void ChatGLM3Tokenizer::truncate(std::vector &ids, int max_length) { } } -// ===== Baichuan ===== - -BaichuanTokenizer::BaichuanTokenizer(std::string_view serialized_model_proto) { - const auto status = sp.LoadFromSerializedProto(serialized_model_proto); - CHATGLM_CHECK(status.ok()) << status.ToString(); -} - -std::vector BaichuanTokenizer::encode(const std::string &text, int max_length) const { - std::vector ids; - sp.Encode(text, &ids); - truncate(ids, max_length); - return ids; -} - -std::string BaichuanTokenizer::decode(const std::vector &ids, bool skip_special_tokens) const { - CHATGLM_CHECK(skip_special_tokens) << "unimplemented"; - std::vector normal_ids(ids); - normal_ids.erase(std::remove_if(normal_ids.begin(), normal_ids.end(), [this](int id) { return is_special_id(id); }), - normal_ids.end()); - - std::string text; - sp.Decode(normal_ids, &text); - return text; -} - -std::vector BaichuanTokenizer::apply_chat_template(const std::vector &messages, - int max_length) const { - check_chat_messages(messages); - std::vector user_assistant_messages = filter_user_assistant_messages(messages); - - std::vector ids; - ids.reserve(max_length); - for (const auto &msg : user_assistant_messages) { - ids.push_back((msg.role == ChatMessage::ROLE_USER) ? USER_TOKEN_ID : ASSISTANT_TOKEN_ID); - std::vector content_ids = encode(msg.content, max_length); - ids.insert(ids.end(), content_ids.begin(), content_ids.end()); - } - ids.push_back(ASSISTANT_TOKEN_ID); - - truncate(ids, max_length); - return ids; -} - -bool BaichuanTokenizer::is_special_id(int id) const { - return id == bos_token_id || id == eos_token_id || id == pad_token_id; -} - -void BaichuanTokenizer::truncate(std::vector &ids, int max_length) { - if ((int)ids.size() > max_length) { - ids.erase(ids.begin(), ids.end() - max_length); - } -} - -// ===== Baichuan-7B ===== - -Baichuan7BForCausalLM::Baichuan7BForCausalLM(const ModelConfig &config) - : BasicModelForCausalLM(config, MEM_SIZE, SCRATCH_SIZE, num_weights(config.num_hidden_layers)) { - state_dict_ = state_dict(); -} - -void Baichuan7BForCausalLM::load(ModelLoader &loader) { - for (auto &item : state_dict_) { - const std::string &name = item.first; - ggml_tensor *tensor = item.second; - loader.read_tensor(name, tensor); - } - - to_device(); - - ctx_.weight_buffer = std::string_view(loader.data, loader.size); - ctx_.init_device_context(); -} - -StateDict Baichuan7BForCausalLM::state_dict() const { - StateDict sd; - sd.reserve(num_weights(config.num_hidden_layers)); - sd.emplace_back("model.embed_tokens.weight", transformer.word_embeddings.weight); - for (int i = 0; i < config.num_hidden_layers; i++) { - std::string layer_prefix = "model.layers." + std::to_string(i) + '.'; - sd.emplace_back(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight); - sd.emplace_back(layer_prefix + "self_attn.W_pack.weight", - transformer.layers[i].attention.query_key_value.weight); - sd.emplace_back(layer_prefix + "self_attn.o_proj.weight", transformer.layers[i].attention.dense.weight); - sd.emplace_back(layer_prefix + "post_attention_layernorm.weight", - transformer.layers[i].post_attention_layernorm.weight); - sd.emplace_back(layer_prefix + "mlp.gate_proj.weight", transformer.layers[i].mlp.gate_proj.weight); - sd.emplace_back(layer_prefix + "mlp.down_proj.weight", transformer.layers[i].mlp.down_proj.weight); - sd.emplace_back(layer_prefix + "mlp.up_proj.weight", transformer.layers[i].mlp.up_proj.weight); - } - sd.emplace_back("model.norm.weight", transformer.final_layernorm.weight); - sd.emplace_back("lm_head.weight", lm_head.weight); - return sd; -} - -// ===== Baichuan-13B ===== - -Baichuan13BForCausalLM::Baichuan13BForCausalLM(const ModelConfig &config) - : BasicModelForCausalLM(config, MEM_SIZE, SCRATCH_SIZE, num_weights(config.num_hidden_layers)) { - state_dict_ = state_dict(); -} - -void Baichuan13BForCausalLM::load(ModelLoader &loader) { - for (auto &item : state_dict_) { - const std::string &name = item.first; - ggml_tensor *tensor = item.second; - loader.read_tensor(name, tensor); - } - - to_device(); - - ctx_.weight_buffer = std::string_view(loader.data, loader.size); - ctx_.init_device_context(); -} - -StateDict Baichuan13BForCausalLM::state_dict() const { - StateDict sd; - sd.reserve(num_weights(config.num_hidden_layers)); - sd.emplace_back("model.embed_tokens.weight", transformer.word_embeddings.weight); - for (int i = 0; i < config.num_hidden_layers; i++) { - std::string layer_prefix = "model.layers." + std::to_string(i) + '.'; - sd.emplace_back(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight); - sd.emplace_back(layer_prefix + "self_attn.W_pack.weight", - transformer.layers[i].attention.query_key_value.weight); - sd.emplace_back(layer_prefix + "self_attn.o_proj.weight", transformer.layers[i].attention.dense.weight); - sd.emplace_back(layer_prefix + "post_attention_layernorm.weight", - transformer.layers[i].post_attention_layernorm.weight); - sd.emplace_back(layer_prefix + "mlp.gate_proj.weight", transformer.layers[i].mlp.gate_proj.weight); - sd.emplace_back(layer_prefix + "mlp.down_proj.weight", transformer.layers[i].mlp.down_proj.weight); - sd.emplace_back(layer_prefix + "mlp.up_proj.weight", transformer.layers[i].mlp.up_proj.weight); - } - sd.emplace_back("model.norm.weight", transformer.final_layernorm.weight); - sd.emplace_back("lm_head.weight", lm_head.weight); - return sd; -} - -// ===== InternLM ===== - -InternLMTokenizer::InternLMTokenizer(std::string_view serialized_model_proto) { - const auto status = sp.LoadFromSerializedProto(serialized_model_proto); - CHATGLM_CHECK(status.ok()) << status.ToString(); -} - -std::vector InternLMTokenizer::encode(const std::string &text, int max_length) const { - std::vector ids; - sp.Encode(text, &ids); - ids.insert(ids.begin(), {bos_token_id}); // special prefix - if ((int)ids.size() > max_length) { - // sliding window: drop the least recent history while keeping the special prefix - int num_drop = (int)ids.size() - max_length; - ids.erase(ids.begin() + 1, ids.begin() + 1 + num_drop); - } - return ids; -} - -std::string InternLMTokenizer::decode(const std::vector &ids, bool skip_special_tokens) const { - CHATGLM_CHECK(skip_special_tokens) << "unimplemented"; - // filter out special tokens - std::vector normal_ids(ids); - normal_ids.erase(std::remove_if(normal_ids.begin(), normal_ids.end(), [this](int id) { return is_special_id(id); }), - normal_ids.end()); - - std::string text; - sp.Decode(normal_ids, &text); - // remove and its following - size_t eoa_pos = text.find(""); - if (eoa_pos != std::string::npos) { - text.erase(eoa_pos); - } - return text; -} - -std::vector InternLMTokenizer::apply_chat_template(const std::vector &messages, - int max_length) const { - std::string prompt = build_prompt(messages); - std::vector input_ids = encode(prompt, max_length); - return input_ids; -} - -std::string InternLMTokenizer::build_prompt(const std::vector &messages) { - check_chat_messages(messages); - std::vector user_assistant_messages = filter_user_assistant_messages(messages); - - std::ostringstream oss_prompt; - for (const auto &msg : user_assistant_messages) { - if (msg.role == ChatMessage::ROLE_USER) { - oss_prompt << "<|User|>:" << msg.content << "\n<|Bot|>:"; - } else { - oss_prompt << msg.content << "\n"; - } - } - return oss_prompt.str(); -} - -InternLMForCausalLM::InternLMForCausalLM(const ModelConfig &config) - : BasicModelForCausalLM(config, MEM_SIZE, SCRATCH_SIZE, num_weights(config.num_hidden_layers, config.hidden_size)) { - state_dict_ = state_dict(); -} - -void InternLMForCausalLM::load(ModelLoader &loader) { - for (auto &item : state_dict_) { - const std::string &name = item.first; - ggml_tensor *tensor = item.second; - loader.read_tensor(name, tensor); - } - - to_device(); - - ctx_.weight_buffer = std::string_view(loader.data, loader.size); - ctx_.init_device_context(); -} - -StateDict InternLMForCausalLM::state_dict() const { - StateDict sd; - sd.reserve(num_weights(config.num_hidden_layers, config.hidden_size)); - sd.emplace_back("model.embed_tokens.weight", transformer.word_embeddings.weight); - for (int i = 0; i < config.num_hidden_layers; i++) { - std::string layer_prefix = "model.layers." + std::to_string(i) + '.'; - sd.emplace_back(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight); - sd.emplace_back(layer_prefix + "self_attn.qkv_proj.weight", - transformer.layers[i].attention.query_key_value.weight); - if (transformer.layers[i].attention.query_key_value.bias) { - sd.emplace_back(layer_prefix + "self_attn.qkv_proj.bias", - transformer.layers[i].attention.query_key_value.bias); - } - sd.emplace_back(layer_prefix + "self_attn.o_proj.weight", transformer.layers[i].attention.dense.weight); - if (transformer.layers[i].attention.dense.bias) { - sd.emplace_back(layer_prefix + "self_attn.o_proj.bias", transformer.layers[i].attention.dense.bias); - } - sd.emplace_back(layer_prefix + "post_attention_layernorm.weight", - transformer.layers[i].post_attention_layernorm.weight); - sd.emplace_back(layer_prefix + "mlp.gate_proj.weight", transformer.layers[i].mlp.gate_proj.weight); - sd.emplace_back(layer_prefix + "mlp.up_proj.weight", transformer.layers[i].mlp.up_proj.weight); - sd.emplace_back(layer_prefix + "mlp.down_proj.weight", transformer.layers[i].mlp.down_proj.weight); - } - sd.emplace_back("model.norm.weight", transformer.final_layernorm.weight); - sd.emplace_back("lm_head.weight", lm_head.weight); - return sd; -} - // ===== ChatGLM4-9B ===== TiktokenCoreBPE::TiktokenCoreBPE(std::unordered_map encoder, @@ -1994,8 +1737,8 @@ Pipeline::Pipeline(const std::string &path, int max_length) { } }; - mapped_file = std::make_unique(path); - ModelLoader loader(mapped_file->data, mapped_file->size); + mapped_file_ = std::make_unique(path); + ModelLoader loader(mapped_file_->data, mapped_file_->size); // load magic std::string magic = loader.read_string(4); @@ -2009,11 +1752,9 @@ Pipeline::Pipeline(const std::string &path, int max_length) { // load config ModelConfig config; if (version == 1) { - config = ModelConfig(model_type, loader.read_basic(), 1e-5f, ActivationType::GELU, true, - true, true, false, RopeType::CHATGLM, 10000.f, -1, AttentionMaskType::CHATGLM, 0); + config = ModelConfig(model_type, loader.read_basic(), 1e-5f, 10000.f, 0); } else if (version == 2) { - config = ModelConfig(model_type, loader.read_basic(), ActivationType::GELU, true, true, - true, false, RopeType::CHATGLM, -1, AttentionMaskType::CHATGLM); + config = ModelConfig(model_type, loader.read_basic()); } else { CHATGLM_THROW << "only support version 1 or 2 for now but got " << version; } @@ -2021,22 +1762,21 @@ Pipeline::Pipeline(const std::string &path, int max_length) { // load tokenizer int proto_size = loader.read_basic(); - std::string_view serialized_model_proto((char *)mapped_file->data + loader.tell(), proto_size); + std::string_view serialized_model_proto((char *)mapped_file_->data + loader.tell(), proto_size); loader.seek(proto_size, SEEK_CUR); tokenizer = std::make_unique(serialized_model_proto); // load model model = std::make_unique(config); - model->load(loader); + StateDict sd = loader.read_state_dict(); + model->load_state_dict(sd); } else if (model_type == ModelType::CHATGLM2 || model_type == ModelType::CHATGLM3) { // load config ModelConfig config; if (version == 1) { - config = ModelConfig(model_type, loader.read_basic(), 1e-5f, ActivationType::SILU, true, - false, false, false, RopeType::GPTJ, 10000.f, 2, AttentionMaskType::CAUSAL, 0); + config = ModelConfig(model_type, loader.read_basic(), 1e-5f, 10000.f, 0); } else if (version == 2) { - config = ModelConfig(model_type, loader.read_basic(), ActivationType::SILU, true, false, - false, false, RopeType::GPTJ, 2, AttentionMaskType::CAUSAL); + config = ModelConfig(model_type, loader.read_basic()); } else { CHATGLM_THROW << "only support version 1 or 2 for now but got " << version; } @@ -2044,7 +1784,7 @@ Pipeline::Pipeline(const std::string &path, int max_length) { // load tokenizer int proto_size = loader.read_basic(); - std::string_view serialized_model_proto((char *)mapped_file->data + loader.tell(), proto_size); + std::string_view serialized_model_proto((char *)mapped_file_->data + loader.tell(), proto_size); loader.seek(proto_size, SEEK_CUR); if (model_type == ModelType::CHATGLM2) { @@ -2059,12 +1799,12 @@ Pipeline::Pipeline(const std::string &path, int max_length) { } // load model - model->load(loader); + StateDict sd = loader.read_state_dict(); + model->load_state_dict(sd); } else if (model_type == ModelType::CHATGLM4) { // load config CHATGLM_CHECK(version == 2) << "only support version 2 for now but got " << version; - ModelConfig config(model_type, loader.read_basic(), ActivationType::SILU, true, false, false, - false, RopeType::GPTJ, 2, AttentionMaskType::CAUSAL); + ModelConfig config(model_type, loader.read_basic()); _update_config_max_length(config, max_length); // load tokenizer @@ -2076,71 +1816,8 @@ Pipeline::Pipeline(const std::string &path, int max_length) { // load model model = std::make_unique(config); - model->load(loader); - } else if (model_type == ModelType::BAICHUAN7B) { - std::cerr << "[WARN] Baichuan models are deprecated in favor of llama.cpp, and will be removed in next major " - "version of chatglm.cpp\n"; - CHATGLM_CHECK(version == 1) << "only support version 1 for now but got " << version; - - // load config - ModelConfig config(model_type, loader.read_basic(), 1e-6f, ActivationType::SILU, false, false, - false, false, RopeType::NEOX, 10000.f, 1, AttentionMaskType::CAUSAL, 0); - _update_config_max_length(config, max_length); - - // load tokenizer - int proto_size = loader.read_basic(); - std::string_view serialized_model_proto((char *)mapped_file->data + loader.tell(), proto_size); - loader.seek(proto_size, SEEK_CUR); - tokenizer = std::make_unique(serialized_model_proto); - - // load model - model = std::make_unique(config); - model->load(loader); - } else if (model_type == ModelType::BAICHUAN13B) { - std::cerr << "[WARN] Baichuan models are deprecated in favor of llama.cpp, and will be removed in next major " - "version of chatglm.cpp\n"; - CHATGLM_CHECK(version == 1) << "only support version 1 for now but got " << version; - - // load config - ModelConfig config(model_type, loader.read_basic(), 1e-6f, ActivationType::SILU, false, false, - false, true, RopeType::DISABLED, 10000.f, -1, AttentionMaskType::CAUSAL, 0); - _update_config_max_length(config, max_length); - - // load tokenizer - int proto_size = loader.read_basic(); - std::string_view serialized_model_proto((char *)mapped_file->data + loader.tell(), proto_size); - loader.seek(proto_size, SEEK_CUR); - tokenizer = std::make_unique(serialized_model_proto); - - // load model - model = std::make_unique(config); - model->load(loader); - } else if (model_type == ModelType::INTERNLM) { - std::cerr << "[WARN] InternLM models are deprecated in favor of llama.cpp, and will be removed in next major " - "version of chatglm.cpp\n"; - CHATGLM_CHECK(version == 1) << "only support version 1 for now but got " << version; - - // load config - auto rec = loader.read_basic(); - ModelConfig config; - if (rec.hidden_size == 4096) { - config = ModelConfig(model_type, rec, 1e-6f, ActivationType::SILU, true, true, false, false, RopeType::NEOX, - 10000.f, 1, AttentionMaskType::CAUSAL, 0); - } else { - config = ModelConfig(model_type, rec, 1e-6f, ActivationType::SILU, false, false, false, false, - RopeType::NEOX, 10000.f, 1, AttentionMaskType::CAUSAL, 0); - } - _update_config_max_length(config, max_length); - - // load tokenizer - int proto_size = loader.read_basic(); - std::string_view serialized_model_proto((char *)mapped_file->data + loader.tell(), proto_size); - loader.seek(proto_size, SEEK_CUR); - tokenizer = std::make_unique(serialized_model_proto); - - // load model - model = std::make_unique(config); - model->load(loader); + StateDict sd = loader.read_state_dict(); + model->load_state_dict(sd); } else { CHATGLM_THROW << "invalid model type " << (int)model_type; } diff --git a/chatglm.h b/chatglm.h index b50b3340..e0836efb 100644 --- a/chatglm.h +++ b/chatglm.h @@ -2,22 +2,21 @@ #include #include +#include #include #include #include #include #include -#ifdef GGML_USE_METAL -#include -#endif +// #ifdef GGML_USE_METAL +// #include +// #endif namespace chatglm { // ===== common ===== -static constexpr size_t MB = 1024 * 1024; - class LogMessageFatal { public: LogMessageFatal(const char *file, int line) { oss_ << file << ':' << line << ' '; } @@ -41,20 +40,11 @@ class LogMessageFatal { std::string to_string(ggml_tensor *tensor, bool with_data = true); -ggml_tensor *tensor_assign_buffers(ggml_tensor *tensor); - -ggml_tensor *tensor_to_device(ggml_tensor *tensor); - -ggml_tensor *tensor_to_cpu(ggml_tensor *tensor); - enum class ModelType { CHATGLM = 1, CHATGLM2 = 2, CHATGLM3 = 3, CHATGLM4 = 4, - BAICHUAN7B = 1024, - BAICHUAN13B = 1025, - INTERNLM = 1280, }; std::string to_string(ModelType model_type); @@ -123,46 +113,53 @@ class ModelConfig { ModelConfig() = default; ModelConfig(ModelType model_type, ggml_type dtype, int vocab_size, int hidden_size, int num_attention_heads, - int num_kv_heads, int num_hidden_layers, int intermediate_size, float norm_eps, - ActivationType hidden_act, bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, bool use_alibi, - RopeType rope_type, float rope_theta, int rope_dim_scale, AttentionMaskType attn_mask_type, + int num_kv_heads, int num_hidden_layers, int intermediate_size, float norm_eps, float rope_theta, int num_virtual_tokens, int max_length, int bos_token_id, int eos_token_id, int pad_token_id, int sep_token_id, std::vector extra_eos_token_ids) : model_type(model_type), dtype(dtype), vocab_size(vocab_size), hidden_size(hidden_size), num_attention_heads(num_attention_heads), num_kv_heads(num_kv_heads), num_hidden_layers(num_hidden_layers), - intermediate_size(intermediate_size), norm_eps(norm_eps), hidden_act(hidden_act), use_qkv_bias(use_qkv_bias), - use_dense_bias(use_dense_bias), interleaved_qkv(interleaved_qkv), use_alibi(use_alibi), rope_type(rope_type), - rope_theta(rope_theta), rope_dim_scale(rope_dim_scale), attn_mask_type(attn_mask_type), + intermediate_size(intermediate_size), norm_eps(norm_eps), rope_theta(rope_theta), num_virtual_tokens(num_virtual_tokens), max_length(max_length), bos_token_id(bos_token_id), eos_token_id(eos_token_id), pad_token_id(pad_token_id), sep_token_id(sep_token_id), - extra_eos_token_ids(std::move(extra_eos_token_ids)) {} + extra_eos_token_ids(std::move(extra_eos_token_ids)) { + if (model_type == ModelType::CHATGLM) { + hidden_act = ActivationType::GELU; + use_qkv_bias = true; + use_dense_bias = true; + interleaved_qkv = true; + tie_word_embeddings = true; + rope_type = RopeType::CHATGLM; + attn_mask_type = AttentionMaskType::CHATGLM; + } else { + hidden_act = ActivationType::SILU; + use_qkv_bias = true; + use_dense_bias = false; + interleaved_qkv = false; + tie_word_embeddings = false; + rope_type = RopeType::CHATGLM2; + attn_mask_type = AttentionMaskType::CAUSAL; + } + } - ModelConfig(ModelType model_type, const ConfigRecordV1 &rec, float norm_eps, ActivationType hidden_act, - bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, bool use_alibi, RopeType rope_type, - float rope_theta, int rope_dim_scale, AttentionMaskType attn_mask_type, int num_virtual_tokens) + ModelConfig(ModelType model_type, const ConfigRecordV1 &rec, float norm_eps, float rope_theta, + int num_virtual_tokens) : ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads, - rec.num_attention_heads, rec.num_hidden_layers, rec.intermediate_size, norm_eps, hidden_act, - use_qkv_bias, use_dense_bias, interleaved_qkv, use_alibi, rope_type, rope_theta, rope_dim_scale, - attn_mask_type, num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, - rec.pad_token_id, rec.sep_token_id, {}) {} + rec.num_attention_heads, rec.num_hidden_layers, rec.intermediate_size, norm_eps, rope_theta, + num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, + rec.sep_token_id, {}) {} ModelConfig(ModelType model_type, const ConfigRecordV1GQA &rec, float norm_eps, ActivationType hidden_act, - bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, bool use_alibi, RopeType rope_type, - float rope_theta, int rope_dim_scale, AttentionMaskType attn_mask_type, int num_virtual_tokens) + bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, RopeType rope_type, float rope_theta, + AttentionMaskType attn_mask_type, int num_virtual_tokens) : ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads, rec.num_kv_heads, - rec.num_hidden_layers, rec.intermediate_size, norm_eps, hidden_act, use_qkv_bias, use_dense_bias, - interleaved_qkv, use_alibi, rope_type, rope_theta, rope_dim_scale, attn_mask_type, - num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, - rec.sep_token_id, {}) {} + rec.num_hidden_layers, rec.intermediate_size, norm_eps, rope_theta, num_virtual_tokens, + rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, rec.sep_token_id, {}) {} - ModelConfig(ModelType model_type, const ConfigRecordV2 &rec, ActivationType hidden_act, bool use_qkv_bias, - bool use_dense_bias, bool interleaved_qkv, bool use_alibi, RopeType rope_type, int rope_dim_scale, - AttentionMaskType attn_mask_type) + ModelConfig(ModelType model_type, const ConfigRecordV2 &rec) : ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads, - rec.num_key_value_heads, rec.num_hidden_layers, rec.intermediate_size, rec.norm_eps, hidden_act, - use_qkv_bias, use_dense_bias, interleaved_qkv, use_alibi, rope_type, rec.rope_theta, - rope_dim_scale, attn_mask_type, rec.num_virtual_tokens, rec.max_length, -1, rec.eos_token_id, - rec.pad_token_id, -1, {}) {} + rec.num_key_value_heads, rec.num_hidden_layers, rec.intermediate_size, rec.norm_eps, + rec.rope_theta, rec.num_virtual_tokens, rec.max_length, -1, rec.eos_token_id, rec.pad_token_id, + -1, {}) {} std::string model_type_name() const { return to_string(model_type); } @@ -180,10 +177,9 @@ class ModelConfig { bool use_qkv_bias; bool use_dense_bias; bool interleaved_qkv; - bool use_alibi; + bool tie_word_embeddings; RopeType rope_type; float rope_theta; - int rope_dim_scale; AttentionMaskType attn_mask_type; int num_virtual_tokens; int max_length; @@ -286,10 +282,28 @@ struct ggml_context_deleter_t { using unique_ggml_context_t = std::unique_ptr; -static inline unique_ggml_context_t make_unique_ggml_context(size_t mem_size, void *mem_buffer, bool no_alloc) { +inline unique_ggml_context_t make_unique_ggml_context(size_t mem_size, void *mem_buffer, bool no_alloc) { return unique_ggml_context_t(ggml_init({mem_size, mem_buffer, no_alloc})); } +struct ggml_gallocr_deleter_t { + void operator()(ggml_gallocr *galloc) const noexcept { ggml_gallocr_free(galloc); } +}; + +using unique_ggml_gallocr_t = std::unique_ptr; + +struct ggml_backend_deleter_t { + void operator()(ggml_backend_t backend) const noexcept { ggml_backend_free(backend); } +}; + +using unique_ggml_backend_t = std::unique_ptr; + +struct ggml_backend_buffer_deleter_t { + void operator()(ggml_backend_buffer_t buffer) const noexcept { ggml_backend_buffer_free(buffer); } +}; + +using unique_ggml_backend_buffer_t = std::unique_ptr; + #ifdef GGML_USE_METAL struct ggml_metal_context_deleter_t { void operator()(ggml_metal_context *ctx) const noexcept { ggml_metal_free(ctx); } @@ -302,39 +316,48 @@ static inline unique_ggml_metal_context_t make_unique_ggml_metal_context(int n_c } #endif -// reference: https://stackoverflow.com/questions/11149665/c-vector-that-doesnt-initialize-its-members -struct uninitialized_char { - char m; - uninitialized_char() {} +// reference: https://github.com/ggerganov/llama.cpp/blob/master/llama.cpp +template +struct no_init { + T value; + no_init() { /* do nothing */ + } }; -void ggml_graph_compute_helper(std::vector &buf, ggml_cgraph *graph, int n_threads); - struct ModelContext { ggml_type dtype; + + std::vector> compute_meta; + unique_ggml_context_t ctx_w; // weight unique_ggml_context_t ctx_kv; // kv cache unique_ggml_context_t ctx_b; // buffer -#ifdef GGML_USE_METAL - unique_ggml_metal_context_t ctx_metal; -#endif - ggml_cgraph gf; - ggml_scratch scratch; - std::vector compute_buffer; // BLAS buffer - std::vector scratch_buffer; // intermediate tensor buffer - std::string_view weight_buffer; // mapped weight - std::vector work_buffer; // temporary buffer for graph computing - - void init_device_context(); + + ggml_cgraph *gf; + unique_ggml_backend_t backend; + unique_ggml_gallocr_t allocr; + + unique_ggml_backend_buffer_t buf_w; + unique_ggml_backend_buffer_t buf_kv; + + // #ifdef GGML_USE_METAL + // // unique_ggml_metal_context_t ctx_metal; + // #endif + + // std::string_view weight_buffer; // mapped weight + + ModelContext(ggml_type dtype); + + // void init_device_context(); }; class Embedding { public: Embedding() : weight(nullptr) {} - Embedding(ModelContext *ctx, int num_embeddings, int embedding_dim) - : weight(ggml_new_tensor_2d(ctx->ctx_w.get(), ctx->dtype, embedding_dim, num_embeddings)) {} + Embedding(ModelContext *mctx, int num_embeddings, int embedding_dim) + : weight(ggml_new_tensor_2d(mctx->ctx_w.get(), mctx->dtype, embedding_dim, num_embeddings)) {} - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input) const; + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input) const; public: ggml_tensor *weight; @@ -343,14 +366,14 @@ class Embedding { class Linear { public: Linear() : weight(nullptr), bias(nullptr) {} - Linear(ModelContext *ctx, int in_features, int out_features, bool use_bias = true) - : weight(ggml_new_tensor_2d(ctx->ctx_w.get(), ctx->dtype, in_features, out_features)), - bias(use_bias ? ggml_new_tensor_1d(ctx->ctx_w.get(), GGML_TYPE_F32, out_features) : nullptr) {} + Linear(ModelContext *mctx, int in_features, int out_features, bool use_bias = true) + : weight(ggml_new_tensor_2d(mctx->ctx_w.get(), mctx->dtype, in_features, out_features)), + bias(use_bias ? ggml_new_tensor_1d(mctx->ctx_w.get(), GGML_TYPE_F32, out_features) : nullptr) {} int in_features() const { return weight->ne[0]; } int out_features() const { return weight->ne[1]; } - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input) const; + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input) const; public: ggml_tensor *weight; // [out_features, in_features] @@ -360,11 +383,11 @@ class Linear { class LayerNorm { public: LayerNorm() = default; - LayerNorm(ModelContext *ctx, int normalized_shape, bool inplace = true, float eps = 1e-5f) - : weight(ggml_new_tensor_1d(ctx->ctx_w.get(), GGML_TYPE_F32, normalized_shape)), - bias(ggml_new_tensor_1d(ctx->ctx_w.get(), GGML_TYPE_F32, normalized_shape)), inplace(inplace), eps(eps) {} + LayerNorm(ModelContext *mctx, int normalized_shape, bool inplace = true, float eps = 1e-5f) + : weight(ggml_new_tensor_1d(mctx->ctx_w.get(), GGML_TYPE_F32, normalized_shape)), + bias(ggml_new_tensor_1d(mctx->ctx_w.get(), GGML_TYPE_F32, normalized_shape)), inplace(inplace), eps(eps) {} - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input) const; + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input) const; public: ggml_tensor *weight; // [normalized_shape] @@ -376,10 +399,10 @@ class LayerNorm { class RMSNorm { public: RMSNorm() = default; - RMSNorm(ModelContext *ctx, int normalized_shape, bool inplace = true, float eps = 1e-5f) - : weight(ggml_new_tensor_1d(ctx->ctx_w.get(), GGML_TYPE_F32, normalized_shape)), inplace(inplace), eps(eps) {} + RMSNorm(ModelContext *mctx, int normalized_shape, bool inplace = true, float eps = 1e-5f) + : weight(ggml_new_tensor_1d(mctx->ctx_w.get(), GGML_TYPE_F32, normalized_shape)), inplace(inplace), eps(eps) {} - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input) const; + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input) const; public: ggml_tensor *weight; // [normalized_shape] @@ -390,11 +413,11 @@ class RMSNorm { class BasicMLP { public: BasicMLP() = default; - BasicMLP(ModelContext *ctx, int hidden_size, int intermediate_size, ActivationType hidden_act) - : dense_h_to_4h(ctx, hidden_size, intermediate_size), dense_4h_to_h(ctx, intermediate_size, hidden_size), + BasicMLP(ModelContext *mctx, int hidden_size, int intermediate_size, ActivationType hidden_act) + : dense_h_to_4h(mctx, hidden_size, intermediate_size), dense_4h_to_h(mctx, intermediate_size, hidden_size), hidden_act(hidden_act) {} - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *hidden_states) const; + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states) const; public: Linear dense_h_to_4h; @@ -405,11 +428,11 @@ class BasicMLP { class BasicGLU { public: BasicGLU() = default; - BasicGLU(ModelContext *ctx, int hidden_size, int intermediate_size, ActivationType hidden_act) - : gate_proj(ctx, hidden_size, intermediate_size, false), up_proj(ctx, hidden_size, intermediate_size, false), - down_proj(ctx, intermediate_size, hidden_size, false), hidden_act(hidden_act) {} + BasicGLU(ModelContext *mctx, int hidden_size, int intermediate_size, ActivationType hidden_act) + : gate_proj(mctx, hidden_size, intermediate_size, false), up_proj(mctx, hidden_size, intermediate_size, false), + down_proj(mctx, intermediate_size, hidden_size, false), hidden_act(hidden_act) {} - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *hidden_states) const; + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states) const; public: Linear gate_proj; @@ -421,31 +444,29 @@ class BasicGLU { class BasicAttention { public: BasicAttention() = default; - BasicAttention(ModelContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length, - bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, bool use_alibi, RopeType rope_type, - float rope_theta, int rope_dim_scale, AttentionMaskType attn_mask_type, int num_virtual_tokens) + BasicAttention(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length, + bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, RopeType rope_type, float rope_theta, + AttentionMaskType attn_mask_type, int num_virtual_tokens) : num_attention_heads(num_attention_heads), num_kv_heads(num_kv_heads), interleaved_qkv(interleaved_qkv), - use_alibi(use_alibi), rope_type(rope_type), rope_theta(rope_theta), rope_dim_scale(rope_dim_scale), - attn_mask_type(attn_mask_type), num_virtual_tokens(num_virtual_tokens), - query_key_value(ctx, hidden_size, hidden_size + 2 * (hidden_size / num_attention_heads) * num_kv_heads, + rope_type(rope_type), rope_theta(rope_theta), attn_mask_type(attn_mask_type), + num_virtual_tokens(num_virtual_tokens), + query_key_value(mctx, hidden_size, hidden_size + 2 * (hidden_size / num_attention_heads) * num_kv_heads, use_qkv_bias), - dense(ctx, hidden_size, hidden_size, use_dense_bias), - k_cache(ggml_new_tensor_3d(ctx->ctx_kv.get(), GGML_TYPE_F16, hidden_size / num_attention_heads, + dense(mctx, hidden_size, hidden_size, use_dense_bias), + k_cache(ggml_new_tensor_3d(mctx->ctx_kv.get(), GGML_TYPE_F16, hidden_size / num_attention_heads, max_length + num_virtual_tokens, num_kv_heads)), - v_cache(ggml_new_tensor_3d(ctx->ctx_kv.get(), GGML_TYPE_F16, max_length + num_virtual_tokens, + v_cache(ggml_new_tensor_3d(mctx->ctx_kv.get(), GGML_TYPE_F16, max_length + num_virtual_tokens, hidden_size / num_attention_heads, num_kv_heads)) {} - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *position_ids, int n_past, - int n_ctx) const; + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, + ggml_tensor *position_ids, int n_past, int n_ctx) const; public: int num_attention_heads; int num_kv_heads; bool interleaved_qkv; - bool use_alibi; RopeType rope_type; float rope_theta; - int rope_dim_scale; AttentionMaskType attn_mask_type; int num_virtual_tokens; Linear query_key_value; @@ -454,77 +475,76 @@ class BasicAttention { ggml_tensor *v_cache; // [#kvh, d, s] }; -template +template class BasicBlock { public: BasicBlock() = default; - BasicBlock(ModelContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int intermediate_size, + BasicBlock(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_kv_heads, int intermediate_size, int max_length, float norm_eps, ActivationType hidden_act, bool use_qkv_bias, bool use_dense_bias, - bool interleaved_qkv, bool use_alibi, RopeType rope_type, float rope_theta, int rope_dim_scale, - AttentionMaskType attn_mask_type, int num_virtual_tokens) - : input_layernorm(ctx, hidden_size, false, norm_eps), - attention(ctx, hidden_size, num_attention_heads, num_kv_heads, max_length, use_qkv_bias, use_dense_bias, - interleaved_qkv, use_alibi, rope_type, rope_theta, rope_dim_scale, attn_mask_type, - num_virtual_tokens), - post_attention_layernorm(ctx, hidden_size, false, norm_eps), - mlp(ctx, hidden_size, intermediate_size, hidden_act) {} - - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *position_ids, int n_past, + bool interleaved_qkv, RopeType rope_type, float rope_theta, AttentionMaskType attn_mask_type, + int num_virtual_tokens) + : input_layernorm(mctx, hidden_size, false, norm_eps), + attention(mctx, hidden_size, num_attention_heads, num_kv_heads, max_length, use_qkv_bias, use_dense_bias, + interleaved_qkv, rope_type, rope_theta, attn_mask_type, num_virtual_tokens), + post_attention_layernorm(mctx, hidden_size, false, norm_eps), + mlp(mctx, hidden_size, intermediate_size, hidden_act) {} + + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, + ggml_tensor *position_ids, int n_past, + int n_ctx) const { - ggml_context *gctx = ctx->ctx_b.get(); + ggml_context *ctx = mctx->ctx_b.get(); ggml_tensor *residual = hidden_states; - hidden_states = input_layernorm.forward(ctx, hidden_states); - hidden_states = attention.forward(ctx, hidden_states, position_ids, n_past, n_ctx); - hidden_states = tensor_assign_buffers(ggml_add_inplace(gctx, hidden_states, residual)); + hidden_states = input_layernorm.forward(mctx, hidden_states); + hidden_states = attention.forward(mctx, hidden_states, attention_mask, position_ids, n_past, n_ctx); + hidden_states = ggml_add_inplace(ctx, hidden_states, residual); residual = hidden_states; - hidden_states = post_attention_layernorm.forward(ctx, hidden_states); - hidden_states = mlp.forward(ctx, hidden_states); - hidden_states = tensor_assign_buffers(ggml_add_inplace(gctx, hidden_states, residual)); + hidden_states = post_attention_layernorm.forward(mctx, hidden_states); + hidden_states = mlp.forward(mctx, hidden_states); + hidden_states = ggml_add_inplace(ctx, hidden_states, residual); return hidden_states; } protected: - BasicBlock(Norm input_layernorm, Attention attention, Norm post_attention_layernorm, MLP mlp) + BasicBlock(Norm input_layernorm, BasicAttention attention, Norm post_attention_layernorm, MLP mlp) : input_layernorm(input_layernorm), attention(attention), post_attention_layernorm(post_attention_layernorm), mlp(mlp) {} public: Norm input_layernorm; - Attention attention; + BasicAttention attention; Norm post_attention_layernorm; MLP mlp; }; -struct NoopPositionIdsGenerator { - ggml_tensor *operator()(ggml_context *ctx, int qlen, int n_past, int n_ctx) const { return nullptr; } +struct NoopPositionIdsAllocator { + ggml_tensor *operator()(ggml_context *ctx, int qlen) const { return nullptr; } }; -struct BasicPositionIdsGenerator { - ggml_tensor *operator()(ggml_context *ctx, int qlen, int n_past, int n_ctx) const { - ggml_tensor *position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, qlen); - for (int i = 0; i < qlen; i++) { - ((int *)position_ids->data)[i] = n_past + i; - } - return position_ids; +struct BasicPositionIdsAllocator { + ggml_tensor *operator()(ggml_context *ctx, int qlen) const { return ggml_new_tensor_1d(ctx, GGML_TYPE_I32, qlen); } +}; + +struct GLMPositionIdsAllocator { + ggml_tensor *operator()(ggml_context *ctx, int qlen) const { + return ggml_new_tensor_1d(ctx, GGML_TYPE_I32, qlen * 2); } }; -struct GLMPositionIdsGenerator { - ggml_tensor *operator()(ggml_context *ctx, int qlen, int n_past, int n_ctx) const { - ggml_tensor *position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, qlen * 2); - for (int i = 0; i < qlen; i++) { - const int p = n_past + i; - ((int *)position_ids->data)[i] = std::min(p, n_ctx - 2); - ((int *)position_ids->data)[qlen + i] = std::max(p - (n_ctx - 2), 0); - } - return position_ids; +struct NoopAttentionMaskAllocator { + ggml_tensor *operator()(ggml_context *ctx, int qlen, int kvlen) const { return nullptr; } +}; + +struct BasicAttentionMaskAllocator { + ggml_tensor *operator()(ggml_context *ctx, int qlen, int kvlen) const { + return ggml_new_tensor_2d(ctx, GGML_TYPE_F32, kvlen, qlen); } }; -template +template class BasicModel { public: BasicModel() = default; @@ -532,69 +552,86 @@ class BasicModel { BasicModel(Embedding word_embeddings, std::vector layers, Norm final_layernorm) : word_embeddings(word_embeddings), layers(std::move(layers)), final_layernorm(final_layernorm) {} - BasicModel(ModelContext *ctx, const ModelConfig &config) - : word_embeddings(ctx, config.vocab_size, config.hidden_size), layers(build_layers(ctx, config)), - final_layernorm(ctx, config.hidden_size) {} + BasicModel(ModelContext *mctx, const ModelConfig &config) + : word_embeddings(mctx, config.vocab_size, config.hidden_size), layers(build_layers(mctx, config)), + final_layernorm(mctx, config.hidden_size) {} + + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input_ids, int n_past, int n_ctx) const { + ggml_context *ctx = mctx->ctx_b.get(); - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const { - ggml_context *gctx = ctx->ctx_b.get(); - ggml_tensor *position_ids = pos_ids_gen_(gctx, input_ids->ne[0], n_past, n_ctx); + const int qlen = input_ids->ne[0]; + const int kvlen = layers.front().attention.num_virtual_tokens + n_past + qlen; + + ggml_tensor *position_ids = pos_ids_alloc_(ctx, qlen); if (position_ids) { - tensor_to_device(position_ids); + ggml_set_name(position_ids, "position_ids"); + ggml_set_input(position_ids); } - ggml_tensor *hidden_states = word_embeddings.forward(ctx, input_ids); - for (const auto &layer : layers) { - ggml_set_scratch(gctx, ctx->scratch); - hidden_states = layer.forward(ctx, hidden_states, position_ids, n_past, n_ctx); + + ggml_tensor *attention_mask = attn_mask_alloc_(ctx, qlen, kvlen); + if (attention_mask) { + ggml_set_name(attention_mask, "attention_mask"); + ggml_set_input(attention_mask); } - if (position_ids) { - tensor_to_cpu(position_ids); + + ggml_tensor *hidden_states = word_embeddings.forward(mctx, input_ids); + for (const auto &layer : layers) { + hidden_states = layer.forward(mctx, hidden_states, attention_mask, position_ids, n_past, n_ctx); } - ggml_scratch empty_scratch = {0, 0, nullptr}; - ggml_set_scratch(gctx, empty_scratch); - hidden_states = final_layernorm.forward(ctx, hidden_states); + + hidden_states = final_layernorm.forward(mctx, hidden_states); return hidden_states; } void load_prefix_cache(const ModelConfig &config, ggml_tensor *past_key_values) { - ggml_cgraph gf{}; - auto ctx = make_unique_ggml_context(config.num_hidden_layers * 7 * ggml_tensor_overhead(), nullptr, false); + // past_key_values: [l * 2, #h, v, d] + ModelContext mctx(config.dtype); + + ggml_tensor *backend_past_key_values = ggml_new_tensor(mctx.ctx_kv.get(), past_key_values->type, + ggml_n_dims(past_key_values), past_key_values->ne); + auto buf_kv = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx.ctx_kv.get(), mctx.backend.get())); + ggml_backend_tensor_set(backend_past_key_values, past_key_values->data, 0, ggml_nbytes(past_key_values)); + past_key_values = backend_past_key_values; + const int head_size = config.hidden_size / config.num_attention_heads; for (size_t i = 0; i < layers.size(); i++) { auto &attn = layers[i].attention; - ggml_tensor *virtual_key = ggml_view_3d(ctx.get(), past_key_values, head_size, config.num_virtual_tokens, - config.num_kv_heads, past_key_values->nb[1], past_key_values->nb[2], - i * 2 * past_key_values->nb[3]); // [#h, v, d] + ggml_tensor *virtual_key = + ggml_view_3d(mctx.ctx_b.get(), past_key_values, head_size, config.num_virtual_tokens, + config.num_kv_heads, past_key_values->nb[1], past_key_values->nb[2], + i * 2 * past_key_values->nb[3]); // [#h, v, d] ggml_tensor *k_cache_view = - ggml_view_3d(ctx.get(), attn.k_cache, head_size, config.num_virtual_tokens, config.num_kv_heads, + ggml_view_3d(mctx.ctx_b.get(), attn.k_cache, head_size, config.num_virtual_tokens, config.num_kv_heads, attn.k_cache->nb[1], attn.k_cache->nb[2], 0); // [#h, v, d] - ggml_build_forward_expand(&gf, ggml_cpy(ctx.get(), virtual_key, k_cache_view)); + ggml_build_forward_expand(mctx.gf, ggml_cpy(mctx.ctx_b.get(), virtual_key, k_cache_view)); ggml_tensor *virtual_value = ggml_view_3d( - ctx.get(), past_key_values, head_size, config.num_virtual_tokens, config.num_kv_heads, + mctx.ctx_b.get(), past_key_values, head_size, config.num_virtual_tokens, config.num_kv_heads, past_key_values->nb[1], past_key_values->nb[2], (i * 2 + 1) * past_key_values->nb[3]); // [#h, v, d] - virtual_value = ggml_permute(ctx.get(), virtual_value, 1, 0, 2, 3); // [#h, d, v] + virtual_value = ggml_permute(mctx.ctx_b.get(), virtual_value, 1, 0, 2, 3); // [#h, d, v] ggml_tensor *v_cache_view = - ggml_view_3d(ctx.get(), attn.v_cache, config.num_virtual_tokens, head_size, config.num_kv_heads, + ggml_view_3d(mctx.ctx_b.get(), attn.v_cache, config.num_virtual_tokens, head_size, config.num_kv_heads, attn.v_cache->nb[1], attn.v_cache->nb[2], 0); // [#h, d, v] - ggml_build_forward_expand(&gf, ggml_cpy(ctx.get(), virtual_value, v_cache_view)); + ggml_build_forward_expand(mctx.gf, ggml_cpy(mctx.ctx_b.get(), virtual_value, v_cache_view)); } - CHATGLM_CHECK(ggml_used_mem(ctx.get()) == ggml_get_mem_size(ctx.get())) << "corrupted prefix cache context"; - std::vector compute_buffer; - ggml_graph_compute_helper(compute_buffer, &gf, 0); + + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx.allocr.get(), mctx.gf)); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx.backend.get(), mctx.gf) == GGML_STATUS_SUCCESS); } private: - std::vector build_layers(ModelContext *ctx, const ModelConfig &config) { + std::vector build_layers(ModelContext *mctx, const ModelConfig &config) { std::vector layers; layers.reserve(config.num_hidden_layers); for (int layer_id = 0; layer_id < config.num_hidden_layers; layer_id++) { - layers.emplace_back(ctx, config.hidden_size, config.num_attention_heads, config.num_kv_heads, + layers.emplace_back(mctx, config.hidden_size, config.num_attention_heads, config.num_kv_heads, config.intermediate_size, config.max_length, config.norm_eps, config.hidden_act, - config.use_qkv_bias, config.use_dense_bias, config.interleaved_qkv, config.use_alibi, - config.rope_type, config.rope_theta, config.rope_dim_scale, config.attn_mask_type, - config.num_virtual_tokens); + config.use_qkv_bias, config.use_dense_bias, config.interleaved_qkv, config.rope_type, + config.rope_theta, config.attn_mask_type, config.num_virtual_tokens); } + mctx->buf_kv = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx->ctx_kv.get(), mctx->backend.get())); return layers; } @@ -604,7 +641,8 @@ class BasicModel { Norm final_layernorm; private: - PositionIdsGenerator pos_ids_gen_; + AttentionMaskAllocator attn_mask_alloc_; + PositionIdsAllocator pos_ids_alloc_; }; class BaseStreamer { @@ -680,6 +718,12 @@ class MappedFile { size_t size; }; +struct StateDict { + unique_ggml_context_t ctx; + unique_ggml_backend_buffer_t buf; + std::unordered_map kv; +}; + class ModelLoader { public: ModelLoader(char *data, size_t size) : data(data), size(size), ptr(data) {} @@ -697,13 +741,9 @@ class ModelLoader { std::string read_string(size_t length); - void checked_read_tensor_meta(const std::string &name, int ndim, int64_t *ne, ggml_type dtype); - - void *read_tensor_data(size_t nbytes); - - void read_tensor(const std::string &name, ggml_tensor *tensor); + StateDict read_state_dict(); - public: + private: char *data; size_t size; char *ptr; @@ -720,19 +760,15 @@ struct GenerationConfig { float top_p; float temperature; float repetition_penalty; - int num_threads; GenerationConfig(int max_length = 2048, int max_new_tokens = -1, int max_context_length = 512, bool do_sample = true, int top_k = 0, float top_p = 0.7, float temperature = 0.95, - float repetition_penalty = 1.f, int num_threads = 0) + float repetition_penalty = 1.f) : max_length(max_length), max_new_tokens(max_new_tokens), max_context_length(max_context_length), do_sample(do_sample), top_k(top_k), top_p(top_p), temperature(temperature), - repetition_penalty(repetition_penalty), num_threads(num_threads) {} + repetition_penalty(repetition_penalty) {} }; -int get_num_physical_cores(); -int get_default_num_threads(); - struct TokenIdScore { int id; float score; @@ -750,15 +786,17 @@ struct TokenIdScore { class BaseModelForCausalLM { public: - BaseModelForCausalLM(ModelConfig config, size_t mem_size, size_t scratch_size, size_t num_weights); + BaseModelForCausalLM(ModelConfig config); virtual ~BaseModelForCausalLM() = default; - virtual void load(ModelLoader &loader) = 0; - virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, + virtual void load_state_dict(const StateDict &sd) = 0; + + virtual ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const = 0; - ggml_tensor *forward_graph_compute(const std::vector &input_ids, int n_past, int n_ctx, int n_threads, - bool is_decoding); + virtual void set_graph_inputs(int qlen, int n_past, int n_ctx) const = 0; + + ggml_tensor *forward_graph_compute(const std::vector &input_ids, int n_past, int n_ctx, bool is_decoding); std::vector generate(const std::vector &input_ids, const GenerationConfig &gen_config, BaseStreamer *streamer = nullptr); @@ -776,77 +814,43 @@ class BaseModelForCausalLM { static void sampling_softmax_inplace(TokenIdScore *first, TokenIdScore *last); - protected: - ModelContext ctx_; - public: ModelConfig config; -}; -using StateDict = std::vector>; + protected: + std::unique_ptr mctx_; +}; template class BasicModelForCausalLM : public BaseModelForCausalLM { protected: - BasicModelForCausalLM(const ModelConfig &config, size_t mem_size, size_t scratch_size, size_t num_weights) - : BaseModelForCausalLM(config, mem_size, scratch_size, num_weights), transformer(&ctx_, config), - lm_head(&ctx_, config.hidden_size, config.vocab_size, false) { - CHATGLM_CHECK(ggml_used_mem(ctx_.ctx_w.get()) == ggml_get_mem_size(ctx_.ctx_w.get())) - << "corrupted model weights"; - CHATGLM_CHECK(ggml_used_mem(ctx_.ctx_kv.get()) + 1 * MB == ggml_get_mem_size(ctx_.ctx_kv.get())) - << "corrupted kv cache"; + BasicModelForCausalLM(const ModelConfig &config) + : BaseModelForCausalLM(config), transformer(mctx_.get(), config), + lm_head(mctx_.get(), config.hidden_size, config.vocab_size, false) { + if (config.tie_word_embeddings) { + lm_head.weight = transformer.word_embeddings.weight; + } } - ~BasicModelForCausalLM() { to_cpu(); } public: - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const override { - ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx); + ggml_tensor *transformer_outputs = transformer.forward(mctx, input_ids, n_past, n_ctx); // NOTE: only compute next token logits for decoding if (is_decoding && input_ids->ne[0] > 1) { - transformer_outputs = tensor_assign_buffers( - ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size, - (input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs))); + transformer_outputs = + ggml_view_1d(mctx->ctx_b.get(), transformer_outputs, config.hidden_size, + (input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)); } - ggml_tensor *lm_logits = lm_head.forward(ctx, transformer_outputs); + ggml_tensor *lm_logits = lm_head.forward(mctx, transformer_outputs); return lm_logits; } void load_prefix_cache(ggml_tensor *past_key_values) { transformer.load_prefix_cache(config, past_key_values); } - protected: - void to_cpu() { - for (auto &item : state_dict_) { - tensor_to_cpu(item.second); - } - - for (auto &layer : transformer.layers) { - tensor_to_cpu(layer.attention.k_cache); - tensor_to_cpu(layer.attention.v_cache); - } - } - - void to_device() { - for (auto &item : state_dict_) { - ggml_tensor *tensor = item.second; - // should not place embedding onto device - if (tensor != transformer.word_embeddings.weight) { - tensor_to_device(tensor); - } - } - - for (auto &layer : transformer.layers) { - tensor_to_device(layer.attention.k_cache); - tensor_to_device(layer.attention.v_cache); - } - } - public: Model transformer; Linear lm_head; - - protected: - StateDict state_dict_; }; // ===== ChatGLM-6B ===== @@ -877,49 +881,43 @@ class ChatGLMTokenizer : public BaseTokenizer { int pad_token_id; }; -struct GLMContextMasker { - ggml_tensor *operator()(ModelContext *ctx, ggml_tensor *attn_scores, int n_past) const; -}; - // NOTE: disable inplace norm since it causes nonsense on cuda when sequence length >= 144 -class GLMBlock : public BasicBlock { +class GLMBlock : public BasicBlock { public: GLMBlock() = default; - GLMBlock(ModelContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int intermediate_size, + GLMBlock(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_kv_heads, int intermediate_size, int max_length, float norm_eps, ActivationType hidden_act, bool use_qkv_bias, bool use_dense_bias, - bool interleaved_qkv, bool use_alibi, RopeType rope_type, float rope_theta, int rope_dim_scale, - AttentionMaskType attn_mask_type, int num_virtual_tokens) - : BasicBlock(LayerNorm(ctx, hidden_size, false, norm_eps), - BasicAttention(ctx, hidden_size, num_attention_heads, num_attention_heads, max_length, - use_qkv_bias, use_dense_bias, interleaved_qkv, use_alibi, rope_type, rope_theta, - rope_dim_scale, attn_mask_type, num_virtual_tokens), - LayerNorm(ctx, hidden_size, false, norm_eps), - BasicMLP(ctx, hidden_size, intermediate_size, hidden_act)), - alpha_value(std::sqrt(2.f * 28)) {} - - ggml_tensor *forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *position_ids, int n_past, - int n_ctx) const; + bool interleaved_qkv, RopeType rope_type, float rope_theta, AttentionMaskType attn_mask_type, + int num_virtual_tokens) + : BasicBlock(LayerNorm(mctx, hidden_size, false, norm_eps), + BasicAttention(mctx, hidden_size, num_attention_heads, num_attention_heads, max_length, + use_qkv_bias, use_dense_bias, interleaved_qkv, rope_type, rope_theta, + attn_mask_type, num_virtual_tokens), + LayerNorm(mctx, hidden_size, false, norm_eps), + BasicMLP(mctx, hidden_size, intermediate_size, hidden_act)), + alpha(std::sqrt(2.f * 28)) {} + + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, + ggml_tensor *position_ids, int n_past, int n_ctx) const; public: - float alpha_value; + float alpha; }; -using ChatGLMModel = BasicModel; +using ChatGLMModel = BasicModel; class ChatGLMForCausalLM : public BasicModelForCausalLM { public: ChatGLMForCausalLM(const ModelConfig &config); - void load(ModelLoader &loader) override; + void load_state_dict(const StateDict &sd) override; + + void set_graph_inputs(int qlen, int n_past, int n_ctx) const override; - static int num_weights(int num_hidden_layers) { return 4 + num_hidden_layers * 12; } + static void set_graph_inputs(ggml_cgraph *gf, int qlen, int n_past, int n_ctx); private: StateDict state_dict() const; - - public: - static constexpr size_t MEM_SIZE = 1280 * MB; // 2k context - static constexpr size_t SCRATCH_SIZE = 1024 * MB; // 2k context }; // ===== ChatGLM2-6B ===== @@ -948,24 +946,22 @@ class ChatGLM2Tokenizer : public BaseTokenizer { int eop_token_id; }; -using GLM2Block = BasicBlock; +using GLM2Block = BasicBlock; -using ChatGLM2Model = BasicModel; +using ChatGLM2Model = BasicModel; class ChatGLM2ForCausalLM : public BasicModelForCausalLM { public: ChatGLM2ForCausalLM(const ModelConfig &config); - void load(ModelLoader &loader) override; + void load_state_dict(const StateDict &sd) override; + + void set_graph_inputs(int qlen, int n_past, int n_ctx) const override; - static int num_weights(int num_hidden_layers) { return 3 + num_hidden_layers * 8; } + static void set_graph_inputs(ggml_cgraph *gf, int qlen, int n_past, int n_ctx); private: StateDict state_dict() const; - - public: - static constexpr size_t MEM_SIZE = 1280 * MB; // 2k context - static constexpr size_t SCRATCH_SIZE = 1280 * MB; // 2k context }; // ===== ChatGLM3-6B ===== @@ -1012,122 +1008,6 @@ using ChatGLM3Model = ChatGLM2Model; using ChatGLM3ForCausalLM = ChatGLM2ForCausalLM; -// ===== Baichuan ===== - -class BaichuanTokenizer : public BaseTokenizer { - public: - BaichuanTokenizer(std::string_view serialized_model_proto); - - std::vector encode(const std::string &text, int max_length) const override; - - std::string decode(const std::vector &ids, bool skip_special_tokens = true) const override; - - std::vector apply_chat_template(const std::vector &messages, int max_length) const override; - - private: - bool is_special_id(int id) const; - - static void truncate(std::vector &ids, int max_length); - - public: - static constexpr int USER_TOKEN_ID = 195; - static constexpr int ASSISTANT_TOKEN_ID = 196; - - sentencepiece::SentencePieceProcessor sp; - int bos_token_id; - int eos_token_id; - int pad_token_id; -}; - -// ===== Baichuan-7B ===== - -using Baichuan7BBlock = BasicBlock; - -using Baichuan7BModel = BasicModel; - -class Baichuan7BForCausalLM : public BasicModelForCausalLM { - public: - Baichuan7BForCausalLM(const ModelConfig &config); - - void load(ModelLoader &loader) override; - - static int num_weights(int num_hidden_layers) { return 3 + num_hidden_layers * 7; } - - private: - StateDict state_dict() const; - - public: - static constexpr size_t MEM_SIZE = 1280 * MB; - static constexpr size_t SCRATCH_SIZE = 1280 * MB; -}; - -// ===== Baichuan-13B ===== - -using Baichuan13BBlock = BasicBlock; - -using Baichuan13BModel = BasicModel; - -class Baichuan13BForCausalLM : public BasicModelForCausalLM { - public: - Baichuan13BForCausalLM(const ModelConfig &config); - - void load(ModelLoader &loader) override; - - static int num_weights(int num_hidden_layers) { return 3 + num_hidden_layers * 7; } - - private: - StateDict state_dict() const; - - public: - static constexpr size_t MEM_SIZE = 1280 * MB; - static constexpr size_t SCRATCH_SIZE = 1280 * MB; -}; - -// ===== InternLM ===== - -class InternLMTokenizer : public BaseTokenizer { - public: - InternLMTokenizer(std::string_view serialized_model_proto); - - std::vector encode(const std::string &text, int max_length) const override; - - std::string decode(const std::vector &ids, bool skip_special_tokens = true) const override; - - std::vector apply_chat_template(const std::vector &messages, int max_length) const override; - - static std::string build_prompt(const std::vector &messages); - - private: - bool is_special_id(int id) const { return id == unk_token_id || id == bos_token_id || id == eos_token_id; } - - public: - sentencepiece::SentencePieceProcessor sp; - static constexpr int unk_token_id = 0; - static constexpr int bos_token_id = 1; - static constexpr int eos_token_id = 2; -}; - -using InternLMBlock = BasicBlock; - -using InternLMModel = BasicModel; - -class InternLMForCausalLM : public BasicModelForCausalLM { - public: - InternLMForCausalLM(const ModelConfig &config); - - void load(ModelLoader &loader) override; - - static int num_weights(int num_hidden_layers, int hidden_size) { - return 3 + num_hidden_layers * (hidden_size == 4096 ? 9 : 7); - } - - private: - StateDict state_dict() const; - - public: - static constexpr size_t MEM_SIZE = 1280 * MB; - static constexpr size_t SCRATCH_SIZE = 1280 * MB; -}; // ===== ChatGLM4-9B ===== // C++ port of BPE algorithm from https://github.com/openai/tiktoken/blob/main/src/lib.rs @@ -1209,10 +1089,12 @@ class Pipeline { ChatMessage chat(const std::vector &messages, const GenerationConfig &gen_config, BaseStreamer *streamer = nullptr) const; + protected: + std::unique_ptr mapped_file_; + public: std::unique_ptr tokenizer; std::unique_ptr model; - std::unique_ptr mapped_file; }; } // namespace chatglm diff --git a/chatglm_cpp/_C.pyi b/chatglm_cpp/_C.pyi index e1cd279d..c1457f32 100644 --- a/chatglm_cpp/_C.pyi +++ b/chatglm_cpp/_C.pyi @@ -3,13 +3,7 @@ ChatGLM.cpp python binding """ from __future__ import annotations import typing -__all__ = ['Baichuan13BForCausalLM', 'Baichuan7BForCausalLM', 'BaichuanTokenizer', 'BaseModelForCausalLM', 'BaseTokenizer', 'ChatGLM2ForCausalLM', 'ChatGLM2Tokenizer', 'ChatGLM3Tokenizer', 'ChatGLM4Tokenizer', 'ChatGLMForCausalLM', 'ChatGLMTokenizer', 'ChatMessage', 'CodeMessage', 'FunctionMessage', 'GenerationConfig', 'InternLMForCausalLM', 'InternLMTokenizer', 'ModelConfig', 'ModelType', 'Pipeline', 'ToolCallMessage'] -class Baichuan13BForCausalLM(BaseModelForCausalLM): - pass -class Baichuan7BForCausalLM(BaseModelForCausalLM): - pass -class BaichuanTokenizer(BaseTokenizer): - pass +__all__ = ['BaseModelForCausalLM', 'BaseTokenizer', 'ChatGLM2ForCausalLM', 'ChatGLM2Tokenizer', 'ChatGLM3Tokenizer', 'ChatGLM4Tokenizer', 'ChatGLMForCausalLM', 'ChatGLMTokenizer', 'ChatMessage', 'CodeMessage', 'FunctionMessage', 'GenerationConfig', 'ModelConfig', 'ModelType', 'Pipeline', 'ToolCallMessage'] class BaseModelForCausalLM: def generate_next_token(self, input_ids: list[int], gen_config: GenerationConfig, n_past: int, n_ctx: int) -> int: ... @@ -69,17 +63,12 @@ class GenerationConfig: max_context_length: int max_length: int max_new_tokens: int - num_threads: int repetition_penalty: float temperature: float top_k: int top_p: float - def __init__(self, max_length: int = 2048, max_new_tokens: int = -1, max_context_length: int = 512, do_sample: bool = True, top_k: int = 0, top_p: float = 0.7, temperature: float = 0.95, repetition_penalty: float = 1.0, num_threads: int = 0) -> None: + def __init__(self, max_length: int = 2048, max_new_tokens: int = -1, max_context_length: int = 512, do_sample: bool = True, top_k: int = 0, top_p: float = 0.7, temperature: float = 0.95, repetition_penalty: float = 1.0) -> None: ... -class InternLMForCausalLM(BaseModelForCausalLM): - pass -class InternLMTokenizer(BaseTokenizer): - pass class ModelConfig: @property def bos_token_id(self) -> int: @@ -137,21 +126,12 @@ class ModelType: CHATGLM3 CHATGLM4 - - BAICHUAN7B - - BAICHUAN13B - - INTERNLM """ - BAICHUAN13B: typing.ClassVar[ModelType] # value = - BAICHUAN7B: typing.ClassVar[ModelType] # value = CHATGLM: typing.ClassVar[ModelType] # value = CHATGLM2: typing.ClassVar[ModelType] # value = CHATGLM3: typing.ClassVar[ModelType] # value = CHATGLM4: typing.ClassVar[ModelType] # value = - INTERNLM: typing.ClassVar[ModelType] # value = - __members__: typing.ClassVar[dict[str, ModelType]] # value = {'CHATGLM': , 'CHATGLM2': , 'CHATGLM3': , 'CHATGLM4': , 'BAICHUAN7B': , 'BAICHUAN13B': , 'INTERNLM': } + __members__: typing.ClassVar[dict[str, ModelType]] # value = {'CHATGLM': , 'CHATGLM2': , 'CHATGLM3': , 'CHATGLM4': } def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: diff --git a/chatglm_cpp/__init__.py b/chatglm_cpp/__init__.py index f7568fde..11f2beb3 100644 --- a/chatglm_cpp/__init__.py +++ b/chatglm_cpp/__init__.py @@ -6,7 +6,7 @@ import chatglm_cpp._C as _C from chatglm_cpp._C import ChatMessage -__version__ = "0.3.4" +__version__ = "0.4.0" @dataclass @@ -58,7 +58,6 @@ def chat( top_p: float = 0.7, temperature: float = 0.95, repetition_penalty: float = 1.0, - num_threads: int = 0, stream: bool = False, ) -> Union[Iterator[DeltaMessage], ChatMessage]: messages = [_ensure_chat_message(msg) for msg in messages] @@ -72,7 +71,6 @@ def chat( top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty, - num_threads=num_threads, ) if stream: return self._stream_chat(input_ids=input_ids, gen_config=gen_config) @@ -90,7 +88,6 @@ def generate( top_p: float = 0.7, temperature: float = 0.95, repetition_penalty: float = 1.0, - num_threads: int = 0, stream: bool = False, ) -> Union[Iterator[str], str]: input_ids = self.tokenizer.encode(prompt, max_context_length) @@ -103,7 +100,6 @@ def generate( top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty, - num_threads=num_threads, ) if stream: return self._stream_generate(input_ids=input_ids, gen_config=gen_config) diff --git a/chatglm_cpp/convert.py b/chatglm_cpp/convert.py index d325e806..bf69c4e9 100644 --- a/chatglm_cpp/convert.py +++ b/chatglm_cpp/convert.py @@ -49,9 +49,6 @@ class ModelType(Enum): CHATGLM2 = 2 CHATGLM3 = 3 CHATGLM4 = 4 - BAICHUAN7B = 1024 - BAICHUAN13B = 1025 - INTERNLM = 1280 def quantize_q8_0(tensor: torch.Tensor) -> torch.Tensor: @@ -387,142 +384,6 @@ def dump_tokenizer(f, tokenizer): f.write(vocab_text) -class BaichuanConverter(BaseConverter): - @classmethod - def dump_config(cls, f, config, ggml_type): - assert config.hidden_act == "silu", "unimplemented: hidden_act must be silu" - - config_version = 1 - config_values = [ - ggml_type.value, - config.vocab_size, - config.hidden_size, - config.num_attention_heads, - config.num_hidden_layers, - config.intermediate_size, - config.model_max_length, - config.bos_token_id if config.bos_token_id is not None else -1, - config.eos_token_id if config.eos_token_id is not None else -1, - config.pad_token_id if config.pad_token_id is not None else -1, - config.sep_token_id if config.sep_token_id is not None else -1, - ] - - f.write(struct.pack("i" * (1 + len(config_values)), config_version, *config_values)) - - @staticmethod - def dump_tokenizer(f, tokenizer): - serialized_model_proto = tokenizer.sp_model.serialized_model_proto() - f.write(struct.pack("i", len(serialized_model_proto))) - f.write(serialized_model_proto) - - @staticmethod - def dump_model(f, model, ggml_type): - weight_names = ["model.embed_tokens.weight"] - for i in range(model.config.num_hidden_layers): - weight_names += [ - f"model.layers.{i}.input_layernorm.weight", - f"model.layers.{i}.self_attn.W_pack.weight", - f"model.layers.{i}.self_attn.o_proj.weight", - f"model.layers.{i}.post_attention_layernorm.weight", - f"model.layers.{i}.mlp.gate_proj.weight", - f"model.layers.{i}.mlp.down_proj.weight", - f"model.layers.{i}.mlp.up_proj.weight", - ] - weight_names += [ - "model.norm.weight", - "lm_head.weight", - ] - - if model.config.vocab_size == 125696: - # For Baichuan2, normalize lm_head weight - model.lm_head.weight.data = F.normalize(model.lm_head.weight.data) - - dump_state_dict(f, weight_names, model.state_dict(), quantization_bit=None, ggml_type=ggml_type) - - -class Baichuan7BConverter(BaichuanConverter): - MODEL_TYPE = ModelType.BAICHUAN7B - - -class Baichuan13BConverter(BaichuanConverter): - MODEL_TYPE = ModelType.BAICHUAN13B - - -class InternLMConverter(BaseConverter): - MODEL_TYPE = ModelType.INTERNLM - - @staticmethod - def dump_config(f, config, ggml_type): - assert config.hidden_act == "silu", "unimplemented: hidden_act must be silu" - - config_version = 1 - config_values = [ - ggml_type.value, - config.vocab_size, - config.hidden_size, - config.num_attention_heads, - config.num_hidden_layers, - config.intermediate_size, - config.max_position_embeddings, - config.bos_token_id if config.bos_token_id is not None else -1, - config.eos_token_id if config.eos_token_id is not None else -1, - config.pad_token_id if config.pad_token_id is not None else -1, - config.sep_token_id if config.sep_token_id is not None else -1, - ] - - f.write(struct.pack("i" * (1 + len(config_values)), config_version, *config_values)) - - @staticmethod - def dump_tokenizer(f, tokenizer): - serialized_model_proto = tokenizer.sp_model.serialized_model_proto() - f.write(struct.pack("i", len(serialized_model_proto))) - f.write(serialized_model_proto) - - @staticmethod - def dump_model(f, model, ggml_type): - state_dict = model.state_dict() - for i in range(model.config.num_hidden_layers): - state_dict[f"model.layers.{i}.self_attn.qkv_proj.weight"] = torch.cat( - ( - state_dict[f"model.layers.{i}.self_attn.q_proj.weight"], - state_dict[f"model.layers.{i}.self_attn.k_proj.weight"], - state_dict[f"model.layers.{i}.self_attn.v_proj.weight"], - ), - dim=0, - ) - if model.config.bias: - state_dict[f"model.layers.{i}.self_attn.qkv_proj.bias"] = torch.cat( - ( - state_dict[f"model.layers.{i}.self_attn.q_proj.bias"], - state_dict[f"model.layers.{i}.self_attn.k_proj.bias"], - state_dict[f"model.layers.{i}.self_attn.v_proj.bias"], - ), - dim=0, - ) - - weight_names = ["model.embed_tokens.weight"] - for i in range(model.config.num_hidden_layers): - optional_qkv_proj_bias = [f"model.layers.{i}.self_attn.qkv_proj.bias"] if model.config.bias else [] - optional_o_proj_bias = [f"model.layers.{i}.self_attn.o_proj.bias"] if model.config.bias else [] - weight_names += [ - f"model.layers.{i}.input_layernorm.weight", - f"model.layers.{i}.self_attn.qkv_proj.weight", - *optional_qkv_proj_bias, - f"model.layers.{i}.self_attn.o_proj.weight", - *optional_o_proj_bias, - f"model.layers.{i}.post_attention_layernorm.weight", - f"model.layers.{i}.mlp.gate_proj.weight", - f"model.layers.{i}.mlp.up_proj.weight", - f"model.layers.{i}.mlp.down_proj.weight", - ] - weight_names += [ - "model.norm.weight", - "lm_head.weight", - ] - - dump_state_dict(f, weight_names, state_dict, quantization_bit=None, ggml_type=ggml_type) - - def convert(f: BinaryIO, model_name_or_path: str, lora_model_name_or_path: Optional[str] = None, dtype: str = "q4_0"): ggml_type = GGMLType[dtype.upper()] @@ -561,13 +422,6 @@ def convert(f: BinaryIO, model_name_or_path: str, lora_model_name_or_path: Optio ChatGLM2Converter.convert(f, model, tokenizer, ggml_type) else: ChatGLMConverter.convert(f, model, tokenizer, ggml_type) - elif model.config.model_type == "baichuan": - if model.config.hidden_size == 5120: - Baichuan13BConverter.convert(f, model, tokenizer, ggml_type) - else: - Baichuan7BConverter.convert(f, model, tokenizer, ggml_type) - elif model.config.model_type == "internlm": - InternLMConverter.convert(f, model, tokenizer, ggml_type) else: raise RuntimeError(f"Unknown model type {model.config.model_type}") diff --git a/chatglm_cpp/openai_api.py b/chatglm_cpp/openai_api.py index 18805bd1..6d37844d 100644 --- a/chatglm_cpp/openai_api.py +++ b/chatglm_cpp/openai_api.py @@ -18,7 +18,6 @@ class Settings(BaseSettings): model: str = "models/chatglm3-ggml.bin" max_length: int = 4096 - num_threads: int = 0 class ToolCallFunction(BaseModel): @@ -146,7 +145,6 @@ def stream_chat(messages, body): do_sample=body.temperature > 0, top_p=body.top_p, temperature=body.temperature, - num_threads=settings.num_threads, stream=True, ): yield ChatCompletionResponse( diff --git a/chatglm_pybind.cpp b/chatglm_pybind.cpp index d24df96b..e6aab456 100644 --- a/chatglm_pybind.cpp +++ b/chatglm_pybind.cpp @@ -26,13 +26,19 @@ class PyBaseModelForCausalLM : public BaseModelForCausalLM { public: using BaseModelForCausalLM::BaseModelForCausalLM; - void load(ModelLoader &loader) override { PYBIND11_OVERLOAD_PURE(void, PyBaseModelForCausalLM, load, loader); } + void load_state_dict(const StateDict &sd) override { + PYBIND11_OVERLOAD_PURE(void, PyBaseModelForCausalLM, load_state_dict, sd); + } ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const override { PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx, is_decoding) } + + void set_graph_inputs(int qlen, int n_past, int n_ctx) const override { + PYBIND11_OVERLOAD_PURE(void, PyBaseModelForCausalLM, set_graph_inputs, qlen, n_past, n_ctx); + } }; template @@ -49,10 +55,7 @@ PYBIND11_MODULE(_C, m) { .value("CHATGLM", ModelType::CHATGLM) .value("CHATGLM2", ModelType::CHATGLM2) .value("CHATGLM3", ModelType::CHATGLM3) - .value("CHATGLM4", ModelType::CHATGLM4) - .value("BAICHUAN7B", ModelType::BAICHUAN7B) - .value("BAICHUAN13B", ModelType::BAICHUAN13B) - .value("INTERNLM", ModelType::INTERNLM); + .value("CHATGLM4", ModelType::CHATGLM4); py::class_(m, "ModelConfig") .def_readonly("model_type", &ModelConfig::model_type) @@ -73,9 +76,9 @@ PYBIND11_MODULE(_C, m) { .def_property_readonly("model_type_name", &ModelConfig::model_type_name); py::class_(m, "GenerationConfig") - .def(py::init(), "max_length"_a = 2048, - "max_new_tokens"_a = -1, "max_context_length"_a = 512, "do_sample"_a = true, "top_k"_a = 0, - "top_p"_a = 0.7, "temperature"_a = 0.95, "repetition_penalty"_a = 1.0, "num_threads"_a = 0) + .def(py::init(), "max_length"_a = 2048, "max_new_tokens"_a = -1, + "max_context_length"_a = 512, "do_sample"_a = true, "top_k"_a = 0, "top_p"_a = 0.7, "temperature"_a = 0.95, + "repetition_penalty"_a = 1.0) .def_readwrite("max_length", &GenerationConfig::max_length) .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) .def_readwrite("max_context_length", &GenerationConfig::max_context_length) @@ -83,8 +86,7 @@ PYBIND11_MODULE(_C, m) { .def_readwrite("top_k", &GenerationConfig::top_k) .def_readwrite("top_p", &GenerationConfig::top_p) .def_readwrite("temperature", &GenerationConfig::temperature) - .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) - .def_readwrite("num_threads", &GenerationConfig::num_threads); + .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty); py::class_(m, "FunctionMessage") .def("__repr__", &to_string) @@ -148,20 +150,6 @@ PYBIND11_MODULE(_C, m) { py::class_(m, "ChatGLM4Tokenizer"); - // ===== Baichuan7B/13B ===== - - py::class_(m, "BaichuanTokenizer"); - - py::class_(m, "Baichuan7BForCausalLM"); - - py::class_(m, "Baichuan13BForCausalLM"); - - // ===== InternLM ===== - - py::class_(m, "InternLMTokenizer"); - - py::class_(m, "InternLMForCausalLM"); - // ===== Pipeline ==== py::class_(m, "Pipeline") diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 96590af1..1b480851 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -1,8 +1,9 @@ #include "chatglm.h" #include +#include #include -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA #include #include #endif @@ -11,38 +12,46 @@ namespace chatglm { namespace fs = std::filesystem; -static inline int get_num_threads() { - const char *chatglm_num_threads_env = getenv("CHATGLM_NUM_THREADS"); - int num_threads = chatglm_num_threads_env ? std::stoi(chatglm_num_threads_env) : get_default_num_threads(); - return num_threads; -} - static inline void expect_all_close(ggml_tensor *a, ggml_tensor *b, float atol = 1e-5f, float rtol = 0.f) { ASSERT_EQ(a->type, b->type); ASSERT_EQ(a->type, GGML_TYPE_F32); ASSERT_EQ(ggml_nelements(a), ggml_nelements(b)); + int64_t numel = ggml_nelements(a); + + std::vector a_buf(numel); + ggml_backend_tensor_get(a, a_buf.data(), 0, numel * sizeof(float)); + + std::vector b_buf(numel); + ggml_backend_tensor_get(b, b_buf.data(), 0, numel * sizeof(float)); + float max_abs_diff = 0.f; + float max_rel_diff = 0.f; int64_t num_mismatch = 0; for (int64_t i = 0; i < numel; i++) { - float ai = ((float *)a->data)[i]; - float bi = ((float *)b->data)[i]; + float ai = a_buf[i]; + float bi = b_buf[i]; float abs_diff = std::abs(ai - bi); max_abs_diff = std::max(max_abs_diff, abs_diff); if (abs_diff >= atol + rtol * std::abs(bi)) { num_mismatch++; } - } - EXPECT_EQ(num_mismatch, 0) << "Tensors are not close!\n\n" - << "Mismatched elements: " << num_mismatch << " / " << numel << " (" - << num_mismatch * 100 / numel << "%)\n" - << "Greatest absolute difference: " << max_abs_diff << " (up to " << std::scientific - << atol << " allowed)\n"; + float rel_diff = abs_diff / std::abs(bi); + max_rel_diff = std::max(max_rel_diff, rel_diff); + } + EXPECT_TRUE(num_mismatch == 0) << "Tensors are not close!\n\n" + << "Mismatched elements: " << num_mismatch << " / " << numel << " (" + << num_mismatch * 100 / numel << "%)\n" + << "Greatest absolute difference: " << max_abs_diff << " (up to " << std::scientific + << atol << std::defaultfloat << " allowed)\n" + << "Greatest relative difference: " << max_rel_diff << " (up to " << std::scientific + << rtol << std::defaultfloat << " allowed)\n"; } -static inline char *read_tensor_data(char *ptr, ggml_tensor *tensor) { - memcpy(tensor->data, ptr, ggml_nbytes(tensor)); - return ptr + ggml_nbytes(tensor); +static inline void read_backend_tensor_data(std::istream &is, ggml_tensor *tensor) { + std::vector> buf(ggml_nbytes(tensor)); + is.read((char *)buf.data(), buf.size()); + ggml_backend_tensor_set(tensor, buf.data(), 0, buf.size()); } static inline float random() { return rand() / (float)RAND_MAX; } @@ -54,21 +63,24 @@ static inline void random_fill(ggml_tensor *tensor) { for (float &v : values) { v = random(); } - int64_t hist[16]{}; if (tensor->type == GGML_TYPE_F32) { - memcpy(tensor->data, values.data(), sizeof(float) * values.size()); - } else if (tensor->type == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row(values.data(), (ggml_fp16_t *)tensor->data, values.size()); - } else if (tensor->type == GGML_TYPE_Q8_0) { - ggml_quantize_q8_0(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); - } else if (tensor->type == GGML_TYPE_Q4_0) { - ggml_quantize_q4_0(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); - } else if (tensor->type == GGML_TYPE_Q4_1) { - ggml_quantize_q4_1(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); + ggml_backend_tensor_set(tensor, values.data(), 0, sizeof(float) * values.size()); } else { - CHATGLM_THROW << "unsupported dtype " << ggml_type_name(tensor->type); - } + CHATGLM_THROW << "unsupported dtype " << tensor->type; + } + + // if (tensor->type == GGML_TYPE_F16) { + // ggml_fp32_to_fp16_row(values.data(), (ggml_fp16_t *)tensor->data, values.size()); + // } else if (tensor->type == GGML_TYPE_Q8_0) { + // ggml_quantize_q8_0(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); + // } else if (tensor->type == GGML_TYPE_Q4_0) { + // ggml_quantize_q4_0(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); + // } else if (tensor->type == GGML_TYPE_Q4_1) { + // ggml_quantize_q4_1(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); + // } else { + // CHATGLM_THROW << "unsupported dtype " << ggml_type_name(tensor->type); + // } } // return elapsed time in milliseconds @@ -77,14 +89,14 @@ static inline float timeit(std::function fn, int warmup, int active) { fn(); } -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA CHATGLM_CHECK_CUDA(cudaDeviceSynchronize()); #endif int64_t start_us = ggml_time_us(); for (int i = 0; i < active; i++) { fn(); } -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA CHATGLM_CHECK_CUDA(cudaDeviceSynchronize()); #endif int64_t end_us = ggml_time_us(); @@ -228,145 +240,112 @@ TEST(Sampling, TopP) { } } +static inline ggml_tensor *ggml_new_tensor_like(ggml_context *ctx, ggml_tensor *tensor) { + return ggml_new_tensor(ctx, tensor->type, ggml_n_dims(tensor), tensor->ne); +} + class ChatGLMTest : public ::testing::Test { protected: - ModelContext ctx; - - void SetUp() override { - ctx.dtype = GGML_TYPE_F32; - ctx.ctx_w = make_unique_ggml_context(1024 * MB, nullptr, false); - ctx.ctx_kv = make_unique_ggml_context(512 * MB, nullptr, false); - ctx.ctx_b = make_unique_ggml_context(512 * MB, nullptr, false); - ctx.scratch_buffer.resize(1 * MB); - ctx.scratch = {0, ctx.scratch_buffer.size(), ctx.scratch_buffer.data()}; -#ifdef GGML_USE_CUBLAS - ggml_cuda_set_scratch_size(ctx.scratch_buffer.size()); -#endif - ctx.init_device_context(); - - reset_cgraph(); - } - - void TearDown() override { -#ifdef GGML_USE_CUBLAS - ggml_cuda_free_scratch(); -#endif - } - - void reset_cgraph() { ctx.gf = {}; } + std::unique_ptr mctx_; - void cpu_graph_compute(int n_threads) { ggml_graph_compute_helper(ctx.work_buffer, &ctx.gf, n_threads); } + void SetUp() override { mctx_ = std::make_unique(GGML_TYPE_F32); } - void device_graph_compute(int n_threads) { -#ifdef GGML_USE_METAL - // ggml_metal_set_n_cb(ctx.ctx_metal.get(), n_threads); - ggml_metal_graph_compute(ctx.ctx_metal.get(), &ctx.gf); - // ggml_metal_get_tensor(ctx.ctx_metal.get(), output); -#else - cpu_graph_compute(n_threads); -#endif - } - - template - float _perf_graph_compute_impl() { - int num_threads = get_num_threads(); - auto fn = [this, num_threads] { - if constexpr (FALLBACK_CPU) { - cpu_graph_compute(num_threads); - } else { - device_graph_compute(num_threads); - } + float perf_graph_compute() { + auto fn = [this] { + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); }; -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_METAL) - return timeit(fn, 10, 100); -#else - return timeit(fn, 1, 3); -#endif + if (ggml_backend_is_cpu(mctx_->backend.get())) { + return timeit(fn, 1, 3); + } else { + return timeit(fn, 10, 100); + } } - float perf_cpu_graph_compute() { return _perf_graph_compute_impl(); } - float perf_device_graph_compute() { return _perf_graph_compute_impl(); } + template + static void set_graph_inputs(ggml_cgraph *gf, int qlen, int n_past, int n_ctx) { + static_assert(std::is_same_v || std::is_same_v, + "invalid model type"); + if (std::is_same_v) { + ChatGLMForCausalLM::set_graph_inputs(gf, qlen, n_past, n_ctx); + } else { + ChatGLM2ForCausalLM::set_graph_inputs(gf, qlen, n_past, n_ctx); + } + } template void test_model(Model &model, const ModelConfig &config, const fs::path &data_path, int seq_len, const std::vector &all_weights) { ASSERT_EQ(config.num_hidden_layers, 1); - MappedFile mapped_file(data_path.string()); - char *ptr = mapped_file.data; + std::ifstream ifs(data_path, std::ios::binary); + ASSERT_TRUE(ifs) << "cannot open file " << data_path; - ggml_tensor *x1 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, seq_len); - ggml_tensor *ref_y1 = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); - ggml_tensor *x2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - ggml_tensor *x3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); + ggml_tensor *x1 = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_I32, seq_len); + ggml_tensor *ref_y1 = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); + ggml_tensor *x2 = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_I32, 1); + ggml_tensor *ref_y2 = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, config.hidden_size); + ggml_tensor *x3 = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_I32, 1); + ggml_tensor *ref_y3 = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, config.hidden_size); std::vector all_tensors = all_weights; all_tensors.insert(all_tensors.end(), {x1, ref_y1, x2, ref_y2, x3, ref_y3}); - std::vector cpu_tensors{model.word_embeddings.weight, x1, x2, x3}; - + ggml_tensor *past_key_values = nullptr; if (config.num_virtual_tokens > 0) { const int head_size = config.hidden_size / config.num_attention_heads; - ggml_tensor *past_key_values = - ggml_new_tensor_4d(ctx.ctx_b.get(), GGML_TYPE_F16, head_size, config.num_virtual_tokens, + past_key_values = + ggml_new_tensor_4d(mctx_->ctx_b.get(), GGML_TYPE_F16, head_size, config.num_virtual_tokens, config.num_kv_heads, config.num_hidden_layers * 2); // [l * 2, #h, v, d] - ptr = read_tensor_data(ptr, past_key_values); - model.load_prefix_cache(config, past_key_values); } - tensor_to_device(model.layers[0].attention.k_cache); - tensor_to_device(model.layers[0].attention.v_cache); + auto buf_b = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); + auto buf_w = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); - for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - if (std::find(cpu_tensors.begin(), cpu_tensors.end(), tensor) == cpu_tensors.end()) { - tensor_to_device(tensor); - } + if (config.num_virtual_tokens > 0) { + read_backend_tensor_data(ifs, past_key_values); + model.load_prefix_cache(config, past_key_values); } - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); + for (auto tensor : all_tensors) { + read_backend_tensor_data(ifs, tensor); + } + ASSERT_TRUE(ifs.peek() == EOF); // self attention { - reset_cgraph(); - ggml_tensor *out_y1 = model.forward(&ctx, x1, 0, seq_len); - EXPECT_EQ(out_y1->backend, ref_y1->backend); - out_y1->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y1); - device_graph_compute(1); + ggml_graph_clear(mctx_->gf); + ggml_tensor *out_y1 = model.forward(mctx_.get(), x1, 0, seq_len); + ggml_build_forward_expand(mctx_->gf, out_y1); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + set_graph_inputs(mctx_->gf, seq_len, 0, seq_len); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); expect_all_close(ref_y1, out_y1, 5e-4); } // cross attention { - reset_cgraph(); - ggml_tensor *out_y2 = model.forward(&ctx, x2, seq_len, seq_len); - EXPECT_EQ(out_y2->backend, ref_y2->backend); - out_y2->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y2); - device_graph_compute(1); + ggml_graph_clear(mctx_->gf); + ggml_tensor *out_y2 = model.forward(mctx_.get(), x2, seq_len, seq_len); + ggml_build_forward_expand(mctx_->gf, out_y2); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + set_graph_inputs(mctx_->gf, 1, seq_len, seq_len); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); expect_all_close(ref_y2, out_y2, 5e-4); } { - reset_cgraph(); - ggml_tensor *out_y3 = model.forward(&ctx, x3, seq_len + 1, seq_len); - EXPECT_EQ(out_y3->backend, ref_y3->backend); - out_y3->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y3); - device_graph_compute(1); + ggml_graph_clear(mctx_->gf); + ggml_tensor *out_y3 = model.forward(mctx_.get(), x3, seq_len + 1, seq_len); + ggml_build_forward_expand(mctx_->gf, out_y3); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + set_graph_inputs(mctx_->gf, 1, seq_len + 1, seq_len); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); expect_all_close(ref_y3, out_y3, 5e-4); } - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } - tensor_to_cpu(model.layers[0].attention.k_cache); - tensor_to_cpu(model.layers[0].attention.v_cache); } }; @@ -377,50 +356,56 @@ TEST_F(ChatGLMTest, Embedding) { float y_data[]{0.5684, -1.0845, -1.3986, -0.4033, -0.5966, 0.1820, 1.5410, -0.2934, -2.1788, 0.4033, 0.8380, -0.7193, -0.4033, -0.5966, 0.1820}; - ggml_tensor *x = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 5); - memcpy(x->data, x_data, sizeof(x_data)); - Embedding model(&ctx, 4, 3); - memcpy(model.weight->data, w_data, sizeof(w_data)); - ggml_tensor *ref = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 3, 5); - ref->data = y_data; + ggml_tensor *x = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_I32, 5); + Embedding model(mctx_.get(), 4, 3); + ggml_tensor *ref = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 3, 5); + + auto buf_b = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); + auto buf_w = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); - ggml_tensor *out = model.forward(&ctx, x); + ggml_backend_tensor_set(x, x_data, 0, sizeof(x_data)); + ggml_backend_tensor_set(model.weight, w_data, 0, sizeof(w_data)); + ggml_backend_tensor_set(ref, y_data, 0, sizeof(y_data)); - ggml_build_forward_expand(&ctx.gf, out); - cpu_graph_compute(1); + ggml_tensor *out = model.forward(mctx_.get(), x); + + ggml_build_forward_expand(mctx_->gf, out); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); expect_all_close(ref, out); } TEST_F(ChatGLMTest, Linear) { fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/linear.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; - - ggml_tensor *w = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 32, 16); - ptr = read_tensor_data(ptr, w); - ggml_tensor *b = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, 16); - ptr = read_tensor_data(ptr, b); - ggml_tensor *x = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 32, 2); - ptr = read_tensor_data(ptr, x); - ggml_tensor *ref = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 16, 2); - ptr = read_tensor_data(ptr, ref); - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); - - // GEMV data - ggml_tensor *vx = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, 32); - memcpy(vx->data, x->data, 32 * sizeof(float)); - ggml_tensor *vref = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, 16); - memcpy(vref->data, ref->data, 16 * sizeof(float)); - - tensor_to_device(x); - tensor_to_device(vx); + std::ifstream ifs(test_path, std::ios::binary); + ASSERT_TRUE(ifs) << "cannot open file " << test_path; + + ggml_tensor *w = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32, 16); + ggml_tensor *b = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 16); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32, 2); + ggml_tensor *ref = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 16, 2); + + ggml_tensor *vec_x = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32); + ggml_tensor *vec_ref = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 16); + + auto buf_b = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); + + read_backend_tensor_data(ifs, w); + read_backend_tensor_data(ifs, b); + read_backend_tensor_data(ifs, x); + read_backend_tensor_data(ifs, ref); + + read_backend_tensor_data(ifs, vec_x); + read_backend_tensor_data(ifs, vec_ref); + + ASSERT_TRUE(ifs.peek() == EOF); struct TestCase { ggml_tensor *x; ggml_tensor *ref; }; - std::vector cases{{x, ref}, {vx, vref}}; + std::vector cases{{x, ref}}; struct TestConfig { ggml_type dtype; @@ -428,175 +413,176 @@ TEST_F(ChatGLMTest, Linear) { float rtol; }; std::vector test_configs{ - {GGML_TYPE_F32, 1e-5, 0}, - {GGML_TYPE_F16, 1e-2, 5e-4}, + {GGML_TYPE_F32, 1e-5, 0}, {GGML_TYPE_F16, 1e-2, 5e-4}, {GGML_TYPE_Q8_0, 0.15, 5e-4}, + {GGML_TYPE_Q5_0, 0.8, 0.1}, {GGML_TYPE_Q5_1, 0.8, 0.1}, {GGML_TYPE_Q4_1, 1.0, 0.2}, {GGML_TYPE_Q4_0, 1.0, 0.2}, }; for (const auto &config : test_configs) { - ctx.dtype = config.dtype; - Linear model(&ctx, 32, 16); + mctx_->dtype = config.dtype; + Linear model(mctx_.get(), 32, 16); + auto buf_w = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); + + auto ctx = make_unique_ggml_context(1024 * 1024, nullptr, false); + ggml_tensor *w_cpu = ggml_new_tensor_like(ctx.get(), w); + ggml_backend_tensor_get(w, w_cpu->data, 0, ggml_nbytes(w)); + ggml_tensor *wq_cpu = ggml_new_tensor_2d(ctx.get(), config.dtype, w_cpu->ne[0], w_cpu->ne[1]); if (config.dtype == GGML_TYPE_F32) { - model.weight->data = w->data; + wq_cpu = w_cpu; } else if (config.dtype == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row((float *)w->data, (ggml_fp16_t *)model.weight->data, ggml_nelements(model.weight)); - } else if (config.dtype == GGML_TYPE_Q4_0) { - int64_t hist[16]{}; - ggml_quantize_q4_0((float *)w->data, model.weight->data, ggml_nelements(w), w->ne[0], hist); + ggml_fp32_to_fp16_row((float *)w_cpu->data, (ggml_fp16_t *)wq_cpu->data, ggml_nelements(w_cpu)); } else { - CHATGLM_THROW << "unsupported dtype " << config.dtype; + ggml_quantize_chunk(config.dtype, (float *)w_cpu->data, wq_cpu->data, 0, w_cpu->ne[1], w_cpu->ne[0], + nullptr); } - model.bias->data = b->data; - tensor_to_device(model.weight); - tensor_to_device(model.bias); + ggml_backend_tensor_set(model.weight, wq_cpu->data, 0, ggml_nbytes(model.weight)); + ggml_backend_tensor_copy(b, model.bias); for (const auto &c : cases) { - reset_cgraph(); - ggml_tensor *out = model.forward(&ctx, c.x); - EXPECT_EQ(out->backend, c.x->backend); - out->backend = GGML_BACKEND_CPU; - - ggml_build_forward_expand(&ctx.gf, out); - device_graph_compute(get_num_threads()); - - EXPECT_EQ(out->type, GGML_TYPE_F32); + ggml_graph_clear(mctx_->gf); + ggml_tensor *out = model.forward(mctx_.get(), c.x); + + ggml_build_forward_expand(mctx_->gf, out); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); + + // if (config.dtype == GGML_TYPE_F16) { + // std::cout << "dtype " << config.dtype << '\n'; + // auto ref_cpu = ggml_new_tensor_like(ctx.get(), c.ref); + // ggml_backend_tensor_get(c.ref, ref_cpu->data, 0, ggml_nbytes(c.ref)); + // auto out_cpu = ggml_new_tensor_like(ctx.get(), out); + // ggml_backend_tensor_get(out, out_cpu->data, 0, ggml_nbytes(out)); + // auto weight_cpu = ggml_new_tensor_like(ctx.get(), model.weight); + // ggml_backend_tensor_get(model.weight, weight_cpu->data, 0, ggml_nbytes(model.weight)); + + // std::cout << "c.ref " << to_string(ref_cpu) << '\n' + // << "out " << to_string(out_cpu) << '\n' + // << "weight " << to_string(weight_cpu) << '\n'; + // } expect_all_close(c.ref, out, config.atol, config.rtol); } - - tensor_to_cpu(model.weight); - tensor_to_cpu(model.bias); } - tensor_to_cpu(x); - tensor_to_cpu(vx); } TEST_F(ChatGLMTest, BenchmarkLinear) { constexpr int M = 64, N = 1024, K = 1024 * 3; - ctx.dtype = GGML_TYPE_F32; - Linear m(&ctx, K, N); - ggml_tensor *x = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, K, M); + mctx_->dtype = GGML_TYPE_F32; + Linear m(mctx_.get(), K, N); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, K, M); + + ggml_tensor *y = m.forward(mctx_.get(), x); + ggml_build_forward_expand(mctx_->gf, y); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); std::vector all_tensors{m.weight, m.bias, x}; for (auto tensor : all_tensors) { random_fill(tensor); - tensor_to_device(tensor); } - ggml_tensor *y = m.forward(&ctx, x); - ggml_build_forward_expand(&ctx.gf, y); - std::cout << "[Benchmark] Linear " << ggml_type_name(ctx.dtype) << " time: " << perf_device_graph_compute() - << " ms\n"; - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } + std::cout << "[Benchmark] Linear " << ggml_type_name(mctx_->dtype) << " time: " << perf_graph_compute() << " ms\n"; } TEST_F(ChatGLMTest, LayerNorm) { fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/layer_norm.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; + std::ifstream ifs(test_path, std::ios::binary); + ASSERT_TRUE(ifs) << "cannot open file " << test_path; + + LayerNorm model(mctx_.get(), 64); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 64, 3); + ggml_tensor *ref = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 64, 3); - LayerNorm model(&ctx, 64); - ggml_tensor *x = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 64, 3); - ggml_tensor *ref = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 64, 3); + auto buf_b = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); + auto buf_w = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); std::vector all_tensors{model.weight, model.bias, x, ref}; for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - tensor_to_device(tensor); + read_backend_tensor_data(ifs, tensor); } - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); + ASSERT_TRUE(ifs.peek() == EOF); - ggml_tensor *out = model.forward(&ctx, x); - EXPECT_EQ(out->backend, x->backend); - out->backend = GGML_BACKEND_CPU; + ggml_tensor *out = model.forward(mctx_.get(), x); - ggml_build_forward_expand(&ctx.gf, out); - device_graph_compute(get_num_threads()); + ggml_build_forward_expand(mctx_->gf, out); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); expect_all_close(ref, out); - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } } TEST_F(ChatGLMTest, BenchmarkLayerNorm) { constexpr int seq_len = 64; constexpr int hidden = 1024; - LayerNorm m(&ctx, hidden); - ggml_tensor *x = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, hidden, seq_len); + LayerNorm m(mctx_.get(), hidden); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, hidden, seq_len); + + auto buffer = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); + auto buffer_w = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); std::vector all_tensors{m.weight, m.bias, x}; for (auto tensor : all_tensors) { random_fill(tensor); - tensor_to_device(tensor); } - ggml_tensor *y = m.forward(&ctx, x); - ggml_build_forward_expand(&ctx.gf, y); - std::cout << "[Benchmark] LayerNorm " << ggml_type_name(ctx.dtype) << " time: " << perf_device_graph_compute() + ggml_tensor *y = m.forward(mctx_.get(), x); + ggml_build_forward_expand(mctx_->gf, y); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + std::cout << "[Benchmark] LayerNorm " << ggml_type_name(mctx_->dtype) << " time: " << perf_graph_compute() << " ms\n"; - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } } TEST_F(ChatGLMTest, RMSNorm) { fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/rms_norm.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; + std::ifstream ifs(test_path, std::ios::binary); + ASSERT_TRUE(ifs) << "cannot open file " << test_path; - RMSNorm model(&ctx, 64); - ggml_tensor *x = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 64, 3); - ggml_tensor *ref = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 64, 3); + RMSNorm model(mctx_.get(), 64); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 64, 3); + ggml_tensor *ref = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 64, 3); + + auto buf_b = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); + auto buf_w = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); std::vector all_tensors{model.weight, x, ref}; for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - tensor_to_device(tensor); + read_backend_tensor_data(ifs, tensor); } - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); + ASSERT_TRUE(ifs.peek() == EOF); - ggml_tensor *out = model.forward(&ctx, x); - EXPECT_EQ(out->backend, x->backend); - out->backend = GGML_BACKEND_CPU; + ggml_tensor *out = model.forward(mctx_.get(), x); - ggml_build_forward_expand(&ctx.gf, out); - device_graph_compute(get_num_threads()); + ggml_build_forward_expand(mctx_->gf, out); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); expect_all_close(ref, out); - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } } TEST_F(ChatGLMTest, BenchmarkRMSNorm) { constexpr int seq_len = 64; constexpr int hidden = 1024; - RMSNorm m(&ctx, hidden); - ggml_tensor *x = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, hidden, seq_len); + RMSNorm m(mctx_.get(), hidden); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, hidden, seq_len); + + auto buffer = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); + auto buffer_w = + unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); std::vector all_tensors{m.weight, x}; for (auto tensor : all_tensors) { random_fill(tensor); - tensor_to_device(tensor); } - ggml_tensor *y = m.forward(&ctx, x); - ggml_build_forward_expand(&ctx.gf, y); - std::cout << "[Benchmark] RMSNorm " << ggml_type_name(ctx.dtype) << " time: " << perf_device_graph_compute() - << " ms\n"; - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } + ggml_tensor *y = m.forward(mctx_.get(), x); + ggml_build_forward_expand(mctx_->gf, y); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + std::cout << "[Benchmark] RMSNorm " << ggml_type_name(mctx_->dtype) << " time: " << perf_graph_compute() << " ms\n"; } TEST_F(ChatGLMTest, GLMModel) { @@ -605,16 +591,14 @@ TEST_F(ChatGLMTest, GLMModel) { ModelConfig config( ModelType::CHATGLM, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, /*num_kv_heads=*/8, /*num_hidden_layers=*/1, /*intermediate_size=*/128, /*norm_eps=*/1e-5f, - /*hidden_act=*/ActivationType::GELU, /*use_qkv_bias=*/true, /*use_dense_bias=*/true, - /*interleaved_qkv=*/true, /*use_alibi=*/false, /*rope_type=*/RopeType::CHATGLM, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/-1, - /*attn_mask_type=*/AttentionMaskType::CHATGLM, /*num_virtual_tokens=*/0, + /*rope_theta=*/10000.f, + /*num_virtual_tokens=*/0, /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, /*extra_eos_token_ids=*/{}); constexpr int seq_len = 3; - ChatGLMModel model(&ctx, config); + ChatGLMModel model(mctx_.get(), config); std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, @@ -641,16 +625,14 @@ TEST_F(ChatGLMTest, GLMPTuningV2Model) { ModelConfig config( ModelType::CHATGLM, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, /*num_kv_heads=*/8, /*num_hidden_layers=*/1, /*intermediate_size=*/128, /*norm_eps=*/1e-5f, - /*hidden_act=*/ActivationType::GELU, /*use_qkv_bias=*/true, /*use_dense_bias=*/true, - /*interleaved_qkv=*/true, /*use_alibi=*/false, /*rope_type=*/RopeType::CHATGLM, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/-1, - /*attn_mask_type=*/AttentionMaskType::CHATGLM, /*num_virtual_tokens=*/5, + /*rope_theta=*/10000.f, + /*num_virtual_tokens=*/5, /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, /*extra_eos_token_ids=*/{}); constexpr int seq_len = 3; - ChatGLMModel model(&ctx, config); + ChatGLMModel model(mctx_.get(), config); std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, @@ -677,16 +659,14 @@ TEST_F(ChatGLMTest, GLM2Model) { ModelConfig config( ModelType::CHATGLM2, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, /*num_kv_heads=*/2, /*num_hidden_layers=*/1, /*intermediate_size=*/48, /*norm_eps=*/1e-5f, - /*hidden_act=*/ActivationType::SILU, /*use_qkv_bias=*/true, /*use_dense_bias=*/false, - /*interleaved_qkv=*/false, /*use_alibi=*/false, /*rope_type=*/RopeType::GPTJ, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/2, - /*attn_mask_type=*/AttentionMaskType::CAUSAL, /*num_virtual_tokens=*/0, + /*rope_theta=*/10000.f, + /*num_virtual_tokens=*/0, /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, /*extra_eos_token_ids=*/{}); constexpr int seq_len = 3; - ChatGLM2Model model(&ctx, config); + ChatGLM2Model model(mctx_.get(), config); std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, @@ -708,16 +688,14 @@ TEST_F(ChatGLMTest, GLM3Model) { ModelConfig config( ModelType::CHATGLM3, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, /*num_kv_heads=*/2, /*num_hidden_layers=*/1, /*intermediate_size=*/48, /*norm_eps=*/1e-5f, - /*hidden_act=*/ActivationType::SILU, /*use_qkv_bias=*/true, /*use_dense_bias=*/false, - /*interleaved_qkv=*/false, /*use_alibi=*/false, /*rope_type=*/RopeType::GPTJ, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/2, - /*attn_mask_type=*/AttentionMaskType::CAUSAL, /*num_virtual_tokens=*/0, + /*rope_theta=*/10000.f, + /*num_virtual_tokens=*/0, /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, /*extra_eos_token_ids=*/{}); constexpr int seq_len = 3; - ChatGLM3Model model(&ctx, config); + ChatGLM3Model model(mctx_.get(), config); std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, @@ -739,16 +717,14 @@ TEST_F(ChatGLMTest, GLM3PTuningV2Model) { ModelConfig config( ModelType::CHATGLM3, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, /*num_kv_heads=*/2, /*num_hidden_layers=*/1, /*intermediate_size=*/48, /*norm_eps=*/1e-5f, - /*hidden_act=*/ActivationType::SILU, /*use_qkv_bias=*/true, /*use_dense_bias=*/false, - /*interleaved_qkv=*/false, /*use_alibi=*/false, /*rope_type=*/RopeType::GPTJ, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/2, - /*attn_mask_type=*/AttentionMaskType::CAUSAL, /*num_virtual_tokens=*/5, + /*rope_theta=*/10000.f, + /*num_virtual_tokens=*/5, /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, /*extra_eos_token_ids=*/{}); constexpr int seq_len = 3; - ChatGLM3Model model(&ctx, config); + ChatGLM3Model model(mctx_.get(), config); std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, @@ -770,114 +746,20 @@ TEST_F(ChatGLMTest, GLM4Model) { ModelConfig config( ModelType::CHATGLM4, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, /*num_kv_heads=*/2, /*num_hidden_layers=*/1, /*intermediate_size=*/48, /*norm_eps=*/1e-5f, - /*hidden_act=*/ActivationType::SILU, /*use_qkv_bias=*/true, /*use_dense_bias=*/false, - /*interleaved_qkv=*/false, /*use_alibi=*/false, /*rope_type=*/RopeType::GPTJ, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/2, - /*attn_mask_type=*/AttentionMaskType::CAUSAL, /*num_virtual_tokens=*/0, - /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, - /*extra_eos_token_ids=*/{}); - - constexpr int seq_len = 3; - - ChatGLM4Model model(&ctx, config); - - std::vector all_weights{model.word_embeddings.weight, - model.layers[0].input_layernorm.weight, - model.layers[0].attention.query_key_value.weight, - model.layers[0].attention.query_key_value.bias, - model.layers[0].attention.dense.weight, - model.layers[0].post_attention_layernorm.weight, - model.layers[0].mlp.gate_proj.weight, - model.layers[0].mlp.up_proj.weight, - model.layers[0].mlp.down_proj.weight, - model.final_layernorm.weight}; - - test_model(model, config, data_path, seq_len, all_weights); -} - -TEST_F(ChatGLMTest, Baichuan7BModel) { - fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/baichuan7b_model.data"; - - ModelConfig config( - ModelType::BAICHUAN7B, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, - /*num_kv_heads=*/8, /*num_hidden_layers=*/1, /*intermediate_size=*/32 * 3, /*norm_eps=*/1e-6f, - /*hidden_act=*/ActivationType::SILU, /*use_qkv_bias=*/false, /*use_dense_bias=*/false, - /*interleaved_qkv=*/false, /*use_alibi=*/false, /*rope_type=*/RopeType::NEOX, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/1, - /*attn_mask_type=*/AttentionMaskType::CAUSAL, /*num_virtual_tokens=*/0, - /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, - /*extra_eos_token_ids=*/{}); - - constexpr int seq_len = 3; - - Baichuan7BModel model(&ctx, config); - - std::vector all_weights{model.word_embeddings.weight, - model.layers[0].input_layernorm.weight, - model.layers[0].attention.query_key_value.weight, - model.layers[0].attention.dense.weight, - model.layers[0].post_attention_layernorm.weight, - model.layers[0].mlp.gate_proj.weight, - model.layers[0].mlp.down_proj.weight, - model.layers[0].mlp.up_proj.weight, - model.final_layernorm.weight}; - - test_model(model, config, data_path, seq_len, all_weights); -} - -TEST_F(ChatGLMTest, Baichuan13BModel) { - fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/baichuan13b_model.data"; - - ModelConfig config( - ModelType::BAICHUAN13B, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, - /*num_kv_heads=*/8, /*num_hidden_layers=*/1, /*intermediate_size=*/32 * 3, /*norm_eps=*/1e-6f, - /*hidden_act=*/ActivationType::SILU, /*use_qkv_bias=*/false, /*use_dense_bias=*/false, - /*interleaved_qkv=*/false, /*use_alibi=*/true, /*rope_type=*/RopeType::DISABLED, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/-1, - /*attn_mask_type=*/AttentionMaskType::CAUSAL, /*num_virtual_tokens=*/0, + /*rope_theta=*/10000.f, + /*num_virtual_tokens=*/0, /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, /*extra_eos_token_ids=*/{}); constexpr int seq_len = 3; - Baichuan13BModel model(&ctx, config); - - std::vector all_weights{model.word_embeddings.weight, - model.layers[0].input_layernorm.weight, - model.layers[0].attention.query_key_value.weight, - model.layers[0].attention.dense.weight, - model.layers[0].post_attention_layernorm.weight, - model.layers[0].mlp.gate_proj.weight, - model.layers[0].mlp.down_proj.weight, - model.layers[0].mlp.up_proj.weight, - model.final_layernorm.weight}; - - test_model(model, config, data_path, seq_len, all_weights); -} - -TEST_F(ChatGLMTest, InternLMModel) { - fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/internlm_model.data"; - - ModelConfig config( - ModelType::INTERNLM, GGML_TYPE_F32, /*vocab_size=*/5, /*hidden_size=*/32, /*num_attention_heads=*/8, - /*num_kv_heads=*/8, /*num_hidden_layers=*/1, /*intermediate_size=*/32 * 3, /*norm_eps=*/1e-6f, - /*hidden_act=*/ActivationType::SILU, /*use_qkv_bias=*/true, /*use_dense_bias=*/true, - /*interleaved_qkv=*/false, /*use_alibi=*/false, /*rope_type=*/RopeType::NEOX, /*rope_theta=*/10000.f, - /*rope_dim_scale=*/1, - /*attn_mask_type=*/AttentionMaskType::CAUSAL, /*num_virtual_tokens=*/0, - /*max_length=*/8, /*bos_token_id=*/-1, /*eos_token_id=*/-1, /*pad_token_id=*/-1, /*sep_token_id=*/-1, - /*extra_eos_token_ids=*/{}); - - constexpr int seq_len = 3; - - InternLMModel model(&ctx, config); + ChatGLM4Model model(mctx_.get(), config); std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, model.layers[0].attention.query_key_value.weight, model.layers[0].attention.query_key_value.bias, model.layers[0].attention.dense.weight, - model.layers[0].attention.dense.bias, model.layers[0].post_attention_layernorm.weight, model.layers[0].mlp.gate_proj.weight, model.layers[0].mlp.up_proj.weight, @@ -888,8 +770,7 @@ TEST_F(ChatGLMTest, InternLMModel) { } TEST_F(ChatGLMTest, quantize) { - GTEST_SKIP() << "Skipping quantization data generation"; - float src_data[]{ + const float src_data[]{ -1.1258e+00, -1.1524e+00, -2.5058e-01, -4.3388e-01, 8.4871e-01, 6.9201e-01, -3.1601e-01, -2.1152e+00, 3.2227e-01, -1.2633e+00, 3.4998e-01, 3.0813e-01, 1.1984e-01, 1.2377e+00, 1.1168e+00, -2.4728e-01, -1.3527e+00, -1.6959e+00, 5.6665e-01, 7.9351e-01, 5.9884e-01, -1.5551e+00, -3.4136e-01, 1.8530e+00, @@ -923,68 +804,115 @@ TEST_F(ChatGLMTest, quantize) { -1.4777e+00, -1.7557e+00, 7.6166e-02, -1.0786e+00, 1.4403e+00, -1.1059e-01, 5.7686e-01, -1.6917e-01, -6.4025e-02, 1.0384e+00, 9.0682e-01, -4.7551e-01, -8.7074e-01, 1.4474e-01, 1.9029e+00, 3.9040e-01}; - ggml_tensor *src = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, 128, 2); + const int q8_ref[]{ + 68, 36, -68, -69, -15, -26, 51, 42, -19, -127, 19, -76, 21, 19, 7, 74, 67, -15, -81, -102, 34, + 48, 36, -93, -20, 111, 45, -35, -10, 11, 83, 95, 57, -51, -32, 38, -23, 1, -18, 9, 16, 4, + 24, 16, -4, 30, -11, 2, 19, 86, -55, -59, -25, 33, 39, 7, -9, -15, 20, -15, -17, 28, 57, + 127, -57, -46, 68, -21, 45, 37, -28, 46, 55, 64, -73, 127, -23, 17, -81, -27, -24, -25, -53, 55, + -7, 40, -5, 34, -41, 0, 42, -20, 51, 18, -12, 114, -93, -2, -52, -47, 2, 35, 69, 37, 80, + -66, 17, 25, -127, -82, -11, 14, 12, 4, 16, 22, 22, -42, 38, -45, -11, -118, -4, -2, 47, 17, + -45, -3, -30, -23, 93, -20, 6, 57, 45, 67, -35, 35, -58, -27, -52, 37, 40, 30, 127, -69, -5, + 8, -39, -81, -7, -67, -20, 16, -17, 8, 16, 76, 19, 25, -13, -8, 13, -50, -124, 9, -46, 20, + 10, 25, 88, 34, 78, -80, 24, -9, 81, -40, 61, 50, 25, 17, -10, 29, -25, -87, 19, -113, -5, + -29, -126, -29, 87, 11, 127, 63, -87, -58, 119, 22, 54, -80, -96, -97, 45, 33, -55, 53, 40, 39, + 17, 87, -28, 101, -7, -108, 39, 59, 66, -119, 60, -54, 116, 10, 95, 83, -124, 98, -49, -127, 95, + 127, -66, 56, 28, -3, -105, -11, -84, 35, -23, 105, 13, 25, -10, 43, -19, -89, -9, -36, -35, 57, + -59, 56, 77, -118, -99, -117, 5, -72, 96, -7, 38, -11, -4, 69, 61, -32, -58, 10, 127, 26}; + + const int q4_0_ref[]{59, 52, 52, 36, -89, -74, -85, 43, 119, -16, -71, 99, 121, -103, -40, -19, + -52, 87, -46, -74, -87, 104, 105, -121, -105, -104, 118, -105, -104, 102, 73, 8, + -57, -77, 75, -100, 34, -75, -118, 101, -75, -124, 93, -112, 89, 119, -99, 26, + -23, -118, -69, -75, -120, 101, 58, 53, 125, 20, -119, -118, -80, -109, 87, -119, + 105, 120, -23, 121, -119, -59, -70, -59, -50, -77, -100, -118, 123, 54, 117, 102, + -112, -116, 120, -72, -6, 125, -72, 124, 121, 103, 75, -78, -125, -83, -10, -87, + 51, 123, 4, 69, -42, -57, 25, 118, 90, -35, -25, -17, 34, -79, 27, 117, + 37, 54, -9, 35, -70, -14, 40, 15, -58, 68, 100, -113, -12, -101, -99, -77, + -23, -15, -121, -42, 41, -123, 105, -98, -119, 74, 74, -92, -52, 116, 3, 111}; + + const int q4_1_ref[]{60, 52, 59, -64, 52, 36, -89, -74, -85, 43, 119, -16, -71, 99, 121, -103, + -40, -19, -52, 87, 85, 53, 89, -66, 51, 117, -125, 86, 70, 69, 103, 70, + 52, 119, -108, -11, 6, 28, -96, 48, -65, 52, -121, -65, 100, -103, 74, 107, + -111, 95, -91, -121, 97, -28, 5, 101, 51, 58, 102, -103, -42, 52, 58, -63, + -114, 20, -118, -102, -64, -93, 104, -118, 121, 121, -6, 122, -102, -58, -53, -42, + 28, 52, -102, -65, 100, -122, -124, -54, -102, -103, 127, 115, -121, 72, 5, -125, + 87, -109, -122, -104, -80, 50, 63, -66, 124, 99, 9, 103, -36, -123, -5, -70, + 41, 72, -9, -103, -74, 50, 41, 33, 122, 49, 34, -67, -28, -117, -38, -54, + 9, -35, 86, 13, -41, -15, 74, -69, -101, 112, 27, 116, -47, 51, 11, -65, + 22, 14, -120, 57, -41, 122, -90, 114, 119, -75, -75, 91, 68, -117, -4, -112}; + + const int q5_0_ref[]{ + 59, 48, 48, 125, -100, 121, 103, 55, 78, 109, 86, 69, -34, -32, 98, -58, -13, 18, -79, -55, + 120, -82, -46, -78, 7, -51, -79, -79, 51, -64, -78, -1, 30, 47, -35, 46, 32, -36, -111, 0, + 126, 101, 119, 55, 34, -79, 81, 95, 45, 125, 20, -54, 89, 8, -71, 32, -93, -18, 42, 35, + -61, 3, 119, 105, 1, -53, 58, 49, -115, 95, -68, -12, -6, 24, 2, 3, 96, 38, -81, 2, + -62, -48, -62, -29, 19, 123, 101, -118, -50, -81, -121, 125, -63, 22, 39, -13, -25, 107, -21, -36, + 32, 25, -31, 111, -11, -6, 97, -40, -13, -34, 75, -82, 42, -76, 15, -29, 22, 74, -3, 65, + 86, -11, 8, -118, -67, 126, 17, -36, -109, -85, -50, -50, 34, -83, 65, -93, -48, -28, 23, -7, + 75, 107, -2, 69, 100, -13, 65, 14, -117, -103, -56, 15, -40, 23, -99, -81, -47, -105, -85, 25, + -61, -13, -2, -99, 65, 27, -78, 27, 17, 116, -124, 73, 119, -7, 6, -33}; + + const int q5_1_ref[]{25, 48, 59, -64, 48, 125, -100, 121, 104, 56, 95, 125, 87, 70, -18, -16, + 99, -57, -13, 35, -79, -38, -119, -81, 41, 49, 89, -66, 0, 32, 4, 76, + 102, -6, 7, -69, -115, 123, -34, 125, 121, -17, 56, -6, 13, 40, 81, 96, + -104, 48, -121, -65, 46, -96, -46, -126, -55, 36, 117, -42, 51, -81, 74, 15, + -78, -39, 10, -38, 102, 101, -36, 35, -82, 48, 58, -63, -51, 95, -67, -12, + 13, 25, 20, 37, -128, 70, -64, 20, -28, -14, -12, -11, 53, -84, -121, -68, + -13, 47, -102, -65, 120, -126, 62, -23, -40, 12, 25, -108, 36, 35, -17, -25, + 31, -112, 11, 5, -82, 39, 29, 33, 121, 46, 63, -66, -43, 75, -16, 28, + -7, -58, 2, -50, -87, 27, -9, 118, 83, -126, -18, 35, 108, 101, 66, 66, + 76, 45, 34, -67, -66, 92, 47, 27, -23, 22, -76, -92, 2, -70, -84, 12, + -65, -14, 116, 103, 55, -15, 55, -23, -112, 47, 11, -65, 46, 104, 84, -26, + 44, 12, 1, 98, -66, -28, 77, -44, -18, -118, 122, -74, -121, 6, -7, 32}; + + auto ctx = make_unique_ggml_context(1024 * 1024, nullptr, false); + + ggml_tensor *src = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, 128, 2); memcpy(src->data, src_data, sizeof(src_data)); + [[maybe_unused]] auto qtensor_to_string = [](ggml_tensor *tensor) { + std::ostringstream oss; + oss << "Q8: ["; + for (size_t i = 0; i < ggml_nbytes(tensor); i++) { + oss << (i > 0 ? ", " : "") << (int)((char *)tensor->data)[i]; + } + oss << "]"; + return oss.str(); + }; + // q8_0 { - ggml_tensor *q8_dst = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_Q8_0, 128, 2); - int64_t hist[16]{}; - ggml_quantize_q8_0((float *)src->data, q8_dst->data, ggml_nelements(src), src->ne[0], hist); - - std::cout << "Q8: ["; - for (size_t i = 0; i < ggml_nbytes(q8_dst); i++) { - std::cout << (i > 0 ? ", " : "") << (int)((char *)q8_dst->data)[i]; - } - std::cout << "]\n"; + ggml_tensor *q8_dst = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_Q8_0, 128, 2); + ggml_quantize_chunk(GGML_TYPE_Q8_0, (float *)src->data, q8_dst->data, 0, src->ne[1], src->ne[0], nullptr); + // std::cout << qtensor_to_string(q8_dst) << '\n'; + EXPECT_TRUE(memcmp(q8_dst->data, q8_ref, sizeof(q8_ref))); } // q4_0 { - ggml_tensor *q4_dst = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_Q4_0, 128, 2); - int64_t hist[16]{}; - ggml_quantize_q4_0((float *)src->data, q4_dst->data, ggml_nelements(src), src->ne[0], hist); - - std::cout << "Q4_0: ["; - for (size_t i = 0; i < ggml_nbytes(q4_dst); i++) { - std::cout << (i > 0 ? ", " : "") << (int)((char *)q4_dst->data)[i]; - } - std::cout << "]\n"; + ggml_tensor *q4_dst = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_Q4_0, 128, 2); + ggml_quantize_chunk(GGML_TYPE_Q4_0, (float *)src->data, q4_dst->data, 0, src->ne[1], src->ne[0], nullptr); + // std::cout << qtensor_to_string(q4_dst) << '\n'; + EXPECT_TRUE(memcmp(q4_dst->data, q4_0_ref, sizeof(q4_0_ref))); } // q4_1 { - ggml_tensor *q4_dst = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_Q4_1, 128, 2); - int64_t hist[16]{}; - ggml_quantize_q4_1((float *)src->data, q4_dst->data, ggml_nelements(src), src->ne[0], hist); - - std::cout << "Q4_1: ["; - for (size_t i = 0; i < ggml_nbytes(q4_dst); i++) { - std::cout << (i > 0 ? ", " : "") << (int)((char *)q4_dst->data)[i]; - } - std::cout << "]\n"; + ggml_tensor *q4_dst = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_Q4_1, 128, 2); + ggml_quantize_chunk(GGML_TYPE_Q4_1, (float *)src->data, q4_dst->data, 0, src->ne[1], src->ne[0], nullptr); + // std::cout << qtensor_to_string(q4_dst) << '\n'; + EXPECT_TRUE(memcmp(q4_dst->data, q4_1_ref, sizeof(q4_1_ref))); } // q5_0 { - ggml_tensor *q5_dst = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_Q5_0, 128, 2); - int64_t hist[16]{}; - ggml_quantize_q5_0((float *)src->data, q5_dst->data, ggml_nelements(src), src->ne[0], hist); - - std::cout << "Q5_0: ["; - for (size_t i = 0; i < ggml_nbytes(q5_dst); i++) { - std::cout << (i > 0 ? ", " : "") << (int)((char *)q5_dst->data)[i]; - } - std::cout << "]\n"; + ggml_tensor *q5_dst = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_Q5_0, 128, 2); + ggml_quantize_chunk(GGML_TYPE_Q5_0, (float *)src->data, q5_dst->data, 0, src->ne[1], src->ne[0], nullptr); + // std::cout << qtensor_to_string(q5_dst) << '\n'; + EXPECT_TRUE(memcmp(q5_dst->data, q5_0_ref, sizeof(q5_0_ref))); } // q5_1 { - ggml_tensor *q5_dst = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_Q5_1, 128, 2); - int64_t hist[16]{}; - ggml_quantize_q5_1((float *)src->data, q5_dst->data, ggml_nelements(src), src->ne[0], hist); - - std::cout << "Q5_1: ["; - for (size_t i = 0; i < ggml_nbytes(q5_dst); i++) { - std::cout << (i > 0 ? ", " : "") << (int)((char *)q5_dst->data)[i]; - } - std::cout << "]\n"; + ggml_tensor *q5_dst = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_Q5_1, 128, 2); + ggml_quantize_chunk(GGML_TYPE_Q5_1, (float *)src->data, q5_dst->data, 0, src->ne[1], src->ne[0], nullptr); + // std::cout << qtensor_to_string(q5_dst) << '\n'; + EXPECT_TRUE(memcmp(q5_dst->data, q5_1_ref, sizeof(q5_1_ref))); } } @@ -1156,8 +1084,10 @@ TEST(Pipeline, ChatGLM2) { } static inline std::string read_text(const fs::path &path) { - MappedFile mapped_file(path.string()); - return std::string(mapped_file.data, mapped_file.size); + std::ifstream ifs(path); + std::ostringstream oss; + oss << ifs.rdbuf(); + return oss.str(); } TEST(Pipeline, ChatGLM3) { @@ -1272,8 +1202,7 @@ TEST(Pipeline, ChatGLM3) { { ChatMessage output = pipeline.chat(messages, gen_config); EXPECT_EQ(output.role, ChatMessage::ROLE_ASSISTANT); - EXPECT_EQ(output.content, - "根据API调用结果,我为您生成了一个随机数,随机数的范围在0到100之间。这个随机数是22。"); + EXPECT_EQ(output.content, "根据您的要求,我使用随机数生成器API生成了一个在0和100之间的随机数,结果为22。"); } } @@ -1292,10 +1221,10 @@ TEST(Pipeline, ChatGLM3) { 质数是指只能被1和它本身整除的正整数。例如,2、3、5、7等都是质数。 -让我们开始吧!)"); +让我们开始计算。)"); EXPECT_EQ(output.tool_calls.front().code.input, R"(```python -# Function to check if a number is prime def is_prime(n): + """Check if a number is prime.""" if n <= 1: return False if n <= 3: @@ -1500,12 +1429,12 @@ TEST(Pipeline, CodeGeeX2) { std::string prompt = "# language: Python\n# write a bubble sort function\n"; std::string target = R"( -def bubble_sort(lst): - for i in range(len(lst) - 1): - for j in range(len(lst) - 1 - i): - if lst[j] > lst[j + 1]: - lst[j], lst[j + 1] = lst[j + 1], lst[j] - return lst +def bubble_sort(list): + for i in range(len(list) - 1): + for j in range(len(list) - 1): + if list[j] > list[j + 1]: + list[j], list[j + 1] = list[j + 1], list[j] + return list print(bubble_sort([5, 4, 3, 2, 1])))"; @@ -1515,214 +1444,6 @@ print(bubble_sort([5, 4, 3, 2, 1])))"; } } -TEST(Pipeline, Baichuan13B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan-13b-chat-ggml.bin"; - if (!fs::exists(model_path)) { - GTEST_SKIP() << "Skipping Baichuan13B e2e test (ggml model not found)"; - } - Pipeline pipeline(model_path.string()); - EXPECT_TRUE(dynamic_cast(pipeline.model.get())); - - // tokenizer - { - std::vector cases{ - {"你是谁", {9875, 21797}}, - {"我是百川大模型,是由百川智能的工程师们创造的大语言模型,我可以和人类进行自然交流、解答问题、协助创作,帮" - "助大众轻松、普惠的获得世界知识和专业服务。如果你有任何问题,可以随时向我提问", - {6323, 31161, 31745, 32213, 31175, 14830, 72, 16347, 31745, 32213, 6358, 31135, 14823, 31212, 8823, - 5114, 7234, 14830, 72, 31182, 1231, 31188, 8627, 1696, 3823, 5536, 76, 17133, 1766, 76, - 16345, 11769, 72, 4090, 13169, 8385, 76, 31840, 32195, 31135, 4137, 2781, 3317, 31188, 2285, - 1910, 73, 6011, 31169, 4315, 1766, 72, 1231, 11533, 31490, 31182, 21934}}}; - check_tokenizer(pipeline.tokenizer.get(), cases); - - std::vector messages{ - {ChatMessage::ROLE_USER, "你好呀"}, - {ChatMessage::ROLE_ASSISTANT, "你好!很高兴和你交流。请问有什么我可以帮助你的吗?"}, - {ChatMessage::ROLE_USER, "你叫什么名字?"}, - }; - std::vector input_ids = pipeline.tokenizer->apply_chat_template(messages, 2048); - std::vector target_input_ids{195, 9875, 31213, 32889, 196, 9875, 31213, 74, 17318, 31906, - 14822, 5536, 73, 20389, 7713, 31182, 1231, 4090, 2689, 31763, - 75, 195, 9875, 32177, 1534, 10240, 75, 196}; - EXPECT_TRUE(equal(input_ids, target_input_ids)); - } - - // memory test - { - GenerationConfig gen_config; - gen_config.max_length = 512; - gen_config.max_context_length = gen_config.max_length - 1; - gen_config.do_sample = false; - - std::vector input_ids(gen_config.max_context_length, 128); - pipeline.generate(input_ids, gen_config); - } - - // chat - { - check_chat_format(pipeline); - GenerationConfig gen_config; - gen_config.do_sample = false; - gen_config.repetition_penalty = 1.1; - std::vector messages{{ChatMessage::ROLE_USER, "你好呀"}}; - ChatMessage output = pipeline.chat(messages, gen_config); - EXPECT_EQ(output.content, "你好!很高兴见到你。请问有什么我可以帮助你的吗?"); - } -} - -TEST(Pipeline, Baichuan2_7B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-7b-chat-ggml.bin"; - if (!fs::exists(model_path)) { - GTEST_SKIP() << "Skipping Baichuan2-7B e2e test (ggml model not found)"; - } - Pipeline pipeline(model_path.string()); - EXPECT_TRUE(dynamic_cast(pipeline.model.get())); - - // tokenizer - { - std::vector cases{ - {"你是谁", {92067}}, - {"我是百川大模型,是由百川智能的工程师们创造的大语言模型,我可以和人类进行自然交流、解答问题、协助创作,帮" - "助大众轻松、普惠的获得世界知识和专业服务。如果你有任何问题,可以随时向我提问", - {6461, 70335, 92366, 9528, 65, 10879, 70335, 3932, 92333, 8832, 92414, 5034, - 3133, 5002, 9528, 65, 28756, 92385, 5243, 1697, 2559, 3341, 69, 10474, - 1754, 69, 9036, 7356, 65, 2716, 7499, 4892, 69, 24816, 92333, 2693, - 2089, 23672, 1940, 1760, 66, 4173, 23181, 1754, 65, 65351, 39975, 14590}}}; - check_tokenizer(pipeline.tokenizer.get(), cases); - - std::vector messages{ - {ChatMessage::ROLE_USER, "你好呀"}, - {ChatMessage::ROLE_ASSISTANT, "你好!很高兴和你交流。请问有什么问题我可以帮助你解决吗?"}, - {ChatMessage::ROLE_USER, "你叫什么名字?"}, - }; - std::vector input_ids = pipeline.tokenizer->apply_chat_template(messages, 2048); - std::vector target_input_ids{195, 16829, 94278, 196, 16829, 67, 52160, 10329, 3341, - 66, 23216, 5817, 1754, 92392, 21777, 92430, 2740, 93122, - 68, 195, 92430, 93410, 1747, 6642, 68, 196}; - EXPECT_TRUE(equal(input_ids, target_input_ids)); - } - - // memory test - { - GenerationConfig gen_config; - gen_config.max_length = 2048; - gen_config.max_context_length = gen_config.max_length - 1; - gen_config.do_sample = false; - - std::vector input_ids(gen_config.max_context_length, 128); - pipeline.generate(input_ids, gen_config); - } - - // chat - { - check_chat_format(pipeline); - GenerationConfig gen_config; - gen_config.do_sample = false; - gen_config.repetition_penalty = 1.05; - std::vector messages{{ChatMessage::ROLE_USER, "你好呀"}}; - ChatMessage output = pipeline.chat(messages, gen_config); - EXPECT_EQ(output.content, "你好!很高兴为你服务。请问有什么问题我可以帮助你解决?"); - } -} - -TEST(Pipeline, Baichuan2_13B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-13b-chat-ggml.bin"; - if (!fs::exists(model_path)) { - GTEST_SKIP() << "Skipping Baichuan2-13B e2e test (ggml model not found)"; - } - Pipeline pipeline(model_path.string()); - EXPECT_TRUE(dynamic_cast(pipeline.model.get())); - - // tokenizer - { - std::vector cases{ - {"你是谁", {92067}}, - {"我是百川大模型,是由百川智能的工程师们创造的大语言模型,我可以和人类进行自然交流、解答问题、协助创作,帮" - "助大众轻松、普惠的获得世界知识和专业服务。如果你有任何问题,可以随时向我提问", - {6461, 70335, 92366, 9528, 65, 10879, 70335, 3932, 92333, 8832, 92414, 5034, - 3133, 5002, 9528, 65, 28756, 92385, 5243, 1697, 2559, 3341, 69, 10474, - 1754, 69, 9036, 7356, 65, 2716, 7499, 4892, 69, 24816, 92333, 2693, - 2089, 23672, 1940, 1760, 66, 4173, 23181, 1754, 65, 65351, 39975, 14590}}}; - check_tokenizer(pipeline.tokenizer.get(), cases); - - std::vector messages{ - {ChatMessage::ROLE_USER, "你好呀"}, - {ChatMessage::ROLE_ASSISTANT, "你好!很高兴和你交流。请问有什么我可以帮助你的吗?"}, - {ChatMessage::ROLE_USER, "你叫什么名字?"}, - }; - std::vector input_ids = pipeline.tokenizer->apply_chat_template(messages, 2048); - std::vector target_input_ids{195, 16829, 94278, 196, 16829, 67, 52160, 10329, 3341, 66, 23216, 5817, - 92392, 21777, 2193, 93122, 68, 195, 92430, 93410, 1747, 6642, 68, 196}; - EXPECT_TRUE(equal(input_ids, target_input_ids)); - } - - // chat - { - check_chat_format(pipeline); - GenerationConfig gen_config; - gen_config.do_sample = false; - gen_config.repetition_penalty = 1.05; - std::vector messages{{ChatMessage::ROLE_USER, "你好呀"}}; - ChatMessage output = pipeline.chat(messages, gen_config); - EXPECT_EQ(output.content, "你好!很高兴见到你。请问有什么我可以帮助你的吗?"); - } -} - -TEST(Pipeline, InternLM) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-7b-ggml.bin"; - if (!fs::exists(model_path)) { - GTEST_SKIP() << "Skipping InternLM e2e test (ggml model not found)"; - } - Pipeline pipeline(model_path.string()); - EXPECT_TRUE(dynamic_cast(pipeline.model.get())); - - // tokenizer - { - std::vector cases{ - {"你好", {1, 76379}}, - {"<|User|>:你好\n<|Bot|>:你好,有什么我可以帮助你的吗?\n<|User|>:晚上睡不着应该怎么办\n<|" - "Bot|>:", - {1, 333, 352, 1621, 352, 27232, 76379, 103027, 364, 333, 352, 23845, 352, 27232, - 76379, 98899, 68408, 73159, 67566, 67513, 61056, 99050, 103028, 364, 333, 352, 1621, 352, - 27232, 67891, 76046, 67551, 68573, 103027, 364, 333, 352, 23845, 352, 27232}, - true}}; - check_tokenizer(pipeline.tokenizer.get(), cases); - } - - // prompter - { - EXPECT_EQ(InternLMTokenizer::build_prompt({{ChatMessage::ROLE_USER, "你好"}}), "<|User|>:你好\n<|Bot|>:"); - EXPECT_EQ(InternLMTokenizer::build_prompt({ - {ChatMessage::ROLE_USER, "你好"}, - {ChatMessage::ROLE_ASSISTANT, "你好,有什么我可以帮助你的吗?"}, - {ChatMessage::ROLE_USER, "晚上睡不着应该怎么办"}, - }), - "<|User|>:你好\n<|Bot|>:你好,有什么我可以帮助你的吗?\n<|User|>:晚上睡不着应该怎么办" - "\n<|Bot|>:"); - } - - // memory test - { - GenerationConfig gen_config; - gen_config.max_length = 2048; - gen_config.max_context_length = gen_config.max_length - 1; - gen_config.do_sample = false; - - std::vector input_ids(gen_config.max_context_length, 128); - pipeline.generate(input_ids, gen_config); - } - - // chat - { - check_chat_format(pipeline); - GenerationConfig gen_config; - gen_config.do_sample = false; - std::vector messages{{ChatMessage::ROLE_USER, "你好"}}; - ChatMessage output = pipeline.chat(messages, gen_config); - EXPECT_EQ(output.content, "你好!有什么我可以帮助你的吗?"); - } -} - static void run_benchmark(const fs::path &model_path) { if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping benchmark test (model " << model_path << " not found)"; @@ -1742,7 +1463,6 @@ static void run_benchmark(const fs::path &model_path) { GenerationConfig gen_config; gen_config.do_sample = false; - gen_config.num_threads = get_num_threads(); PerfStreamer streamer; start_ms = ggml_time_ms(); @@ -1750,7 +1470,6 @@ static void run_benchmark(const fs::path &model_path) { int64_t gen_s = (ggml_time_ms() - start_ms) / 1000.f; std::cout << "======== benchmark results for " << model_path.filename() << " ========\n" - << "using #threads: " << gen_config.num_threads << "\n" << "model loaded within: " << load_model_ms << " ms\n" << "generation finished within: " << gen_s << " s\n" << streamer.to_string() << "\n" @@ -1772,24 +1491,4 @@ TEST(Benchmark, ChatGLM4) { run_benchmark(model_path); } -TEST(Benchmark, Baichuan2_7B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-7b-chat-ggml.bin"; - run_benchmark(model_path); -} - -TEST(Benchmark, Baichuan2_13B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-13b-chat-ggml.bin"; - run_benchmark(model_path); -} - -TEST(Benchmark, InternLM7B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-7b-ggml.bin"; - run_benchmark(model_path); -} - -TEST(Benchmark, InternLM20B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-20b-ggml.bin"; - run_benchmark(model_path); -} - } // namespace chatglm diff --git a/examples/chatglm3_demo.py b/examples/chatglm3_demo.py index 783b58ad..6013596d 100644 --- a/examples/chatglm3_demo.py +++ b/examples/chatglm3_demo.py @@ -302,7 +302,6 @@ def main(): top_p=top_p, temperature=temperature, repetition_penalty=1.0, - num_threads=0, stream=True, ): response += chunk.content diff --git a/examples/web_demo.py b/examples/web_demo.py index 0d61baa9..8ce52bc2 100644 --- a/examples/web_demo.py +++ b/examples/web_demo.py @@ -17,7 +17,6 @@ parser.add_argument("--top_p", default=0.7, type=float, help="top-p sampling") parser.add_argument("--temp", default=0.95, type=float, help="temperature") parser.add_argument("--repeat_penalty", default=1.0, type=float, help="penalize repeat sequence of tokens") -parser.add_argument("-t", "--threads", default=0, type=int, help="number of threads for inference") parser.add_argument("--plain", action="store_true", help="display in plain text without markdown support") args = parser.parse_args() @@ -42,7 +41,6 @@ def predict(input, chatbot, max_length, top_p, temperature, messages): top_p=top_p, temperature=temperature, repetition_penalty=args.repeat_penalty, - num_threads=args.threads, stream=True, ) diff --git a/main.cpp b/main.cpp index 6a14765e..9b8166d9 100644 --- a/main.cpp +++ b/main.cpp @@ -35,7 +35,6 @@ struct Args { float top_p = 0.7; float temp = 0.95; float repeat_penalty = 1.0; - int num_threads = 0; bool verbose = false; }; @@ -112,8 +111,6 @@ static Args parse_args(const std::vector &argv) { args.temp = std::stof(argv.at(++i)); } else if (arg == "--repeat_penalty") { args.repeat_penalty = std::stof(argv.at(++i)); - } else if (arg == "-t" || arg == "--threads") { - args.num_threads = std::stoi(argv.at(++i)); } else if (arg == "-v" || arg == "--verbose") { args.verbose = true; } else { @@ -185,7 +182,7 @@ static void chat(Args &args) { auto streamer = std::make_unique(std::move(streamers)); chatglm::GenerationConfig gen_config(args.max_length, args.max_new_tokens, args.max_context_length, args.temp > 0, - args.top_k, args.top_p, args.temp, args.repeat_penalty, args.num_threads); + args.top_k, args.top_p, args.temp, args.repeat_penalty); if (args.verbose) { std::cout << "system info: | " @@ -210,8 +207,7 @@ static void chat(Args &args) { << "top_k = " << args.top_k << " | " << "top_p = " << args.top_p << " | " << "temperature = " << args.temp << " | " - << "repetition_penalty = " << args.repeat_penalty << " | " - << "num_threads = " << args.num_threads << " |\n"; + << "repetition_penalty = " << args.repeat_penalty << " |\n"; std::cout << "loaded " << pipeline.model->config.model_type_name() << " model from " << args.model_path << " within: " << (end_load_us - start_load_us) / 1000.f << " ms\n"; diff --git a/tests/data/linear.data b/tests/data/linear.data index febc77d0969162ee41aa6335b362076e63451f95..9ae4c1b5011620a3fe97432f6508d9c1749cba95 100644 GIT binary patch delta 76 zcmV-S0JHzV6Mz-4zzLH;2!KFpYuQLYX(7`6(2+Z delta 7 OcmZn=Js`Z{04D$ossiEw diff --git a/tests/perf.sh b/tests/perf.sh index 79108750..c045148f 100644 --- a/tests/perf.sh +++ b/tests/perf.sh @@ -2,15 +2,9 @@ export CUDA_VISIBLE_DEVICES=0 -# InternLM-7B -hf_model=internlm/internlm-chat-7b -ggml_model=models/internlm-chat-7b-ggml.bin -benchmark=Benchmark.InternLM7B - -# InternLM-20B -# hf_model=internlm/internlm-chat-20b -# ggml_model=models/internlm-chat-20b-ggml.bin -# benchmark=Benchmark.InternLM20B +hf_model=THUDM/chatglm3-6b +ggml_model=models/chatglm3-ggml.bin +benchmark=Benchmark.ChatGLM2 # ChatGLM4-9B hf_model=THUDM/glm-4-9b-chat @@ -19,10 +13,10 @@ benchmark=Benchmark.ChatGLM4 for dtype in f16 q8_0 q5_1 q5_0 q4_1 q4_0; do python3 chatglm_cpp/convert.py -i $hf_model -o $ggml_model -t $dtype - for use_cublas in ON; do - cmake -B build -DGGML_CUBLAS=$use_cublas && cmake --build build -j + for use_cuda in OFF ON; do + cmake -B build -DGGML_CUDA=$use_cuda && cmake --build build -j for i in $(seq 3); do - echo "[benchmark] dtype=$dtype use_cublas=$use_cublas round=$i" + echo "[benchmark] dtype=$dtype use_cuda=$use_cuda round=$i" ./build/bin/chatglm_test --gtest_filter="$benchmark" done done diff --git a/tests/perplexity.cpp b/tests/perplexity.cpp index bd9fbb08..22ddff16 100644 --- a/tests/perplexity.cpp +++ b/tests/perplexity.cpp @@ -9,7 +9,6 @@ struct Args { std::string corpus_path = "data/wikitext-2-raw/wiki.test.raw"; int max_length = 1024; int stride = 512; - int num_threads = 0; }; static void usage(const std::string &prog) { @@ -21,7 +20,6 @@ static void usage(const std::string &prog) { -f, --file path to the corpus -l, --max_length N max total length including prompt and output -s, --stride N stride size of the sliding window - -t, --threads N number of threads for inference )"; } @@ -42,8 +40,6 @@ static Args parse_args(const std::vector &argv) { args.max_length = std::stoi(argv.at(++i)); } else if (arg == "-s" || arg == "--stride") { args.stride = std::stoi(argv.at(++i)); - } else if (arg == "-t" || arg == "--threads") { - args.num_threads = std::stoi(argv.at(++i)); } else { std::cerr << "Unknown argument: " << arg << std::endl; usage(argv.at(0)); @@ -59,7 +55,7 @@ static Args parse_args(int argc, char **argv) { return parse_args(argv_vec); } -static std::string read_text(std::string path) { +static std::string read_text(const std::string &path) { std::ifstream fin(path); CHATGLM_CHECK(fin) << "cannot open file " << path; std::ostringstream oss; @@ -68,8 +64,8 @@ static std::string read_text(std::string path) { } static float cross_entropy(const ggml_tensor *input, const ggml_tensor *target) { - CHATGLM_CHECK(ggml_is_contiguous(input) && input->n_dims == 2 && input->type == GGML_TYPE_F32); - CHATGLM_CHECK(ggml_is_contiguous(target) && target->n_dims == 1 && target->type == GGML_TYPE_I32); + CHATGLM_CHECK(ggml_is_contiguous(input) && ggml_n_dims(input) == 2 && input->type == GGML_TYPE_F32); + CHATGLM_CHECK(ggml_is_contiguous(target) && ggml_n_dims(target) == 1 && target->type == GGML_TYPE_I32); CHATGLM_CHECK(input->ne[1] == target->ne[0]); const int num_classes = input->ne[0]; @@ -108,7 +104,7 @@ static void perplexity(Args &args) { float total_loss = 0.f; size_t num_samples = 0; - std::vector buf; + std::vector> buf; size_t prev_end = 0; for (size_t begin = 0; begin < corpus_ids.size(); begin += args.stride) { @@ -117,15 +113,20 @@ static void perplexity(Args &args) { size_t target_len = std::min(end - prev_end, size_t(args.max_length - 1)); std::vector input_ids(corpus_ids.begin() + begin, corpus_ids.begin() + end); - ggml_tensor *lm_logits = pipeline.model->forward_graph_compute(input_ids, 0, 0, args.num_threads, false); + ggml_tensor *lm_logits = pipeline.model->forward_graph_compute(input_ids, 0, 0, false); const auto clk_fwd = std::chrono::system_clock::now(); - buf.resize(ggml_nbytes(lm_logits) + 16 * chatglm::MB); + buf.resize(ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_nbytes(lm_logits) + + target_len * ggml_type_size(GGML_TYPE_I32)); auto ctx = chatglm::make_unique_ggml_context(buf.size(), buf.data(), false); - ggml_tensor *next_lm_logits = ggml_view_2d(ctx.get(), lm_logits, lm_logits->ne[0], target_len, lm_logits->nb[1], - (input_ids.size() - target_len - 1) * lm_logits->nb[1]); + ggml_tensor *lm_logits_cpu = ggml_new_tensor(ctx.get(), lm_logits->type, ggml_n_dims(lm_logits), lm_logits->ne); + ggml_backend_tensor_get(lm_logits, lm_logits_cpu->data, 0, ggml_nbytes(lm_logits)); + ggml_tensor *next_lm_logits = + ggml_view_2d(ctx.get(), lm_logits_cpu, lm_logits_cpu->ne[0], target_len, lm_logits_cpu->nb[1], + (input_ids.size() - target_len - 1) * lm_logits_cpu->nb[1]); + ggml_tensor *next_input_ids = ggml_new_tensor_1d(ctx.get(), GGML_TYPE_I32, target_len); memcpy(next_input_ids->data, input_ids.data() + input_ids.size() - target_len, target_len * sizeof(int)); diff --git a/tests/ppl.sh b/tests/ppl.sh index fa3d88f1..bf334ee9 100644 --- a/tests/ppl.sh +++ b/tests/ppl.sh @@ -10,14 +10,6 @@ ggml_model=models/chatglm3-base-ggml.bin hf_model=THUDM/glm-4-9b ggml_model=models/chatglm4-base-ggml.bin -# Baichuan2-7B-Base -# hf_model=baichuan-inc/Baichuan2-7B-Base -# ggml_model=models/baichuan2-7b-base-ggml.bin - -# InternLM -# hf_model=internlm/internlm-7b -# ggml_model=models/internlm-7b-base-ggml.bin - for dtype in f16 q8_0 q5_1 q5_0 q4_1 q4_0; do python3 chatglm_cpp/convert.py -i $hf_model -o $ggml_model -t $dtype echo "[perplexity] dtype=$dtype" diff --git a/tests/test_chatglm_cpp.py b/tests/test_chatglm_cpp.py index 18cdafd3..e4b42c85 100644 --- a/tests/test_chatglm_cpp.py +++ b/tests/test_chatglm_cpp.py @@ -10,11 +10,6 @@ CHATGLM3_MODEL_PATH = PROJECT_ROOT / "models/chatglm3-ggml.bin" CHATGLM4_MODEL_PATH = PROJECT_ROOT / "models/chatglm4-ggml.bin" CODEGEEX2_MODEL_PATH = PROJECT_ROOT / "models/codegeex2-ggml.bin" -BAICHUAN13B_MODEL_PATH = PROJECT_ROOT / "models/baichuan-13b-chat-ggml.bin" -BAICHUAN2_7B_MODEL_PATH = PROJECT_ROOT / "models/baichuan2-7b-chat-ggml.bin" -BAICHUAN2_13B_MODEL_PATH = PROJECT_ROOT / "models/baichuan2-13b-chat-ggml.bin" -INTERNLM7B_MODEL_PATH = PROJECT_ROOT / "models/internlm-chat-7b-ggml.bin" -INTERNLM20B_MODEL_PATH = PROJECT_ROOT / "models/internlm-chat-20b-ggml.bin" def test_chatglm_version(): @@ -90,12 +85,12 @@ def test_codegeex2_pipeline(): prompt = "# language: Python\n# write a bubble sort function\n" target = """ -def bubble_sort(lst): - for i in range(len(lst) - 1): - for j in range(len(lst) - 1 - i): - if lst[j] > lst[j + 1]: - lst[j], lst[j + 1] = lst[j + 1], lst[j] - return lst +def bubble_sort(list): + for i in range(len(list) - 1): + for j in range(len(list) - 1): + if list[j] > list[j + 1]: + list[j], list[j + 1] = list[j + 1], list[j] + return list print(bubble_sort([5, 4, 3, 2, 1]))""" @@ -109,46 +104,6 @@ def bubble_sort(lst): assert stream_output == target -@pytest.mark.skipif(not BAICHUAN13B_MODEL_PATH.exists(), reason="model file not found") -def test_baichuan13b_pipeline(): - check_pipeline( - model_path=BAICHUAN13B_MODEL_PATH, - prompt="你好呀", - target="你好!很高兴见到你。请问有什么我可以帮助你的吗?", - gen_kwargs=dict(repetition_penalty=1.1), - ) - - -@pytest.mark.skipif(not BAICHUAN2_7B_MODEL_PATH.exists(), reason="model file not found") -def test_baichuan2_7b_pipeline(): - check_pipeline( - model_path=BAICHUAN2_7B_MODEL_PATH, - prompt="你好呀", - target="你好!很高兴为你服务。请问有什么问题我可以帮助你解决?", - gen_kwargs=dict(repetition_penalty=1.05), - ) - - -@pytest.mark.skipif(not BAICHUAN2_13B_MODEL_PATH.exists(), reason="model file not found") -def test_baichuan2_13b_pipeline(): - check_pipeline( - model_path=BAICHUAN2_13B_MODEL_PATH, - prompt="你好呀", - target="你好!很高兴见到你。请问有什么我可以帮助你的吗?", - gen_kwargs=dict(repetition_penalty=1.05), - ) - - -@pytest.mark.skipif(not INTERNLM7B_MODEL_PATH.exists(), reason="model file not found") -def test_internlm7b_pipeline(): - check_pipeline(model_path=INTERNLM7B_MODEL_PATH, prompt="你好", target="你好!有什么我可以帮助你的吗?") - - -@pytest.mark.skipif(not INTERNLM20B_MODEL_PATH.exists(), reason="model file not found") -def test_internlm20b_pipeline(): - check_pipeline(model_path=INTERNLM20B_MODEL_PATH, prompt="你好", target="你好!有什么我可以帮助你的吗?") - - @pytest.mark.skipif(not CHATGLM4_MODEL_PATH.exists(), reason="model file not found") def test_langchain_api(): import os diff --git a/tests/test_convert.py b/tests/test_convert.py index 9a1e31e3..59acaf10 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -179,18 +179,6 @@ def test_quantize_q5_1(): "~/.cache/huggingface/hub/models--THUDM--chatglm2-6b/snapshots/b1502f4f75c71499a3d566b14463edd62620ce9f" ).expanduser() -BAICHUAN7B_MODEL_PATH = Path( - "~/.cache/huggingface/hub/models--baichuan-inc--Baichuan2-7B-Chat/snapshots/229e4eb1fab7f6aef90a2344c07085b680487597" -).expanduser() - -BAICHUAN13B_MODEL_PATH = Path( - "~/.cache/huggingface/hub/models--baichuan-inc--Baichuan-13B-Chat/snapshots/a4a558127068f2ce965aa56aeb826bf501a68970" -).expanduser() - -INTERNLM_MODEL_PATH = Path( - "~/.cache/huggingface/hub/models--internlm--internlm-chat-7b-v1_1/snapshots/1359d2199215552bd0a0cb138e21c6e97d538c0e" -).expanduser() - def make_data_embedding(): m = torch.nn.Embedding(4, 3) @@ -207,11 +195,16 @@ def make_data_linear(): x = torch.randn(2, 32) y = F.linear(x, w, b) + vec_x = x[0] + vec_y = F.linear(vec_x, w, b) + with open(HERE / "data/linear.data", "wb") as f: w.numpy().tofile(f) b.numpy().tofile(f) x.numpy().tofile(f) y.numpy().tofile(f) + vec_x.numpy().tofile(f) + vec_y.numpy().tofile(f) def make_data_layernorm(): @@ -612,8 +605,11 @@ def _forward_steps(model, seq_len): config.torch_dtype = torch.float32 m = ChatGLMModel(config).float().eval() + for param in m.parameters(): + param.data.uniform_(-0.5, 0.5) seq_len = 3 + x1, y1, x2, y2, x3, y3 = _forward_steps(m, seq_len) print(m) @@ -637,161 +633,59 @@ def _forward_steps(model, seq_len): y3.numpy().tofile(f) -def _make_data_baichuan_model(model_path, out_name): - sys.path.append(str(model_path)) - from modeling_baichuan import BaichuanModel - from transformers import AutoConfig - - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - config.hidden_size = 32 - config.num_attention_heads = 8 - config.intermediate_size = config.hidden_size * 3 - config.num_hidden_layers = 1 - config.torch_dtype = torch.float32 - config.vocab_size = 5 - - m = BaichuanModel(config).eval() - for param in m.parameters(): - param.data.uniform_(-0.5, 0.5) - - seq_len = 3 - - # self attention - x1 = torch.arange(seq_len, dtype=torch.int64)[None, :] - attn_mask = torch.ones(1, seq_len, dtype=torch.int64) - with torch.no_grad(): - out = m(x1, attention_mask=attn_mask, use_cache=True) - y1 = out.last_hidden_state - kv_cache = out.past_key_values - - # cross attention - x2 = torch.tensor([[seq_len]], dtype=torch.int64) - attn_mask = torch.ones(1, seq_len + 1, dtype=torch.int64) - with torch.no_grad(): - out = m(x2, attention_mask=attn_mask, past_key_values=kv_cache, use_cache=True) - y2 = out.last_hidden_state - kv_cache = out.past_key_values - - # cross attention - x3 = torch.tensor([[seq_len + 1]], dtype=torch.int64) - attn_mask = torch.ones(1, seq_len + 2, dtype=torch.int64) - with torch.no_grad(): - out = m(x3, attention_mask=attn_mask, past_key_values=kv_cache, use_cache=True) - y3 = out.last_hidden_state - kv_cache = out.past_key_values - - print(m) - - with open(HERE / f"data/{out_name}", "wb") as f: - m.embed_tokens.weight.data.numpy().tofile(f) - m.layers[0].input_layernorm.weight.data.numpy().tofile(f) - m.layers[0].self_attn.W_pack.weight.data.numpy().tofile(f) - m.layers[0].self_attn.o_proj.weight.data.numpy().tofile(f) - m.layers[0].post_attention_layernorm.weight.data.numpy().tofile(f) - m.layers[0].mlp.gate_proj.weight.data.numpy().tofile(f) - m.layers[0].mlp.down_proj.weight.data.numpy().tofile(f) - m.layers[0].mlp.up_proj.weight.data.numpy().tofile(f) - m.norm.weight.data.numpy().tofile(f) - - x1.int().numpy().tofile(f) - y1.numpy().tofile(f) - x2.int().numpy().tofile(f) - y2.numpy().tofile(f) - x3.int().numpy().tofile(f) - y3.numpy().tofile(f) - - -def make_data_baichuan7b_model(): - _make_data_baichuan_model(BAICHUAN7B_MODEL_PATH, "baichuan7b_model.data") - - -def make_data_baichuan13b_model(): - _make_data_baichuan_model(BAICHUAN13B_MODEL_PATH, "baichuan13b_model.data") - - -def make_internlm_model(): - sys.path.append(str(INTERNLM_MODEL_PATH)) - from modeling_internlm import InternLMModel - from transformers import AutoConfig - - config = AutoConfig.from_pretrained(INTERNLM_MODEL_PATH, trust_remote_code=True) - config.hidden_size = 32 - config.num_attention_heads = 8 - config.intermediate_size = config.hidden_size * 3 - config.num_hidden_layers = 1 - config.torch_dtype = torch.float32 - config.vocab_size = 5 +def make_glm4_pipeline_data(): + from transformers import AutoModelForCausalLM, AutoTokenizer - m = InternLMModel(config).float().eval() - for param in m.parameters(): - param.data.uniform_(-0.5, 0.5) + tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat", trust_remote_code=True) - seq_len = 3 + # tiktoken + chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" + print("tiktoken:", tokenizer.tokenizer.encode(chktxt, disallowed_special=())) - # self attention - x1 = torch.arange(seq_len, dtype=torch.int64)[None, :] - attn_mask = torch.ones(1, seq_len, dtype=torch.int64) - position_ids = torch.arange(seq_len, dtype=torch.int64)[None, :] - with torch.no_grad(): - out = m(x1, attention_mask=attn_mask, position_ids=position_ids, use_cache=True) - y1 = out.last_hidden_state - kv_cache = out.past_key_values + # tokenizer + inputs = tokenizer("你好") + print(f"encode: {inputs=}") - # cross attention - x2 = torch.tensor([[seq_len]], dtype=torch.int64) - attn_mask = torch.ones(1, seq_len + 1, dtype=torch.int64) - position_ids = torch.tensor([[seq_len]], dtype=torch.int64) - with torch.no_grad(): - out = m(x2, attention_mask=attn_mask, position_ids=position_ids, past_key_values=kv_cache, use_cache=True) - y2 = out.last_hidden_state - kv_cache = out.past_key_values + conversation = [{"role": "user", "content": "你好"}] + inputs = tokenizer.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=True, + return_tensors="pt", + return_dict=True, + ) + print(f"apply_chat_template: {conversation=} {inputs=}") - # cross attention - x3 = torch.tensor([[seq_len + 1]], dtype=torch.int64) - attn_mask = torch.ones(1, seq_len + 2, dtype=torch.int64) - position_ids = torch.tensor([[seq_len + 1]], dtype=torch.int64) - with torch.no_grad(): - out = m(x3, attention_mask=attn_mask, position_ids=position_ids, past_key_values=kv_cache, use_cache=True) - y3 = out.last_hidden_state - kv_cache = out.past_key_values + # round 1 + inputs = inputs.to("cuda") + model = AutoModelForCausalLM.from_pretrained( + "THUDM/glm-4-9b-chat", + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True, + device_map="auto", + ).eval() - print(m) + outputs = model.generate(**inputs, do_sample=False) + outputs = outputs[:, inputs["input_ids"].shape[1] :] + response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() + print(f"generate: {response=}") - with open(HERE / f"data/internlm_model.data", "wb") as f: - m.embed_tokens.weight.data.numpy().tofile(f) - m.layers[0].input_layernorm.weight.data.numpy().tofile(f) - qkv_proj = torch.cat( - ( - m.layers[0].self_attn.q_proj.weight.data, - m.layers[0].self_attn.k_proj.weight.data, - m.layers[0].self_attn.v_proj.weight.data, - ), - dim=0, - ) - qkv_proj.numpy().tofile(f) - qkv_bias = torch.cat( - ( - m.layers[0].self_attn.q_proj.bias.data, - m.layers[0].self_attn.k_proj.bias.data, - m.layers[0].self_attn.v_proj.bias.data, - ), - dim=0, - ) - qkv_bias.numpy().tofile(f) - m.layers[0].self_attn.o_proj.weight.data.numpy().tofile(f) - m.layers[0].self_attn.o_proj.bias.data.numpy().tofile(f) - m.layers[0].post_attention_layernorm.weight.data.numpy().tofile(f) - m.layers[0].mlp.gate_proj.weight.data.numpy().tofile(f) - m.layers[0].mlp.up_proj.weight.data.numpy().tofile(f) - m.layers[0].mlp.down_proj.weight.data.numpy().tofile(f) - m.norm.weight.data.numpy().tofile(f) + conversation += [{"role": "assistant", "content": response}, {"role": "user", "content": "晚上睡不着应该怎么办"}] + inputs = tokenizer.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=True, + return_tensors="pt", + return_dict=True, + ) + print(f"apply_chat_template: {conversation=} {inputs=}") - x1.int().numpy().tofile(f) - y1.numpy().tofile(f) - x2.int().numpy().tofile(f) - y2.numpy().tofile(f) - x3.int().numpy().tofile(f) - y3.numpy().tofile(f) + # round 2 + outputs = model.generate(**inputs, do_sample=False) + outputs = outputs[:, inputs["input_ids"].shape[1] :] + response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() + print(f"generate: {response=}") def make_glm4_pipeline_data(): @@ -874,9 +768,6 @@ def main(): # make_data_glm_model() # make_data_glm2_model() # make_data_glm3_model() - # make_data_baichuan7b_model() - # make_data_baichuan13b_model() - # make_internlm_model() # make_data_glm4_model() make_glm4_pipeline_data() diff --git a/third_party/ggml b/third_party/ggml index 6549d12f..9d562d71 160000 --- a/third_party/ggml +++ b/third_party/ggml @@ -1 +1 @@ -Subproject commit 6549d12f2e3176050040a86334f17c001e170f13 +Subproject commit 9d562d712513c77a4de44ad0428be62bc3f2a9cf From 5161c10631e84d06c21c61774da371d0bf820f04 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Sun, 16 Jun 2024 20:51:44 +0800 Subject: [PATCH 02/18] Upgrade to latest --- chatglm.cpp | 28 ++++++++++++++-------------- chatglm.h | 16 +++++++--------- chatglm_test.cpp | 18 +++++++++--------- third_party/ggml | 2 +- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/chatglm.cpp b/chatglm.cpp index 3c20bd80..1bbafa5b 100644 --- a/chatglm.cpp +++ b/chatglm.cpp @@ -516,7 +516,7 @@ std::string to_string(ModelType model_type) { } } -static ggml_tensor *apply_rotary_emb_basic(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx, +static ggml_tensor *apply_rotary_emb_basic(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, RopeType rope_type, float rope_theta) { // tensor a (activation) is of shape [s, #h, d] // tensor b (position_ids) is of shape [s] @@ -527,12 +527,12 @@ static ggml_tensor *apply_rotary_emb_basic(ModelContext *mctx, ggml_tensor *laye } #endif const int head_size = layer->ne[0]; - layer = ggml_rope_ext_inplace(ctx, layer, position_ids, nullptr, head_size, (int)rope_type, n_ctx, 0, rope_theta, + layer = ggml_rope_ext_inplace(ctx, layer, position_ids, nullptr, head_size, (int)rope_type, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // [s, #h, d] return layer; } -static ggml_tensor *apply_rotary_emb_glm(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx) { +static ggml_tensor *apply_rotary_emb_glm(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids) { // tensor a (activation) is of shape [s, #h, d] // tensor b (position_ids) is of shape [2 * s] ggml_context *ctx = mctx->ctx_b.get(); @@ -556,8 +556,8 @@ static ggml_tensor *apply_rotary_emb_glm(ModelContext *mctx, ggml_tensor *layer, a2_rope = ggml_cont(ctx, a2_rope); #endif - a1_rope = ggml_rope_inplace(ctx, a1_rope, b1, rope_dim, (int)RopeType::NEOX, n_ctx); // [s, #h, d/2] - a2_rope = ggml_rope_inplace(ctx, a2_rope, b2, rope_dim, (int)RopeType::NEOX, n_ctx); // [s, #h, d/2] + a1_rope = ggml_rope_inplace(ctx, a1_rope, b1, rope_dim, (int)RopeType::NEOX); // [s, #h, d/2] + a2_rope = ggml_rope_inplace(ctx, a2_rope, b2, rope_dim, (int)RopeType::NEOX); // [s, #h, d/2] #ifdef GGML_USE_CUDA a1_rope = ggml_cpy(ctx, a1_rope, a1); @@ -587,7 +587,7 @@ static ggml_tensor *apply_rotary_emb_glm2(ModelContext *mctx, ggml_tensor *layer half_layer = ggml_cont(ctx, half_layer); } ggml_tensor *roped_half_layer = - ggml_rope_ext_inplace(ctx, half_layer, position_ids, nullptr, rope_dim, (int)RopeType::GPTJ, 0, 0, rope_theta, + ggml_rope_ext_inplace(ctx, half_layer, position_ids, nullptr, rope_dim, (int)RopeType::GPTJ, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // [s, #h, d] if (!ggml_backend_is_cpu(mctx->backend.get())) { roped_half_layer = ggml_cpy(ctx, roped_half_layer, half_layer_view); @@ -597,14 +597,14 @@ static ggml_tensor *apply_rotary_emb_glm2(ModelContext *mctx, ggml_tensor *layer return layer; } -static ggml_tensor *apply_rotary_emb(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, int n_ctx, +static ggml_tensor *apply_rotary_emb(ModelContext *mctx, ggml_tensor *layer, ggml_tensor *position_ids, RopeType rope_type, float rope_theta) { switch (rope_type) { case RopeType::GPTJ: case RopeType::NEOX: - return apply_rotary_emb_basic(mctx, layer, position_ids, n_ctx, rope_type, rope_theta); + return apply_rotary_emb_basic(mctx, layer, position_ids, rope_type, rope_theta); case RopeType::CHATGLM: - return apply_rotary_emb_glm(mctx, layer, position_ids, n_ctx); + return apply_rotary_emb_glm(mctx, layer, position_ids); case RopeType::CHATGLM2: return apply_rotary_emb_glm2(mctx, layer, position_ids, rope_theta); case RopeType::DISABLED: @@ -615,7 +615,7 @@ static ggml_tensor *apply_rotary_emb(ModelContext *mctx, ggml_tensor *layer, ggm } ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, - ggml_tensor *position_ids, int n_past, int n_ctx) const { + ggml_tensor *position_ids, int n_past) const { ggml_context *ctx = mctx->ctx_b.get(); const int hidden_size = hidden_states->ne[0]; @@ -648,8 +648,8 @@ ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_sta qkv->nb[1], (hidden_size + head_size * num_kv_heads) * ggml_element_size(qkv)); } - query_layer = apply_rotary_emb(mctx, query_layer, position_ids, n_ctx, rope_type, rope_theta); - key_layer = apply_rotary_emb(mctx, key_layer, position_ids, n_ctx, rope_type, rope_theta); + query_layer = apply_rotary_emb(mctx, query_layer, position_ids, rope_type, rope_theta); + key_layer = apply_rotary_emb(mctx, key_layer, position_ids, rope_type, rope_theta); query_layer = ggml_cont(ctx, ggml_permute(ctx, query_layer, 0, 2, 1, 3)); // [#h, s, d] if (num_shared_q_heads > 1) { @@ -1063,11 +1063,11 @@ std::string ChatGLMTokenizer::postprocess(const std::string &text) { } ggml_tensor *GLMBlock::forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, - ggml_tensor *position_ids, int n_past, int n_ctx) const { + ggml_tensor *position_ids, int n_past) const { ggml_context *ctx = mctx->ctx_b.get(); ggml_tensor *attn_input = input_layernorm.forward(mctx, hidden_states); - ggml_tensor *attn_output = attention.forward(mctx, attn_input, attention_mask, position_ids, n_past, n_ctx); + ggml_tensor *attn_output = attention.forward(mctx, attn_input, attention_mask, position_ids, n_past); ggml_build_forward_expand(mctx->gf, attn_output); attn_input = ggml_scale_inplace(ctx, attn_input, alpha); hidden_states = ggml_add_inplace(ctx, attn_input, attn_output); diff --git a/chatglm.h b/chatglm.h index e0836efb..7b3dc9db 100644 --- a/chatglm.h +++ b/chatglm.h @@ -459,7 +459,7 @@ class BasicAttention { hidden_size / num_attention_heads, num_kv_heads)) {} ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, - ggml_tensor *position_ids, int n_past, int n_ctx) const; + ggml_tensor *position_ids, int n_past) const; public: int num_attention_heads; @@ -490,14 +490,12 @@ class BasicBlock { mlp(mctx, hidden_size, intermediate_size, hidden_act) {} ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, - ggml_tensor *position_ids, int n_past, - - int n_ctx) const { + ggml_tensor *position_ids, int n_past) const { ggml_context *ctx = mctx->ctx_b.get(); ggml_tensor *residual = hidden_states; hidden_states = input_layernorm.forward(mctx, hidden_states); - hidden_states = attention.forward(mctx, hidden_states, attention_mask, position_ids, n_past, n_ctx); + hidden_states = attention.forward(mctx, hidden_states, attention_mask, position_ids, n_past); hidden_states = ggml_add_inplace(ctx, hidden_states, residual); residual = hidden_states; @@ -556,7 +554,7 @@ class BasicModel { : word_embeddings(mctx, config.vocab_size, config.hidden_size), layers(build_layers(mctx, config)), final_layernorm(mctx, config.hidden_size) {} - ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input_ids, int n_past, int n_ctx) const { + ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input_ids, int n_past) const { ggml_context *ctx = mctx->ctx_b.get(); const int qlen = input_ids->ne[0]; @@ -576,7 +574,7 @@ class BasicModel { ggml_tensor *hidden_states = word_embeddings.forward(mctx, input_ids); for (const auto &layer : layers) { - hidden_states = layer.forward(mctx, hidden_states, attention_mask, position_ids, n_past, n_ctx); + hidden_states = layer.forward(mctx, hidden_states, attention_mask, position_ids, n_past); } hidden_states = final_layernorm.forward(mctx, hidden_states); @@ -835,7 +833,7 @@ class BasicModelForCausalLM : public BaseModelForCausalLM { public: ggml_tensor *forward(ModelContext *mctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const override { - ggml_tensor *transformer_outputs = transformer.forward(mctx, input_ids, n_past, n_ctx); + ggml_tensor *transformer_outputs = transformer.forward(mctx, input_ids, n_past); // NOTE: only compute next token logits for decoding if (is_decoding && input_ids->ne[0] > 1) { transformer_outputs = @@ -898,7 +896,7 @@ class GLMBlock : public BasicBlock { alpha(std::sqrt(2.f * 28)) {} ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask, - ggml_tensor *position_ids, int n_past, int n_ctx) const; + ggml_tensor *position_ids, int n_past) const; public: float alpha; diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 1b480851..0b7a2b9f 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -316,7 +316,7 @@ class ChatGLMTest : public ::testing::Test { // self attention { ggml_graph_clear(mctx_->gf); - ggml_tensor *out_y1 = model.forward(mctx_.get(), x1, 0, seq_len); + ggml_tensor *out_y1 = model.forward(mctx_.get(), x1, 0); ggml_build_forward_expand(mctx_->gf, out_y1); CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); set_graph_inputs(mctx_->gf, seq_len, 0, seq_len); @@ -328,7 +328,7 @@ class ChatGLMTest : public ::testing::Test { // cross attention { ggml_graph_clear(mctx_->gf); - ggml_tensor *out_y2 = model.forward(mctx_.get(), x2, seq_len, seq_len); + ggml_tensor *out_y2 = model.forward(mctx_.get(), x2, seq_len); ggml_build_forward_expand(mctx_->gf, out_y2); CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); set_graph_inputs(mctx_->gf, 1, seq_len, seq_len); @@ -338,7 +338,7 @@ class ChatGLMTest : public ::testing::Test { } { ggml_graph_clear(mctx_->gf); - ggml_tensor *out_y3 = model.forward(mctx_.get(), x3, seq_len + 1, seq_len); + ggml_tensor *out_y3 = model.forward(mctx_.get(), x3, seq_len + 1); ggml_build_forward_expand(mctx_->gf, out_y3); CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); set_graph_inputs(mctx_->gf, 1, seq_len + 1, seq_len); @@ -1202,7 +1202,7 @@ TEST(Pipeline, ChatGLM3) { { ChatMessage output = pipeline.chat(messages, gen_config); EXPECT_EQ(output.role, ChatMessage::ROLE_ASSISTANT); - EXPECT_EQ(output.content, "根据您的要求,我使用随机数生成器API生成了一个在0和100之间的随机数,结果为22。"); + EXPECT_EQ(output.content, "根据您的要求,我使用随机数生成器API生成了一个随机数。根据API返回的结果,生成的随机数为22。"); } } @@ -1219,9 +1219,9 @@ TEST(Pipeline, ChatGLM3) { EXPECT_EQ(output.role, ChatMessage::ROLE_ASSISTANT); EXPECT_EQ(output.content, R"(好的,我会为您列出100以内的所有质数。 -质数是指只能被1和它本身整除的正整数。例如,2、3、5、7等都是质数。 +质数是指只能被1和它本身整除的大于1的整数。例如,2、3、5、7等都是质数。 -让我们开始计算。)"); +让我们开始吧!)"); EXPECT_EQ(output.tool_calls.front().code.input, R"(```python def is_prime(n): """Check if a number is prime.""" @@ -1239,8 +1239,8 @@ def is_prime(n): return True # Get all prime numbers up to 100 -primes_up_to_100 = [i for i in range(2, 101) if is_prime(i)] -primes_up_to_100 +primes_upto_100 = [i for i in range(2, 101) if is_prime(i)] +primes_upto_100 ```)"); messages.emplace_back(std::move(output)); } @@ -1431,7 +1431,7 @@ TEST(Pipeline, CodeGeeX2) { def bubble_sort(list): for i in range(len(list) - 1): - for j in range(len(list) - 1): + for j in range(len(list) - 1 - i): if list[j] > list[j + 1]: list[j], list[j + 1] = list[j + 1], list[j] return list diff --git a/third_party/ggml b/third_party/ggml index 9d562d71..f4ee2394 160000 --- a/third_party/ggml +++ b/third_party/ggml @@ -1 +1 @@ -Subproject commit 9d562d712513c77a4de44ad0428be62bc3f2a9cf +Subproject commit f4ee239462c7ad59e0a39f0e96a0e6099c5a9452 From 64db1ef5389a0a2564a42acdcb12ec713d66de11 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jun 2024 16:00:28 +0800 Subject: [PATCH 03/18] update readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 89ce8aea..95844a57 100644 --- a/README.md +++ b/README.md @@ -224,9 +224,9 @@ OpenBLAS provides acceleration on CPU. Add the CMake flag `-DGGML_OPENBLAS=ON` t cmake -B build -DGGML_OPENBLAS=ON && cmake --build build -j ``` -**cuBLAS** +**CUDA** -cuBLAS uses NVIDIA GPU to accelerate BLAS. Add the CMake flag `-DGGML_CUDA=ON` to enable it. +CUDA accelerates model inference on NVIDIA GPU. Add the CMake flag `-DGGML_CUDA=ON` to enable it. ```sh cmake -B build -DGGML_CUDA=ON && cmake --build build -j ``` @@ -257,7 +257,7 @@ Install from PyPI (recommended): will trigger compilation on your platform. pip install -U chatglm-cpp ``` -To enable cuBLAS acceleration on NVIDIA GPU: +To enable CUDA on NVIDIA GPU: ```sh CMAKE_ARGS="-DGGML_CUDA=ON" pip install -U chatglm-cpp ``` From 5f76d827ecba4f645bf6ede9aa96781a29eb0367 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jun 2024 16:14:59 +0800 Subject: [PATCH 04/18] Fix pytest --- CMakeLists.txt | 2 +- tests/test_chatglm_cpp.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11aaa3db..581223a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,7 +137,7 @@ add_custom_target(check-all COMMAND cmake --build build -j COMMAND ./build/bin/chatglm_test COMMAND python3 setup.py develop - COMMAND python3 -m pytest tests/test_chatglm_cpp.py -v + COMMAND python3 -m pytest tests/test_chatglm_cpp.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} ) diff --git a/tests/test_chatglm_cpp.py b/tests/test_chatglm_cpp.py index e4b42c85..482c8604 100644 --- a/tests/test_chatglm_cpp.py +++ b/tests/test_chatglm_cpp.py @@ -87,7 +87,7 @@ def test_codegeex2_pipeline(): def bubble_sort(list): for i in range(len(list) - 1): - for j in range(len(list) - 1): + for j in range(len(list) - 1 - i): if list[j] > list[j + 1]: list[j], list[j + 1] = list[j + 1], list[j] return list From 75b0da35874816ebe697aa6cc1bae64ee339eb49 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jun 2024 16:17:14 +0800 Subject: [PATCH 05/18] upgrade ggml --- third_party/ggml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ggml b/third_party/ggml index f4ee2394..5653a195 160000 --- a/third_party/ggml +++ b/third_party/ggml @@ -1 +1 @@ -Subproject commit f4ee239462c7ad59e0a39f0e96a0e6099c5a9452 +Subproject commit 5653a195935ea3ac54652644c9daf154dbc1571b From 9048d01807456972ec3c23a010c4ed1f75f3bc33 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jun 2024 16:27:57 +0800 Subject: [PATCH 06/18] ggml f4ee239 --- third_party/ggml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ggml b/third_party/ggml index 5653a195..f4ee2394 160000 --- a/third_party/ggml +++ b/third_party/ggml @@ -1 +1 @@ -Subproject commit 5653a195935ea3ac54652644c9daf154dbc1571b +Subproject commit f4ee239462c7ad59e0a39f0e96a0e6099c5a9452 From fc2c102b079f1b7995398e9dc11769d17ef4038d Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jun 2024 19:20:51 +0800 Subject: [PATCH 07/18] update linear data --- tests/data/linear.data | Bin 2688 -> 9472 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/data/linear.data b/tests/data/linear.data index 9ae4c1b5011620a3fe97432f6508d9c1749cba95..8cb7042d2d5c75466d6fd4e40b600f5cbd3d588a 100644 GIT binary patch delta 7167 zcmc)Phd-BnxCd|zE2A=tA{vV98Sd|OM}v}3(U!EdC}pJjSy>q&yOO<<)x`b1e)K5H z6H?N8N+m*+R2n$W>pbUm&R=lOzi_|apX>TwpW9#7e^n_f%A5QqifHQl;nB0M(Dt|< zHf82ewI8ERx@10Qch;ev&-tZus{e$x6%E#pS$WaZT~kuSk242#hj>jJU6 z#E3_>oS1}er>Eg!g#)LJZ|x=`m%R(}lq<1e zED7drNueFzrP!5B0okL?6JvM!rDIUNGHdnS06#2^hX_s_8|v(!Uqpaa8+=1Kl~?rI z+(Ej(GM51O;#CV9Xt^VW*`p5R%B#&}&AMYCP-q7B%PMhG|3v=$b%r#xMi~sW+VO>r zJ=pr*qZpP2>a~;LwMRLb@GKYu4j#vo5(glcXZed%zV(AU6L~PnBmv(%m&LUOga+)o zgg?WRVCH-)Dj-*cr_Y8!!qFg1(~GCB5q?aXP8{TkXur4Zd4C;MEDca*l6P4RKpdhJ_H|)pWM*Va-4g4FY?X=meHGD29RO$ z%Y0kOW}Nt06eQ|}(9XXQrSH{&yKxIy#+$^i$`FE6UNfQK?`B#!*hMZT*VEI#Mc4-m z*3mD#22L$zm~5SE2&21nm`S5==nGj*Y>EE|W||(s^((@#)8qhs+Hx89CQHJd-6~|% zD+p$6y2yhw_LnQQ(!21@YF(J-oC0gY^I@Y?IXw}R3QFzsas7QEklpKv3++Yu7YFYU z_I?Xhlx0x*O)JP53qq&fVY<6`l%QWY**b-TioEN@^LrCS^vHwt^Ov~aTp91)YplxC z*bUli|5UvZT*CCNm4RO|_c6+sCx@l(A33*JeK0haMB77msQnT{ZtEmL$UScXf*6F$ zB`%|BdpGB$GaGMg-wZO3l*r5IC=iy9g#9Dap$AvM?JLDN@Yxc7O>v}(cSEVa%1UJB zTmy+X!atiWO`Fbd!b7K$Y0h#HR;EG#Hq3BEr=l^~#oP4^Pm3(ZgJTw$HuW1eC&XY~ zpaSk`R^V^DvmXTByoJTF87LSp2fUxVcVKc* zFuFvh;j3~@>NrgnzIA+ov*{7!uO3tQ{X7e;U2oF)y(c*JJw+fG5sIEek4e|{l_(-r zUp@HX2+nOyW8QCA$@)gGMQ`sYyrjMa!yBxjCI2dztwmxY>5m?6ji$~wg=hq{GI<*~ zI#vxB&Uz2`Ph*v{a{S4I0wtdreoc4A%0)hJu<(eoEh-j zf+tcVX?fpUVjOlHj+`ol+dq~=-({X9t8Svqx0v+|b??}50n-UqbL|k|HT? znGAcr$l)qEe@M*8$2$Qx%^G%0hhzEIalK9sT|YdRICpZGTkileDuLj1EgLR@AUi>R zfZ5X^PZy={0ykkxocQA}Oh5P@BgFK;T5mQ#BdWlB%Q^vC#%p^DABv~5l>^7nwc7~H zwKjtFk{D(RG@9-2u0YqUA*MBq_C( ztRYUM={ZgFwu4@J4Hr5#atnM?m~G1PkT|uGR5qxhKhzGI?nqJ_sHQIJJ+!GJ2mknwC&_aNx4sL31(G3{8@z{vh-$$rS7GSddY$~Z zAOndDCeVfQ0btUYpXyUd# zgPvReH&yA}N>+zv!`08hSUA508_$cvwD<_f`1~A=??m9o)=5-$ixyrf%)@=ci{Qlg z*NAHE0q3}n#O^^P5uav5I=o&p^q~{(>EkKDV17Q7$c4iqmkVe!G6$~Q41kP08`LQg zpnjs7@S$-lc#L$y_7Ohzj+|#SZb|Va14Mxzs0FV>5&eVwFu&Uvyj(j-cl%V#9$o|; z7lioI=0bJx+d5E0zo<<2@z}b;^aox)=B*b2ue?joExznH({a|2) zRue?|Z(bLY7hDWB{ZN9PTXx_Xsdw0TY%Q+qtH!yt>QH5*0YzgeTn1T4wpn$U%eKe> zuO!`r1S}4RGtULlH`fI1r~`EtTZ_ispNNu2J?Tm=LNy-?P>nN%b63K+ZEGdr%i?Qz z{@h#Uu*x*1J7gNVc<%=dnH1vVWJsRMmD2c>{iHeKHId&CL7ob&p|M*6LD%OAjaT_o zEjaZg3Y)JbLVrId>!nk9c$6nXH!ew}7lrzX=ox9?@(*&kLkS?ay%zTrTQI|a7FK)S zz6mEg{YgwN1KQ^zQ87S?I7?PwpNbljE*yihL;65ptE2HQ0itcxMDhyM(9nMagNj#4 zZqO#0D^U;U&GuofPd#=$Tfx{mo`x5Z!mQT}o*CWrdIf%Uc?Zwi1aM*XUgr2(aojDd zi>iHUnDs#y>{JJ+&L0iBUc&qhNrZ@qz>g+DG^l#O zEZ8x?RlcQUeMKPlYeYcH-*MpYsY^Zm0-5y4OcWDTMxFbfw7={IBf;Z+fJ+l6^L@T% zlV1;FQS0j^SmRoYq%Z~e-S23$-)iQ4tuB?^vK zWUIR@-pa`(7d%DCi`p?J#SgJ$Q-b;7!BX-mYs~!DSR+Xtc}9|Z9DycWf*VRg5VHI+ z7CVarPy0_f&j0-f+1iWHTquUG7e;ftj+sH{=b1EJ$PgcH86YiJqsf^cg<#bY4>ywc za0$LINo&nvTz!>~-AS#C+?`tRQLQ4gyKcbi@Jcdz<|*vh8UQB^lp(@IiuFrufXv^Q z=xa{|_rw0MXY3|E4bfqiWjx}cn{O5FSYAQCy>I6vV;MBiJ`ZvHTOhw^H3{>I!Wlaz z@+%n;R%@F*4Sha9{HLXYTG(a6^xcO>Q5Vn^&|wYq1##sCDZY$~Hwx|_#1_Re>Qbi! z3dXwNbc4XK)L{&A(8dZY6>uLIhVu5iWSQqqcoo5m!oX!I=;B!c#p$C&WU3aVzmI}I zFI(tTnoYH0BB`i$90>{W2M5C>EZoorBhj6B;GZ1)l4^@y8N%#eJ~zpNo2BH41fguT zDm^in29;Z5Fyd1W&G*kiS>WSdUKH(i;y`qVnMeM4(~(e7s?h2+`g5 z2zW%{$mxxs*fcDHfUr_c-MjN52$rU11czJ)Vnc_XHsxAS0kk`Sgn664A(tD&S@iVbf} zh4kH9n4B_>x>tmfxSeWb+g=ufrpA$F%v19FY%&br$l`hxQlUWnE2wO^3j&uNN$hr0 zhPk|e9E)@3eAlhSSspn=|N95os#6?=btP{uNvhB$%WmLpB}HBw}VNApg&2YGt&Q zJn?L$yx{FUC>=M2OY^8AU4&mP_4xy>ZxrVjNLGWxjHfsuBp!SMx}YMq9^EHbK;WxL z64h8pn*uaZSRx1V_lyx$Z(+Q!u;tZK5Q&W!;B8uU}HX)3s!i;zqnwEyD$Tt!EZq z@q$IF%kjBxA}q9NM7051NM)apE1UPyFE_Gjwe5TSag8@X6xCeO>u>=lal{TDYiP17 zSrHT&Lzp1<6t^r<=1=~eh8taj=$vQe5L9rBiOeZsR_eZ|PnO>#Upxko-@h3j)r3K8 z)nAORhBo?=Mj|>nn0&mynwT4^QfFNTrrG*a^#!YFrQs;nGAc;6%!0asX|$L3Obd5y zE(0&qJZ?X#U*p4HQ1Mq{!dwNN0 zDXy9p2M*_rke6{$4~k}{;m{{Zn$@#`?hc8CN$;$%dy^H62K(Y`hg8V5F(+yDQRH{h zW4bQ%AUSZvjC6E7q3JU6e3@exuSdAaO&x1{9-e{#pVr>(4vD(vKbGBNBx6MVI2;?C2uUX+Nx4)8$^Z-JTzAktU*>?e>tE>VsfRXGHj?~b zVlcQm7~{jgp@~5t$&Gx3otgj86Y)a)G@)NGko<+d@~eUU%XuHTC#}w~;gShc$Nr$P zeIMY?^#HoP`7)Hv76#2Z2$PO3BYRA(X=2la715Yl1kj$BEFz zo1li@*$jB7SOeVIOm50Od)TMw2udEeX+~*0`V>sXi%Sf_UEncyDE~a}K3jy{w+3)S z*Cni6JdxcIE=C4c3SfSZE!oLwBqzuJTxdirFtoPvmJFi~q#wXC?qM zm~QrIdnP>lTaeX1^a6R?4o0GhZy0`?+(CF9GtojyfR^~}4-#n| z;HAUjwi8Q0DJd5(M9Gj9rTfV09YJW6)DOB(v{?gJACmJR3Cp+5hes>*ad+DIUN{^L zSvH~YJwAapnSptvH`R^OB$onLiX)1xF!7IEMc9xpj=fca8C{(Kdj1vQM z@#=sjjwKDzy&)HIfBOh^=?sNamg$f@Q39tg6+~^@IuJQ1%>?MW;NGqiX!PhQ^m`p8 znYQ&%$1C4OOV8*N^$DfS<9&C@*6)c}GWk9vsc)cH_6<`HNquhQ)F)2M%m%WYe5eHd zN{>3$(&K7o)fvKFv|m09RIc*KyTi`h0Yy6qRXvW&cc#H$v=jz-XENujWAKIHb>z)F zKxaCrkQ09;p(Dw`#Z|6mZo8HvFXg8?`*P+_T#ORzE2lQli9SY~kAK5pAuUu0p3A>w zas{&oe*!t~2t(Jefz#*l>6WYo-MKr_cvCGI%lS>~7;Mck}z{fnX_qleyOTlcui>hdm|kG~PZ5!|lsxyL1Gx zP_)MW9R#w3PN99{`|7O+#W`7hKKf?{W6-=aw0^uz$y}_DX4#RDO^2bBpNjHLyg$`9 zg%zMG?G^_8`J1zlInGHqEha~N)AURDI<8)N7)>r&#T z-A~pp6aZI^xmZ@P3ZpI?fvgROt5^6_Kani1?6f=$@{NG^*B_eE16}lI+;n#KF*m$l zJP{=1MIa^OI9l+Y_HiYXC&TgKkDRBDESlUHX7bgvV6{XAJz4q|vNyOxi?SY^eeo4+ zy6UTcOQ^tIyC^E=sbT&mqy4UK*b(RWhkXr#z5W|m<9gxdzgaQ084 zsC@_p7D!`%hbidNL+InBN6dY?nJsZ;$a`97!gs#z!5u6Uh6tOx80$1jMwHq>a$N=; zeVjyj*KW|FhXu^l`;*c6V-dH?Di88>O<>Y_JL)^*Jk*XKrt?@;aLVMcCFK=#9*>}Y z29faIuno1oUxNwKhPdp43)c0@VXurMPWs$~jX%DUjg$T;30TpAp(y&%Toy!olZdOS6n0B*fnew7T-GOf)EjR}C3~iU;_W7=S{2MSu2e?7 zdD|hwHjb{SP=E;kBu2N>22Yva<4V>3!fKs6G%@7~mdrT|a}TIPvfoZvwP=8dYtBdS zYF-EJ)Xbqqn@;0QQ3tfG&7vRvzC@iQT1lPhZJaZjh}Voa;90htyj$RaLE~4Av_=tp z+h;_cI9#lbPn?L#4U?JG8lmW;ypEbR`#{BEHGV)w6DN_p6?TQDkCze4#~TqRjOgiv zT_elL&JjbpB_f*)MSX@EUbR1%_HT#kVjbL$$++?PdU!KI8vaQLB|a`e5b4fQ`e-Q( z=Vp;av%g^2*B}AaEf=NF=UEaxat$gnH9nXrj%5$e(mRhk!TPuXMr|yl$A%|yOd7v z$Y5q?HIw3zURzLVj7FKJt< zETj5s0$JETy~a4_0c&UZ8FYD$uKdxvBK)_(0<7~qfu)(O9gCu;a8bcGzJ*9O`?Xi9 zW*!{ISQUesqhb0rnC8^ebxAW;_AqJYT!pEzTwBb7h0-QyC1$Q4gv4Vb%vcDEwfaupp#wP zQ^21UV>i#vb{9gmJ{x#q^ZoqZi mz|L_46YBrRz@u>kGUEo0{A-}#Ujq}~=&*(V%Yga62L1#7=+1Ef delta 206 zcmZqhY7pJvz**mvBIFR|Jng_9?#2UCm{%RpndEVRefsSK)7YgQG!q36>hbY9s+<0` zf7UD**4tSStVk From c3bd6054e4ccfa84f6323f3e96a9aa4b35f8a04a Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jun 2024 19:29:24 +0800 Subject: [PATCH 08/18] update ut --- chatglm_test.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 0b7a2b9f..7ac6bf96 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -381,13 +381,13 @@ TEST_F(ChatGLMTest, Linear) { std::ifstream ifs(test_path, std::ios::binary); ASSERT_TRUE(ifs) << "cannot open file " << test_path; - ggml_tensor *w = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32, 16); - ggml_tensor *b = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 16); - ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32, 2); - ggml_tensor *ref = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 16, 2); + ggml_tensor *w = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 64, 32); + ggml_tensor *b = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 64, 2); + ggml_tensor *ref = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32, 2); - ggml_tensor *vec_x = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32); - ggml_tensor *vec_ref = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 16); + ggml_tensor *vec_x = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 64); + ggml_tensor *vec_ref = ggml_new_tensor_1d(mctx_->ctx_b.get(), GGML_TYPE_F32, 32); auto buf_b = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_b.get(), mctx_->backend.get())); @@ -405,7 +405,7 @@ TEST_F(ChatGLMTest, Linear) { ggml_tensor *x; ggml_tensor *ref; }; - std::vector cases{{x, ref}}; + std::vector cases{{x, ref}, {vec_x, vec_ref}}; struct TestConfig { ggml_type dtype; @@ -413,14 +413,14 @@ TEST_F(ChatGLMTest, Linear) { float rtol; }; std::vector test_configs{ - {GGML_TYPE_F32, 1e-5, 0}, {GGML_TYPE_F16, 1e-2, 5e-4}, {GGML_TYPE_Q8_0, 0.15, 5e-4}, - {GGML_TYPE_Q5_0, 0.8, 0.1}, {GGML_TYPE_Q5_1, 0.8, 0.1}, {GGML_TYPE_Q4_1, 1.0, 0.2}, - {GGML_TYPE_Q4_0, 1.0, 0.2}, + {GGML_TYPE_F32, 1e-5, 0}, {GGML_TYPE_F16, 1e-2, 5e-4}, {GGML_TYPE_Q8_0, 0.2, 5e-4}, + {GGML_TYPE_Q5_0, 1.5, 0.1}, {GGML_TYPE_Q5_1, 1.5, 0.1}, {GGML_TYPE_Q4_1, 2.0, 0.2}, + {GGML_TYPE_Q4_0, 2.0, 0.2}, }; for (const auto &config : test_configs) { mctx_->dtype = config.dtype; - Linear model(mctx_.get(), 32, 16); + Linear model(mctx_.get(), 64, 32); auto buf_w = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); From da38976d6e34ad765df731663fff69287b4b58ce Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jun 2024 19:47:57 +0800 Subject: [PATCH 09/18] Update glm4 data --- tests/data/glm4_model.data | Bin 30548 -> 30548 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/data/glm4_model.data b/tests/data/glm4_model.data index 4ba3b4830def51be46aad485e3a576f506ad6caf..a34ce05a468aeb94b54348f91f36c801e30e67b1 100644 GIT binary patch literal 30548 zcmW)nfnSX2_x=YhZQBw;Z0w*-O9-t`YVKnYn}m>rFiQv_ZEOgmq9hp!Nivcom60Sh z_hlp*l_W`$43Z>Ck|gzge!o9pUNdt)&wb8y-q&?RE?cvgyBwhYS4$vYqek=i>8PLe zGfC$TQx~=6llRTPxYN!al$<>X-VXP8=8@h6~7 z8AO{7gpqjiUM}^Bk%;U+Fww(Ua(M8Cj{kiJ^=Iu6WiykCe=DWhJ5@C7)l2frFJP@P zRZy&vu;dX+ghe)hd|y*zLJoB->x6#NO{SG6^UQ!mZn|PU!m91b`s@i*Ua_P8+xC+C zwCm)sWimOg_yJMx1lT0(A(I7V?D*BYsP#OGrqV`o*^^8vT7l9^5&WKihMfAU{j?gy z{dWgq)l0SV=1JC^G7nL2Pms;MQ}CQwOU0WnlgG;^2zl1Q?z%N}lr8R~2kqUIL`nC2G;s>(m%B zpUab1Ni-=|+=xA3vb;FT+TBFmkH%51;b)Q!Hs)(u-$0t>gQB^qB--GTUFp9YT}S6a z6!{VLM&@tsgBC|?Lw(D!4nP|Y{hQ1`5C(f8S>MhxLO`_BuZ{gc|mKqm$aqq((FwQ6> zmls7);?!4p)7kw>(Q${G6Pc0z#KQB}A(R9dC zGbD;_=?JM0gO%kmSbrNsxrd@4${Md$&P|r+bw5vSfwy3P^D-m}ktiE;l?s2cfv({R zDa|I4G-)prefT$9e`7wE?vOL9NAI|Z)?{lg1h6R5V#nzHD8F7r`)@sfxL04Up&w+_ zY(tvHwG#7+=dg_1Mfz?TNObLuLhUHn#Gl~uiC688n?EA?r9DjS^oFcX#!z1QS-8Zl zCbO^Ui0O2L^hAGZ?D;1(_0*6>i5Wzt^K5q%1k#eDUoo}c4YIoO2%_Sbb~=+(Zcr-x zhDE`wN%e?ollsB6{RD!RGSp=Mfa&11B!06M#brrYx$`LITOB2{h+uN|&!zH=NtD|@ z5~Y26A!_OXs`4%&-@kq#Q>hQyQ@kMUa>&`ix@UH}d##+l3lkUf8G{z#?@r}&!3cE*GVRKgUHS73|Wq!faXhEN$j$Lg@iS8$GHl)oIcNY$27;R7esHuocOewM`ONeyb}l+var2S~oHgN`rU zOAc*`aJcZ8#KS8k@d30bI zasxuKKJXR!4ymIg@ds=QPp8&da%v4eL1r1{l+bzr?kPh_FL)l+Iu1bMqETcyw=ZNf zVkx9&PZ+--NZ*LaGx{$wGT#9!gTI)sdn7gH&fe;59_$f#1GokFc!3@ghKkJKr|i_@dF8}Cw?dMS4}8c{{>c4+o|>IS%??p zGU+WV?w9d|J6x;glH@qjW)fb^BTTOvuZxvjOHe6 zie&zAo1vO9oLp$jmi*byD)z)R);d&s`A6P-AGkuXIiYJSH0bKp(zsRbug8!>J zSafTnsFj^ue4dT~oCKA@l^4qdN!KE(e7Ozk7!S<}QMvl_{~-+iPu zM3W>d2(>z8Ka%{E0jJ>w!kK-O6n@Po zhu21se)pyw?a#=s;3CD%+z**t#$`i>LiFrDL`t_w$IobJ(B&uOZ9JJZM%1#}3(K&C z-6BiFg~%>^!6e6-*H9p z)qaq!{08y!Qi*)fH706&enY(I52|_FM9o9>seGdyq#5(!lz$HS+pa)b7tf`p*SOxD zRU~@41fl&3$@%P6i0rd$V}Gxq?urfyag2ob+6?Agw}rX<>W%bjE67p|$hTV>l+B6E zBzzeuZqGzbd?_jKcIW!DJt;Hj2Px#HC^S7tUOs(b5qXT4A4uV%qqDLr*H3}CTO-rB zMoXe+#F5*y2VAvcHVfKHSiY(ZVODd=vqvDaHk=I;)6o#u{6t0;H~4tR3#9B?D@k87 zg(CZ|Am=IHxpe4p~L5SaY7f@%w^Ab0I1Dc`mM zEngEU(yak8HmAsc#t)K?f4~&~>hmQX->Lro5*Q6On}k;}=iCtH?Rkrb zf9ZrQ`=La0(MN4!7|q%&aw+CNzuE^-~;=jc=S%lW>E8WH+bDV1yy%PSUNq#?(>Cc7;};=Oan;1CWIwFISB2d zktFYyDJg8uK<1K4<~i^kDer%WzoG+H69QSEJPozqIR)j0m#n4%zn+%mLG&@@DXPCR|0Te16kf^ z?%Od6X*Z6M_^1_AmE0vo%}u5{X9o4^6p}xBA(4*Pu*^$>mLLD#R{A=ZD+Z6@mI-$u z+BY*>yQ>t^S(V&uStRY~nuf&iWO)AE#+{ZLLbW)Bi9|`v_~BmKu|x%Buo2G~eVFn# zJtuFEe%x0a38yiA$^PCEiurg3rs0u@n!S=VyL<7Z{7*1Gbrhnwx#XQV8Wm|ODv{-r zQA;Fi{4|so?;8g3kOykjsw{?sx8ac8%xfDRkF8{t z7{vkGM5F2wHT@o@gto3H%k6IxXc6jh=ozlUF#bg3hEsn|tEd*WcW zsg33w>OhU%7Z`o}#4}AFlBVZu7Jt^BTobZkl4(ntryg8mGnQ~`P=#86%{G|!xHFq|)wa1dA`z+}1YTzQbF%ofo9Yj&*vPGNbaB=(pxaw*lH+giA zvg*|&-UT>?;9@`Oy~F4Vm+3o4WIOf*-W9kOgbjK@bnQ}ZV#-n&O}f!8t2Jp(o^gP{rN zDRE!%i|}4=m1yNHJa^J5Sn=!J@3%SdDlMYr{qE=H7DHs7}UF6lgUlu;?v z&Ht{b*H1f6)~bIfU-gircB|AQyClPC$wiV!j78j`ASz$n&Ku7!CatuFy8hBY-P>xB z=Xw$KR~A4fJ%_@xUxazmnyZR0BJuGmF!0c^2BnkoI8M7MK9&^-^fYzC0UH?V#rCy!avC~bWnHSceSYRhSAIQWoOS~kFWhAW#N zT0vD-S&(*)APrkd$yVK|GNgk-Pd%jixMx&6V-K&oRz{^q8eslMC=$Cp7K!U78YUI<<2AufGsB{7HyBqyr}G-H`i%m4X@ zBwaROWl09yeiC@mm;y;3$5wKX=WzMvWOb~57Gzd~N$nIz?H=t|G9?||&v5eeX=2Ld zooY|R>rAIQ#g~6fL*c}a2wRazM#ui;e&&Sa5qcaBd&1Kg%mrJ(N~!C;2$hz;ZRcQ zdvF_V8(XvR2U(v#51sOb+Iq`0*q+hRrazvMX4MLH>GpBhVHu0)(bq|)|0~}#=o6{h zk5Efr9kp&3>a}|UjGp%+%Q-Hn@sX2qmfk~cYW|{(jxg9*mNJvO?|6*ERq9{$f+7}Gk!(t)MBH+mLhg3LVAOHgbi2+(Hz(SK zwrSCRdL=T4ZpQJrAJEL3#^iyila%?YOsjfXRU*1#YXP`M1VqT8=)L@ zky|&H3wP!iDM~^p`r$`%YZwKYW3fbWLMxnqCylvKM}E74kW=~;rCBu)9o~j~_JsVK zM^UWkD-TT4Q=mVmzb1tv`Nh1EvvyqEUl?$>vdXy^!tjrL>M>k@L_7)+k6DO@@Ap+p?*#*5V-nC{3CiMTErA(@=X zr@Bgv*6Fe4!P5{kwi|8g_5i+y18M(_P*SY=hs!GSAd)m>iHvM*_e-9`srNN#we@7U zMesm$hbYeN3RMoAPW62hRCnkil~4T09V`R*#Lx^96-#b-98Ex|JP*m1hY5zIWK^6- znsrIcHbX^PgZG%~oe8x|3w);(kz5*?t)H_2_LE1GB5^W?jg5zN$8?hJ&Ss$tuA@(E zB2>M+sb^viX)?{3=9nDLcK?yvi8QwUcO7QboQHI%j)^>DN#$>b^dtX~!$G00d}}7h zbs6M$|0ZkJuA-PJ<0;1I1VmNg*_pxmq#XK|`|SEcW^b5tH|k@hGd7{ zGT%;u_Udxjst!}s56=ALPe}JbN^!PDv}0xwY>VzwB-?=IUjiCQI_^QMGKPbX8P?K85C45dczKe=z}Ph|LMA$*+#&NxO+rYo`` zUh1D zclsjja|qmer}M&uaacJ(_|5NkA?o01R=Z6BN#XC12j8Rir8^;?)uJ}u-AsvWENNy3 zKrj6y6bC23WT6=P?*;98@ih;qeL`!tg&}>A7-~;jit{ZLm_aky)SP3=gk(vr$7=LB zub@|}B9VEtJMvl{GxP+|1nf*sfE|VpJA0YiRp&=vFg{>aMu2S()=f=yV=u_&6QMd z`4&z-zO?3D6UG-eLorW@a%nJ+>Fi0BsvLyd-622uTM~6^mq`C9Cg=RM@HprQnOP8* z=dECR$I>wDMC@!+5sF*oW-t;F?%`3Hg9w*JDa8esTro`|- z(!Dw<8FIRk5|ax?PdgHasOg@tVXo#NKZ$=%MGCVqcPg)>Bu zej5SzGo#2ar6;wx9D`!O3^smw4*aeAL(=3RXignzx75JI`~-Ffx4~#=7jhN!y!gT& zq+fCcra#UB+5o67on(1-?@3npk&15&;&U`@s5k!;?c!;uEquyN&rYMRrE=2ix{6Gv z^`V&Qb0I29$@ZKdz$|VHI^X(#+&N(nm%jJ}*RT{S?m3rBWCs z+~G8rRL|~9$(v@;{++i-H()T9^i)7RUCiayT9OX1Viqs1kVyG&cF2IiB-*>!ZzW)MZ=}6YRb{DmM71%)L6^ZKMBC;IrN`pEop%rF2<6U{sJTa8)esLC>^5O8d zKFJ+!|4rUrAJ|C$4^&lKL&N5sgJ@e(mj1wttknMnwWbU~@eqYj5P< znv8IBL2n)w&Lk#To#mU3oMXi#{cS8HpEf}D?JC4<8v=G|;hZb!{91#vrQ-C?iip6zd`}9O((o&}t^O36&K0d`#6{mJ6P$A%@JmNM_r* zpk%-ovRXO|BJZo&4yT*A&5KPGGRB_r7KC7pDv`X0b>)`ZyFfJg54HTyG-?_0ojiwa zC*_!{%%JZDG8yCw=X?90e%4pugo5TYb0<;gR3@<;D{!8z}?S$1~M`ko| z6!)C{fyel!lGz{Sq+PX-$;+%EAV_j3QrC~tA#|6MCFP^4$;xCo^Del@*PtBH zV-H|ubtZ`=3U%W@`rKj3-&DIX5k`v*c%;Y(MW6Oju4^%c3kpkJ2}-zE4NdM}FbeWv z&YPb?T5UiH_b;G*^-{_h_dw9s74QfwBU#NRQZ^gO+l@ADjDw`5pf*$wIX2Wsid z>r6YPKZYbU30%*MPqfRSm_c`_&|FTQ$&pO#-N9Uz+=0YpF6G_rfYy5f+zxfe_#fFM zy}wBvWU~(CHEFDLLn$1>dmy>+SJK4w;0ohKylra?wQtKo(bT{D`FZ+r==p-ib7e+vMrGpUJF?SgVHxmA8E2 z=^7`2i}xY3v>s6GU4ZhL!Ax`cD=V(E;i{o2(3D2=p8bv>O}(FnDKElyNFjOG&!VXR zJ%dXdr}~jUApfI5UG?}3M0S<%u5)Cbll_qU%MIF1_aO5xrQ|Kqw8Tb7CJp@%TNfjE z&`BtISxWke!kLZq=dF)V^Td%V=HI&nQm-P4@!|-}7v6DK2lyWGfSV?aD}o1*LpJAw zTqM)m7jb-ND4iV#p@zC~pZk0!?wD;J0of!;;=4wQOSY2iB6zq6lFoc)`7{ z97ojNY2J925Q`y2)ev2j zXIDlJA=#PUJaShnNf+JbT@5dwH7SX?j5UJAKVf{(udO6E+@Rj1d_fi(!Ta1!a8YH` z_(w+|-XAQn7*)ff?ysix6Y8OC&`QKd^x*paJjte7lKo$^X=MFLGMe&+>n3*NyJh!j z&cY((eOAG_q#u(!bcEw){rh_7t>sO ziGXpBsmVgn?g3S#_p6M&&sf65Yy>>Sex%%+$E7)17SpR0#RGOhHim?m<(|6PY#ZvG zUZ7`s9LZkY=MKT&VcOn8VnN%+|9%p-Wm%XZ`%0pR`gSEPC$Yo%D7iU37UnoZR=ac` z5=zU_)GY}zos7r1+(4ysJINPZu?y*H52q!LWZCix=AQzfog<~)flrY( zQ+sSJN$y*sSYRiO0V}!xf&pYPc`k_#P2^rn&!f)#7P5R>$nV0>P^}B)p4BmslpKBO&KbTY>N6=HR56ELhnP&Xhi9~&q2@cggc(1D zTHk$QqV!eFvo(yRzdQ~7VW955`>Dc4i{(ojpt;`-?R}D{x{p7srhlNiYjTv*TQu7I zLyj5IXxwm?>#hjeIeQqN)8_@!@7JN&#GM*@dvfQE(U7W~)n$qAU{u?K{LTNRn(3{y zKD-#Jg?_weLnv9Et|R^7LjERlG1aeJKv) zVti@{d55j%#{Ukdw6wdFuJ;QK+gB>+*g=S51uRtXd}S<&6>bxBS6Bfr{quj6q+f)d zOXDHmzm9Up37MPGD)L!<2cp_;>e|0bn4Z2P8CS=`@BRVSxYeC1BA$`)1r1yd?S|)w zek9%rY6m5|KPq$eIiie3qO)ATI326_Wn6 zWaw#VLMO7g=}{z3JqGm`W0Jl-s5ZR*3Q-0#Q8FlspYb!%d|n_EbBZ!fH?N5TEs8|VrhdGSs|mN>$Y+)dIT?eOAR zpGt6?6{Fa19~<)K8)6pfQ}Uzth!_4>`Swz-oz;vEO%b`?dPqyI??dC>G8WkqKo(br zQRb)v+;Q6{ksH4I?uVrxI+?j#sb#%b|{rThvxE2iF2qY_Zjnok`G%TN65V> z-`#srP{?EMF)04}uU9_uBPvr4U1E~3731v1bS@XY>sJLG&DQ^f_5j`(zcGypf z*pHNY?j6!6j3s&04RuJ(b)LI=Ivi?GQoz_cvVUKU{JEjzvR_4wU(*Cn=ayYUXD-QK z`bbRHb@F1n0o>o-jC?Ic0>`OT8=e_Tp4C57ci972S#+|JvoYjcwVpYDwB>Pwaw+2C z3mB%YgUg1s*gyU`m7n?w)!&Ap*zu`c3vdsBm0AVUf-Y) zffYLiu7_2t7HQ%}$ebpTOG*u@4s{|*CS*s<%Avb66D>j(lw1Rmyz(`N%Jy$6f-GJyl>P@njg z5&{I)xOW6g7*he$x}RX_-v~n=E3#_p0ol-JT%!3MPQQ9n(~(fJn5Sc{wsIl&>BvQY z^it1>19}#IBcq4zaQqZWZh?m(YFla-GJP!RetnPhiuGjGd>b}z!dYw20-l2i3Mdk4 zW?l`G)~!=FCJA-FTF6*z-iDxE3&^H!89L6Vke_=lslLUKd`P@q@#c*bI^!K0PCX#g z?mxlef-RY>P2u+SqagP7hK=~V!0iR5cr=~I4;OMD>*lcL)lcAdxC6=W{m@`qPx(J( zz>i~q;{jI1d)_6aqw z+5l0keMoQ}o>c*A( zxwe-T4RR=kO~z9pFRW*mJoY!}P2L6j`3*t_^b*M?oKshY9)RazXGl(Ur;bFyI}Yi_ zA%gQs%aOX>bs2A&%H&mr`=feh5)iDn*zVp0xLSzk4krXp--<^GBp&m z^8zU%mB$g?dIE93pF-=TPjFUWW`2r%t`{zNUCKo_oFkLCYT^R&-Y2}fKMNU)yb`AB zv6!421~CVX0qNCtQoZ{)WNlBSroc*?xa}3hSHH5-u7dxTeTGk*DKP)}2PM9*1-`b@ zN)r3#92DzwNH*O>@W!5!d_o87FK}R=*fXTqn9M?cY2eyjk67F8G-w(QO1u?j(CqBO z(p33q-w;EtR`u|mx|G(RNTeNS>IBC1me$V!9E|Vr*u)@mS~?8rX4RBi^$%K24)8ib zkD7&Sgu%maBnj4#vYA`d%3Xu zj1+zG08XOWN*S*c%NE9%a_**?{6eI_$s)~@=(@));Snn+(HJc@(|8dr2lLk zbVjp~v9S#0XTI{rfit*S=LTfjWsv*(2ulC>5{;EkTx@1Z&4M0lecH@}MxG|A-y5}N zg*|p$6m;>BYV_YxMs&54%J2OGXQ?A;9XiRzeGus;IZ&}kLe+lrgbd*~*5Vxk)94Da zS+f-01L|0=voY3mmy<}phh)s{Ffxrdfa9d&FmbY{O??!?8I?fq;dEO6^(5kbq(Y{< zlgqq%GueW1Fq&=y@5>^Xti4KMa|V+Bo^9ABWKHD{n54G58HqNIQkzvCBhi8FvYfhuO_P^w6W!s`xLBJe+@nc(O4zQ;`VEJcW{fVn-|PW~Unl=k%oNqx5Q z=x?E9y8jP~S~?U}_l4Z_Uv=bi!34`6X-F;jp0dVxwZqm&P`of>x`6*sYx@&L2F`+^ z^Brnxt%2p^pOLxqER(O8#;l!7$T)8()-Nf9+ zw@$LyqF@T=Dm2=un4-gvwM2hK`QH0XIk}lU7xg94v@zNGf0uHTUi!k!?SbP1+iBSB z=j3m-nS!nn+*UNgX~j{@scohHZFPv*)C}3FzC5aWn2^irfI-R!h(?UJ)59plE>0tB zQD4fMSOsf!4O)sWQ{vHU+@oy;ET<0@*!?|TUg=AUvM!XFx(w^b+{c<@f+tdR60)F` zT;4T|X}?FJ&%XjY?VS#({5v!HS;(|Mvm*=oE}TWZ#3On;*$XwNeAN#oS}n5MHu@=9 zl$LW*)e1XP)pA%YZ|AiUeW>zH3rwtYsOx}C@+x&B>U%WESBW|bOlCb z_Ncx67erP+c$1(jiwx3HymKnI$UDW2R@MqyvVwK2I{5!*1Gj&lkn5{7Ncx+>qS{Bu zZ)Y>{grCXp(QD>(@))`F3MTSA00(}VLR86A9uUlGZd9SB>uJQN_9XF7e2BXa7P}ACztd!RRZYq^!6Tb+992&m;s10R$(#a(S}1s}A+7ML zj)(3_rCOD;jv5b-!qjQ!5n_{u4#PKO;W>%&o<0B#jK!FFPaz)>$qL^Je6`|vR%XXp z?%7|5#N28`AI^izCRfPq&oHsp8&Xk(TIal#JN{lr+O_N8u&!hVGVR7>Pb*kxasw6T2>D$97Z42zRO|Y^g>m?8fp=ze(K?Up zIET~NG2tMr-UKn-{~l3giX6IqQ4IYi;x9Ya+suEgDTsb zyaI2N=Gl3P;s07OK1`_Pwr_dVxKPqXB|+Ln=!^9SVK}px8ZEjbYRzNr7NUcplc34h z88BJIdDb!~l#W-tfd0m-%<98YZZ_&T)!z4n?tlsD&!0oqC*|Z2GZr4XW)yW<%iF7d zCdc)s5wxKI;cFivF5Q8l!gaewRcx$CyjpcVw=4 zgu*z1S(PM85?6+j)+CtQTh@@DO(yqV`jG7?uOZ3dy`)SIP-~C1(5BB(5ML9nL$Z*4 z3XDaGUn5#`hm&;jRE)RUi}I8LmUQg~Tqf6$)sz2maqDnYbuFXn!#&Bt>#NqMk8pT9v)6`fZp;{?!rCJAAG{6JECCUv){q|zaFaLe5RH_uDV_R#_I z&S__!>mKvM$8jiYs-Uj<=OJ716}}stpd44o?B@_kzX^V%QzT5~_Ap9Wh04bNguKBM zXk#8Sy&7W_u4zH^&SO}~Pa^V}J6gxYqB!yjciH(r8b9^{8okGI%f)?B@-jfkQ@`e( zqQyc!Gf*%Q4wsEosq;E z#A#GKGoM8)Yo^Tio2aJx9*To!!em1WxA}7*i4xYSqeVresNKlo#c?pYW(UvjrSP)+ z6~0+JNj@>wZvF4gun2mN_%oAe$Q2!mtqKu(R`3)%KU40p-mnmft={MsDZ1 zeA^wSIMd7uR~u1zpVH_{fCMRJ!RJmMhM}L0|AhSF}nT zw|o@x;(<`SeotP%x=`k-5VBcxnTw5|`)gJT7%HRjL zx;%lRuD5YYWoW{$B#`x%m{-0`GLb|2rxl`9j`->;~&3vC6xclVeaM2~AcylmU-B0I9_A%%hd5j`{drD4WsnAT730c^U zXiRY7=4}nsDCB-C`ZpuM^*t$_Qq+U2I}V#vcMZYwzUMKOG^jfx}3Y;m6DDYG{6sdz9(48DTe;K|V>7 zt=Ln_M3L*W&61{*mr_NVbD7-3dnf5nKgjGu>?!x|e6oKkM6L+6!l*81Pb$wy^D4cXnGs)%~ z(UQ_+!l!*Os{6o2d)BE#j~7w*X9e(l5`oOR&%8Fd0-~w;2wirWG<|ML#x&hWtMNsy zySQI%-F+f<4=sc7f`g>|?9ZDUzmcw&z}iE@pe!s^r%z8qnIel4Tkp{P_ixEP`YdY9 z1g3WL3e{%Lf-FnMLfZbNS60`_YtTn@JntZnED@A3&ZH9Zh54d8uwRjamKVp#`;b3! zizBICJBU0k&LWH3uetckQ0RpV=Vx*lWKd6{=qj~~Vkz{-HVFB+G>Lq0hmZ}`WH+2U zOKCnwVUid|s`?c{elg82+PNM)NeD6e`!wck51%sB_c z_MIf1^+8FF<}gAE%XnVNYeW=Q33Nk;$XlHt;$RBjl^BGZK2z_)>LZcio8tX`xZe~Xka&+u@)hr(WJK9e$M z@~tQ$cb^&5I*L*CjRJCO9t%Z}SJW^$km|oCk@9E;^*4WrHrIV*=G8!6leN6BiuI?d)z6tD=TQNW2B}HYPW69StX}{w;a*!&xw?J(>+&)mB;VQHS z3`Aze5$62KfXkfqDKTb0`JSH-<$u33_fu~LeWZZ;WD1fOO2%y8yhwzAPnx^`lGvTFxf1u zXBNqVublarCQeA9nt)WO^j45asNr5$u46{sRd@)yOrky>B=SsS@_*dO>Xi)rcWbCt z6Tn*R3sGMsg3tK?LDRit-WOuH<+C*CCZ9n=Q6()uAA#fRu2JQRQIO}JvvYF%hT=`; zkl%koO!|Gq_`H(p@mAi;08J4{1zv!Pd5jPmhXG6`!Z z(e?4nucreH?g$LBeGN&Y_Hg=ANVyH?Dbt~onb!xAYr#j@KiUlC9vL%{{37g99^_^V zrjV@lH;Rj@hkW-?k_DP`r;$PrvkR5TpM2+%reKr`Os>tMjI{rqf-d9$du4o|6kYc6 ziT5sG&Lv?_$t;Qa|0{>pl77tc;Bd0;-COVkQm9tCkBb*NVXFN__$~U4TBgUMR{Dfl zu)VzL<7Kk!vJ>q#{xES3;#Tf4@Ekvl6bJ1P(`!D>IT}eh{>Lb@J)1ATc96^u2Sa1j z3Gdpey!G@AM7juhGV`v~Kc|AMcg=+I>RYv+-whPErZGutZ}PJ`%`@-hkmb)p&T`cO zZZa?&VOkwEJ}xIgB$HmqduqJvNL4@O!ZaZT`a1+|6<&>;-fLWB+r^~Xs6`~?o$r@1?}Oi&Jm9#* z;X*4@{Wprs-mjppHt9&8vjkpuZqmf)SQuLGq=c$^D(PPb>0BAq+bT$!{8g$PL1fe~pNXEmW!|?oGx>&_WRvlhHMXB(l{dm5FC0njdp;s+ z$55($^BkhL3(#&JO!_+nZCGAFZCegu(9R}my+4?ubj#3vXbf3ryn?S~9GX`ugz#_4jh-vxmyI!UL~wGNsUP<$}ZM7*{T0+|

L8flBAMRVI(zrk|ZNZ zYR=vCpi*fk6@z3XgfM!NB){`>UHz%6uI6*j{qTCfKE^|h!6TlKHZwH{k3T6CkD3O8 z36oeJ%%AnEF`Vg}XAl+c0()CoKNt84g=acZ^?51kZw@6|kJKPtx`wKMw*l=*ArP-$ zhC9mY(PoepsM35vw$m5&10_V`N+0q@B+7rx5V(a>k?pFuayWh{oL{T7h41rxbb0m=5S#hTxBQ8Fh- zrsLg7+&(LA43$gXP4yg_0D5&<9d7Wojt9X0xQ2G&GuRW;! z+1bCO5d^mvb3H+65O|`6NE^3Nt@WG0eP|~rj`u@$@M9GI+slQ#y@Pv=&%&pQJ78YQ zxTnH)S;)+ruw(62_B_h*Y0EWGuDMO4@}HoxubKoO_XUR&7r=LPHlFjo%kJ?k)Ydx# z7W4%QjUx)%{l{Ty#1>Go?7SY!wkn?Gkj!%*z*U?kd@4Yr^QTa6IYr*uHCQ$56gV$21QYrO z7XLN??OV*zF2n})r$v!jPpWYhM1rARKILzEgh8u{v2uqLeeP;O4=DrPx+-w~u7!O$ zap(-&@VzP<%?4?pv)Nm0Xb1+a(Zc5DYr?;B4V*u?faHKb=<_cSp>eG&zuyHI+eu1#x{x28Ml?LO zVSMok3>~xz0=L%@SFh{DvxUvp;(S1ds6iDOM6~ugLX232kN9HNH!gsLYdat{M5cVFqgNn)3aJXWCIUd4@MnMG{A1)_~+FBYmIv+Z^yRmZU5D?Cl z7y2!oMx+y@AnYH*l~1U}_&?c>chEZadD}>`;#NZM^dc%4V~gh3Jiz2#FFL=<0p1v2 z+B_`>-tR63oyv_Qqk9^hs}`~Ua|hluo(0VY2~~+!gQ4$hnPhM!7?wK1!{h=`=ConZ zu5Zv^0-$UU1lM`3#B@dm@CJPWhrszXanwV!AH(je&d0Q^JphNDzXV)!GJ-L7RnZUZQvO8aQ9rQ#`6b4;{z-V6 zXDtLTl8JlSax6PA3B&!ekf+y2bHXmM8f62gWpp1VCfB3?{euwo52E#SXVwQ*5)V@; z5vuDL<7<7FKWGJ-7dbJ8+>fj5DnhONN^I5HgaSWD&iEB$-KNK&XJsB}$F_rqhat-x zJwuP7*4UM55AwOQz$SkRHUzxH%FYR>+0#n#-&n|MIfP!R7roenXXZ4RWT(&{}&23gShy=V&8LpU{rxQAeP8=VRc#b+jCOB?(o7 zj&br&EDs*B6_l4Jp>gi-sE*u&u58zvTDK7R5yi;=`acxLXB76bTG>1C6|6EVz_4M> z(Duh`aILKc$vRWg8~*@&N&}&v)sEqx!ceF)ksIc78&l64!2OUWBZO6(}0cyg5Z- z56DnaRz=PC+^76AwWR#iUDW8i0~^P@Lq*VD%>Ua8dExzB zYPu6-d>n@1iXVw*H=jdb~=g4BZKG%}p|ojYl~hw^Kx) z_lc-0vQaD=iSC{6F>u9wth^?tQnM4Zz=GALe@jXD@N_Ed+A7=lEgHJ?Cn4Xrh_?Ov zfcpE^qI&Xax;Ud974auArXv{xZXpyr{D8wUd%)n^GN|HR$JX%;piX;9H5l(BG>fJN z!w*6G+g{`!m_Yj-9Y8N)HS$ha5Q9t`>?yj2Z3inEgL#|nKIDapu(^<_a~d_4?1wIK z1DZ2tTyK~s$a_W9r?Ua&mxDR~ghgOJJQDr7g*18UPv*J#i8?-;X%EYH$t&ct)KEi+ zx?+dI`Lj`N@HY+jKFWL(31FI<1o=ZM&~C(J5QaG6xW%#P@bmx`KbV4|QpT{oJ4ei? z#DI8kA{AMUriPOzlJ=4A7;@qSWSX_J^U#v&n2#g{*0u2TdL3|n40WEg3r(hcXRLby z=>As$#%>nq!x^z>p_h0Gn$Z7$b|A7~^R+oW<iA-B=B{67Bw}!;H8@Z9M&HJE&>s~5>T?&VY-cFsd+h@|#_1~*4{7DsuQ2^y3~d`WD1plS z0We$9gElAMqN~0kW|N<=b?q7SnD{%X=~F<4U=a$2OvaAJYbde4#I=6;1e$;S0r6_) zJjiD3XL>T#|B^=qgJZyBRTzz7vya^S3sJho29)Q&u?%_vZFtp;&PQZu>~kOEw%J2q zt_;Gx8N(9J2cL>`u+zK&na{x7>v|g74?y14Nb_sR}Z(^8H7)W0}C-S~^vPZsRj86N3 zV*Vlw{>U=P$pe@?rwipvuHfS4UJQ+&3Q@7kplvfdhm`|ZYsXwCsTD-`Z3whweW%jq z|ABLL2kLpwhpeAfU}UiY^wxjD{({lScRoxXO$&o|x*Q!=#}c({A$#VNsZe)@?CH-K zG?3oJ9WM1?H~ulm2M5YNon;=gmG|KNvde6ryB5tFZ=f=KCJ5%O=Q4IhQ3qWF6GI!c zniPe7-*2NX7lJ{w2YNpSVyj9EHOqED?WHnI{re)0_?I~a*bds@aVTR1LTTW;GgLmK zkW}8b1eTn7&KN@;L#7 zXY&KROZt#MxsP=1kf7KyohW18qU!BIP|RU7r=McrdGr*vb)TWe3B5Fc<;JXo+)(mM zIF~c390M{{;Bhz|69QMGxjLL$GtNl3_h?~5Q~*2YA2I)n4QG|y0+q&W-!QYDF)5Nl z!EIBtcB*Awvv8Ewx1(q>V{PUh0fS#IV3wH~7^N>l$)P~%RoFtM!x)!ir(pL}AIZ8t z42_FQLGeD56u8%6mAnM|ypN+NV+u6aq+#n)Z!~PXOL`C9M3bq(s5_$#0$opF@Y%U6 zOLGSt0$!8(pDQ6c{3Xgi90A_t1iE8TDrT+Ogf>ZEalZad5C)gg9*yr9^!X`hDAItJ z>jb*deGonWEm$o340D&i2FXYTJ1>WUBIGwJDZ9vt%bk&b#t-C9yEyT=FQn;BD!Nt* zFtoi0cnek&)eSy)A9{iQf=wttf)VIvA~n`IfmTuDwnI z&wfLrv}d^WaSDdIj0T=;3Q;d!MRb1gpz#wQgKkA1l=-mSF0>A_q0 z$?6Jvh(>~(?Swt-M^dlE*K8(si~2OMJKc3ECLCh=J5vJ~Jhc_fV@-kQz6d$vEnqe2 zDe%UYkgkzFFmYloR^5((fPpW_t#1M4e&*gPA4ZbnKVxfk8~Af65cBf`G^|_$2EHbs zDbc|2x<=;GOesa(#nf@*7$>Yi_1IaUc1tAQe|KTR^QF+5 ztc5iuKOytkH}ELpgKgD)$Z9u-fQ20(A4@22><^MZ^D33E7%kKApNJk3TQnRW2T^&O zz(|9!TL~{Q!8Hi-fTqJJ^6?Y#cpBmA3 zXE|2x>4d=DDa6m}12q?32X5RL(5zm<_MFuqdOVdTf9`~mhqo}Z{36;My-sve0;&19 z7|^sBiPE8$nRl>(w)0uu@6TbB7yR84A`2l))eC#Oy1>A$44K5;N%aufq|0i?D3817nQq(B8ZY3Z^lSMecOe36j$Er={qz^atm#w2;Wz+(09m zu}#UBWY!HHQ17HlTrn+z_`GC;MFvE2p_F~sRRN_F-@+onFE(H#d3DKF7b z?-b=T_onNSFi6J+ko~9xF>~5yk8G;%}zZH3f1TN5@feu08Nc5*UHe^A;BiF!9jtzt!`9(yNnGwrjR-M3-osBBX9B9!dZr2QIyYIWhu?z z0#86X!T|03ZBZvqgL0x>0NcN^^OWVmtBld(>1GuC<0=a(C;HH;xX1nMyfgm*m4(qY#S8oWdaC70hZFWUhSuRRP}R)J_|_Xn(+bshCr z{RKT0tk2>W7KTRNg-<`Og2Sd3kX)G$f}&Mmux>awd@f+WZy30b84vt4Q`E28j}E{6 zpsB9@(B&41s+S`n;oA!~Ggd-QkPQ1j{|bE1D^PNx4UI3QKvPH(7EECLt)?4F;{+(l zc0%FKG2r92in$$_%cw}o_L`T`U9S@xHa|cvmw8jGM#<$$=6&ribQ!pes)XAzo@O|8hJ_fod@>H}9>9>Vogn`!7+m!3 zLf-o)Xe8;wM;-{~b+>4KViIxQECl|G!9;lFgv@KQ7^08WKvcFi4xE0954S{N_Ln>m zjx{AYrq7^#;zDc?vH9Q6DIlM}5v@jDK-GmL;Q441m1;f>TmKD82ETxkX?4u)^a1(5 zA3|YiIjYZuAwMsR?bPMUVAg{N_=&(lijIkNL5TW7K5p*`J zL5H;}k~5TL)jY1t6v4?T+@vJNzg?mElTMJpOdoJ~l!_t|^Nr+YvhUl-c&8=I7bAd5 zr*6L3je1K1hJ)?^ob)FNnz6hk>E(Txc6zPW=Mbpx8DMp;@H4cmfmQI*u0YEYJKF0KHNMK7s#U^--J zMnFd0b5zeP1j9FJGJ*IWSb0^zj@Mstz_bw+Q~#j}VMZY6O{Ss#Js2#A1oLe%DBkS_ ziy!7PkA(@AnZ$$FZ{fsCW{B#i8pLyoIY>qvW8W_ZTV0Dl!+05zfwjk#| z8XR5>C%k|PnTgeP?CT5z!&_HK>k%I2``jjFv1TBBZ~?vck709JFSLzof$)Vt$l`^? z7+#$VYH>34=++{tL!rcjzngB|*a3l+f3aGzi3A&2pfdXd*}J^}Ycv*M&V!?n*YzC* z`t~xvTn@YLhd^Rm5(bi+#N^@_tSw`6jcPG%`ocU#I#ba!Fc%VrSwkBA$1d35I)}OJvpx=slT5c~w(E!p%glkr`xjb{wb|CxK3Q5UV}*f$Cv54K#QMp3iTA zuUiMW_On^iU@z=B(}0S97mpy%9VLwi4O) zBnTd3i1z1G(eVKDEqz}DO-szcRp1Lg89WSS_ru=MNQk?)8JlLvpyn9+JSIePyvaS5 zYRx7PjycY$wauWst^icqpL164nQuJ&JWBL!s3c(~Q5iIIetHF%ytWJM(@&#zc|Mpe zw?qH9wUDEI3f;eLW(>nHuo@v{pG7088YBSC#i!6%$#z+jcF=J9cT{s+FbKQmpo41y znppmddVd~)S!24H_xdeNug?L;{!7@D&!id6KK)i}ig6rS$C>N#30=r*AWoIHaTJQ`X-Vvj5ULr&&!azGm z3VxktY!>W`pB!7z!EO{W-|(DTwRV6sV+RVaC1WV_x8>Gapx+Fj8e4yYVOA1o%>K!o zO!qj+TOsGaxepASo};Ol@v>XKV^+p@^sqRF&B`w1eg7^CET{$92yIp)mmr_dr&HWn z7U}wRuo)Fc>_)x<{z5m97=@py*j)D26%rEh6E@#HC zU_9O((ys9WB&K>qbLuV}r+ozscF%#l4W~ffBgSEqN{|b<2CkDAf`x4%$agnlulQdS zJUA*#?g>Q)brLlH!CafKo`badG?^IKhLvecP)GEUbeW96(2R}XTCa&V5o}&j{2cgK zen6#Y4&<)8jz-5cuG?A7`87IqJ|`4%{i9Q#}&9&(Fz{b3aaYp0cqqpP_3QAoNQST==TKLEh|y{Yd+gL z-2kDkGv$p)hA3$Tycu4NHD@ECcF70KvynjZsZ-d#aVmQ4?jyteSjNuulq_}XLaZ5J zpU3+hGDTV&itGAmMt%~RXu??D*)gb-dJ$7IcfdgA1=QbSi?dYB9hu!jdDCjJa+iek z+y4jZE!&{=U&bc~y@n3f=L_~(V~pf9NWbibt$L?Gb)%E2epjI?y$?nI%_bpNzoJT> zfnjT}FgMsGs?0Hk{_XdX_eV0>d!4bXa|=jDVlJudyMzs_j#*63g7yN&1i$N(X-s>7 z0`+Lp`=k$pkIZ3th%FZOE*Z>42iOpAhsuWSsCN5FT{C6GYb~2CFBt@%Iv!%z*e_79 z`vTVFxj~o34b+fjqvrZ^w5st8DwmB$!R^hc;*UoAH}|kA>l-9)6@mEeVvKlm7xE1} zQE15A-vcv1_&XnM!{yAyH5_;BR$%(`$Ds4{HIZ$b1b#ZJs5H77Q|pbGm*%EL?%}H_ zY&ip;@;_tfOP0ed`9%9VKVwh88^$b7U~E7!HL5Cv+;-+}9{LG*!>2+8p9n~{kR*p3KCIiM?jd-A~RoCPk0vpplxOWV>s)uW8f2R zlvTjst}Gl^)D1e`+t|AlR)XZxA*#A4Clb3MpjqmNQNE0EWOa(5LCpys=mIY$kQ%DZ zvC{b-`bn)(ZrqLBkZEkEejIeR_QIp(ETiyW1M=6_gZ!=;@|+h~=p{=)P~5`q@8zIO z`$6PkD`mnx_MF#wdocM=jkXWJVi$AYq@VA@oD*SSyzVY!lwJk1-wePoVjI!%_W_S6 zG049OImNTyBgRS* zDLW^RzT&K0AAxjDIpICZrUIuYVC6$mC||-!{Z3PlF?Tt6t1e~>*xYOUBbmWzCx|=f z4jTpUz+s{}2u*)theZSPkbgnNw>~^P(160)Plds@9bkQf@%%eBfGFV~<~}cnxks3D z;`J_=e&Qtrj4j5&e?P&#`z$LR@gJG{M+*cDWxlQvhM-?{n{M35T-@RTcsNFdF6X5v zW<8_H?FvwfhJkSMXqlnYKI*aMIEk(mqNDx~h{*a0o-RDV-78XmX;gP&)Dc$kP7{=1MionYR3w~s`5ts6A+GN}BS1u1yb ziBY!)gX*tM$iLq~MWrY3jqXR5$<7A5L+&V^`JHxkOajkq0&p0)2Nj=~r&EzkRFh)i zjY|YZtxZFHZRVDEm?YCN*n*;V)=627OLKyev}Q;i$7uFh>K9U@CKE)A14|g+`-~}EX(rxqS3BC<_E|Hqu0AI zr0W?-m6bH^H-b|(gn-VH)inFmbyW0kB5jOI(O7X8wZbl=@YVv>51A0dW9v~-vKZP{ z8lvafdl>xacW85)kHL4wvpHTq_`4N>R!krEM?V91bt??M$ULByKQaDs1^fMeVQfae ztbNib=G5&Xy<5vk$I}Yjy1fVN!PbFwBsiW<}d5bg*<-=8osl`hSc?#)o~ltTUP;}#(>hU zi1N3vI_ZQtC}QlvyR{y;m!C0JS%_BMS6H4^115G~h2X$J7`@^r^Q727lgTQyaodmn zp~HYj*T{@!2BKuiNmL#)fhbupSSva~f3z!xPd|?XTQ6g?_jPo6*$)EGAT)a7hq{VF z@Qtlzp11eNm;X%#PbYwWd;-aiID^i1wU~T29~~m+f@HHVLU|kZZ0m=%1aFYa^)Z9x zGyFOip>atDwjFOrGgUhD?`%gc{bF{0HA2Hu#?8O|4S7!EEwyX=v1xZ3%C7u}nt#R9 zF1w}3GelT?J(cALzkyNcbIj=4hi%)kLF@1W&@4$u`R5{dUlf8Wy-sj&tU>dep}>3A zCi6Y_95n($p|Z^q<66#P>X!GHe~FzmU)N!4TNteA69FsPaB zfq!tlp;sVo?oH5f+Cx;2=1`rk#cZ$lgcydMlx;1q0@;bD@MhN;+_CKq$gd8?{B=!4 z!D|IO^&T|&?hE>fk(7HbLe*d2WKoSzf!`DjtG0z=!QHo@b z>*8ar%X^fLyuflUCy8qHJ8GOb7fN(WLAv!ZDVf#@JfrIcso#1)e^DCYSv$%cCX9w2 z<$d%oQ9wEO9+Hh-;tqE)Dx9hy;U*8lCoE;P&k>X-y{G0fIW;qGXEi0u9uF1~uc^O+ z`B#<=-}Y1HHT5p_bt;A4Id{M?;hxN&e~jgnW6>42csJ;Bl#%oMSFTosb#O z=76+y+G&_R`Zy>{AAxAvSmIG=KzKpBEX7Sy;Mo_0kn90PaSionY_y~0N9HLFMe%eg zY5!&j<)TnnwUT+TrT3`7cDKy0)`Gg`J;U~o?vNoXMV0L!6srv&YwdlOXUd@o^c0#n zu=*+gT;brZNHpy4Cng7Xf?50*IA`-1_^azEKgkVLu3>~x!_Z#fg?s(f%%|pNIngK? zOqwYew<&4QoHCFIHLxr9FuMyAY3RNxY(G92ZFX6K>){%9wkp^zAr8ZQj>E%>5cb}V zMlL=s2OT^H5wUDPYTj%m8ZB#}_wOLTbk_!z<>SdDe94s6q!FyopZ2;Y2> z>15BKc6J9a@8W$>97-dKo!V6LQY6cWx(j;Oen9-AGw3_|6$bFHDyPGbNOG(Uhg^V3N1kIcQ{H04#6&3bX74JB@*t09)PSM%KIfVE4bHV( z#qc3b$j@%32`_p<|K(F`wQvHPi9^8sfh%@>VfF4d7swhc0Qm|Rnb6)A&J9F@qkS%p zSo<6Wt23$J=2(_-u*QbsVX$#u8ETaDvKqLSR{o4eMPdro$*aOtb{=M#=s|FZ7V2M) zKr_btZtSZ7o#gFAIpYYvv5$mQM?Pu|-i6|{3hLF+OPgkV1)YjJRCweKI>%2(gL+HQ z+Wr@46kbCkw#VRQ|BCAF!9<<)5Vdaq%UEzZo&O;Y9K`z2yEBDK>dk5Xw#lHW$-Eo| z8kphe4Hk!5(XgqS2F`9I4MRgQaO++WOAawEH;(d;8DaQ?{h-1(Tu{e7Y>9}4h9UhR zobU{~mYhPXtdF4S@h6u5D8@|3JJ5O}7rpG+dudwxz;<*NhMl|$f!3u|!*D37H(n$O ze=mnrE*rB)dEGy<)E90!S2JBb*Q&bZ7Dl#E_R)vn#h3%QS7HD-(> z{X%p4Gohvr**k$G7ELUh&=qqNw8nLT;?#Z;kjlKu+Ag5~A(NUW$x!UqMC6gnI9_#! zWtemTw7;<3M8ahd&JVZrI8a3me=LKhh$1xk@C7_h9wshbA6Z`$iwUg(Ape7930i_7 zd|yBE5~mdooDhTL-%}vj)67*)`hn`ZFQ7exy`zk016FI}q08<(c)0AO@uI72hRhiE zlpn-r#XanLmIJAef-v)sr(nA;3Hj4TQKPRb+59;Qvzm<&f24wV_8jI>s>}Rmv9#$YlNQNg=w9n+sOm36Q_c zk$Da6pkyeUy=bu-lYNdoyuZo%T?)}-SE9_w{v`;j-jTNU8mcM0OWV#craJKq@U}2N ztsoSI8$NPT%cp?hp6i^>;n~zUU5)b20GUSh3=~$?6(%pgguMC7Azhe=I(uiLF>_nE zs)We%eP`)azYYD8{zBE04J4Gm0sIs9vD{`jDBq2T{$%ESnYN0!Yd?gh8*k9#x3S>x zgJlo*yMoBa8vWFD)UiPcvob$J?=~K0{T0g`a*?24vlXN6`QhPEInLcN0F2Y)!WUN( z-KIG7yzUF0MIC66tcMPV5@_O8HY-p76=}zk0mea?tTF?izW+dSa6U!=+ri~0QppxQ z+E(|JI4XMBEK%2Meq&VEX?GnQzeuG^?(V z(6WQzGwwH*Gj}FdjAidS^cT9Wvxh1xcAprWrk-gJA^7MpY};K!&5jQSwPPYJ+dGVT z7%Qme={oZN&h~6!DOO4ng`ZlLvS!gP5d21=gwnCUKBXITY{nUKC+tiD>Wo}w6gsisQ-5h;u`XyeD7T- zAJu_Db56nB*5l|XxeeLBS1`_QF71!tLtCnngm!x{&gL3K@%6wpl+~CMFLQ$9J_JwH zqn52PSUpI`r*6qm>Gc`}n;B%(^67bx01oHlDL?6co$a~UFwW`;GW)xD* z%>QWSgLqKvd<|yzT|rnp9vTdoo5&-QJUaCTr9D6JO~5Ctn)(v0TZ`GeS%=2#yNQ+K zUl5*&iKS`$1(YzBIB?`gBD~ONsdsA;HWx;tFz6S?fHs4mr3M>b*`n*a=@^y9+&S^z zIIFO1P;hIhhpB`{96kfG0c-SGCI-EZ7r@`=0FJyw*z8%3@OwM-{S}Xjp&|&cXS3?3 zV@cp84d|N7JOqckK>l+G)l6f1-_tgnL#_h~E!m7=h=I%^E*YhnLSn5LKvTU$EZ1av zn5_>v$InqHzh%mtF(s&Xx(S6*K2&P6hiELxhb*1nQ9rzs)#Z0kQ?LMR*S}$XFWX(h za_rzogW>vnoO|m!Y@5r(POF>6=!@Uf-}an z$~z53I^zk?bDu1?tOoV1MzVYTifno^V=H$?ld|(#Ao;e8Di58a!auFKUg2O+tY_|w zi$DW-vA~ z3Ba#Qfhn)Oxm1eQTP$yD0Vs|WLJPtyilDA+MJ08Ow4}--V<`vFnv%ULBqE6mG z^`QVTws^+w-07GacpYJK2%ht-VDA=5qt>Soo5Jf@F6at4-1v)n$o^%1v4h0g_bWIC z#X&}BK6Ah==6qhVXM8Hfn(7E>HP`?i`n$-u^KqabbdTLH?{Jk)BgSv~0yZs-1#T(8 zj-!3h@iiCsj!-bKLn|nThERuJ&qCtoQV4Bk{)J&D(Sd)Q{6AZoC1<&`nulnyp_}dK z_n^=1Zy5aE12(4|Lp`Y}`o}CpoAvFC1+ju4Wgo~o%&;Z;F1X14!^OW`1jCdooT^*Q z`3!#w!X^U}EO$dK(^DAufz{{i-yrrin0!>JJRQtF>(F@L##vQ%#}t!>FQ8?b*1-&JpfTJwXqaeWpcoBjBX}X)ymi zfT|z(F{fD-b#j|tRKuL?u_HEd#3YWJ7jd|#v9DIP=AZ}-ic)C&OF7j!z7365>K7GP z$w+PVWN_EcV*J$bqWU|fIQ_&=nULhttV0h_>{Z7& zgrA;5V{PhTm;W6YV|5%(NrvLQ&u_uD#|izfq{1T6e#^-(7jgU#M~XbkEV&KRVoZJV zhE|R_Oe37qxS?yWV}!;(becnnqtr4d!YkY!e*dP&dfHARQ} zTycGkm`2!evW;`Tq4vAE%j{PVExNNgjI{o3Rk+K41kAJ#q<)u%LO|jWd58d1?6aH8TxKT{3%Gr<40 z%Hk(h;QrY8@aA6x5YNe6Yyn8fd&EIG- zO9n2-SCT7D38eO87x_MN)UveM5fBwH3x|BpC3`*ALGs8w74p?gVr6Y=FkwRRg2KhB$E1{wUEHrwkJAbs2Q;zLedQMY+)M?An zSWAhN+LVe8iW;=Vs8v!>JBo!sMT3Hl3OYcQfJlQN4eO7SwzTPc-~0UxnXmKi?(CkP z_C06z&dulE=RWtj-}}68ZnC>)?{m^Q|M}759?V~H_hqYZ&E*;wyfGK96CS$qkvE_E zkyq91^}WXB{cicyL&y2|d^>GlQ~L|Azv8M(K0o`E?vWq9_s+!5=#P(2`8_9IeC^qH zpS;+5%lFr{pFOs;?XI&w`Q*}jI4587x|cuv`bSRc9raLJ?>_4d zslBt#z5Al0KKL7}qv4Cc5bT%sKOuenz>iuFH`AN{faG5f9a zrlDHz*k4P3dy?v}QaO44Qzp-N_T;3aCwuMpiX+fW|0v~G>h;{^v0k_=mbBkjzvK^oecC>! z^4;q9Uiz!#^_4fjHEsX0y&_otvsl038L>Y8(^!9b2Cuw5^3yZ;f_No^H^tA$;2rTZ zGk9P8>pVhoaH(j3Um$>QWTsK+p!;?xVUyzivFWndQzES(5)&50Ed2sR*uA<`S z{if}Q_WP%;bGfG458oE!xTht{O}CEEFMLk#6|0iZWY$dBE&VH&`_S1l-rXsTw~cq_eI%d-QjPLU$Iv% z*V8_s{(Bj`{^v2S=V$PiIQtNL*zbz751p@wvk#q*#My_=^A|<`*oV$b;_O4`RdM#A z^SU^?blwt2m(IK5=+gO$IJ$H`5=WQL^UXMabm_b#jxL>7#nF@Vx;T2OX+18L9=hVJ zcUheF=w!L;m&DPB^Qt)da9$TjAI@9i z4IhZJ-=bH!96Wbvj0^kCc|n~0=DaMHE_Ji}> z-^BT|ADkD&*$>Xk;_L_KHF5TX^M*M4!FgL8Jvr}*qo=Oco9F4=yW;#<@0K|2(I3y# zCAI&m>F?(>aN!T5J)XO1k6&J1o$Q0qmpsR~HI0KVhwq7Y@33*vrSoO=N0-i5#nGkn zu{gSPK6_d82VFX!6GxZM=f%nES?@!xP2zK45}!~$D=BF|$~bQc?VhdnTauFVmL-kn z{if}Q_QX5+HN9@nh`00NyJI|vxATfP@pir-PQ0Br#fi7`jyUml-WT^g!2E{do(JGF z(x2x6cv0N*0K6jZc>um3?s)*-6#p&V=X`!0an8%eT5sA>POta#k{8j_uIi8Y4K*J7 zaZ%$DKj)RDs1M@jd_kP)t=+FR&b0qeODbU=!GG*^)E|Fn&w4mtReRRM`B=&lS$+)Smf{ zRsYUz%r`uv1;0XmV!ocoru{!m?T<{#wfv{?yy|%BAN{)9v!AKd-*+szk5dvLYJM@GBJ z4KscHP0=1aE%$xA#`^I7d-?rwJ@3-|<`p02KmWnV?-5^8d+xW+hvMXa=QAIO_Q?Ov zi{j*e=M{1Czw-rg^1t(@IQrSqenk&^qPM=r+nL_T*G0wq@fC}|P3!ad$xEB{pM34{ z=*U0vwew~9LwDo%cNJ{BjxI-hMvJ&<3W&x!Ng>wI3E&4r$6=W+D+%ilLwsF#mNKC zTjI=b@Zsps0~zykzM}rj&-qB4Jm);G@tB|Uk~s5oUKMA4&gHzW=#nz%qmWcWc|=ljZtUatWRJY#JpZ>b-sBzUC2{hO^Qt&`$9Y|xyyLtj zPTq0e6-OVrtD?Wd41G8ssXzK~p8vb``gdLuM<33s;^@P9T^xNlZ;7Li@>kdEV?~_# zc2!3o-5uy7oL2^Y96O~CAAcZw=)-wQ@kSrctK#Uxd0iZRIB$ug59eKR^x=F(9Noat zOQ{pr_XinzabEaHY>!@?m&M5^&THc6#d$*nmGD!-VjG0&fDVX!+B2}eK;S8^E{Guju7&~sao$dl5(@{S$`itr+Bjd&Ku&a zzw@>@>+ifL&iXqah_n9Ab03fEjV|Emq4xKYKa!yb=T+H556ahTl8 zRpIxmQ$N9Sd6@=ezJ{*a$uo%kKf-%QHQwnuM1eof<}H|GO! z^yWNwbzD#M=DZ+|-kg`k(U0?*IJ$y!E?CxnZmZqHI-u>U?m7#430Gdv0p1_Za})WH zap62)kvH-0DbakcsQ0BaD)n-@eqjrRb$m@yVu$gt$GF&KeCFMh-6fUGyQ4bu?kf)X zy(sSe;rPM$j0?wK{KP-zUHFHnha9CQ9_BkeZ}eD^KfHhM>isEtbY7QT^ys`Ljvk$N z#nGel6>;?Fd?e0#Qa&o(m9B~NXFl+?Jfrz?uGYKYrnE^-{b=v;;&YX{+0U!KmJc?0 zK2zGw_ULfh-|#s?@H?b-&gBc&#(3PTd|%f2%lV3z^QzkK>3nwPANQl$r=x${j_(_Y>z;aqcJ1>*Cx`oVUbxH-3KZHod;VW?0|p^STv{ z=lw!E^bp#k_mbkZk=|*4Zqk0U{*zBU9$ooIK5^cYKjahV199?+^W10RdXrC_7sSaY z&dcJQBRj8&KV1s}S9Pbw^g4vNa*o`VU3A}2oje6!TZfGA(SAnj9LlhTI7h~@A8B7v z->khoZSQ%&{UuNJ|1qxPkL7Pgb@G(+u{iTxUXK08yD{JJc~0FX zseSldfOGSCjqm+Jd+dky;*O^96D8 zp7W+S`W<|I{rkGUIP>kO&ilIFPw(3BTsg#VtK$7jNy)jgkH0E=o8sgl=N)nKkn_Gc`k4KE^!Kq0eK?=FF1AM>&Wqyc!+Av!i-x>d2X(EAjSJ1~xl2<}`dQzefAAc-+=)-wM@kSrc z7sSzr^QJiZaNZF|AI|&Y=)?I?ocO_c?k)b)#(E+DqBr{SJiznsy!sQ|>3tXCy7v5Q z;|!%+@@Cfi=U|UKIGtyAw>|Tk_IKJo_m#r+F)lyVKAKY=<-X#)s`kX0`wriWXo>$s z_GV>|eOv0q@%~G^uJ+`0=RI-ut@D96`__5xhBzMk)_Fmkee1j|&c1bC6W`tS_50!U zdTn$cZEHO5H?5y=A4R`8#d{nRcvCRvFeIyiW*ah%3*tb=gJtZRwVM1m}4T`~PkC zH}JL2p^=Y4VZq4S|Qx^zDC z&vE|f(s@xFT{^Fbqf6%t;^@+OQyg77?}(#I=Y4VXp!S~jS zUx*xi4cf6zf4--~`z-Xy_t+QIe`5Mv%Oe@*Hp|){p$uKo`~1yu{Jk-tX;95N2XcF~ z_xJlUmkaGfn@~=l6O1(B8fv}gYezZv~KhygVK`^R}x^JD)w?})R1ocG1qKhB5Z z>>uYdUyS~;f1DS^ncw19(r-UDvn;9kHyi8^I7?WUYyT~qZjA%;^?JrdRemh zE~<`R>O0U&_+C%Y%PCX&#K-T*9(r*;7Dq46XK#(`fnJ=?iK7?i^Wx~m`Jy;_alRyu zK1TYy{%INdaK5Vk=)?I~9DO*S)p+Q``J8yN1nE^jFP5c3EYqYPepS8q!x+!5`k^bg zJ1>r|oYS8$WPKkf=;PF6_)zZ3_kp}Uy7BQ> zzc1x%z5}%<&vbW?XTtY^f-cTUN}gMM{JiX=7w0u`^y0iBPM&ez7Dq46d*bND`9K`K zIM01Ot}ps1d?VHm%+QDPvihSB=QVNk;k+S^KAgA3(TDS%IQnot5NA4B=Y?UN!}oJS zxhvn#@%F60kDpUKS%2pZan|2?Tb%WG-V zRd<~v;Y+WMo-e#VoaZ0%6XU{pP9l%tAKz2yYy6H%4V$iC_?}8wpVuTMb{G$PjEh~y zXWqrz<35;E$-FD7Gw%hBkKeQ6-XD%1jL*1m{KZfFW8OX4Z>!9jH+uB}h*+IPGsZHw}J=b0N+-d-8WL4Y;avn)9`O&PQthzwICQljJXt zrbopGYry=sW1Z&>=PhyWC(gU#+)tdZh;u)2J`(4C;yk|^{o{V(yd=K6@$>Vu>GcgZ z!}?C2yVW(G_Y3XNLuilQSJZwZz0*E?k7Tp{lTSPzRgFhJabDBS=l2V&gaC5hx2)H;^BNzoOn2266bwNUVndI(^@M+Ji@=* z63X!JvTU|J``X7}*7)pe=X2kQ@n&B;pBHCeJ6{xMUprqCXJ0#C7GK-9ipY(p#_#*j z-4p#gRC*|WH`ZBi=M{0*+xdbx>+QTL&U!oVh_l|#`{Lw>`s(`ku`@QGq1vgo=)rkY96dPih@%JReR1^Qd?=0{oX;p9qKC2cK|A6a z;y^zAg(M!Ni(Sn(#K-ejNBM;K{0x_W2kbcSE&WGxcex1+A9{P1&6i2_# zE8^(a`GPq5b>0*w&bd3+>$fk?d^@V6-_n+iKOYF|6~3<#%44SV>*KG=9_!=0cwdY+ z>*Ks4j{k7#P4Rp6T)m_^^U?C8_{~F_cWXAe9UsZqhyYsO)>*IX(`*A$h$N8K%>*IW0 zob_?OD9&_tls`{Zyuh)4f{=-&|-NLaYYf6j**k9atrkv-z!yeLjQoL9t&hw}w- z;^Dk0j{bJ^ycRx>2>LrSDL305J^J_^jgKCk7o|V)iSvp$_h;t|;@qE|H^sLqKRWM- zbAMjedeV-49^!&t!udP$*j0TIpP|NMzb$Dz;^VwBjC$NB88cjP{A!A?<$u+&AA3Fi z|Au&f?5^d1?cY_|>8lL$CqISxvtC0RZ(05E4?a-*=N{ZR-*B$P{e*tlpaN<` z)Aqy}f85`-^Sd9!bvjA|mQ_Dr>)10sQl0sBABz27h{HbLGtYbN^#5lw?a!8^68xHe z4j#Sl_RM&P@9%{6M6g7yrwvOa$a?O!1K;d>z5Cq@s)`5l9SgoTUy zg!8=GbDwZt66ZeQyeiIp!g*bs`-JnBIQI$XU2&$ftMS-ud-9-ACMf?a3EzuO&{ta87^nh4b7aQ6HO)PxSKfKlyy2 zE_<#w@*?AN-pc0-ZS}*V>zU6Nx@Ny%{?|9TUw60tM)`^T8u9`8srGL%zDFy-CGB_e zlk=9^6EE@?fB#`c`~|X?mp%4-`_VYwi^RKX&wh8F8^!kQcjpCh_Pg`4IQ!jsO`QGi zydlnhcit99pPOAT&#Tk=+9)3nG@kd{(|nBH%UYj}^p1XxOPt*7e92cHznKa&_C^1&#W@68_zi*u6({QkX`o4 z!1f)Sb;!EUTqpaX4Clx^SK!aF7|;t;ch%mWw)Z^c{*tF=ejIghoCa+EDAvhS&gazL z=X*R3Yrf>A&QGGf6OEIXoG-~fdCB>*IC;tWsyKPc`Bbru5oK zs^Mq)YN$f#%wMOzt_;4YBd@EvR`5Dy0O9xW6Rt-3=jO|_+XmwYd;Ekp;rDPg(myvY z1KSZ-*R>+F4L`Z5@k8}zu0(p%S7W-u@5wrC_&R)N<<5NAD>1Nl^8b?lU;g|LyBms_ From f0daf0108a0ccc846ca708f5eb4d46203d0c2cd1 Mon Sep 17 00:00:00 2001 From: lijiahao Date: Thu, 20 Jun 2024 09:08:23 +0800 Subject: [PATCH 10/18] update ut --- chatglm_test.cpp | 113 ++++++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 46 deletions(-) diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 7ac6bf96..af8ee5f6 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef GGML_USE_CUDA #include @@ -31,6 +32,7 @@ static inline void expect_all_close(ggml_tensor *a, ggml_tensor *b, float atol = for (int64_t i = 0; i < numel; i++) { float ai = a_buf[i]; float bi = b_buf[i]; + EXPECT_TRUE(std::isfinite(ai) && std::isfinite(bi)); float abs_diff = std::abs(ai - bi); max_abs_diff = std::max(max_abs_diff, abs_diff); if (abs_diff >= atol + rtol * std::abs(bi)) { @@ -54,33 +56,55 @@ static inline void read_backend_tensor_data(std::istream &is, ggml_tensor *tenso ggml_backend_tensor_set(tensor, buf.data(), 0, buf.size()); } +static inline void _fill(ggml_tensor *tensor, const std::vector &values) { + switch (tensor->type) { + case GGML_TYPE_F32: { + ggml_backend_tensor_set(tensor, values.data(), 0, sizeof(float) * values.size()); + } break; + case GGML_TYPE_F16: { + std::vector fp16_buf(values.size()); + ggml_fp32_to_fp16_row(values.data(), fp16_buf.data(), fp16_buf.size()); + ggml_backend_tensor_set(tensor, fp16_buf.data(), 0, fp16_buf.size()); + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: { + std::vector> q_buf(ggml_nbytes(tensor)); + ggml_quantize_chunk(tensor->type, values.data(), q_buf.data(), 0, ggml_nelements(tensor) / tensor->ne[0], tensor->ne[0], nullptr); + ggml_backend_tensor_set(tensor, q_buf.data(), 0, ggml_nbytes(tensor)); + } break; + default: + CHATGLM_THROW << "unsupported dtype " << tensor->type; + } +} + static inline float random() { return rand() / (float)RAND_MAX; } static inline float random(float lo, float hi) { return lo + random() * (hi - lo); } -static inline void random_fill(ggml_tensor *tensor) { +static inline void random_(ggml_tensor *tensor) { std::vector values(ggml_nelements(tensor)); for (float &v : values) { v = random(); } + _fill(tensor, values); +} - if (tensor->type == GGML_TYPE_F32) { - ggml_backend_tensor_set(tensor, values.data(), 0, sizeof(float) * values.size()); - } else { - CHATGLM_THROW << "unsupported dtype " << tensor->type; - } +static inline float randn() { + thread_local std::random_device rd{}; + thread_local std::mt19937 gen{rd()}; + std::normal_distribution d; + return d(gen); +} - // if (tensor->type == GGML_TYPE_F16) { - // ggml_fp32_to_fp16_row(values.data(), (ggml_fp16_t *)tensor->data, values.size()); - // } else if (tensor->type == GGML_TYPE_Q8_0) { - // ggml_quantize_q8_0(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); - // } else if (tensor->type == GGML_TYPE_Q4_0) { - // ggml_quantize_q4_0(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); - // } else if (tensor->type == GGML_TYPE_Q4_1) { - // ggml_quantize_q4_1(values.data(), tensor->data, ggml_nelements(tensor), tensor->ne[0], hist); - // } else { - // CHATGLM_THROW << "unsupported dtype " << ggml_type_name(tensor->type); - // } +static inline void randn_(ggml_tensor *tensor) { + std::vector values(ggml_nelements(tensor)); + for (float &v : values) { + v = randn(); + } + _fill(tensor, values); } // return elapsed time in milliseconds @@ -105,18 +129,6 @@ static inline float timeit(std::function fn, int warmup, int active) { return elapsed_ms / active; } -static bool equal(const std::vector &a, const std::vector &b) { - if (a.size() != b.size()) { - return false; - } - for (size_t i = 0; i < a.size(); i++) { - if (a[i] != b[i]) { - return false; - } - } - return true; -} - static std::vector extract_sorted_ids(std::vector &token_scores) { std::vector token_ids(token_scores.size()); for (size_t i = 0; i < token_scores.size(); i++) { @@ -199,7 +211,7 @@ TEST(Sampling, TopK) { token_scores.resize(top_k); // sort & compare - EXPECT_TRUE(equal(extract_sorted_ids(token_scores), extract_sorted_ids(target))); + EXPECT_EQ(extract_sorted_ids(token_scores), extract_sorted_ids(target)); } static void reference_top_p(std::vector &token_scores, float top_p) { @@ -236,7 +248,7 @@ TEST(Sampling, TopP) { // sort & compare auto output_ids = extract_sorted_ids(token_scores); auto target_ids = extract_sorted_ids(target); - EXPECT_TRUE(equal(output_ids, target_ids)) << "size " << output_ids.size() << " vs " << target_ids.size(); + EXPECT_EQ(output_ids, target_ids); } } @@ -468,20 +480,28 @@ TEST_F(ChatGLMTest, Linear) { TEST_F(ChatGLMTest, BenchmarkLinear) { constexpr int M = 64, N = 1024, K = 1024 * 3; - mctx_->dtype = GGML_TYPE_F32; - Linear m(mctx_.get(), K, N); - ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, K, M); + std::vector dtypes { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0}; + for (ggml_type dtype : dtypes) { + mctx_ = std::make_unique(dtype); - ggml_tensor *y = m.forward(mctx_.get(), x); - ggml_build_forward_expand(mctx_->gf, y); - CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); + Linear m(mctx_.get(), K, N); + ggml_tensor *x = ggml_new_tensor_2d(mctx_->ctx_b.get(), GGML_TYPE_F32, K, M); - std::vector all_tensors{m.weight, m.bias, x}; - for (auto tensor : all_tensors) { - random_fill(tensor); - } + ggml_tensor *y = m.forward(mctx_.get(), x); + ggml_build_forward_expand(mctx_->gf, y); + CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); - std::cout << "[Benchmark] Linear " << ggml_type_name(mctx_->dtype) << " time: " << perf_graph_compute() << " ms\n"; + std::vector all_tensors{m.weight, m.bias, x}; + for (auto tensor : all_tensors) { + randn_(tensor); + } + + std::cout << "[Benchmark] Linear " << ggml_type_name(mctx_->dtype) << " time: " << perf_graph_compute() << " ms\n"; + + // for (int i = ggml_nelements(y); i >= 0 ; i--) { + // CHATGLM_CHECK(std::isfinite(((float *)y->data)[i])) << i; + // } + } } TEST_F(ChatGLMTest, LayerNorm) { @@ -525,7 +545,7 @@ TEST_F(ChatGLMTest, BenchmarkLayerNorm) { std::vector all_tensors{m.weight, m.bias, x}; for (auto tensor : all_tensors) { - random_fill(tensor); + random_(tensor); } ggml_tensor *y = m.forward(mctx_.get(), x); @@ -576,7 +596,7 @@ TEST_F(ChatGLMTest, BenchmarkRMSNorm) { std::vector all_tensors{m.weight, x}; for (auto tensor : all_tensors) { - random_fill(tensor); + random_(tensor); } ggml_tensor *y = m.forward(mctx_.get(), x); @@ -926,7 +946,7 @@ static void check_tokenizer(const BaseTokenizer *tokenizer, const std::vector input_ids = tokenizer->encode(c.prompt, 2048); - EXPECT_TRUE(equal(input_ids, c.input_ids)); + EXPECT_EQ(input_ids, c.input_ids); if (!c.skip_decode) { // decode std::string output = tokenizer->decode(c.input_ids); @@ -1202,7 +1222,8 @@ TEST(Pipeline, ChatGLM3) { { ChatMessage output = pipeline.chat(messages, gen_config); EXPECT_EQ(output.role, ChatMessage::ROLE_ASSISTANT); - EXPECT_EQ(output.content, "根据您的要求,我使用随机数生成器API生成了一个随机数。根据API返回的结果,生成的随机数为22。"); + EXPECT_EQ(output.content, + "根据您的要求,我使用随机数生成器API生成了一个随机数。根据API返回的结果,生成的随机数为22。"); } } From 5825cc2327f2f59db6ab84e4023b58dc59a23520 Mon Sep 17 00:00:00 2001 From: lijiahao Date: Thu, 20 Jun 2024 11:36:53 +0800 Subject: [PATCH 11/18] Fix nan --- CMakeLists.txt | 2 -- chatglm.cpp | 68 +++++++++++++++++++++++++++++++++----------------- chatglm.h | 4 +-- 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 581223a2..24078c29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,8 +74,6 @@ file(GLOB CPP_SOURCES ${PROJECT_SOURCE_DIR}/*.cpp ${PROJECT_SOURCE_DIR}/tests/*.cpp) -set_source_files_properties(${CPP_SOURCES} PROPERTIES COMPILE_FLAGS "-pedantic-errors") - add_library(chatglm STATIC chatglm.cpp) target_link_libraries(chatglm PUBLIC ggml sentencepiece-static re2) diff --git a/chatglm.cpp b/chatglm.cpp index 1bbafa5b..f3e55c80 100644 --- a/chatglm.cpp +++ b/chatglm.cpp @@ -1,4 +1,5 @@ #include "chatglm.h" +#include #include #include #include @@ -65,50 +66,70 @@ static std::string strides_to_string(ggml_tensor *tensor) { } std::string to_string(ggml_tensor *tensor, bool with_data) { - std::vector> buf(ggml_nbytes(tensor)); - ggml_backend_tensor_get(tensor, buf.data(), 0, buf.size()); + std::vector buf(ggml_nbytes(tensor)); + if (tensor->buffer ) { + ggml_backend_tensor_get(tensor, buf.data(), 0, buf.size()); + } else { + memcpy(buf.data(), tensor->data, buf.size()); + } + + std::vector float_buf(ggml_nelements(tensor)); + + switch (tensor->type) { + case GGML_TYPE_F32: + memcpy(float_buf.data(), buf.data(), buf.size()); + break; + case GGML_TYPE_F16: + ggml_fp16_to_fp32_row((const ggml_fp16_t*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + break; + case GGML_TYPE_Q4_0: + dequantize_row_q4_0((block_q4_0*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + break; + case GGML_TYPE_Q4_1: + dequantize_row_q4_1((block_q4_1*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + break; + case GGML_TYPE_Q5_0: + dequantize_row_q5_0((block_q5_0*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + break; + case GGML_TYPE_Q5_1: + dequantize_row_q5_1((block_q5_1*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + break; + case GGML_TYPE_Q8_0: + dequantize_row_q8_0((block_q8_0*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + break; + default: + CHATGLM_THROW << "Unsupported dtype " << tensor->type; + } std::ostringstream oss; oss << "ggml_tensor("; if (with_data) { - if (ggml_n_dims(tensor) > 3) + const int n_dims = ggml_n_dims(tensor); + if (n_dims > 3) oss << "["; for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - if (ggml_n_dims(tensor) > 2) + if (n_dims > 2) oss << (i3 > 0 ? ",\n\n[" : "["); for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - if (ggml_n_dims(tensor) > 1) + if (n_dims > 1) oss << (i2 > 0 ? ",\n\n[" : "["); for (int i1 = 0; i1 < tensor->ne[1]; i1++) { oss << (i1 > 0 ? ",\n[" : "["); for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - char *ptr = (char *)buf.data() + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + - i0 * tensor->nb[0]; oss << (i0 > 0 ? ", " : ""); - if (tensor->type == GGML_TYPE_I32) { - oss << *(int *)ptr; - } else { - float val; - if (tensor->type == GGML_TYPE_F32) { - val = *(float *)ptr; - } else if (tensor->type == GGML_TYPE_F16) { - val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr); - } else { - CHATGLM_THROW << "unimplemented"; - } - oss << std::setw(7) << std::fixed << std::setprecision(4) << val; - } + const int i = ((i3 * tensor->ne[2] + i2 ) * tensor->ne[1] + i1) * tensor->ne[0] + i0; + oss << std::setw(7) << std::fixed << std::setprecision(4) << float_buf[i]; } oss << "]"; } - if (ggml_n_dims(tensor) > 1) + if (n_dims > 1) oss << "]"; } - if (ggml_n_dims(tensor) > 2) + if (n_dims > 2) oss << "]"; } - if (ggml_n_dims(tensor) > 3) + if (n_dims > 3) oss << "]"; oss << ", "; } @@ -731,6 +752,7 @@ ggml_tensor *BaseModelForCausalLM::forward_graph_compute(const std::vector ggml_set_input(curr_input_ids); ggml_tensor *lm_logits = forward(mctx_.get(), curr_input_ids, n_past, n_ctx, is_decoding); + ggml_set_output(lm_logits); ggml_build_forward_expand(mctx_->gf, lm_logits); CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); diff --git a/chatglm.h b/chatglm.h index 7b3dc9db..955a26a7 100644 --- a/chatglm.h +++ b/chatglm.h @@ -148,9 +148,7 @@ class ModelConfig { num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, rec.sep_token_id, {}) {} - ModelConfig(ModelType model_type, const ConfigRecordV1GQA &rec, float norm_eps, ActivationType hidden_act, - bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, RopeType rope_type, float rope_theta, - AttentionMaskType attn_mask_type, int num_virtual_tokens) + ModelConfig(ModelType model_type, const ConfigRecordV1GQA &rec, float norm_eps, float rope_theta, int num_virtual_tokens) : ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads, rec.num_kv_heads, rec.num_hidden_layers, rec.intermediate_size, norm_eps, rope_theta, num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, rec.sep_token_id, {}) {} From ffa0d773b86e0edf8c30933869901ac8d0ac8206 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Thu, 20 Jun 2024 11:39:29 +0800 Subject: [PATCH 12/18] lint --- chatglm.cpp | 24 ++++++++++++------------ chatglm.h | 3 ++- chatglm_test.cpp | 17 ++++++++--------- tests/test_convert.py | 6 +++--- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/chatglm.cpp b/chatglm.cpp index f3e55c80..728454fe 100644 --- a/chatglm.cpp +++ b/chatglm.cpp @@ -1,11 +1,11 @@ #include "chatglm.h" -#include #include #include #include #include #include #include +#include #include #include #include @@ -67,7 +67,7 @@ static std::string strides_to_string(ggml_tensor *tensor) { std::string to_string(ggml_tensor *tensor, bool with_data) { std::vector buf(ggml_nbytes(tensor)); - if (tensor->buffer ) { + if (tensor->buffer) { ggml_backend_tensor_get(tensor, buf.data(), 0, buf.size()); } else { memcpy(buf.data(), tensor->data, buf.size()); @@ -80,25 +80,25 @@ std::string to_string(ggml_tensor *tensor, bool with_data) { memcpy(float_buf.data(), buf.data(), buf.size()); break; case GGML_TYPE_F16: - ggml_fp16_to_fp32_row((const ggml_fp16_t*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + ggml_fp16_to_fp32_row((ggml_fp16_t *)buf.data(), float_buf.data(), ggml_nelements(tensor)); break; case GGML_TYPE_Q4_0: - dequantize_row_q4_0((block_q4_0*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + dequantize_row_q4_0((block_q4_0 *)buf.data(), float_buf.data(), ggml_nelements(tensor)); break; case GGML_TYPE_Q4_1: - dequantize_row_q4_1((block_q4_1*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + dequantize_row_q4_1((block_q4_1 *)buf.data(), float_buf.data(), ggml_nelements(tensor)); break; case GGML_TYPE_Q5_0: - dequantize_row_q5_0((block_q5_0*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + dequantize_row_q5_0((block_q5_0 *)buf.data(), float_buf.data(), ggml_nelements(tensor)); break; case GGML_TYPE_Q5_1: - dequantize_row_q5_1((block_q5_1*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + dequantize_row_q5_1((block_q5_1 *)buf.data(), float_buf.data(), ggml_nelements(tensor)); break; case GGML_TYPE_Q8_0: - dequantize_row_q8_0((block_q8_0*)buf.data(), float_buf.data(), ggml_nelements(tensor)); + dequantize_row_q8_0((block_q8_0 *)buf.data(), float_buf.data(), ggml_nelements(tensor)); break; default: - CHATGLM_THROW << "Unsupported dtype " << tensor->type; + CHATGLM_THROW << "Unsupported dtype " << tensor->type; } std::ostringstream oss; @@ -118,7 +118,7 @@ std::string to_string(ggml_tensor *tensor, bool with_data) { oss << (i1 > 0 ? ",\n[" : "["); for (int i0 = 0; i0 < tensor->ne[0]; i0++) { oss << (i0 > 0 ? ", " : ""); - const int i = ((i3 * tensor->ne[2] + i2 ) * tensor->ne[1] + i1) * tensor->ne[0] + i0; + const int i = ((i3 * tensor->ne[2] + i2) * tensor->ne[1] + i1) * tensor->ne[0] + i0; oss << std::setw(7) << std::fixed << std::setprecision(4) << float_buf[i]; } oss << "]"; @@ -548,8 +548,8 @@ static ggml_tensor *apply_rotary_emb_basic(ModelContext *mctx, ggml_tensor *laye } #endif const int head_size = layer->ne[0]; - layer = ggml_rope_ext_inplace(ctx, layer, position_ids, nullptr, head_size, (int)rope_type, 0, rope_theta, - 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // [s, #h, d] + layer = ggml_rope_ext_inplace(ctx, layer, position_ids, nullptr, head_size, (int)rope_type, 0, rope_theta, 1.0f, + 0.0f, 1.0f, 0.0f, 0.0f); // [s, #h, d] return layer; } diff --git a/chatglm.h b/chatglm.h index 955a26a7..111a6645 100644 --- a/chatglm.h +++ b/chatglm.h @@ -148,7 +148,8 @@ class ModelConfig { num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, rec.sep_token_id, {}) {} - ModelConfig(ModelType model_type, const ConfigRecordV1GQA &rec, float norm_eps, float rope_theta, int num_virtual_tokens) + ModelConfig(ModelType model_type, const ConfigRecordV1GQA &rec, float norm_eps, float rope_theta, + int num_virtual_tokens) : ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads, rec.num_kv_heads, rec.num_hidden_layers, rec.intermediate_size, norm_eps, rope_theta, num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, rec.sep_token_id, {}) {} diff --git a/chatglm_test.cpp b/chatglm_test.cpp index af8ee5f6..239a8f52 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -70,9 +70,10 @@ static inline void _fill(ggml_tensor *tensor, const std::vector &values) case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: { + case GGML_TYPE_Q8_0: { std::vector> q_buf(ggml_nbytes(tensor)); - ggml_quantize_chunk(tensor->type, values.data(), q_buf.data(), 0, ggml_nelements(tensor) / tensor->ne[0], tensor->ne[0], nullptr); + ggml_quantize_chunk(tensor->type, values.data(), q_buf.data(), 0, ggml_nelements(tensor) / tensor->ne[0], + tensor->ne[0], nullptr); ggml_backend_tensor_set(tensor, q_buf.data(), 0, ggml_nbytes(tensor)); } break; default: @@ -92,7 +93,7 @@ static inline void random_(ggml_tensor *tensor) { _fill(tensor, values); } -static inline float randn() { +static inline float randn() { thread_local std::random_device rd{}; thread_local std::mt19937 gen{rd()}; std::normal_distribution d; @@ -480,7 +481,8 @@ TEST_F(ChatGLMTest, Linear) { TEST_F(ChatGLMTest, BenchmarkLinear) { constexpr int M = 64, N = 1024, K = 1024 * 3; - std::vector dtypes { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0}; + std::vector dtypes{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, + GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0}; for (ggml_type dtype : dtypes) { mctx_ = std::make_unique(dtype); @@ -496,11 +498,8 @@ TEST_F(ChatGLMTest, BenchmarkLinear) { randn_(tensor); } - std::cout << "[Benchmark] Linear " << ggml_type_name(mctx_->dtype) << " time: " << perf_graph_compute() << " ms\n"; - - // for (int i = ggml_nelements(y); i >= 0 ; i--) { - // CHATGLM_CHECK(std::isfinite(((float *)y->data)[i])) << i; - // } + std::cout << "[Benchmark] Linear " << ggml_type_name(mctx_->dtype) << " time: " << perf_graph_compute() + << " ms\n"; } } diff --git a/tests/test_convert.py b/tests/test_convert.py index 59acaf10..25955999 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -190,9 +190,9 @@ def make_data_embedding(): def make_data_linear(): - w = torch.randn(16, 32) - b = torch.randn(16) - x = torch.randn(2, 32) + w = torch.randn(32, 64) + b = torch.randn(32) + x = torch.randn(2, 64) y = F.linear(x, w, b) vec_x = x[0] From 4a43a86b8ebcd5c70b3b933c750872d5a5705ca1 Mon Sep 17 00:00:00 2001 From: lijiahao Date: Thu, 20 Jun 2024 12:21:12 +0800 Subject: [PATCH 13/18] metal --- chatglm.cpp | 72 ++++++++++++++++++++++++------------------------ chatglm.h | 19 +------------ chatglm_test.cpp | 8 ++---- 3 files changed, 39 insertions(+), 60 deletions(-) diff --git a/chatglm.cpp b/chatglm.cpp index 728454fe..12a11899 100644 --- a/chatglm.cpp +++ b/chatglm.cpp @@ -43,6 +43,10 @@ #include #endif +#ifdef GGML_USE_METAL +#include +#endif + namespace chatglm { static std::string shape_to_string(ggml_tensor *tensor) { @@ -179,27 +183,6 @@ std::vector BaseTokenizer::filter_user_assistant_messages(const std return out; } -// void ModelContext::init_device_context() { -// #ifdef GGML_USE_METAL -// ctx_metal = make_unique_ggml_metal_context(1); - -// const size_t max_size = ggml_get_max_tensor_size(ctx_w.get()); - -// void *weight_data = weight_buffer.empty() ? ggml_get_mem_buffer(ctx_w.get()) : (void *)weight_buffer.data(); -// size_t weight_size = weight_buffer.empty() ? ggml_get_mem_size(ctx_w.get()) : weight_buffer.size(); -// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "weights", weight_data, weight_size, max_size)); - -// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "kv", ggml_get_mem_buffer(ctx_kv.get()), -// ggml_get_mem_size(ctx_kv.get()), 0)); - -// void *compute_data = ctx_b ? ggml_get_mem_buffer(ctx_b.get()) : compute_meta.data(); -// size_t compute_size = ctx_b ? ggml_get_mem_size(ctx_b.get()) : compute_meta.size(); -// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "compute", compute_data, compute_size, 0)); - -// CHATGLM_CHECK(ggml_metal_add_buffer(ctx_metal.get(), "scratch", scratch.data, scratch.size, 0)); -// #endif -// } - // ===== streamer ===== void StreamerGroup::put(const std::vector &output_ids) { @@ -602,17 +585,16 @@ static ggml_tensor *apply_rotary_emb_glm2(ModelContext *mctx, ggml_tensor *layer ggml_tensor *half_layer_view = ggml_view_3d(ctx, layer, rope_dim, layer->ne[1], layer->ne[2], layer->nb[1], layer->nb[2], 0); - // TODO: metal ggml_tensor *half_layer = half_layer_view; - if (!ggml_backend_is_cpu(mctx->backend.get())) { - half_layer = ggml_cont(ctx, half_layer); - } +#ifdef GGML_USE_CUDA + half_layer = ggml_cont(ctx, half_layer); +#endif ggml_tensor *roped_half_layer = ggml_rope_ext_inplace(ctx, half_layer, position_ids, nullptr, rope_dim, (int)RopeType::GPTJ, 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // [s, #h, d] - if (!ggml_backend_is_cpu(mctx->backend.get())) { - roped_half_layer = ggml_cpy(ctx, roped_half_layer, half_layer_view); - } +#ifdef GGML_USE_CUDA + roped_half_layer = ggml_cpy(ctx, roped_half_layer, half_layer_view); +#endif ggml_build_forward_expand(mctx->gf, roped_half_layer); return layer; @@ -1106,11 +1088,19 @@ ggml_tensor *GLMBlock::forward(ModelContext *mctx, ggml_tensor *hidden_states, g ChatGLMForCausalLM::ChatGLMForCausalLM(const ModelConfig &config) : BasicModelForCausalLM(config) {} void ChatGLMForCausalLM::load_state_dict(const StateDict &sd) { - // TODO: handle metal + void *sd_buf_base = ggml_backend_buffer_get_base(sd.buf.get()); + const size_t sd_buf_size = ggml_backend_buffer_get_size(sd.buf.get()); if (ggml_backend_is_cpu(mctx_->backend.get())) { - mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr( - ggml_backend_buffer_get_base(sd.buf.get()), ggml_backend_buffer_get_size(sd.buf.get()))); - } else { + mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr(sd_buf_base, sd_buf_size)); + } +#ifdef GGML_USE_METAL + else if (ggml_backend_is_metal(mctx_->backend.get())) { + const size_t max_size = ggml_get_max_tensor_size(mctx_->ctx_w.get()); + mctx_->buf_w = + unique_ggml_backend_buffer_t(ggml_backend_metal_buffer_from_ptr(sd_buf_base, sd_buf_size, max_size)); + } +#endif + else { mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); } @@ -1120,7 +1110,7 @@ void ChatGLMForCausalLM::load_state_dict(const StateDict &sd) { const std::string &name = item.first; ggml_tensor *self_weight = item.second; ggml_tensor *ckpt_weight = sd.kv.at(name); - if (ggml_backend_is_cpu(mctx_->backend.get())) { + if (ggml_backend_is_cpu(mctx_->backend.get()) || ggml_cpu_has_metal()) { ggml_backend_tensor_alloc(mctx_->buf_w.get(), self_weight, ckpt_weight->data); } else { ggml_backend_tensor_set(self_weight, ckpt_weight->data, 0, ggml_nbytes(self_weight)); @@ -1256,10 +1246,20 @@ bool ChatGLM2Tokenizer::is_special_id(int id) const { ChatGLM2ForCausalLM::ChatGLM2ForCausalLM(const ModelConfig &config) : BasicModelForCausalLM(config) {} void ChatGLM2ForCausalLM::load_state_dict(const StateDict &sd) { + void *sd_buf_base = ggml_backend_buffer_get_base(sd.buf.get()); + const size_t sd_buf_size = ggml_backend_buffer_get_size(sd.buf.get()); if (ggml_backend_is_cpu(mctx_->backend.get())) { mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr( ggml_backend_buffer_get_base(sd.buf.get()), ggml_backend_buffer_get_size(sd.buf.get()))); - } else { + } +#ifdef GGML_USE_METAL + else if (ggml_backend_is_metal(mctx_->backend.get())) { + const size_t max_size = ggml_get_max_tensor_size(mctx_->ctx_w.get()); + mctx_->buf_w = + unique_ggml_backend_buffer_t(ggml_backend_metal_buffer_from_ptr(sd_buf_base, sd_buf_size, max_size)); + } +#endif + else { mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_alloc_ctx_tensors(mctx_->ctx_w.get(), mctx_->backend.get())); } @@ -1289,7 +1289,7 @@ void ChatGLM2ForCausalLM::load_state_dict(const StateDict &sd) { CHATGLM_CHECK(ggml_nbytes(ckpt_weight) == ggml_nbytes(gate_proj) + ggml_nbytes(up_proj)); - if (ggml_backend_is_cpu(mctx_->backend.get())) { + if (ggml_backend_is_cpu(mctx_->backend.get()) || ggml_cpu_has_metal()) { ggml_backend_tensor_alloc(mctx_->buf_w.get(), gate_proj, ckpt_weight->data); ggml_backend_tensor_alloc(mctx_->buf_w.get(), up_proj, (char *)ckpt_weight->data + ggml_nbytes(gate_proj)); @@ -1301,7 +1301,7 @@ void ChatGLM2ForCausalLM::load_state_dict(const StateDict &sd) { } else { // normal weight ggml_tensor *self_weight = self_sd.kv.at(name); - if (ggml_backend_is_cpu(mctx_->backend.get())) { + if (ggml_backend_is_cpu(mctx_->backend.get()) || ggml_cpu_has_metal()) { ggml_backend_tensor_alloc(mctx_->buf_w.get(), self_weight, ckpt_weight->data); } else { ggml_backend_tensor_set(self_weight, ckpt_weight->data, 0, ggml_nbytes(self_weight)); diff --git a/chatglm.h b/chatglm.h index 111a6645..714aa64b 100644 --- a/chatglm.h +++ b/chatglm.h @@ -9,10 +9,6 @@ #include #include -// #ifdef GGML_USE_METAL -// #include -// #endif - namespace chatglm { // ===== common ===== @@ -303,24 +299,11 @@ struct ggml_backend_buffer_deleter_t { using unique_ggml_backend_buffer_t = std::unique_ptr; -#ifdef GGML_USE_METAL -struct ggml_metal_context_deleter_t { - void operator()(ggml_metal_context *ctx) const noexcept { ggml_metal_free(ctx); } -}; - -using unique_ggml_metal_context_t = std::unique_ptr; - -static inline unique_ggml_metal_context_t make_unique_ggml_metal_context(int n_cb) { - return unique_ggml_metal_context_t(ggml_metal_init(n_cb)); -} -#endif - // reference: https://github.com/ggerganov/llama.cpp/blob/master/llama.cpp template struct no_init { T value; - no_init() { /* do nothing */ - } + no_init() { /* do nothing */ } }; struct ModelContext { diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 239a8f52..3061bcb6 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -958,15 +958,11 @@ static void check_chat_format(const Pipeline &pipeline) { GenerationConfig gen_config; gen_config.max_new_tokens = 1; EXPECT_THROW( - { - pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_USER, "user"}}, gen_config); - }, + { pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_USER, "user"}}, gen_config); }, std::runtime_error); EXPECT_THROW({ pipeline.chat({{ChatMessage::ROLE_ASSISTANT, "assistant"}}, gen_config); }, std::runtime_error); EXPECT_THROW( - { - pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_ASSISTANT, "assistant"}}, gen_config); - }, + { pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_ASSISTANT, "assistant"}}, gen_config); }, std::runtime_error); // never throw with system prompt pipeline.chat({{ChatMessage::ROLE_SYSTEM, "system"}, {ChatMessage::ROLE_USER, "user"}}, gen_config); From 07ae850e172305053484339e795190cda8f91520 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Thu, 20 Jun 2024 12:25:47 +0800 Subject: [PATCH 14/18] fix --- chatglm.cpp | 3 +-- chatglm.h | 3 ++- chatglm_test.cpp | 8 ++++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/chatglm.cpp b/chatglm.cpp index 12a11899..f651d09a 100644 --- a/chatglm.cpp +++ b/chatglm.cpp @@ -1249,8 +1249,7 @@ void ChatGLM2ForCausalLM::load_state_dict(const StateDict &sd) { void *sd_buf_base = ggml_backend_buffer_get_base(sd.buf.get()); const size_t sd_buf_size = ggml_backend_buffer_get_size(sd.buf.get()); if (ggml_backend_is_cpu(mctx_->backend.get())) { - mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr( - ggml_backend_buffer_get_base(sd.buf.get()), ggml_backend_buffer_get_size(sd.buf.get()))); + mctx_->buf_w = unique_ggml_backend_buffer_t(ggml_backend_cpu_buffer_from_ptr(sd_buf_base, sd_buf_size)); } #ifdef GGML_USE_METAL else if (ggml_backend_is_metal(mctx_->backend.get())) { diff --git a/chatglm.h b/chatglm.h index 714aa64b..0a9c1ed6 100644 --- a/chatglm.h +++ b/chatglm.h @@ -303,7 +303,8 @@ using unique_ggml_backend_buffer_t = std::unique_ptr struct no_init { T value; - no_init() { /* do nothing */ } + no_init() { /* do nothing */ + } }; struct ModelContext { diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 3061bcb6..239a8f52 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -958,11 +958,15 @@ static void check_chat_format(const Pipeline &pipeline) { GenerationConfig gen_config; gen_config.max_new_tokens = 1; EXPECT_THROW( - { pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_USER, "user"}}, gen_config); }, + { + pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_USER, "user"}}, gen_config); + }, std::runtime_error); EXPECT_THROW({ pipeline.chat({{ChatMessage::ROLE_ASSISTANT, "assistant"}}, gen_config); }, std::runtime_error); EXPECT_THROW( - { pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_ASSISTANT, "assistant"}}, gen_config); }, + { + pipeline.chat({{ChatMessage::ROLE_USER, "user"}, {ChatMessage::ROLE_ASSISTANT, "assistant"}}, gen_config); + }, std::runtime_error); // never throw with system prompt pipeline.chat({{ChatMessage::ROLE_SYSTEM, "system"}, {ChatMessage::ROLE_USER, "user"}}, gen_config); From e4273acc581545d5dedee13b41d3163451e9783e Mon Sep 17 00:00:00 2001 From: lijiahao Date: Thu, 20 Jun 2024 13:13:16 +0800 Subject: [PATCH 15/18] add metal benchmark --- README.md | 11 ++++++----- tests/perf.sh | 15 +++++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 95844a57..4e7e6512 100644 --- a/README.md +++ b/README.md @@ -533,11 +533,12 @@ ChatGLM2-6B / ChatGLM3-6B / CodeGeeX2: ChatGLM4-9B: -| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 | -|--------------------------------|-------|-------|-------|-------|-------|-------| -| ms/token (CPU @ Platinum 8260) | 105 | 105 | 122 | 134 | 158 | 279 | -| ms/token (CUDA @ V100 SXM2) | 12.1 | 12.5 | 13.8 | 13.9 | 17.7 | 27.7 | -| file size | 5.0G | 5.5G | 6.1G | 6.6G | 9.4G | 18G | +| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 | +|--------------------------------|------|------|------|------|------|------| +| ms/token (CPU @ Platinum 8260) | 105 | 105 | 122 | 134 | 158 | 279 | +| ms/token (CUDA @ V100 SXM2) | 12.1 | 12.5 | 13.8 | 13.9 | 17.7 | 27.7 | +| ms/token (MPS @ M2 Ultra) | 14.4 | 15.3 | 19.6 | 20.1 | 20.7 | 32.4 | +| file size | 5.0G | 5.5G | 6.1G | 6.6G | 9.4G | 18G | ## Model Quality diff --git a/tests/perf.sh b/tests/perf.sh index c045148f..89820da1 100644 --- a/tests/perf.sh +++ b/tests/perf.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -ex + export CUDA_VISIBLE_DEVICES=0 hf_model=THUDM/chatglm3-6b @@ -11,13 +13,14 @@ hf_model=THUDM/glm-4-9b-chat ggml_model=models/chatglm4-ggml.bin benchmark=Benchmark.ChatGLM4 +use_cuda=ON +use_metal=OFF + for dtype in f16 q8_0 q5_1 q5_0 q4_1 q4_0; do python3 chatglm_cpp/convert.py -i $hf_model -o $ggml_model -t $dtype - for use_cuda in OFF ON; do - cmake -B build -DGGML_CUDA=$use_cuda && cmake --build build -j - for i in $(seq 3); do - echo "[benchmark] dtype=$dtype use_cuda=$use_cuda round=$i" - ./build/bin/chatglm_test --gtest_filter="$benchmark" - done + cmake -B build -DGGML_CUDA=$use_cuda -DGGML_METAL=$use_metal -DCHATGLM_ENABLE_TESTING=ON && cmake --build build -j + for i in $(seq 3); do + echo "[benchmark] dtype=$dtype use_cuda=$use_cuda use_metal=$use_metal round=$i" + ./build/bin/chatglm_test --gtest_filter="$benchmark" done done From 9f7f9edaeb8b41eb6f6164c3b4bfb12fb7528a02 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Thu, 20 Jun 2024 13:39:46 +0800 Subject: [PATCH 16/18] fix cuda arch --- CMakeLists.txt | 2 +- README.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 24078c29..3a9fe5ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ if (GGML_CUDA) if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") set(CUDA_ARCH_LIST "${CUDA_ARCH_LIST};89;90") endif () - set(GGML_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST} CACHE STRING "chatglm: cuda architectures to compile") + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST} CACHE STRING "") endif () include_directories(third_party/ggml/include/ggml third_party/ggml/src) diff --git a/README.md b/README.md index 4e7e6512..fe399540 100644 --- a/README.md +++ b/README.md @@ -231,10 +231,10 @@ CUDA accelerates model inference on NVIDIA GPU. Add the CMake flag `-DGGML_CUDA= cmake -B build -DGGML_CUDA=ON && cmake --build build -j ``` -By default, all kernels will be compiled for all possible CUDA architectures and it takes some time. To run on a specific type of device, you may specify `GGML_CUDA_ARCHITECTURES` to speed up the nvcc compilation. For example: +By default, all kernels will be compiled for all possible CUDA architectures and it takes some time. To run on a specific type of device, you may specify `CMAKE_CUDA_ARCHITECTURES` to speed up the nvcc compilation. For example: ```sh -cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_ARCHITECTURES="80" # for A100 -cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_ARCHITECTURES="70;75" # compatible with both V100 and T4 +cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="80" # for A100 +cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="70;75" # compatible with both V100 and T4 ``` To find out the CUDA architecture of your GPU device, see [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus). @@ -481,7 +481,7 @@ For CUDA support, make sure [nvidia-docker](https://github.com/NVIDIA/nvidia-doc ```sh docker build . --network=host -t chatglm.cpp-cuda \ --build-arg BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 \ - --build-arg CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_CUDA_ARCHITECTURES=80" + --build-arg CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=80" docker run -it --rm --gpus all -v $PWD/models:/chatglm.cpp/models chatglm.cpp-cuda \ ./build/bin/main -m models/chatglm-ggml.bin -p "你好" ``` From 7a125399b5e9f7cdcdf8aae0c6a917ca7cef6453 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Thu, 20 Jun 2024 14:02:43 +0800 Subject: [PATCH 17/18] remove trash --- .github/workflows/cmake.yml | 2 +- README.md | 2 +- chatglm.h | 8 -------- chatglm_test.cpp | 13 ------------- examples/cli_demo.py | 1 - main.cpp | 1 - pyproject.toml | 2 +- 7 files changed, 3 insertions(+), 26 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index a6b942bb..a4eb207d 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -18,7 +18,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-13] + os: [ubuntu-latest, windows-latest, macos-13, macos-14] steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index fe399540..1708a87a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ![Python](https://img.shields.io/pypi/pyversions/chatglm-cpp) [![License: MIT](https://img.shields.io/badge/license-MIT-blue)](LICENSE) -C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4) and more LLMs for real-time chatting on your MacBook. +C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4) for real-time chatting on your MacBook. ![demo](docs/demo.gif) diff --git a/chatglm.h b/chatglm.h index 0a9c1ed6..c817157a 100644 --- a/chatglm.h +++ b/chatglm.h @@ -323,15 +323,7 @@ struct ModelContext { unique_ggml_backend_buffer_t buf_w; unique_ggml_backend_buffer_t buf_kv; - // #ifdef GGML_USE_METAL - // // unique_ggml_metal_context_t ctx_metal; - // #endif - - // std::string_view weight_buffer; // mapped weight - ModelContext(ggml_type dtype); - - // void init_device_context(); }; class Embedding { diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 239a8f52..64c247d9 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -461,19 +461,6 @@ TEST_F(ChatGLMTest, Linear) { CHATGLM_CHECK(ggml_gallocr_alloc_graph(mctx_->allocr.get(), mctx_->gf)); CHATGLM_CHECK(ggml_backend_graph_compute(mctx_->backend.get(), mctx_->gf) == GGML_STATUS_SUCCESS); - // if (config.dtype == GGML_TYPE_F16) { - // std::cout << "dtype " << config.dtype << '\n'; - // auto ref_cpu = ggml_new_tensor_like(ctx.get(), c.ref); - // ggml_backend_tensor_get(c.ref, ref_cpu->data, 0, ggml_nbytes(c.ref)); - // auto out_cpu = ggml_new_tensor_like(ctx.get(), out); - // ggml_backend_tensor_get(out, out_cpu->data, 0, ggml_nbytes(out)); - // auto weight_cpu = ggml_new_tensor_like(ctx.get(), model.weight); - // ggml_backend_tensor_get(model.weight, weight_cpu->data, 0, ggml_nbytes(model.weight)); - - // std::cout << "c.ref " << to_string(ref_cpu) << '\n' - // << "out " << to_string(out_cpu) << '\n' - // << "weight " << to_string(weight_cpu) << '\n'; - // } expect_all_close(c.ref, out, config.atol, config.rtol); } } diff --git a/examples/cli_demo.py b/examples/cli_demo.py index 55eaa204..6cd3b9a0 100644 --- a/examples/cli_demo.py +++ b/examples/cli_demo.py @@ -52,7 +52,6 @@ def main() -> None: parser.add_argument("--top_p", default=0.7, type=float, help="top-p sampling") parser.add_argument("--temp", default=0.95, type=float, help="temperature") parser.add_argument("--repeat_penalty", default=1.0, type=float, help="penalize repeat sequence of tokens") - parser.add_argument("-t", "--threads", default=0, type=int, help="number of threads for inference") args = parser.parse_args() prompt = args.prompt diff --git a/main.cpp b/main.cpp index 9b8166d9..037f8340 100644 --- a/main.cpp +++ b/main.cpp @@ -59,7 +59,6 @@ static void usage(const std::string &prog) { --top_p N top-p sampling (default: 0.7) --temp N temperature (default: 0.95) --repeat_penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) - -t, --threads N number of threads for inference -v, --verbose display verbose output including config/system/performance info )"; } diff --git a/pyproject.toml b/pyproject.toml index 19736ea5..92435dc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ maintainers = [ {name = "Jiahao Li", email = "liplus17@163.com"}, ] -description = "C++ implementation of ChatGLM family models and more LLMs" +description = "C++ implementation of ChatGLM family models" readme = "README.md" requires-python = ">=3.7" keywords = ["ChatGLM", "ChatGLM2", "ChatGLM3", "Large Language Model"] From 541101aedd444e09659b53cd6e49f49254fc5414 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Fri, 21 Jun 2024 10:14:31 +0800 Subject: [PATCH 18/18] fix --- README.md | 2 +- chatglm.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1708a87a..dd163b7c 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ![Python](https://img.shields.io/pypi/pyversions/chatglm-cpp) [![License: MIT](https://img.shields.io/badge/license-MIT-blue)](LICENSE) -C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4) for real-time chatting on your MacBook. +C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3) and [GLM-4](https://github.com/THUDM/GLM-4) for real-time chatting on your MacBook. ![demo](docs/demo.gif) diff --git a/chatglm.cpp b/chatglm.cpp index f651d09a..6cf5c328 100644 --- a/chatglm.cpp +++ b/chatglm.cpp @@ -1110,6 +1110,7 @@ void ChatGLMForCausalLM::load_state_dict(const StateDict &sd) { const std::string &name = item.first; ggml_tensor *self_weight = item.second; ggml_tensor *ckpt_weight = sd.kv.at(name); + CHATGLM_CHECK(ggml_nbytes(self_weight) == ggml_nbytes(ckpt_weight)); if (ggml_backend_is_cpu(mctx_->backend.get()) || ggml_cpu_has_metal()) { ggml_backend_tensor_alloc(mctx_->buf_w.get(), self_weight, ckpt_weight->data); } else { @@ -1300,6 +1301,7 @@ void ChatGLM2ForCausalLM::load_state_dict(const StateDict &sd) { } else { // normal weight ggml_tensor *self_weight = self_sd.kv.at(name); + CHATGLM_CHECK(ggml_nbytes(self_weight) == ggml_nbytes(ckpt_weight)); if (ggml_backend_is_cpu(mctx_->backend.get()) || ggml_cpu_has_metal()) { ggml_backend_tensor_alloc(mctx_->buf_w.get(), self_weight, ckpt_weight->data); } else {