diff --git a/CMakeLists.txt b/CMakeLists.txt index c0b93564a53dd..c4a649a976275 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,7 +118,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") add_custom_command( OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h" COMMENT "Generating build details from Git" - COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" + COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS "${GIT_DIR}/index" VERBATIM @@ -162,6 +162,8 @@ if (APPLE AND LLAMA_ACCELERATE) message(STATUS "Accelerate framework found") add_compile_definitions(GGML_USE_ACCELERATE) + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) else() message(WARNING "Accelerate framework not found") diff --git a/Makefile b/Makefile index e07db8afa16f5..f170f22939942 100644 --- a/Makefile +++ b/Makefile @@ -305,6 +305,8 @@ ifndef LLAMA_NO_ACCELERATE # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) MK_CPPFLAGS += -DGGML_USE_ACCELERATE + MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK + MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate endif endif # LLAMA_NO_ACCELERATE diff --git a/Package.swift b/Package.swift index fb95ef7ebc59f..442463ba3430f 100644 --- a/Package.swift +++ b/Package.swift @@ -45,6 +45,8 @@ let package = Package( .unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_K_QUANTS"), .define("GGML_USE_ACCELERATE") + .define("ACCELERATE_NEW_LAPACK") + .define("ACCELERATE_LAPACK_ILP64") ] + additionalSettings, linkerSettings: [ .linkedFramework("Accelerate") diff --git a/README.md b/README.md index 42686aacc970b..09c5b1b92ff2d 100644 --- a/README.md +++ b/README.md @@ -501,7 +501,7 @@ Building the program with BLAS support may lead to some performance improvements ```sh mkdir build cd build - cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path + cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path cmake --build . --config Release ``` - CMake (Windows): @@ -597,6 +597,11 @@ Several quantization methods are supported. They differ in the resulting model d | 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 | | 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | +- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684) +- recent k-quants improvements + - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707) + - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807) + ### Perplexity (measuring model quality) You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better). diff --git a/convert.py b/convert.py index 4ac5030db61eb..8bb6c7e410852 100755 --- a/convert.py +++ b/convert.py @@ -439,7 +439,7 @@ def __repr__(self) -> str: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: - n_head //= n_head_kv + n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape)) diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp index 7f9a1620b60bf..e9197f6b51b41 100644 --- a/examples/gptneox-wip/falcon-main.cpp +++ b/examples/gptneox-wip/falcon-main.cpp @@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ keyidx = gguf_find_key(ggufctx, "general.architecture"); if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); + keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } } diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp index 55eba0cdcfdfb..b76bafaa8c51f 100644 --- a/examples/gptneox-wip/gptneox-main.cpp +++ b/examples/gptneox-wip/gptneox-main.cpp @@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 keyidx = gguf_find_key(ggufctx, "general.architecture"); if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); + keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 34ddfde39d295..2f1a1d9ff5645 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -903,7 +903,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) } } -static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) { +static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { (void) level; (void) text; (void) user_data; diff --git a/examples/make-ggml.py b/examples/make-ggml.py index 6a34eeac53faa..c73485ebf1eff 100755 --- a/examples/make-ggml.py +++ b/examples/make-ggml.py @@ -1,22 +1,25 @@ #!/usr/bin/env python3 """ -This script converts Hugging Face llama models to GGML and quantizes them. +This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them. Usage: -python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)] +python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)] Arguments: -- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub. +- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub. +- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox. - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used. - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used. - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'. - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created. -Quant types: +Old quant types (some base model types require these): - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M + +New quant types (recommended): - Q2_K: smallest, extreme quality loss - not recommended - Q3_K: alias for Q3_K_M - Q3_K_S: very small, very high quality loss @@ -40,9 +43,7 @@ import os from huggingface_hub import snapshot_download -def main(model, outname, outdir, quants, keep_fp16): - ggml_version = "v3" - +def main(model, model_type, outname, outdir, quants, keep_fp16): if not os.path.isdir(model): print(f"Model not found at {model}. Downloading...") try: @@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16): print("Building llama.cpp") subprocess.run(f"cd .. && make quantize", shell=True, check=True) - fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin" + fp16 = f"{outdir}/{outname}.gguf.fp16.bin" - print(f"Making unquantised GGML at {fp16}") + print(f"Making unquantised GGUF at {fp16}") if not os.path.isfile(fp16): - subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True) + if model_type != "llama": + subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True) + else: + subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True) else: print(f"Unquantised GGML already exists at: {fp16}") print("Making quants") for type in quants: - outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin" + outfile = f"{outdir}/{outname}.gguf.{type}.bin" print(f"Making {type} : {outfile}") subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True) @@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16): os.remove(fp16) if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.') - parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name') + parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.') + parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name') + parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.') parser.add_argument('--outname', default=None, help='Output model(s) name') parser.add_argument('--outdir', default=None, help='Output directory') parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types') @@ -90,4 +95,4 @@ def main(model, outname, outdir, quants, keep_fp16): args = parser.parse_args() - main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16) + main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16) diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index eacfb17c67fb2..50e1af0111dd6 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -1,3 +1,21 @@ # perplexity TODO + +## Llama 2 70B Scorechart +Quantization | Model size (GiB) | Perplexity | Delta to fp16 +-- | -- | -- | -- +Q4_0 | 36.20 | 3.5550 | 3.61% +Q4_1 | 40.20 | 3.5125 | 2.37% +Q5_0 | 44.20 | 3.4744 | 1.26% +Q2_K | 27.27 | 3.7339 | 8.82% +Q3_K_S | 27.86 | 3.7019 | 7.89% +Q3_K_M | 30.83 | 3.5932 | 4.72% +Q3_K_L | 33.67 | 3.5617 | 3.80% +Q4_K_S | 36.39 | 3.4852 | 1.57% +Q4_K_M | 38.54 | 3.4725 | 1.20% +Q5_K_S | 44.20 | 3.4483 | 0.50% +Q5_K_M | 45.41 | 3.4451 | 0.40% +Q6_K | 52.70 | 3.4367 | 0.16% +fp16 | 128.5 | 3.4313 | - + diff --git a/examples/quantize/README.md b/examples/quantize/README.md index f349e913e3d10..c8b9a27a0b04e 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -1,3 +1,44 @@ # quantize TODO + +## Llama 2 7B + +Quantization | Bits per Weight (BPW) +-- | -- +Q2_K | 3.35 +Q3_K_S | 3.50 +Q3_K_M | 3.91 +Q3_K_L | 4.27 +Q4_K_S | 4.58 +Q4_K_M | 4.84 +Q5_K_S | 5.52 +Q5_K_M | 5.68 +Q6_K | 6.56 + +## Llama 2 13B +Quantization | Bits per Weight (BPW) +-- | -- +Q2_K | 3.34 +Q3_K_S | 3.48 +Q3_K_M | 3.89 +Q3_K_L | 4.26 +Q4_K_S | 4.56 +Q4_K_M | 4.83 +Q5_K_S | 5.51 +Q5_K_M | 5.67 +Q6_K | 6.56 + +# Llama 2 70B + +Quantization | Bits per Weight (BPW) +-- | -- +Q2_K | 3.40 +Q3_K_S | 3.47 +Q3_K_M | 3.85 +Q3_K_L | 4.19 +Q4_K_S | 4.53 +Q4_K_M | 4.80 +Q5_K_S | 5.50 +Q5_K_M | 5.65 +Q6_K | 6.56 diff --git a/flake.nix b/flake.nix index 7723357afe419..433d3d942ce40 100644 --- a/flake.nix +++ b/flake.nix @@ -35,6 +35,20 @@ ); pkgs = import nixpkgs { inherit system; }; nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ]; + cudatoolkit_joined = with pkgs; symlinkJoin { + # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit + # see https://github.com/NixOS/nixpkgs/issues/224291 + # copied from jaxlib + name = "${cudaPackages.cudatoolkit.name}-merged"; + paths = [ + cudaPackages.cudatoolkit.lib + cudaPackages.cudatoolkit.out + ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [ + # for some reason some of the required libs are in the targets/x86_64-linux + # directory; not sure why but this works around it + "${cudaPackages.cudatoolkit}/targets/${system}" + ]; + }; llama-python = pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); postPatch = '' @@ -70,6 +84,13 @@ "-DLLAMA_CLBLAST=ON" ]; }; + packages.cuda = pkgs.stdenv.mkDerivation { + inherit name src meta postPatch nativeBuildInputs postInstall; + buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ]; + cmakeFlags = cmakeFlags ++ [ + "-DLLAMA_CUBLAS=ON" + ]; + }; packages.rocm = pkgs.stdenv.mkDerivation { inherit name src meta postPatch nativeBuildInputs postInstall; buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ]; diff --git a/ggml-metal.h b/ggml-metal.h index fca28d37ef970..790cf0bf7b963 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -19,6 +19,8 @@ #pragma once +#include "ggml.h" + #include #include @@ -33,6 +35,8 @@ struct ggml_cgraph; extern "C" { #endif +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); + struct ggml_metal_context; // number of command buffers to use diff --git a/ggml-metal.m b/ggml-metal.m index 1139ee3114610..654eb67f3392b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -11,11 +11,14 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -// TODO: temporary - reuse llama.cpp logging #ifdef GGML_METAL_NDEBUG -#define metal_printf(...) +#define GGML_METAL_LOG_INFO(...) +#define GGML_METAL_LOG_WARN(...) +#define GGML_METAL_LOG_ERROR(...) #else -#define metal_printf(...) fprintf(stderr, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -120,8 +123,37 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end +ggml_log_callback ggml_metal_log_callback = NULL; +void * ggml_metal_log_user_data = NULL; + +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { + ggml_metal_log_callback = log_callback; + ggml_metal_log_user_data = user_data; +} + +static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ + if (ggml_metal_log_callback != NULL) { + va_list args; + va_start(args, format); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); + } else { + char* buffer2 = malloc(len+1); + vsnprintf(buffer2, len+1, format, args); + buffer2[len] = 0; + ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); + free(buffer2); + } + va_end(args); + } +} + + + struct ggml_metal_context * ggml_metal_init(int n_cb) { - metal_printf("%s: allocating\n", __func__); + GGML_METAL_LOG_INFO("%s: allocating\n", __func__); id device; NSString * s; @@ -131,14 +163,14 @@ @implementation GGMLMetalClass NSArray * devices = MTLCopyAllDevices(); for (device in devices) { s = [device name]; - metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); } #endif // Pick and show default Metal device device = MTLCreateSystemDefaultDevice(); s = [device name]; - metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); // Configure context struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -165,7 +197,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -179,11 +211,11 @@ @implementation GGMLMetalClass //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -195,7 +227,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -207,11 +239,11 @@ @implementation GGMLMetalClass #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -270,13 +302,13 @@ @implementation GGMLMetalClass #undef GGML_METAL_ADD_KERNEL } - metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX - metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { - metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); + GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); } #endif @@ -284,7 +316,7 @@ @implementation GGMLMetalClass } void ggml_metal_free(struct ggml_metal_context * ctx) { - metal_printf("%s: deallocating\n", __func__); + GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -360,7 +392,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { void * data = NULL; const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - metal_printf("%s: error: posix_memalign failed\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -388,7 +420,7 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -400,13 +432,13 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - metal_printf("%s: error: buffer is nil\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__); return nil; } @@ -418,7 +450,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - metal_printf("%s: too many buffers\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__); return false; } @@ -428,7 +460,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -449,11 +481,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -473,13 +505,13 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - metal_printf("\n"); + GGML_METAL_LOG_INFO("\n"); } ++ctx->n_buffers; @@ -487,17 +519,17 @@ bool ggml_metal_add_buffer( } #if TARGET_OS_OSX - metal_printf(", (%8.2f / %8.2f)", + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); + GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { - metal_printf("\n"); + GGML_METAL_LOG_INFO("\n"); } #else - metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); #endif } @@ -610,7 +642,7 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); + GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -664,7 +696,7 @@ void ggml_metal_graph_compute( continue; } - //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -708,17 +740,17 @@ void ggml_metal_graph_compute( id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { - // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -830,7 +862,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; @@ -1019,7 +1051,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("Asserting on type %d\n",(int)src0t); + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1286,7 +1318,7 @@ void ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } } diff --git a/ggml.h b/ggml.h index f45456876da62..b2251acef271e 100644 --- a/ggml.h +++ b/ggml.h @@ -445,6 +445,12 @@ extern "C" { GGML_OBJECT_WORK_BUFFER }; + enum ggml_log_level { + GGML_LOG_LEVEL_ERROR = 2, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_INFO = 4 + }; + // ggml object struct ggml_object { size_t offs; @@ -1691,6 +1697,7 @@ extern "C" { }; typedef void (*ggml_opt_callback)(void * data, float * sched); + typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); // optimization parameters // diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index e0e0dbcbbe840..598cf8e594aa8 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -32,7 +32,7 @@ KEY_GENERAL_DESCRIPTION = "general.description" KEY_GENERAL_LICENSE = "general.license" KEY_GENERAL_SOURCE_URL = "general.source.url" -KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" +KEY_GENERAL_SOURCE_HF_REPO = "general.source.huggingface.repository" KEY_GENERAL_FILE_TYPE = "general.file_type" # LLM diff --git a/llama.cpp b/llama.cpp index 346636501ce15..6e23a0772325d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -92,12 +92,12 @@ // LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (llama_log_level level, const char* format, ...); -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); +static void llama_log_internal (ggml_log_level level, const char* format, ...); +static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); -#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) -#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) -#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) // // helpers @@ -221,16 +221,16 @@ enum llm_kv { }; static std::map LLM_KV_NAMES = { - { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, - { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, - { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, - { LLM_KV_GENERAL_NAME, "general.name" }, - { LLM_KV_GENERAL_AUTHOR, "general.author" }, - { LLM_KV_GENERAL_URL, "general.url" }, - { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, - { LLM_KV_GENERAL_LICENSE, "general.license" }, - { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" }, - { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" }, + { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, + { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, + { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, + { LLM_KV_GENERAL_NAME, "general.name" }, + { LLM_KV_GENERAL_AUTHOR, "general.author" }, + { LLM_KV_GENERAL_URL, "general.url" }, + { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, + { LLM_KV_GENERAL_LICENSE, "general.license" }, + { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, + { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, @@ -904,7 +904,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to struct llama_state { // We save the log callback globally - llama_log_callback log_callback = llama_log_callback_default; + ggml_log_callback log_callback = llama_log_callback_default; void * log_callback_user_data = nullptr; }; @@ -6366,6 +6366,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return NULL; } + ggml_metal_log_set_callback(llama_log_callback_default, NULL); ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); } @@ -7199,12 +7200,12 @@ const std::vector> & llama_internal return ctx->model.tensors_by_name; } -void llama_log_set(llama_log_callback log_callback, void * user_data) { +void llama_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; } -static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { +static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) { va_list args_copy; va_copy(args_copy, args); char buffer[128]; @@ -7221,14 +7222,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_ va_end(args_copy); } -static void llama_log_internal(llama_log_level level, const char * format, ...) { +static void llama_log_internal(ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); llama_log_internal_v(level, format, args); va_end(args); } -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { +static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; fputs(text, stderr); diff --git a/llama.h b/llama.h index 369be048c0012..350268b9a94aa 100644 --- a/llama.h +++ b/llama.h @@ -62,12 +62,6 @@ extern "C" { typedef int llama_token; - enum llama_log_level { - LLAMA_LOG_LEVEL_ERROR = 2, - LLAMA_LOG_LEVEL_WARN = 3, - LLAMA_LOG_LEVEL_INFO = 4 - }; - enum llama_vocab_type { LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding @@ -151,13 +145,6 @@ extern "C" { bool embedding; // embedding mode only }; - // Signature for logging events - // Note that text includes the new line character at the end for most events. - // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it - // if it exists. - // It might not exist for progress report where '.' is output repeatedly. - typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data); - // model quantization parameters typedef struct llama_model_quantize_params { int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() @@ -526,7 +513,7 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake index e33f3349a435d..c86ab43790c75 100644 --- a/scripts/build-info.cmake +++ b/scripts/build-info.cmake @@ -8,16 +8,12 @@ set(BUILD_TARGET "unknown") # Look for git find_package(Git) if(NOT Git_FOUND) - execute_process( - COMMAND which git - OUTPUT_VARIABLE GIT_EXECUTABLE - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if(NOT GIT_EXECUTABLE STREQUAL "") + find_program(GIT_EXECUTABLE NAMES git git.exe) + if(GIT_EXECUTABLE) set(Git_FOUND TRUE) - message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}") + message(STATUS "Found Git: ${GIT_EXECUTABLE}") else() - message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.") + message(WARNING "Git not found. Build info will not be accurate.") endif() endif() @@ -28,43 +24,32 @@ if(Git_FOUND) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE HEAD OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE GIT_HEAD_RESULT ) execute_process( COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE COUNT OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE GIT_COUNT_RESULT ) - if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0) - set(BUILD_COMMIT ${HEAD}) - set(BUILD_NUMBER ${COUNT}) - endif() -endif() - -if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0) set(BUILD_COMMIT ${HEAD}) set(BUILD_NUMBER ${COUNT}) endif() -execute_process( - COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER} - OUTPUT_VARIABLE OUT - OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE RES -) -if (RES EQUAL 0) +if(MSVC) + set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME}) +else() + execute_process( + COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER} + OUTPUT_VARIABLE OUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) set(BUILD_COMPILER ${OUT}) -endif() - -execute_process( - COMMAND ${CMAKE_C_COMPILER} -dumpmachine - OUTPUT_VARIABLE OUT - OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE RES -) -if (RES EQUAL 0) + execute_process( + COMMAND ${CMAKE_C_COMPILER} -dumpmachine + OUTPUT_VARIABLE OUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) set(BUILD_TARGET ${OUT}) endif()