teleprint-me · pull · Nov 27, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -904,6 +904,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install Cuda Toolkit 11.7
         if: ${{ matrix.cuda == '11.7' }}
@@ -1139,6 +1141,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install
         id: depends

diff --git a/AUTHORS b/AUTHORS
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
             }
             params.hf_file = params.model;
         } else if (params.model.empty()) {
-            params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
+            // this is to avoid different repo having same file name, or same file name in different subdirs
+            std::string filename = params.hf_repo + "_" + params.hf_file;
+            // to make sure we don't have any slashes in the filename
+            string_replace_all(filename, "/", "_");
+            params.model = fs_get_cache_file(filename);
         }
     } else if (!params.model_url.empty()) {
         if (params.model.empty()) {
@@ -1366,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
@@ -2100,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));

diff --git a/common/common.cpp b/common/common.cpp
@@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
     llama_model * model = nullptr;
 
     if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
     } else if (!params.model_url.empty()) {
-        model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
     } else {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
@@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
 }
 
 struct llama_model * common_load_model_from_url(
-        const char * model_url,
-        const char * path_model,
-        const char * hf_token,
+        const std::string & model_url,
+        const std::string & local_path,
+        const std::string & hf_token,
         const struct llama_model_params & params) {
     // Basic validation of the model_url
-    if (!model_url || strlen(model_url) == 0) {
+    if (model_url.empty()) {
         LOG_ERR("%s: invalid model_url\n", __func__);
         return NULL;
     }
 
-    if (!common_download_file(model_url, path_model, hf_token)) {
+    if (!common_download_file(model_url, local_path, hf_token)) {
         return NULL;
     }
 
@@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
             /*.no_alloc = */ true,
             /*.ctx      = */ NULL,
         };
-        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
         if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
             return NULL;
         }
 
@@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
         // Verify the first split file format
         // and extract split URL and PATH prefixes
         {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
                 return NULL;
             }
 
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
                 return NULL;
             }
         }
@@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
         }
     }
 
-    return llama_load_model_from_file(path_model, params);
+    return llama_load_model_from_file(local_path.c_str(), params);
 }
 
 struct llama_model * common_load_model_from_hf(
-        const char * repo,
-        const char * model,
-        const char * path_model,
-        const char * hf_token,
+        const std::string & repo,
+        const std::string & remote_path,
+        const std::string & local_path,
+        const std::string & hf_token,
         const struct llama_model_params & params) {
     // construct hugging face model url:
     //
@@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
     std::string model_url = "https://huggingface.co/";
     model_url += repo;
     model_url += "/resolve/main/";
-    model_url += model;
+    model_url += remote_path;
 
-    return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
+    return common_load_model_from_url(model_url, local_path, hf_token, params);
 }
 
 #else
 
 struct llama_model * common_load_model_from_url(
-        const char * /*model_url*/,
-        const char * /*path_model*/,
-        const char * /*hf_token*/,
+        const std::string & /*model_url*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
         const struct llama_model_params & /*params*/) {
     LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
     return nullptr;
 }
 
 struct llama_model * common_load_model_from_hf(
-        const char * /*repo*/,
-        const char * /*model*/,
-        const char * /*path_model*/,
-        const char * /*hf_token*/,
+        const std::string & /*repo*/,
+        const std::string & /*remote_path*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
         const struct llama_model_params & /*params*/) {
     LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
     return nullptr;

diff --git a/common/common.h b/common/common.h
@@ -470,8 +470,17 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
-struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_url(
+    const std::string & model_url,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
+struct llama_model * common_load_model_from_hf(
+    const std::string & repo,
+    const std::string & remote_path,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
 void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);

diff --git a/docs/android.md b/docs/android.md
@@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
 Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
 
 ```
-$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
 ```
 
-Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
 
 To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
 

diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
@@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 
 ## News
 
+- 2024.11
+  - Support F16 and F32 data type model for Ascend 310P NPU.
 - 2024.8
   - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
 - 2024.7
@@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 ### Ascend NPU
 
 **Verified devices**
+
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |
+| Atlas 300I Duo                | Support |
 
 *Notes:*
 

diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
@@ -2,6 +2,6 @@ aiohttp~=3.9.3
 pytest~=8.3.3
 huggingface_hub~=0.23.2
 numpy~=1.26.4
-openai~=1.30.3
+openai~=1.55.3
 prometheus-client~=0.20.0
 requests~=2.32.3
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
@@ -8,7 +8,6 @@
 import re
 import json
 import sys
-import threading
 import requests
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -161,26 +160,12 @@ def start(self, timeout_seconds: int = 10) -> None:
         self.process = subprocess.Popen(
             [str(arg) for arg in [server_path, *server_args]],
             creationflags=flags,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stdout=sys.stdout,
+            stderr=sys.stdout,
             env={**os.environ, "LLAMA_CACHE": "tmp"},
         )
         server_instances.add(self)
 
-        def server_log(in_stream, out_stream):
-            for line in iter(in_stream.readline, b""):
-                print(line.decode("utf-8"), end="", file=out_stream)
-
-        thread_stdout = threading.Thread(
-            target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
-        )
-        thread_stdout.start()
-
-        thread_stderr = threading.Thread(
-            target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
-        )
-        thread_stderr.start()
-
         print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
 
         # wait for server to start
@@ -319,7 +304,6 @@ def jina_reranker_tiny() -> ServerProcess:
         server.model_hf_repo = "ggml-org/models"
         server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
         server.model_alias = "jina-reranker"
-        server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
         server.n_ctx = 512
         server.n_batch = 512
         server.n_slots = 1

diff --git a/examples/simple/README.md b/examples/simple/README.md
@@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 
 ```bash
-./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
+./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
 
 ...
 

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
@@ -91,6 +91,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_neon       (void);
     GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
     GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
     GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
     GGML_BACKEND_API int ggml_cpu_has_sve        (void);
     GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -389,6 +389,9 @@ extern "C" {
         GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
+        GGML_TYPE_IQ4_NL_4_4 = 36,
+        // GGML_TYPE_IQ4_NL_4_8 = 37,
+        // GGML_TYPE_IQ4_NL_8_8 = 38,
         GGML_TYPE_COUNT,
     };
 

diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
@@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
     detect_ascend_soc_type(SOC_VERSION)
     set(SOC_TYPE "${SOC_VERSION}")
     message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
-else()
-    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 endif()
 
-# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
+string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
 string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
 
 if (CANN_INSTALL_DIR)
     # Only Support Linux.
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,7 @@ @@
     The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
     ```bash
-    ./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
+    ./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
     ...
@@ Expand Down @@