llama : fix strncpy warning + note token_to_str does not write null

ggerganov · Aug 16, 2023 · 5b94b14 · 5b94b14
1 parent a499313
commit 5b94b14
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 24 deletions.
diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py
@@ -132,7 +132,7 @@ def count_model_parts(dir_model: str) -> int:
         toktype = 1 # defualt to normal token type
         if tokenizer.is_unknown(i): toktype = 2
         if tokenizer.is_control(i): toktype = 3
- 
+
         # TODO: How to determinate if a token is user defined?
         # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
         # if tokenizer.is_user_defined(i): toktype = 4
@@ -223,7 +223,7 @@ def count_model_parts(dir_model: str) -> int:
             sys.exit()
 
         n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype
 
         # if f32 desired, convert any float16 to float32
         if ftype == 0 and data.dtype == np.float16:
@@ -261,7 +261,6 @@ def count_model_parts(dir_model: str) -> int:
     for name in model_part.keys():
         data = model_part[name]
 
-
         old_dtype = data.dtype
 
         # we don't need these
@@ -284,7 +283,7 @@ def count_model_parts(dir_model: str) -> int:
             sys.exit()
 
         n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype
 
         # if f32 desired, convert any float16 to float32
         if ftype == 0 and data.dtype == np.float16:

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -266,9 +266,6 @@ int main(int argc, char ** argv) {
         params.interactive = true;
     }
 
-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
     if (params.verbose_prompt) {
         fprintf(stderr, "\n");
         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@@ -778,8 +775,7 @@ int main(int argc, char ** argv) {
                     if (grammar != NULL) {
                         llama_grammar_free(grammar);
 
-                        std::vector<const llama_grammar_element *> grammar_rules(
-                            parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                         grammar = llama_grammar_init(
                             grammar_rules.data(), grammar_rules.size(),
                             parsed_grammar.symbol_ids.at("root"));

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }
 
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     fprintf(stderr, "\nAllowed quantization types:\n");
@@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
         if (pos != std::string::npos) {
             fpath = fname_inp.substr(0, pos + 1);
         }
-        // export as [inp path]/ggml-model-[ftype].bin
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        // export as [inp path]/ggml-model-[ftype].gguf
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
         arg_idx++;
     }
     else {

diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
@@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
     auto lparams = llama_context_default_params();
 
     lparams.n_ctx     = params.n_ctx;
-    lparams.n_gqa     = params.n_gqa;
     lparams.seed      = params.seed;
     lparams.f16_kv    = params.memory_f16;
     lparams.use_mmap  = params.use_mmap;

diff --git a/llama.cpp b/llama.cpp
@@ -4774,32 +4774,34 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * str, int length) {
+// does not write null-terminator to str
+int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
     if (0 <= token && token < llama_n_vocab_from_model(model)) {
         if (llama_is_normal_token(model->vocab, token)) {
             std::string result = model->vocab.id_to_token[token].tok;
-            if(llama_vocab_type(model->vocab) == "spm") {
+            if (llama_vocab_type(model->vocab) == "spm") {
                 result = llama_unescape_whitespace(result);
             }
             if (length < (int) result.length()) {
                 return -result.length();
             }
-            strncpy(str, result.c_str(), result.length());
+            memcpy(buf, result.c_str(), result.length());
             return result.length();
         } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
             if (length < 3) {
                 return -3;
             }
-            strncpy(str, "\xe2\x96\x85", 4);
+            buf[0] = '\xe2';
+            buf[1] = '\x96';
+            buf[2] = '\x85';
             return 3;
         } else if (llama_is_control_token(model->vocab, token)) {
             ;
         } else if (llama_is_byte_token(model->vocab, token)) {
             if (length < 1) {
                 return -1;
             }
-            str[0] = llama_byte_to_char(model->vocab, token);
-            str[1] = 0x00;
+            buf[0] = llama_byte_to_char(model->vocab, token);
             return 1;
         }
     }
@@ -4830,7 +4832,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
         if (length < (int) result.length()) {
             return -result.length();
         }
-        strncpy(str, result.c_str(), result.length());
+        memcpy(str, result.c_str(), result.length());
         return result.length();
     }
     return 0;

diff --git a/llama.h b/llama.h
@@ -355,22 +355,23 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
     // Token Id -> String. Uses the vocabulary in the provided context
+    // Does not write null terminator to the buffer
     LLAMA_API int llama_token_to_str(
             const struct llama_context * ctx,
                            llama_token   token,
-                                  char * str,
+                                  char * buf,
                                   int    length);
 
     LLAMA_API int llama_token_to_str_bpe(
             const struct llama_context * ctx,
                            llama_token   token,
-                                  char * str,
+                                  char * buf,
                                   int    length);
 
     LLAMA_API int llama_token_to_str_with_model(
               const struct llama_model * model,
                            llama_token   token,
-                                  char * str,
+                                  char * buf,
                                   int    length);
     // Special tokens
     LLAMA_API llama_token llama_token_bos(void);  // beginning-of-sentence