From fc49c3230aa0c224488f376e819b2532b4a646cb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 3 Jan 2025 12:05:16 +0200 Subject: [PATCH 1/2] tokenize : escape the prompt --- examples/tokenize/tokenize.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index c97e227242c42..76624c8b000ad 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -198,6 +198,7 @@ int main(int raw_argc, char ** raw_argv) { // variables where to put any arguments we see. bool printing_ids = false; bool no_bos = false; + bool no_escape = false; bool no_parse_special = false; bool disable_logging = false; bool show_token_count = false; @@ -233,6 +234,9 @@ int main(int raw_argc, char ** raw_argv) { else if (arg == "--no-bos") { no_bos = true; } + else if (arg == "--no-escape") { + no_escape = true; + } else if (arg == "--no-parse-special") { no_parse_special = true; } @@ -363,6 +367,11 @@ int main(int raw_argc, char ** raw_argv) { const bool model_wants_add_bos = llama_add_bos_token(model); const bool add_bos = model_wants_add_bos && !no_bos; const bool parse_special = !no_parse_special; + const bool escape = !no_escape; + + if (escape) { + string_process_escapes(prompt); + } std::vector tokens; tokens = common_tokenize(model, prompt, add_bos, parse_special); From c3a473d421f9ccbada569a6f728145f50269b131 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 6 Jan 2025 10:54:11 +0200 Subject: [PATCH 2/2] tokenize : update help --- examples/tokenize/tokenize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 76624c8b000ad..57d9d43124184 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -31,6 +31,7 @@ static void print_usage_information(const char * argv0) { printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); printf(" --stdin read prompt from standard input.\n"); printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + printf(" --no-escape do not escape input (such as \\n, \\t, etc.).\n"); printf(" --no-parse-special do not parse control tokens.\n"); printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n"); printf(" --show-count print the total number of tokens.\n");