From 99e5322a7985696bd17405dfd18b069a557f0c75 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sun, 11 Feb 2024 15:19:01 +0000 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Georgi Gerganov --- whisper.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index eb78a91ecce..729c0da2566 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2947,7 +2947,6 @@ static std::vector bpe_encode(const whisper_vocab & vocab, co int min_idx = -1; int min_rank = -1; - // iterate over all pairs and find the pair we want to merge the most for (int pos=0; pos < tokens.size() - 1; pos++) { auto query = vocab.id_to_token.at(tokens[pos]) + vocab.id_to_token.at(tokens[pos+1]); @@ -2975,8 +2974,6 @@ static std::vector bpe_encode(const whisper_vocab & vocab, co return tokens; } -// This is not perfect -// Occasionally, it produces different results compared to OpenAI's tiktoken static std::vector bpe_gpt2_preprocess(const std::string & text) { std::vector bpe_words; std::vector bpe_encoded_words; @@ -2995,15 +2992,17 @@ static std::vector bpe_gpt2_preprocess(const std::string & text) { bpe_encoded_words.reserve(text.size()); auto cps = codepoints_from_utf8(text); - for (size_t i = 0; i < cps.size(); ++i) + for (size_t i = 0; i < cps.size(); ++i) { text_utf.emplace_back(codepoint_to_utf8(cps[i])); + } for (int i = 0; i < (int)text_utf.size(); i++) { const std::string & utf_char = text_utf[i]; bool split_condition = false; int bytes_remain = text_utf.size() - i; + // forward backward lookups - const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; + const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; // handling contractions @@ -3136,7 +3135,7 @@ static std::vector tokenize(const whisper_vocab & vocab, cons std::vector tokens; - for (const auto& word : words) { + for (const auto & word : words) { auto word_tokens = bpe_encode(vocab, word); tokens.insert(tokens.end(), word_tokens.begin(), word_tokens.end()); }