diff --git a/src/unicode.cpp b/src/unicode.cpp index 89180da4152da..a32ae6d0824f2 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -618,7 +618,14 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { result.reserve(utf8.size()); size_t offset = 0; while (offset < utf8.size()) { - result.push_back(unicode_cpt_from_utf8(utf8, offset)); + try { + result.push_back(unicode_cpt_from_utf8(utf8, offset)); + } + catch (const std::invalid_argument & /*ex*/) { + // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize + ++offset; + result.emplace_back(0xFFFD); // replacement character + } } return result; }