SERVER-19944 Improve text index v3 performance

adamchel · Aug 14, 2015 · a0bbce2 · a0bbce2
1 parent 3211eea
commit a0bbce2
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 32 deletions.
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
@@ -62,7 +62,7 @@ UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language)
 void UnicodeFTSTokenizer::reset(StringData document, Options options) {
     _options = options;
     _pos = 0;
-    _document = unicode::String(document);
+    _document.resetData(document);
 
     // Skip any leading delimiters (and handle the case where the document is entirely delimiters).
     _skipDelimiters();
@@ -81,29 +81,30 @@ bool UnicodeFTSTokenizer::moveNext() {
                (!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) {
             ++_pos;
         }
-        unicode::String token = _document.substr(start, _pos - start);
+        _document.substrToBuf(start, _pos - start, _tokenBuf);
 
         // Skip the delimiters before the next token.
         _skipDelimiters();
 
         // Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased
         // but with diacritics not removed to check against the stop word list.
-        unicode::String word = token.toLower(_caseFoldMode);
+        _tokenBuf.toLowerToBuf(_caseFoldMode, _wordBuf);
 
-        if ((_options & kFilterStopWords) && _stopWords->isStopWord(word.toString())) {
+        if ((_options & kFilterStopWords) && _stopWords->isStopWord(_wordBuf.toString())) {
             continue;
         }
 
         if (_options & kGenerateCaseSensitiveTokens) {
-            word = token;
+            _tokenBuf.copyToBuf(_wordBuf);
         }
 
         // The stemmer is diacritic sensitive, so stem the word before removing diacritics.
-        _stem = _stemmer.stem(word.toString());
+        _stem = _stemmer.stem(_wordBuf.toString());
 
         if (!(_options & kGenerateDiacriticSensitiveTokens)) {
-            token.resetData(_stem);
-            _stem = token.removeDiacritics().toString();
+            _tokenBuf.resetData(_stem);
+            _tokenBuf.removeDiacriticsToBuf(_wordBuf);
+            _stem = _wordBuf.toString();
         }
 
         return true;

diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h
@@ -83,6 +83,9 @@ class UnicodeFTSTokenizer final : public FTSTokenizer {
     unicode::String _document;
     size_t _pos;
 
+    unicode::String _tokenBuf;
+    unicode::String _wordBuf;
+
     Options _options;
 
     std::string _stem;

diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py
@@ -65,6 +65,13 @@ def generate(unicode_proplist_file, target):
         return false;
     }
 
+    // Most characters are latin letters, so filter those out first.
+    if (codepoint >= 'A' && codepoint <= 'Z') {
+        return false;
+    } else if (codepoint >= 'a' && codepoint <= 'z') {
+        return false;
+    }
+
     switch (codepoint) {\n""")
 
     for delim in sorted(delim_codepoints):

diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
@@ -42,10 +42,16 @@ using linenoise_utf8::copyString8to32;
 using std::u32string;
 
 String::String(const StringData utf8_src) {
+    // Reserve space for underlying buffers to prevent excessive reallocations.
+    _outputBuf.reserve(utf8_src.size() * 4);
+    _data.reserve(utf8_src.size() * 4);
+
+    // Convert UTF-8 input to UTF-32 data.
     setData(utf8_src);
 }
 
 void String::resetData(const StringData utf8_src) {
+    // Convert UTF-8 input to UTF-32 data.
     setData(utf8_src);
 }
 
@@ -70,20 +76,28 @@ void String::setData(const StringData utf8_src) {
 
     // Resize _data so it is only as big as what it contains.
     _data.resize(resultSize);
+    _needsOutputConversion = true;
 }
 
-String::String(u32string&& src) : _data(std::move(src)) {}
-
-std::string String::toString() const {
-    // output is the target, resize it so that it's guaranteed to fit all of the input characters,
-    // plus a null character if there isn't one.
-    std::string output(_data.size() * 4 + 1, '\0');
-    size_t resultSize =
-        copyString32to8(reinterpret_cast<unsigned char*>(&output[0]), &_data[0], output.size());
+String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) {
+    // Reserve space for underlying buffers to prevent excessive reallocations.
+    _outputBuf.reserve(src.size() * 4);
+    _data.reserve(src.size() * 4);
+}
 
-    // Resize output so it is only as large as what it contains.
-    output.resize(resultSize);
-    return output;
+std::string String::toString() {
+    // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input
+    // characters, plus a null character if there isn't one.
+    if (_needsOutputConversion) {
+        _outputBuf.resize(_data.size() * 4 + 1);
+        size_t resultSize = copyString32to8(
+            reinterpret_cast<unsigned char*>(&_outputBuf[0]), &_data[0], _outputBuf.size());
+
+        // Resize output so it is only as large as what it contains.
+        _outputBuf.resize(resultSize);
+        _needsOutputConversion = false;
+    }
+    return _outputBuf;
 }
 
 size_t String::size() const {
@@ -95,30 +109,61 @@ const char32_t& String::operator[](int i) const {
 }
 
 String String::substr(size_t pos, size_t len) const {
-    return String(_data.substr(pos, len));
+    unicode::String buf;
+    substrToBuf(pos, len, buf);
+    return buf;
 }
 
 String String::toLower(CaseFoldMode mode) const {
-    u32string newdata(_data.size(), 0);
+    unicode::String buf;
+    toLowerToBuf(mode, buf);
+    return buf;
+}
+
+String String::removeDiacritics() const {
+    unicode::String buf;
+    removeDiacriticsToBuf(buf);
+    return buf;
+}
+
+void String::copyToBuf(String& buffer) const {
+    buffer._data = _data;
+    buffer._data.resize(_data.size());
     auto index = 0;
     for (auto codepoint : _data) {
-        newdata[index++] = codepointToLower(codepoint, mode);
+        buffer._data[index++] = codepoint;
     }
+    buffer._needsOutputConversion = true;
+}
 
-    return String(std::move(newdata));
+void String::substrToBuf(size_t pos, size_t len, String& buffer) const {
+    buffer._data.resize(len + 1);
+    for (size_t index = 0, src_pos = pos; index < len;) {
+        buffer._data[index++] = _data[src_pos++];
+    }
+    buffer._data[len] = '\0';
+    buffer._needsOutputConversion = true;
 }
 
-String String::removeDiacritics() const {
-    u32string newdata(_data.size(), 0);
+void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const {
+    buffer._data.resize(_data.size());
+    auto index = 0;
+    for (auto codepoint : _data) {
+        buffer._data[index++] = codepointToLower(codepoint, mode);
+    }
+    buffer._needsOutputConversion = true;
+}
+
+void String::removeDiacriticsToBuf(String& buffer) const {
+    buffer._data.resize(_data.size());
     auto index = 0;
     for (auto codepoint : _data) {
         if (!codepointIsDiacritic(codepoint)) {
-            newdata[index++] = codepointRemoveDiacritics(codepoint);
+            buffer._data[index++] = codepointRemoveDiacritics(codepoint);
         }
     }
-
-    newdata.resize(index);
-    return String(std::move(newdata));
+    buffer._data.resize(index);
+    buffer._needsOutputConversion = true;
 }
 
 bool String::substrMatch(const String& str,

diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
@@ -67,7 +67,7 @@ class String {
     void resetData(const StringData utf8_src);
 
     /**
-     * Return a lowercased version of the String instance using the Unicode data in u_data.h.
+     * Return a lowercased version of the String instance.
      */
     String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const;
 
@@ -82,9 +82,30 @@ class String {
     String substr(size_t begin, size_t end) const;
 
     /**
-     * Returns a UTF-8 encoded std::string version of the String instance.
+     * Copies the current String to another String.
      */
-    std::string toString() const;
+    void copyToBuf(String& buffer) const;
+
+    /**
+     * Takes a substring of the current String and puts it in another String.
+     */
+    void substrToBuf(size_t pos, size_t len, String& buffer) const;
+
+    /**
+     * Lowercases the current String and stores the result in another String.
+     */
+    void toLowerToBuf(CaseFoldMode mode, String& buffer) const;
+
+    /**
+     * Removes diacritics from the current String and stores the result in another String.
+     */
+    void removeDiacriticsToBuf(String& buffer) const;
+
+    /**
+     * Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion
+     * stored in the output buffer when possible.
+     */
+    std::string toString();
 
     /**
      * Returns the number Unicode codepoints in the String.
@@ -143,6 +164,17 @@ class String {
      * The underlying UTF-32 data.
      */
     std::u32string _data;
+
+    /**
+     * A buffer for storing the result of the UTF-32 to UTF-8 conversion.
+     */
+    std::string _outputBuf;
+
+    /**
+     * A bool flag that is set to true when toString() will require that the UTF-32 to UTF-8
+     * conversion be applied again.
+     */
+    bool _needsOutputConversion;
 };
 
 }  // namespace unicode