Skip to content

Commit

Permalink
SERVER-19944 Improve text index v3 performance
Browse files Browse the repository at this point in the history
  • Loading branch information
adamchel committed Aug 14, 2015
1 parent 3211eea commit a0bbce2
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 32 deletions.
17 changes: 9 additions & 8 deletions src/mongo/db/fts/fts_unicode_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language)
void UnicodeFTSTokenizer::reset(StringData document, Options options) {
_options = options;
_pos = 0;
_document = unicode::String(document);
_document.resetData(document);

// Skip any leading delimiters (and handle the case where the document is entirely delimiters).
_skipDelimiters();
Expand All @@ -81,29 +81,30 @@ bool UnicodeFTSTokenizer::moveNext() {
(!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) {
++_pos;
}
unicode::String token = _document.substr(start, _pos - start);
_document.substrToBuf(start, _pos - start, _tokenBuf);

// Skip the delimiters before the next token.
_skipDelimiters();

// Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased
// but with diacritics not removed to check against the stop word list.
unicode::String word = token.toLower(_caseFoldMode);
_tokenBuf.toLowerToBuf(_caseFoldMode, _wordBuf);

if ((_options & kFilterStopWords) && _stopWords->isStopWord(word.toString())) {
if ((_options & kFilterStopWords) && _stopWords->isStopWord(_wordBuf.toString())) {
continue;
}

if (_options & kGenerateCaseSensitiveTokens) {
word = token;
_tokenBuf.copyToBuf(_wordBuf);
}

// The stemmer is diacritic sensitive, so stem the word before removing diacritics.
_stem = _stemmer.stem(word.toString());
_stem = _stemmer.stem(_wordBuf.toString());

if (!(_options & kGenerateDiacriticSensitiveTokens)) {
token.resetData(_stem);
_stem = token.removeDiacritics().toString();
_tokenBuf.resetData(_stem);
_tokenBuf.removeDiacriticsToBuf(_wordBuf);
_stem = _wordBuf.toString();
}

return true;
Expand Down
3 changes: 3 additions & 0 deletions src/mongo/db/fts/fts_unicode_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ class UnicodeFTSTokenizer final : public FTSTokenizer {
unicode::String _document;
size_t _pos;

unicode::String _tokenBuf;
unicode::String _wordBuf;

Options _options;

std::string _stem;
Expand Down
7 changes: 7 additions & 0 deletions src/mongo/db/fts/unicode/gen_delimiter_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ def generate(unicode_proplist_file, target):
return false;
}
// Most characters are latin letters, so filter those out first.
if (codepoint >= 'A' && codepoint <= 'Z') {
return false;
} else if (codepoint >= 'a' && codepoint <= 'z') {
return false;
}
switch (codepoint) {\n""")

for delim in sorted(delim_codepoints):
Expand Down
87 changes: 66 additions & 21 deletions src/mongo/db/fts/unicode/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,16 @@ using linenoise_utf8::copyString8to32;
using std::u32string;

String::String(const StringData utf8_src) {
// Reserve space for underlying buffers to prevent excessive reallocations.
_outputBuf.reserve(utf8_src.size() * 4);
_data.reserve(utf8_src.size() * 4);

// Convert UTF-8 input to UTF-32 data.
setData(utf8_src);
}

void String::resetData(const StringData utf8_src) {
// Convert UTF-8 input to UTF-32 data.
setData(utf8_src);
}

Expand All @@ -70,20 +76,28 @@ void String::setData(const StringData utf8_src) {

// Resize _data so it is only as big as what it contains.
_data.resize(resultSize);
_needsOutputConversion = true;
}

String::String(u32string&& src) : _data(std::move(src)) {}

std::string String::toString() const {
// output is the target, resize it so that it's guaranteed to fit all of the input characters,
// plus a null character if there isn't one.
std::string output(_data.size() * 4 + 1, '\0');
size_t resultSize =
copyString32to8(reinterpret_cast<unsigned char*>(&output[0]), &_data[0], output.size());
String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) {
// Reserve space for underlying buffers to prevent excessive reallocations.
_outputBuf.reserve(src.size() * 4);
_data.reserve(src.size() * 4);
}

// Resize output so it is only as large as what it contains.
output.resize(resultSize);
return output;
std::string String::toString() {
// _outputBuf is the target, resize it so that it's guaranteed to fit all of the input
// characters, plus a null character if there isn't one.
if (_needsOutputConversion) {
_outputBuf.resize(_data.size() * 4 + 1);
size_t resultSize = copyString32to8(
reinterpret_cast<unsigned char*>(&_outputBuf[0]), &_data[0], _outputBuf.size());

// Resize output so it is only as large as what it contains.
_outputBuf.resize(resultSize);
_needsOutputConversion = false;
}
return _outputBuf;
}

size_t String::size() const {
Expand All @@ -95,30 +109,61 @@ const char32_t& String::operator[](int i) const {
}

String String::substr(size_t pos, size_t len) const {
return String(_data.substr(pos, len));
unicode::String buf;
substrToBuf(pos, len, buf);
return buf;
}

String String::toLower(CaseFoldMode mode) const {
u32string newdata(_data.size(), 0);
unicode::String buf;
toLowerToBuf(mode, buf);
return buf;
}

String String::removeDiacritics() const {
unicode::String buf;
removeDiacriticsToBuf(buf);
return buf;
}

void String::copyToBuf(String& buffer) const {
buffer._data = _data;
buffer._data.resize(_data.size());
auto index = 0;
for (auto codepoint : _data) {
newdata[index++] = codepointToLower(codepoint, mode);
buffer._data[index++] = codepoint;
}
buffer._needsOutputConversion = true;
}

return String(std::move(newdata));
void String::substrToBuf(size_t pos, size_t len, String& buffer) const {
buffer._data.resize(len + 1);
for (size_t index = 0, src_pos = pos; index < len;) {
buffer._data[index++] = _data[src_pos++];
}
buffer._data[len] = '\0';
buffer._needsOutputConversion = true;
}

String String::removeDiacritics() const {
u32string newdata(_data.size(), 0);
void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const {
buffer._data.resize(_data.size());
auto index = 0;
for (auto codepoint : _data) {
buffer._data[index++] = codepointToLower(codepoint, mode);
}
buffer._needsOutputConversion = true;
}

void String::removeDiacriticsToBuf(String& buffer) const {
buffer._data.resize(_data.size());
auto index = 0;
for (auto codepoint : _data) {
if (!codepointIsDiacritic(codepoint)) {
newdata[index++] = codepointRemoveDiacritics(codepoint);
buffer._data[index++] = codepointRemoveDiacritics(codepoint);
}
}

newdata.resize(index);
return String(std::move(newdata));
buffer._data.resize(index);
buffer._needsOutputConversion = true;
}

bool String::substrMatch(const String& str,
Expand Down
38 changes: 35 additions & 3 deletions src/mongo/db/fts/unicode/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class String {
void resetData(const StringData utf8_src);

/**
* Return a lowercased version of the String instance using the Unicode data in u_data.h.
* Return a lowercased version of the String instance.
*/
String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const;

Expand All @@ -82,9 +82,30 @@ class String {
String substr(size_t begin, size_t end) const;

/**
* Returns a UTF-8 encoded std::string version of the String instance.
* Copies the current String to another String.
*/
std::string toString() const;
void copyToBuf(String& buffer) const;

/**
* Takes a substring of the current String and puts it in another String.
*/
void substrToBuf(size_t pos, size_t len, String& buffer) const;

/**
* Lowercases the current String and stores the result in another String.
*/
void toLowerToBuf(CaseFoldMode mode, String& buffer) const;

/**
* Removes diacritics from the current String and stores the result in another String.
*/
void removeDiacriticsToBuf(String& buffer) const;

/**
* Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion
* stored in the output buffer when possible.
*/
std::string toString();

/**
* Returns the number Unicode codepoints in the String.
Expand Down Expand Up @@ -143,6 +164,17 @@ class String {
* The underlying UTF-32 data.
*/
std::u32string _data;

/**
* A buffer for storing the result of the UTF-32 to UTF-8 conversion.
*/
std::string _outputBuf;

/**
* A bool flag that is set to true when toString() will require that the UTF-32 to UTF-8
* conversion be applied again.
*/
bool _needsOutputConversion;
};

} // namespace unicode
Expand Down

0 comments on commit a0bbce2

Please sign in to comment.