Skip to content

Commit

Permalink
Allow custom characters in token_chars of ngram tokenizers (#49250)
Browse files Browse the repository at this point in the history
Currently the `token_chars` setting in both `edgeNGram` and `ngram` tokenizers
only allows for a list of predefined character classes, which might not fit
every use case. For example, including underscore "_" in a token would currently
require the `punctuation` class which comes with a lot of other characters.
This change adds an additional "custom" option to the `token_chars` setting,
which requires an additional `custom_token_chars` setting to be present and
which will be interpreted as a set of characters to inlcude into a token.

Closes #25894
  • Loading branch information
Christoph Büscher authored Nov 20, 2019
1 parent 87bb19a commit ed86750
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ Character classes may be any of the following:
* `whitespace` -- for example `" "` or `"\n"`
* `punctuation` -- for example `!` or `"`
* `symbol` -- for example `$` or `√`
* `custom` -- custom characters which need to be set using the
`custom_token_chars` setting.

`custom_token_chars`::

Custom characters that should be treated as part of a token. For example,
setting this to `+-_` will make the tokenizer treat the plus, minus and
underscore sign as part of a token.

[[max-gram-limits]]
=== Limitations of the `max_gram` parameter
Expand Down
8 changes: 8 additions & 0 deletions docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ Character classes may be any of the following:
* `whitespace` -- for example `" "` or `"\n"`
* `punctuation` -- for example `!` or `"`
* `symbol` -- for example `$` or `√`
* `custom` -- custom characters which need to be set using the
`custom_token_chars` setting.

`custom_token_chars`::

Custom characters that should be treated as part of a token. For example,
setting this to `+-_` will make the tokenizer treat the plus, minus and
underscore sign as part of a token.

TIP: It usually makes sense to set `min_gram` and `max_gram` to the same
value. The smaller the length, the more documents will match but the lower
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
super(indexSettings, settings, name);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
this.matcher = parseTokenChars(settings);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@

import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static java.util.Collections.unmodifiableMap;

Expand Down Expand Up @@ -67,7 +71,8 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
MATCHERS = unmodifiableMap(matchers);
}

static CharMatcher parseTokenChars(List<String> characterClasses) {
static CharMatcher parseTokenChars(Settings settings) {
List<String> characterClasses = settings.getAsList("token_chars");
if (characterClasses == null || characterClasses.isEmpty()) {
return null;
}
Expand All @@ -76,7 +81,23 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
characterClass = characterClass.toLowerCase(Locale.ROOT).trim();
CharMatcher matcher = MATCHERS.get(characterClass);
if (matcher == null) {
throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + MATCHERS.keySet());
if (characterClass.equals("custom") == false) {
throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + Stream
.of(MATCHERS.keySet(), Collections.singleton("custom")).flatMap(x -> x.stream()).collect(Collectors.toSet()));
}
String customCharacters = settings.get("custom_token_chars");
if (customCharacters == null) {
throw new IllegalArgumentException("Token type: 'custom' requires setting `custom_token_chars`");
}
final Set<Integer> customCharSet = customCharacters.chars().boxed().collect(Collectors.toSet());
matcher = new CharMatcher() {

@Override
public boolean isTokenChar(int c) {
return customCharSet.contains(c);
}

};
}
builder.or(matcher);
}
Expand All @@ -95,7 +116,7 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
}
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
this.matcher = parseTokenChars(settings);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.TestEnvironment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand All @@ -33,6 +35,7 @@
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;

public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase {
Expand Down Expand Up @@ -95,4 +98,17 @@ public void testPreConfiguredTokenizer() throws IOException {

}

public void testCustomTokenChars() throws IOException {
final Index index = new Index("test", "_na_");
final String name = "engr";
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();

final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name,
settings).create();
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "-g", "-gh", "_j", "_jk", "lm"});
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -46,25 +46,34 @@ public void testParseTokenChars() {
final Index index = new Index("test", "_na_");
final String name = "ngr";
final Settings indexSettings = newAnalysisSettingsBuilder().build();
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) {
final IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.put("token_chars", tokenChars).build();
try {
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
fail();
} catch (IllegalArgumentException expected) {
// OK
}
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
// no exception
}
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
{
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.put("token_chars", tokenChars).build();
indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);

.put("token_chars", "DIRECTIONALITY_UNDEFINED").build();
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
assertEquals("Unknown token type: 'directionality_undefined'", ex.getMessage().substring(0, 46));
assertTrue(ex.getMessage().contains("custom"));
}
{
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
.put("custom_token_chars", "_-").build();
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
// no exception
}
{
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
.build();
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
assertEquals("Token type: 'custom' requires setting `custom_token_chars`", ex.getMessage());
}
}

public void testNoTokenChars() throws IOException {
Expand All @@ -80,6 +89,19 @@ public void testNoTokenChars() throws IOException {
assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"});
}

public void testCustomTokenChars() throws IOException {
final Index index = new Index("test", "_na_");
final String name = "ngr";
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();

final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
.create();
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "bc", "-g", "-gh", "gh", "_j", "_jk", "jk", "lm"});
}

public void testPreTokenization() throws IOException {
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
final Index index = new Index("test", "_na_");
Expand Down

0 comments on commit ed86750

Please sign in to comment.