From fd4a36a9d07391fb75444baf084818348061ce23 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 30 Nov 2018 18:00:08 +0100 Subject: [PATCH 1/9] Add support for inlined user dictionary in Nori This change adds a new option called `user_dictionary_rules` to the Nori a tokenizer`. It can be used to set additional tokenization rules to the Korean tokenizer directly in the settings (instead of using a file). Closes #35842 --- docs/plugins/analysis-nori.asciidoc | 33 +++++++++++++++++ .../index/analysis/NoriTokenizerFactory.java | 36 +++++++++++++++---- .../index/analysis/NoriAnalysisTests.java | 17 +++++++++ 3 files changed, 79 insertions(+), 7 deletions(-) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 7a6ab21595881..c312e11812973 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -154,6 +154,39 @@ The above `analyze` request returns the following: <1> This is a compound token that spans two positions (`mixed` mode). +`user_dictionary_rules`:: ++ +-- + +You can also inline the rules directly in the tokenizer definition using +the `user_dictionary_rules` option: + +[source,js] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index": { + "analysis": { + "tokenizer": { + "nori_user_dict": { + "type": "nori_tokenizer", + "decompound_mode": "mixed", + "user_dictionary_rules": ["c++", "C샤프", "세종", "세종시", "세종", "시"] + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "nori_user_dict" + } + } + } + } + } +} +-------------------------------------------------- + The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters to modify the stream. You can view all these additional attributes with the following request: diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index 9295ed95c3fb8..a575062377d78 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -29,10 +29,14 @@ import java.io.IOException; import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.List; import java.util.Locale; public class NoriTokenizerFactory extends AbstractTokenizerFactory { - private static final String USER_DICT_OPTION = "user_dictionary"; + private static final String USER_DICT_PATH_OPTION = "user_dictionary"; + private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules"; private final UserDictionary userDictionary; private final KoreanTokenizer.DecompoundMode decompoundMode; @@ -44,14 +48,32 @@ public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String } public static UserDictionary getUserDictionary(Environment env, Settings settings) { - try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) { - if (reader == null) { + if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) { + throw new ElasticsearchException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + + " with [" + USER_DICT_RULES_OPTION + "]"); + + } + String path = settings.get(USER_DICT_PATH_OPTION); + if (path != null) { + try (Reader rulesReader = Analysis.getReaderFromFile(env, settings, USER_DICT_PATH_OPTION)) { + return rulesReader == null ? null : UserDictionary.open(rulesReader); + } catch (IOException e) { + throw new ElasticsearchException("failed to load nori user dictionary", e); + } + } else { + List rulesList = settings.getAsList(USER_DICT_RULES_OPTION, Collections.emptyList(), false); + if (rulesList == null || rulesList.size() == 0) { return null; - } else { - return UserDictionary.open(reader); } - } catch (IOException e) { - throw new ElasticsearchException("failed to load nori user dictionary", e); + StringBuilder sb = new StringBuilder(); + for (String line : rulesList) { + sb.append(line).append(System.lineSeparator()); + } + try (Reader rulesReader = new StringReader(sb.toString())) { + return UserDictionary.open(rulesReader); + } catch (IOException e) { + throw new ElasticsearchException("failed to load nori user dictionary", e); + } } } diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index fa5858a7bbbb8..b4f791aed2b53 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -37,6 +37,7 @@ import java.io.StringReader; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; import static org.hamcrest.Matchers.instanceOf; @@ -76,6 +77,22 @@ public void testNoriAnalyzer() throws Exception { } public void testNoriAnalyzerUserDict() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "nori") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시") + .build(); + TestAnalysis analysis = createTestAnalysis(settings); + Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + try (TokenStream stream = analyzer.tokenStream("", "세종시" )) { + assertTokenStreamContents(stream, new String[] {"세종", "시"}); + } + + try (TokenStream stream = analyzer.tokenStream("", "c++world")) { + assertTokenStreamContents(stream, new String[] {"c++", "world"}); + } + } + + public void testNoriAnalyzerUserDictPath() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.my_analyzer.type", "nori") .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt") From 41babe7b189631e8f1c4b0c47a821cf06545c7ee Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 30 Nov 2018 18:08:27 +0100 Subject: [PATCH 2/9] checkstyle --- .../java/org/elasticsearch/index/analysis/NoriAnalysisTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index b4f791aed2b53..9c2d3d4efdb1a 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -37,7 +37,6 @@ import java.io.StringReader; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; import static org.hamcrest.Matchers.instanceOf; From 450a578c05392f5f37ab7fe9c3a309c5bab5407a Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 30 Nov 2018 18:28:11 +0100 Subject: [PATCH 3/9] add missing CONSOLE in docs --- docs/plugins/analysis-nori.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index c312e11812973..289e6c158dd56 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -186,6 +186,7 @@ PUT nori_sample } } -------------------------------------------------- +// CONSOLE The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters to modify the stream. From adcee291b47a37ca124f5d252cb6f287afe0536c Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 3 Dec 2018 11:35:13 +0100 Subject: [PATCH 4/9] fix docs section --- docs/plugins/analysis-nori.asciidoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 289e6c158dd56..d88024c623e97 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -153,6 +153,7 @@ The above `analyze` request returns the following: // TESTRESPONSE <1> This is a compound token that spans two positions (`mixed` mode). +-- `user_dictionary_rules`:: + @@ -187,6 +188,7 @@ PUT nori_sample } -------------------------------------------------- // CONSOLE +-- The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters to modify the stream. From e6181986d24a65d9fd17a3ed3288ec90b3a0d828 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 3 Dec 2018 13:03:45 +0100 Subject: [PATCH 5/9] fix redundant end of section --- docs/plugins/analysis-nori.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index d88024c623e97..34c3ac8384cb3 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -70,7 +70,7 @@ The first token is mandatory and represents the custom noun that should be added the dictionary. For compound nouns the custom segmentation can be provided after the first token (`[ ... ]`). The segmentation of the custom compound nouns is controlled by the `decompound_mode` setting. --- + As a demonstration of how the user dictionary can be used, save the following dictionary to `$ES_HOME/config/userdict_ko.txt`: From 3a97e16fa843da027094c876c9dea195f5fd4fb6 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Tue, 4 Dec 2018 09:26:42 +0100 Subject: [PATCH 6/9] Add an helper in Analysis to load the word list and check for duplicates in the user rules --- docs/plugins/analysis-nori.asciidoc | 2 +- .../index/analysis/NoriTokenizerFactory.java | 42 ++++++++----------- .../index/analysis/NoriAnalysisTests.java | 17 ++++++-- .../index/analysis/Analysis.java | 19 +++++++-- 4 files changed, 48 insertions(+), 32 deletions(-) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 34c3ac8384cb3..68ec943533aa9 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -173,7 +173,7 @@ PUT nori_sample "nori_user_dict": { "type": "nori_tokenizer", "decompound_mode": "mixed", - "user_dictionary_rules": ["c++", "C샤프", "세종", "세종시", "세종", "시"] + "user_dictionary_rules": ["c++", "C샤프", "세종", "세종시 세종 시"] } }, "analyzer": { diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index a575062377d78..5a7ab3dbfc554 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -31,8 +31,10 @@ import java.io.Reader; import java.io.StringReader; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; public class NoriTokenizerFactory extends AbstractTokenizerFactory { private static final String USER_DICT_PATH_OPTION = "user_dictionary"; @@ -48,32 +50,24 @@ public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String } public static UserDictionary getUserDictionary(Environment env, Settings settings) { - if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) { - throw new ElasticsearchException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + - " with [" + USER_DICT_RULES_OPTION + "]"); - + List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION); + StringBuilder sb = new StringBuilder(); + if (ruleList == null || ruleList.isEmpty()) { + return null; } - String path = settings.get(USER_DICT_PATH_OPTION); - if (path != null) { - try (Reader rulesReader = Analysis.getReaderFromFile(env, settings, USER_DICT_PATH_OPTION)) { - return rulesReader == null ? null : UserDictionary.open(rulesReader); - } catch (IOException e) { - throw new ElasticsearchException("failed to load nori user dictionary", e); - } - } else { - List rulesList = settings.getAsList(USER_DICT_RULES_OPTION, Collections.emptyList(), false); - if (rulesList == null || rulesList.size() == 0) { - return null; - } - StringBuilder sb = new StringBuilder(); - for (String line : rulesList) { - sb.append(line).append(System.lineSeparator()); - } - try (Reader rulesReader = new StringReader(sb.toString())) { - return UserDictionary.open(rulesReader); - } catch (IOException e) { - throw new ElasticsearchException("failed to load nori user dictionary", e); + // check for duplicate terms + Set terms = new HashSet<>(); + for (String line : ruleList) { + String[] split = line.split("\\s+"); + if (terms.add(split[0]) == false) { + throw new IllegalArgumentException("Found duplicate term: [" + split[0] + "] in user dictionary. "); } + sb.append(line).append(System.lineSeparator()); + } + try (Reader rulesReader = new StringReader(sb.toString())) { + return UserDictionary.open(rulesReader); + } catch (IOException e) { + throw new ElasticsearchException("failed to load nori user dictionary", e); } } diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index 9c2d3d4efdb1a..0201a12238f25 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -38,6 +38,7 @@ import java.nio.file.Files; import java.nio.file.Path; +import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.instanceOf; public class NoriAnalysisTests extends ESTokenStreamTestCase { @@ -82,15 +83,25 @@ public void testNoriAnalyzerUserDict() throws Exception { .build(); TestAnalysis analysis = createTestAnalysis(settings); Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); - try (TokenStream stream = analyzer.tokenStream("", "세종시" )) { - assertTokenStreamContents(stream, new String[] {"세종", "시"}); + try (TokenStream stream = analyzer.tokenStream("", "세종시")) { + assertTokenStreamContents(stream, new String[]{"세종", "시"}); } try (TokenStream stream = analyzer.tokenStream("", "c++world")) { - assertTokenStreamContents(stream, new String[] {"c++", "world"}); + assertTokenStreamContents(stream, new String[]{"c++", "world"}); } } + public void testNoriAnalyzerUserDictWithDuplicates() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "nori") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "세종", "C샤프", "세종", "세종 세 종") + .build(); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), containsString("Found duplicate term: [세종]")); + + } + public void testNoriAnalyzerUserDictPath() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.my_analyzer.type", "nori") diff --git a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java index d56b8820e9b1c..bfbd77e4f01ea 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -221,10 +221,21 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String * If the word list cannot be found at either key. */ public static List getWordList(Environment env, Settings settings, String settingPrefix) { - String wordListPath = settings.get(settingPrefix + "_path", null); + return getWordList(env, settings, settingPrefix + "_path", settingPrefix); + } + + /** + * Fetches a list of words from the specified settings file. The list should either be available at the key + * specified by settingList or in a file specified by settingPath. + * + * @throws IllegalArgumentException + * If the word list cannot be found at either key. + */ + public static List getWordList(Environment env, Settings settings, String settingPath, String settingList) { + String wordListPath = settings.get(settingPath, null); if (wordListPath == null) { - List explicitWordList = settings.getAsList(settingPrefix, null); + List explicitWordList = settings.getAsList(settingList, null); if (explicitWordList == null) { return null; } else { @@ -239,10 +250,10 @@ public static List getWordList(Environment env, Settings settings, Strin } catch (CharacterCodingException ex) { String message = String.format(Locale.ROOT, "Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded", - settingPrefix, path.toString()); + settingPath, path.toString()); throw new IllegalArgumentException(message, ex); } catch (IOException ioe) { - String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString()); + String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPath, path.toString()); throw new IllegalArgumentException(message, ioe); } } From cb3ca2a20972318e58ec55f04267dbfc39e5472a Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Tue, 4 Dec 2018 09:27:21 +0100 Subject: [PATCH 7/9] unused import --- .../org/elasticsearch/index/analysis/NoriTokenizerFactory.java | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index 5a7ab3dbfc554..8b1afdd921dcb 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -30,7 +30,6 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; -import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Locale; From 39db1d964ecf8717be26dbe0a2f3bc5cee9802cb Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Tue, 4 Dec 2018 13:56:14 +0100 Subject: [PATCH 8/9] fix error message --- .../main/java/org/elasticsearch/index/analysis/Analysis.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java index bfbd77e4f01ea..09a87124110b3 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -249,11 +249,11 @@ public static List getWordList(Environment env, Settings settings, Strin return loadWordList(path, "#"); } catch (CharacterCodingException ex) { String message = String.format(Locale.ROOT, - "Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded", + "Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded", settingPath, path.toString()); throw new IllegalArgumentException(message, ex); } catch (IOException ioe) { - String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPath, path.toString()); + String message = String.format(Locale.ROOT, "IOException while reading %s: %s", settingPath, path.toString()); throw new IllegalArgumentException(message, ioe); } } From 5fcfad43b9bf29ef209457379b8db36211a2c32f Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 7 Dec 2018 11:52:20 +0100 Subject: [PATCH 9/9] address review --- .../index/analysis/NoriTokenizerFactory.java | 12 ++++------- .../index/analysis/NoriAnalysisTests.java | 21 ++++++++++--------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index 8b1afdd921dcb..aa96da807c80f 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -30,10 +30,8 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; -import java.util.HashSet; import java.util.List; import java.util.Locale; -import java.util.Set; public class NoriTokenizerFactory extends AbstractTokenizerFactory { private static final String USER_DICT_PATH_OPTION = "user_dictionary"; @@ -49,18 +47,16 @@ public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String } public static UserDictionary getUserDictionary(Environment env, Settings settings) { + if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) { + throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + + " with [" + USER_DICT_RULES_OPTION + "]"); + } List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION); StringBuilder sb = new StringBuilder(); if (ruleList == null || ruleList.isEmpty()) { return null; } - // check for duplicate terms - Set terms = new HashSet<>(); for (String line : ruleList) { - String[] split = line.split("\\s+"); - if (terms.add(split[0]) == false) { - throw new IllegalArgumentException("Found duplicate term: [" + split[0] + "] in user dictionary. "); - } sb.append(line).append(System.lineSeparator()); } try (Reader rulesReader = new StringReader(sb.toString())) { diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index 0201a12238f25..051a2f3e4dc32 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -92,16 +92,6 @@ public void testNoriAnalyzerUserDict() throws Exception { } } - public void testNoriAnalyzerUserDictWithDuplicates() throws Exception { - Settings settings = Settings.builder() - .put("index.analysis.analyzer.my_analyzer.type", "nori") - .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "세종", "C샤프", "세종", "세종 세 종") - .build(); - IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); - assertThat(exc.getMessage(), containsString("Found duplicate term: [세종]")); - - } - public void testNoriAnalyzerUserDictPath() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.my_analyzer.type", "nori") @@ -118,6 +108,17 @@ public void testNoriAnalyzerUserDictPath() throws Exception { } } + public void testNoriAnalyzerInvalidUserDictOption() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "nori") + .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시") + .build(); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " + + "with [user_dictionary_rules]")); + } + public void testNoriTokenizer() throws Exception { Settings settings = Settings.builder() .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")