Skip to content

Commit

Permalink
Add User Dictionary Rules to NoriTokenizer (#3634)
Browse files Browse the repository at this point in the history
  • Loading branch information
russcam committed Apr 1, 2019
1 parent 76f2822 commit 6abba25
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 2 deletions.
25 changes: 23 additions & 2 deletions src/Nest/Analysis/Tokenizers/NoriTokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Runtime.Serialization;
using System.Collections.Generic;
using System.Runtime.Serialization;
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;

Expand Down Expand Up @@ -32,10 +33,20 @@ public interface INoriTokenizer : ITokenizer

/// <summary>
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be appended to
/// the default dictionary. This property allows you to specify this file on disk
/// the default dictionary. This property allows you to specify a path to this file on disk
/// </summary>
[JsonProperty("user_dictionary")]
string UserDictionary { get; set; }

/// <summary>
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG)
/// can be specified inline with this property
/// </summary>
/// <remarks>
/// Valid for Elasticsearch 6.6.0+
/// </remarks>
[JsonProperty("user_dictionary_rules")]
IEnumerable<string> UserDictionaryRules { get; set; }
}

/// <inheritdoc cref="INoriTokenizer" />
Expand All @@ -48,6 +59,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer

/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
public string UserDictionary { get; set; }

/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
public IEnumerable<string> UserDictionaryRules { get; set; }
}

/// <inheritdoc cref="INoriTokenizer" />
Expand All @@ -58,11 +72,18 @@ public class NoriTokenizerDescriptor

NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
string INoriTokenizer.UserDictionary { get; set; }
IEnumerable<string> INoriTokenizer.UserDictionaryRules { get; set; }

/// <inheritdoc cref="INoriTokenizer.DecompoundMode" />
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v);

/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
public NoriTokenizerDescriptor UserDictionary(string path) => Assign(path, (a, v) => a.UserDictionary = v);

/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
public NoriTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);

/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);
}
}
23 changes: 23 additions & 0 deletions src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,29 @@ public class NoriTests : TokenizerAssertionBase<NoriTests>
public override string Name => "nori";
}

[SkipVersion("<6.6.0", "inline user dictionary rules introduced in 6.6.0")]
public class NoriWithUserDictionaryTests : TokenizerAssertionBase<NoriWithUserDictionaryTests>
{
public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
.DecompoundMode(NoriDecompoundMode.Mixed)
.UserDictionaryRules("c++", "C샤프", "세종", "세종시 세종 시")
);

public override ITokenizer Initializer => new NoriTokenizer
{
DecompoundMode = NoriDecompoundMode.Mixed,
UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
};

public override object Json => new
{
type = "nori_tokenizer",
decompound_mode = "mixed",
user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
};
public override string Name => "nori_userdictionary";
}

[SkipVersion("<6.4.0", "char_group introduced in 6.4.0")]
public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
{
Expand Down

0 comments on commit 6abba25

Please sign in to comment.