From 9661e8db62e8897cb53b0bc1860ead43b28dfa21 Mon Sep 17 00:00:00 2001 From: Matthew Ridehalgh <9155962+mridehalgh@users.noreply.github.com> Date: Fri, 16 Aug 2024 20:11:13 +0100 Subject: [PATCH] fix: MinHash token filter parameters not working (#15233) * fix: minhash configuration Signed-off-by: Matt Ridehalgh * style: linting Signed-off-by: Matt Ridehalgh * chore: update CHANGELOG.md Signed-off-by: Matt Ridehalgh --------- Signed-off-by: Matt Ridehalgh --- CHANGELOG.md | 3 +- .../common/MinHashTokenFilterFactory.java | 4 +- .../common/MinHashFilterFactoryTests.java | 66 +++++++++++++++---- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fe700d7190fa..5103d978b361a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,7 +34,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Bump `com.azure:azure-core` from 1.49.1 to 1.51.0 ([#15111](https://github.com/opensearch-project/OpenSearch/pull/15111)) - Bump `org.xerial.snappy:snappy-java` from 1.1.10.5 to 1.1.10.6 ([#15207](https://github.com/opensearch-project/OpenSearch/pull/15207)) - Bump `com.azure:azure-xml` from 1.0.0 to 1.1.0 ([#15206](https://github.com/opensearch-project/OpenSearch/pull/15206)) -- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262)) +- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262)) - Bump `reactor-netty` from 1.1.21 to 1.1.22 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262)) ### Changed @@ -53,6 +53,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126)) - Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620)) - Fix range aggregation optimization ignoring top level queries ([#15194](https://github.com/opensearch-project/OpenSearch/pull/15194)) +- Fix incorrect parameter names in MinHash token filter configuration handling ([#15233](https://github.com/opensearch-project/OpenSearch/pull/15233)) ### Security diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java index e76354ae3a765..40655b84794d5 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java @@ -65,10 +65,10 @@ private Map convertSettings(Settings settings) { if (settings.hasValue("hash_count")) { settingMap.put("hashCount", settings.get("hash_count")); } - if (settings.hasValue("bucketCount")) { + if (settings.hasValue("bucket_count")) { settingMap.put("bucketCount", settings.get("bucket_count")); } - if (settings.hasValue("hashSetSize")) { + if (settings.hasValue("hash_set_size")) { settingMap.put("hashSetSize", settings.get("hash_set_size")); } if (settings.hasValue("with_rotation")) { diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java index e86a939dc857b..b9f09033f49f3 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java @@ -50,14 +50,10 @@ public void testDefault() throws IOException { int default_bucket_size = 512; int default_hash_set_size = 1; Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); - OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( - settings, - new CommonAnalysisModulePlugin() - ); + OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash"); String source = "the quick brown fox"; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); + Tokenizer tokenizer = getTokenizer(source); // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't // have enough tokens to fill all the buckets, we still expect 512 tokens. @@ -73,17 +69,63 @@ public void testSettings() throws IOException { .put("index.analysis.filter.test_min_hash.with_rotation", false) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( - settings, - new CommonAnalysisModulePlugin() - ); + OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash"); String source = "sushi"; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); + Tokenizer tokenizer = getTokenizer(source); // despite the fact that bucket_count is 2 and hash_set_size is 1, // because with_rotation is false, we only expect 1 token here. assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1); } + + public void testBucketCountSetting() throws IOException { + // Correct case with "bucket_count" + Settings settingsWithBucketCount = Settings.builder() + .put("index.analysis.filter.test_min_hash.type", "min_hash") + .put("index.analysis.filter.test_min_hash.hash_count", "1") + .put("index.analysis.filter.test_min_hash.bucket_count", "3") + .put("index.analysis.filter.test_min_hash.hash_set_size", "1") + .put("index.analysis.filter.test_min_hash.with_rotation", false) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + OpenSearchTestCase.TestAnalysis analysisWithBucketCount = getTestAnalysisFromSettings(settingsWithBucketCount); + + TokenFilterFactory tokenFilterWithBucketCount = analysisWithBucketCount.tokenFilter.get("test_min_hash"); + String sourceWithBucketCount = "salmon avocado roll uramaki"; + Tokenizer tokenizerWithBucketCount = getTokenizer(sourceWithBucketCount); + // Expect 3 tokens due to bucket_count being set to 3 + assertStreamHasNumberOfTokens(tokenFilterWithBucketCount.create(tokenizerWithBucketCount), 3); + } + + public void testHashSetSizeSetting() throws IOException { + // Correct case with "hash_set_size" + Settings settingsWithHashSetSize = Settings.builder() + .put("index.analysis.filter.test_min_hash.type", "min_hash") + .put("index.analysis.filter.test_min_hash.hash_count", "1") + .put("index.analysis.filter.test_min_hash.bucket_count", "1") + .put("index.analysis.filter.test_min_hash.hash_set_size", "2") + .put("index.analysis.filter.test_min_hash.with_rotation", false) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + OpenSearchTestCase.TestAnalysis analysisWithHashSetSize = getTestAnalysisFromSettings(settingsWithHashSetSize); + + TokenFilterFactory tokenFilterWithHashSetSize = analysisWithHashSetSize.tokenFilter.get("test_min_hash"); + String sourceWithHashSetSize = "salmon avocado roll uramaki"; + Tokenizer tokenizerWithHashSetSize = getTokenizer(sourceWithHashSetSize); + // Expect 2 tokens due to hash_set_size being set to 2 and bucket_count being 1 + assertStreamHasNumberOfTokens(tokenFilterWithHashSetSize.create(tokenizerWithHashSetSize), 2); + } + + private static OpenSearchTestCase.TestAnalysis getTestAnalysisFromSettings(Settings settingsWithBucketCount) throws IOException { + return AnalysisTestsHelper.createTestAnalysisFromSettings(settingsWithBucketCount, new CommonAnalysisModulePlugin()); + } + + private static Tokenizer getTokenizer(String sourceWithBucketCount) { + Tokenizer tokenizerWithBucketCount = new WhitespaceTokenizer(); + tokenizerWithBucketCount.setReader(new StringReader(sourceWithBucketCount)); + return tokenizerWithBucketCount; + } }