From fabf9bd596386dd745685a23b6a1dc52d0f84b7b Mon Sep 17 00:00:00 2001
From: Matthew Ridehalgh <9155962+mridehalgh@users.noreply.github.com>
Date: Tue, 20 Aug 2024 00:13:11 +0100
Subject: [PATCH] fix: MinHash token filter parameters not working (#15293)

Signed-off-by: Matt Ridehalgh <mrideh@amazon.co.uk>
---
 CHANGELOG.md                                  |  3 +-
 .../common/MinHashTokenFilterFactory.java     |  4 +-
 .../common/MinHashFilterFactoryTests.java     | 56 +++++++++++++++++--
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eab8dd23c16b9..d8bb22ec55021 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,7 +32,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Bump `com.azure:azure-core` from 1.49.1 to 1.51.0 ([#15111](https://github.com/opensearch-project/OpenSearch/pull/15111))
 - Bump `org.xerial.snappy:snappy-java` from 1.1.10.5 to 1.1.10.6 ([#15207](https://github.com/opensearch-project/OpenSearch/pull/15207))
 - Bump `com.azure:azure-xml` from 1.0.0 to 1.1.0 ([#15206](https://github.com/opensearch-project/OpenSearch/pull/15206))
-- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262)) 
+- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
 - Bump `reactor-netty` from 1.1.21 to 1.1.22 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
 - Bump `org.apache.kerby:kerb-admin` from 2.0.3 to 2.1.0 ([#15301](https://github.com/opensearch-project/OpenSearch/pull/15301))
 
@@ -50,6 +50,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Fix delete index template failed when the index template matches a data stream but is unused ([#15080](https://github.com/opensearch-project/OpenSearch/pull/15080))
 - Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
 - Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))
+- Fix incorrect parameter names in MinHash token filter configuration handling ([#15233](https://github.com/opensearch-project/OpenSearch/pull/15233))
 
 ### Security
 
diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java
index e76354ae3a765..40655b84794d5 100644
--- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java
@@ -65,10 +65,10 @@ private Map<String, String> convertSettings(Settings settings) {
         if (settings.hasValue("hash_count")) {
             settingMap.put("hashCount", settings.get("hash_count"));
         }
-        if (settings.hasValue("bucketCount")) {
+        if (settings.hasValue("bucket_count")) {
             settingMap.put("bucketCount", settings.get("bucket_count"));
         }
-        if (settings.hasValue("hashSetSize")) {
+        if (settings.hasValue("hash_set_size")) {
             settingMap.put("hashSetSize", settings.get("hash_set_size"));
         }
         if (settings.hasValue("with_rotation")) {
diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java
index 514c53f17456c..7d21dcab4e951 100644
--- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java
@@ -53,8 +53,7 @@ public void testDefault() throws IOException {
         OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
         String source = "the quick brown fox";
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
+        Tokenizer tokenizer = getTokenizer(source);
 
         // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
         // have enough tokens to fill all the buckets, we still expect 512 tokens.
@@ -73,11 +72,60 @@ public void testSettings() throws IOException {
         OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
         String source = "sushi";
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
+        Tokenizer tokenizer = getTokenizer(source);
 
         // despite the fact that bucket_count is 2 and hash_set_size is 1,
         // because with_rotation is false, we only expect 1 token here.
         assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
     }
+
+    public void testBucketCountSetting() throws IOException {
+        // Correct case with "bucket_count"
+        Settings settingsWithBucketCount = Settings.builder()
+            .put("index.analysis.filter.test_min_hash.type", "min_hash")
+            .put("index.analysis.filter.test_min_hash.hash_count", "1")
+            .put("index.analysis.filter.test_min_hash.bucket_count", "3")
+            .put("index.analysis.filter.test_min_hash.hash_set_size", "1")
+            .put("index.analysis.filter.test_min_hash.with_rotation", false)
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+
+        OpenSearchTestCase.TestAnalysis analysisWithBucketCount = getTestAnalysisFromSettings(settingsWithBucketCount);
+
+        TokenFilterFactory tokenFilterWithBucketCount = analysisWithBucketCount.tokenFilter.get("test_min_hash");
+        String sourceWithBucketCount = "salmon avocado roll uramaki";
+        Tokenizer tokenizerWithBucketCount = getTokenizer(sourceWithBucketCount);
+        // Expect 3 tokens due to bucket_count being set to 3
+        assertStreamHasNumberOfTokens(tokenFilterWithBucketCount.create(tokenizerWithBucketCount), 3);
+    }
+
+    public void testHashSetSizeSetting() throws IOException {
+        // Correct case with "hash_set_size"
+        Settings settingsWithHashSetSize = Settings.builder()
+            .put("index.analysis.filter.test_min_hash.type", "min_hash")
+            .put("index.analysis.filter.test_min_hash.hash_count", "1")
+            .put("index.analysis.filter.test_min_hash.bucket_count", "1")
+            .put("index.analysis.filter.test_min_hash.hash_set_size", "2")
+            .put("index.analysis.filter.test_min_hash.with_rotation", false)
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+
+        OpenSearchTestCase.TestAnalysis analysisWithHashSetSize = getTestAnalysisFromSettings(settingsWithHashSetSize);
+
+        TokenFilterFactory tokenFilterWithHashSetSize = analysisWithHashSetSize.tokenFilter.get("test_min_hash");
+        String sourceWithHashSetSize = "salmon avocado roll uramaki";
+        Tokenizer tokenizerWithHashSetSize = getTokenizer(sourceWithHashSetSize);
+        // Expect 2 tokens due to hash_set_size being set to 2 and bucket_count being 1
+        assertStreamHasNumberOfTokens(tokenFilterWithHashSetSize.create(tokenizerWithHashSetSize), 2);
+    }
+
+    private static OpenSearchTestCase.TestAnalysis getTestAnalysisFromSettings(Settings settingsWithBucketCount) throws IOException {
+        return AnalysisTestsHelper.createTestAnalysisFromSettings(settingsWithBucketCount, new CommonAnalysisPlugin());
+    }
+
+    private static Tokenizer getTokenizer(String sourceWithBucketCount) {
+        Tokenizer tokenizerWithBucketCount = new WhitespaceTokenizer();
+        tokenizerWithBucketCount.setReader(new StringReader(sourceWithBucketCount));
+        return tokenizerWithBucketCount;
+    }
 }