From 0e95bb9dff976a9c7f9cdac63a92040043d029e2 Mon Sep 17 00:00:00 2001
From: QUANG MAU BACH <14192818+quangmaubach@users.noreply.github.com>
Date: Sat, 19 Feb 2022 05:05:44 +1100
Subject: [PATCH] Add Factory to enable Lucene ConcatenateGraphFilter (#1278)
 (#2152)

Lucene has a ConcatenateGraphFilter that can concatenate tokens from a TokenStream
to create a single token (or several tokens that have the same position if
input TokenStream is a graph).

The change is to enable that ConcatenateGraphFilter by adding a Factory.

Signed-off-by: Mau Bach Quang <quangmaubach@gmail.com>
---
 .../analysis/common/CommonAnalysisPlugin.java |   1 +
 .../ConcatenateGraphTokenFilterFactory.java   |  81 ++++++
 ...ncatenateGraphTokenFilterFactoryTests.java | 260 ++++++++++++++++++
 3 files changed, 342 insertions(+)
 create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java
 create mode 100644 modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java
diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java
index 98956a62edb..47a144311c0 100644
--- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java
@@ -257,6 +257,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
         filters.put("classic", ClassicFilterFactory::new);
         filters.put("czech_stem", CzechStemTokenFilterFactory::new);
         filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new));
+        filters.put("concatenate_graph", ConcatenateGraphTokenFilterFactory::new);
         filters.put(
             "condition",
             requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get()))
diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java
new file mode 100644
index 00000000000..0d1a2b185d1
--- /dev/null
+++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java
@@ -0,0 +1,81 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.analysis.common;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
+import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
+import org.opensearch.LegacyESVersion;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.env.Environment;
+import org.opensearch.index.IndexSettings;
+import org.opensearch.index.analysis.AbstractTokenFilterFactory;
+
+/**
+ * Factory for {@link ConcatenateGraphFilter}.
+ * Adopted from {@link org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory}, with some changes to
+ * default values: token_separator is a "space", preserve_position_increments is false to avoid duplicated separators,
+ * max_graph_expansions is 100 as the default value of 10_000 seems to be unnecessarily large and preserve_separator is false.
+ *
+ * <ul>
+ *   <li>preserve_separator:
+ *       For LegacyESVersion lesser than {@link LegacyESVersion#V_7_6_0} i.e. lucene versions lesser
+ *       than {@link org.apache.lucene.util.Version#LUCENE_8_4_0}
+ *       Whether {@link ConcatenateGraphFilter#SEP_LABEL} should separate the input tokens in the concatenated token.
+ *       </li>
+ *   <li>token_separator:
+ *       Separator to use for concatenation. Must be a String with a single character or empty.
+ *       If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_TOKEN_SEPARATOR} will be used.
+ *       If empty i.e. "", tokens will be concatenated without any separators.
+ *       </li>
+ *   <li>preserve_position_increments:
+ *       Whether to add an empty token for missing positions.
+ *       If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_PRESERVE_POSITION_INCREMENTS} will be used.
+ *       </li>
+ *   <li>max_graph_expansions:
+ *       If the tokenStream graph has more than this many possible paths through, then we'll throw
+ *       {@link TooComplexToDeterminizeException} to preserve the stability and memory of the
+ *       machine.
+ *       If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_MAX_GRAPH_EXPANSIONS} will be used.
+ *       </li>
+ * </ul>
+ * @see ConcatenateGraphFilter
+ */
+public class ConcatenateGraphTokenFilterFactory extends AbstractTokenFilterFactory {
+    public static final String DEFAULT_TOKEN_SEPARATOR = " ";
+    public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = 100;
+    public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = false;
+
+    private final Character tokenSeparator;
+    private final int maxGraphExpansions;
+    private final boolean preservePositionIncrements;
+
+    ConcatenateGraphTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_6_0)) { // i.e. Lucene 8.4.0
+            String separator = settings.get("token_separator", DEFAULT_TOKEN_SEPARATOR);
+            if (separator.length() > 1) {
+                throw new IllegalArgumentException("token_separator must be either empty or a single character");
+            }
+            tokenSeparator = separator.length() == 0 ? null : separator.charAt(0); // null means no separator while concatenating
+        } else {
+            boolean preserveSep = settings.getAsBoolean("preserve_separator", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
+            tokenSeparator = preserveSep ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null;
+        }
+
+        maxGraphExpansions = settings.getAsInt("max_graph_expansions", DEFAULT_MAX_GRAPH_EXPANSIONS);
+        preservePositionIncrements = settings.getAsBoolean("preserve_position_increments", DEFAULT_PRESERVE_POSITION_INCREMENTS);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new ConcatenateGraphFilter(tokenStream, tokenSeparator, preservePositionIncrements, maxGraphExpansions);
+    }
+}
diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java
new file mode 100644
index 00000000000..ef4146b6587
--- /dev/null
+++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java
@@ -0,0 +1,260 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.analysis.common;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
+import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
+import org.opensearch.LegacyESVersion;
+import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.env.Environment;
+import org.opensearch.index.analysis.AnalysisTestsHelper;
+import org.opensearch.index.analysis.NamedAnalyzer;
+import org.opensearch.index.analysis.TokenFilterFactory;
+import org.opensearch.test.OpenSearchTestCase;
+import org.opensearch.test.OpenSearchTokenStreamTestCase;
+import org.opensearch.test.VersionUtils;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class ConcatenateGraphTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase {
+    public void testSimpleTokenizerAndConcatenate() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(),
+            new CommonAnalysisPlugin()
+        );
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("concatenate_graph");
+        String source = "PowerShot Is AweSome";
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+
+        assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot Is AweSome" });
+    }
+
+    public void testTokenizerCustomizedSeparator() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.filter.my_concatenate_graph.token_separator", "+")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
+        String source = "PowerShot Is AweSome";
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+
+        assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot+Is+AweSome" });
+    }
+
+    public void testOldLuceneVersionSeparator() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(
+                    IndexMetadata.SETTING_VERSION_CREATED,
+                    VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2)
+                )
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
+        String source = "PowerShot Is AweSome";
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+
+        // earlier Lucene version will only use Lucene's default separator
+        assertTokenStreamContents(
+            tokenFilter.create(tokenizer),
+            new String[] {
+                "PowerShot"
+                    + ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR
+                    + "Is"
+                    + ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR
+                    + "AweSome" }
+        );
+    }
+
+    public void testOldLuceneVersionNoSeparator() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(
+                    IndexMetadata.SETTING_VERSION_CREATED,
+                    VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2)
+                )
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored
+                .put("index.analysis.filter.my_concatenate_graph.preserve_separator", "false")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
+        String source = "PowerShot Is AweSome";
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+
+        // earlier Lucene version will not add separator if preserve_separator is false
+        assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" });
+    }
+
+    public void testTokenizerEmptySeparator() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.filter.my_concatenate_graph.token_separator", "")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
+        String source = "PowerShot Is AweSome";
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+
+        assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" });
+    }
+
+    public void testPreservePositionIncrementsDefault() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.filter.my_concatenate_graph.token_separator", "+")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
+
+        CannedTokenStream cannedTokenStream = new CannedTokenStream(
+            new Token("a", 1, 0, 1),
+            new Token("b", 2, 2, 3), // there is a gap, posInc is 2
+            new Token("d", 1, 4, 5)
+        );
+
+        // the gap between a and b is not preserved
+        assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a+b+d" });
+    }
+
+    public void testPreservePositionIncrementsTrue() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.filter.my_concatenate_graph.token_separator", "+")
+                .put("index.analysis.filter.my_concatenate_graph.preserve_position_increments", "true")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
+
+        CannedTokenStream cannedTokenStream = new CannedTokenStream(
+            new Token("a", 1, 0, 1),
+            new Token("b", 2, 2, 3), // there is a gap, posInc is 2
+            new Token("d", 1, 4, 5)
+        );
+
+        // the gap between a and b is preserved
+        assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a++b+d" });
+    }
+
+    public void testGraph() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph")
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.analyzer.my_analyzer.type", "custom")
+                .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
+                .put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        String source = "PowerShot Is AweSome";
+
+        // Expected output from Whitespace Tokenizer is: "PowerShot" --> "Is" --> "Awe" --> "Some"
+        // Expected output from word_delimiter_graph is a graph:
+        // <start> ---> "Power" --> "Shot" ---> "Is" ---> "Awe" ---> "Some" --- <end>
+        // | | | |
+        // --> "PowerShot" -------- --> "AweSome" ---------
+        // and this filter will traverse through all possible paths to produce concatenated tokens
+        String[] expected = new String[] {
+            "Power Shot Is Awe Some",
+            "Power Shot Is AweSome",
+            "PowerShot Is Awe Some",
+            "PowerShot Is AweSome" };
+
+        // all tokens will be in the same position
+        int[] expectedPosIncrements = new int[] { 1, 0, 0, 0 };
+        int[] expectedPosLengths = new int[] { 1, 1, 1, 1 };
+
+        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        assertAnalyzesToPositions(analyzer, source, expected, expectedPosIncrements, expectedPosLengths);
+    }
+
+    public void testInvalidSeparator() {
+        expectThrows(
+            IllegalArgumentException.class,
+            () -> AnalysisTestsHelper.createTestAnalysisFromSettings(
+                Settings.builder()
+                    .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                    .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                    .put("index.analysis.filter.my_concatenate_graph.token_separator", "11")
+                    .build(),
+                new CommonAnalysisPlugin()
+            )
+        );
+    }
+
+    /**
+     * Similar to the {@link #testGraph()} case, there will be 4 paths generated by word_delimiter_graph.
+     * By setting max_graph_expansions to 3, we expect an exception.
+     */
+    public void testMaxGraphExpansion() throws IOException {
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph")
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
+                .put("index.analysis.filter.my_concatenate_graph.max_graph_expansions", "3")
+                .put("index.analysis.analyzer.my_analyzer.type", "custom")
+                .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
+                .put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph")
+                .build(),
+            new CommonAnalysisPlugin()
+        );
+
+        String source = "PowerShot Is AweSome";
+
+        TokenStream tokenStream = analysis.indexAnalyzers.get("my_analyzer").tokenStream("dummy", source);
+
+        tokenStream.reset();
+
+        expectThrows(TooComplexToDeterminizeException.class, tokenStream::incrementToken);
+    }
+}