From 0e95bb9dff976a9c7f9cdac63a92040043d029e2 Mon Sep 17 00:00:00 2001 From: QUANG MAU BACH <14192818+quangmaubach@users.noreply.github.com> Date: Sat, 19 Feb 2022 05:05:44 +1100 Subject: [PATCH] Add Factory to enable Lucene ConcatenateGraphFilter (#1278) (#2152) Lucene has a ConcatenateGraphFilter that can concatenate tokens from a TokenStream to create a single token (or several tokens that have the same position if input TokenStream is a graph). The change is to enable that ConcatenateGraphFilter by adding a Factory. Signed-off-by: Mau Bach Quang --- .../analysis/common/CommonAnalysisPlugin.java | 1 + .../ConcatenateGraphTokenFilterFactory.java | 81 ++++++ ...ncatenateGraphTokenFilterFactoryTests.java | 260 ++++++++++++++++++ 3 files changed, 342 insertions(+) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java create mode 100644 modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java index 98956a62edb..47a144311c0 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisPlugin.java @@ -257,6 +257,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri filters.put("classic", ClassicFilterFactory::new); filters.put("czech_stem", CzechStemTokenFilterFactory::new); filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new)); + filters.put("concatenate_graph", ConcatenateGraphTokenFilterFactory::new); filters.put( "condition", requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get())) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java new file mode 100644 index 00000000000..0d1a2b185d1 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java @@ -0,0 +1,81 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; +import org.opensearch.LegacyESVersion; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenFilterFactory; + +/** + * Factory for {@link ConcatenateGraphFilter}. + * Adopted from {@link org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory}, with some changes to + * default values: token_separator is a "space", preserve_position_increments is false to avoid duplicated separators, + * max_graph_expansions is 100 as the default value of 10_000 seems to be unnecessarily large and preserve_separator is false. + * + * + * @see ConcatenateGraphFilter + */ +public class ConcatenateGraphTokenFilterFactory extends AbstractTokenFilterFactory { + public static final String DEFAULT_TOKEN_SEPARATOR = " "; + public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = 100; + public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = false; + + private final Character tokenSeparator; + private final int maxGraphExpansions; + private final boolean preservePositionIncrements; + + ConcatenateGraphTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + + if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_6_0)) { // i.e. Lucene 8.4.0 + String separator = settings.get("token_separator", DEFAULT_TOKEN_SEPARATOR); + if (separator.length() > 1) { + throw new IllegalArgumentException("token_separator must be either empty or a single character"); + } + tokenSeparator = separator.length() == 0 ? null : separator.charAt(0); // null means no separator while concatenating + } else { + boolean preserveSep = settings.getAsBoolean("preserve_separator", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP); + tokenSeparator = preserveSep ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null; + } + + maxGraphExpansions = settings.getAsInt("max_graph_expansions", DEFAULT_MAX_GRAPH_EXPANSIONS); + preservePositionIncrements = settings.getAsBoolean("preserve_position_increments", DEFAULT_PRESERVE_POSITION_INCREMENTS); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ConcatenateGraphFilter(tokenStream, tokenSeparator, preservePositionIncrements, maxGraphExpansions); + } +} diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java new file mode 100644 index 00000000000..ef4146b6587 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java @@ -0,0 +1,260 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; +import org.opensearch.LegacyESVersion; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.analysis.AnalysisTestsHelper; +import org.opensearch.index.analysis.NamedAnalyzer; +import org.opensearch.index.analysis.TokenFilterFactory; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.test.OpenSearchTokenStreamTestCase; +import org.opensearch.test.VersionUtils; + +import java.io.IOException; +import java.io.StringReader; + +public class ConcatenateGraphTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase { + public void testSimpleTokenizerAndConcatenate() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(), + new CommonAnalysisPlugin() + ); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("concatenate_graph"); + String source = "PowerShot Is AweSome"; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + + assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot Is AweSome" }); + } + + public void testTokenizerCustomizedSeparator() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.token_separator", "+") + .build(), + new CommonAnalysisPlugin() + ); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); + String source = "PowerShot Is AweSome"; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + + assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot+Is+AweSome" }); + } + + public void testOldLuceneVersionSeparator() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put( + IndexMetadata.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2) + ) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored + .build(), + new CommonAnalysisPlugin() + ); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); + String source = "PowerShot Is AweSome"; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + + // earlier Lucene version will only use Lucene's default separator + assertTokenStreamContents( + tokenFilter.create(tokenizer), + new String[] { + "PowerShot" + + ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR + + "Is" + + ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR + + "AweSome" } + ); + } + + public void testOldLuceneVersionNoSeparator() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put( + IndexMetadata.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2) + ) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored + .put("index.analysis.filter.my_concatenate_graph.preserve_separator", "false") + .build(), + new CommonAnalysisPlugin() + ); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); + String source = "PowerShot Is AweSome"; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + + // earlier Lucene version will not add separator if preserve_separator is false + assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" }); + } + + public void testTokenizerEmptySeparator() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.token_separator", "") + .build(), + new CommonAnalysisPlugin() + ); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); + String source = "PowerShot Is AweSome"; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + + assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" }); + } + + public void testPreservePositionIncrementsDefault() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.token_separator", "+") + .build(), + new CommonAnalysisPlugin() + ); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); + + CannedTokenStream cannedTokenStream = new CannedTokenStream( + new Token("a", 1, 0, 1), + new Token("b", 2, 2, 3), // there is a gap, posInc is 2 + new Token("d", 1, 4, 5) + ); + + // the gap between a and b is not preserved + assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a+b+d" }); + } + + public void testPreservePositionIncrementsTrue() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.token_separator", "+") + .put("index.analysis.filter.my_concatenate_graph.preserve_position_increments", "true") + .build(), + new CommonAnalysisPlugin() + ); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); + + CannedTokenStream cannedTokenStream = new CannedTokenStream( + new Token("a", 1, 0, 1), + new Token("b", 2, 2, 3), // there is a gap, posInc is 2 + new Token("d", 1, 4, 5) + ); + + // the gap between a and b is preserved + assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a++b+d" }); + } + + public void testGraph() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph") + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.analyzer.my_analyzer.type", "custom") + .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace") + .put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph") + .build(), + new CommonAnalysisPlugin() + ); + + String source = "PowerShot Is AweSome"; + + // Expected output from Whitespace Tokenizer is: "PowerShot" --> "Is" --> "Awe" --> "Some" + // Expected output from word_delimiter_graph is a graph: + // ---> "Power" --> "Shot" ---> "Is" ---> "Awe" ---> "Some" --- + // | | | | + // --> "PowerShot" -------- --> "AweSome" --------- + // and this filter will traverse through all possible paths to produce concatenated tokens + String[] expected = new String[] { + "Power Shot Is Awe Some", + "Power Shot Is AweSome", + "PowerShot Is Awe Some", + "PowerShot Is AweSome" }; + + // all tokens will be in the same position + int[] expectedPosIncrements = new int[] { 1, 0, 0, 0 }; + int[] expectedPosLengths = new int[] { 1, 1, 1, 1 }; + + NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + assertAnalyzesToPositions(analyzer, source, expected, expectedPosIncrements, expectedPosLengths); + } + + public void testInvalidSeparator() { + expectThrows( + IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.token_separator", "11") + .build(), + new CommonAnalysisPlugin() + ) + ); + } + + /** + * Similar to the {@link #testGraph()} case, there will be 4 paths generated by word_delimiter_graph. + * By setting max_graph_expansions to 3, we expect an exception. + */ + public void testMaxGraphExpansion() throws IOException { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph") + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") + .put("index.analysis.filter.my_concatenate_graph.max_graph_expansions", "3") + .put("index.analysis.analyzer.my_analyzer.type", "custom") + .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace") + .put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph") + .build(), + new CommonAnalysisPlugin() + ); + + String source = "PowerShot Is AweSome"; + + TokenStream tokenStream = analysis.indexAnalyzers.get("my_analyzer").tokenStream("dummy", source); + + tokenStream.reset(); + + expectThrows(TooComplexToDeterminizeException.class, tokenStream::incrementToken); + } +}