Lucene has a ConcatenateGraphFilter that can concatenate tokens from a TokenStream to create a single token (or several tokens that have the same position if input TokenStream is a graph). The change is to enable that ConcatenateGraphFilter by adding a Factory. Signed-off-by: Mau Bach Quang <quangmaubach@gmail.com>
This commit is contained in:
parent
8ae0db5285
commit
0e95bb9dff
|
@ -257,6 +257,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
||||||
filters.put("classic", ClassicFilterFactory::new);
|
filters.put("classic", ClassicFilterFactory::new);
|
||||||
filters.put("czech_stem", CzechStemTokenFilterFactory::new);
|
filters.put("czech_stem", CzechStemTokenFilterFactory::new);
|
||||||
filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
||||||
|
filters.put("concatenate_graph", ConcatenateGraphTokenFilterFactory::new);
|
||||||
filters.put(
|
filters.put(
|
||||||
"condition",
|
"condition",
|
||||||
requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get()))
|
requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get()))
|
||||||
|
|
|
@ -0,0 +1,81 @@
|
||||||
|
/*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*
|
||||||
|
* The OpenSearch Contributors require contributions made to
|
||||||
|
* this file be licensed under the Apache-2.0 license or a
|
||||||
|
* compatible open source license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.opensearch.analysis.common;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
|
||||||
|
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
|
||||||
|
import org.opensearch.LegacyESVersion;
|
||||||
|
import org.opensearch.common.settings.Settings;
|
||||||
|
import org.opensearch.env.Environment;
|
||||||
|
import org.opensearch.index.IndexSettings;
|
||||||
|
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link ConcatenateGraphFilter}.
|
||||||
|
* Adopted from {@link org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory}, with some changes to
|
||||||
|
* default values: token_separator is a "space", preserve_position_increments is false to avoid duplicated separators,
|
||||||
|
* max_graph_expansions is 100 as the default value of 10_000 seems to be unnecessarily large and preserve_separator is false.
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>preserve_separator:
|
||||||
|
* For LegacyESVersion lesser than {@link LegacyESVersion#V_7_6_0} i.e. lucene versions lesser
|
||||||
|
* than {@link org.apache.lucene.util.Version#LUCENE_8_4_0}
|
||||||
|
* Whether {@link ConcatenateGraphFilter#SEP_LABEL} should separate the input tokens in the concatenated token.
|
||||||
|
* </li>
|
||||||
|
* <li>token_separator:
|
||||||
|
* Separator to use for concatenation. Must be a String with a single character or empty.
|
||||||
|
* If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_TOKEN_SEPARATOR} will be used.
|
||||||
|
* If empty i.e. "", tokens will be concatenated without any separators.
|
||||||
|
* </li>
|
||||||
|
* <li>preserve_position_increments:
|
||||||
|
* Whether to add an empty token for missing positions.
|
||||||
|
* If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_PRESERVE_POSITION_INCREMENTS} will be used.
|
||||||
|
* </li>
|
||||||
|
* <li>max_graph_expansions:
|
||||||
|
* If the tokenStream graph has more than this many possible paths through, then we'll throw
|
||||||
|
* {@link TooComplexToDeterminizeException} to preserve the stability and memory of the
|
||||||
|
* machine.
|
||||||
|
* If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_MAX_GRAPH_EXPANSIONS} will be used.
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
|
* @see ConcatenateGraphFilter
|
||||||
|
*/
|
||||||
|
public class ConcatenateGraphTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
public static final String DEFAULT_TOKEN_SEPARATOR = " ";
|
||||||
|
public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = 100;
|
||||||
|
public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = false;
|
||||||
|
|
||||||
|
private final Character tokenSeparator;
|
||||||
|
private final int maxGraphExpansions;
|
||||||
|
private final boolean preservePositionIncrements;
|
||||||
|
|
||||||
|
ConcatenateGraphTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
|
super(indexSettings, name, settings);
|
||||||
|
|
||||||
|
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_6_0)) { // i.e. Lucene 8.4.0
|
||||||
|
String separator = settings.get("token_separator", DEFAULT_TOKEN_SEPARATOR);
|
||||||
|
if (separator.length() > 1) {
|
||||||
|
throw new IllegalArgumentException("token_separator must be either empty or a single character");
|
||||||
|
}
|
||||||
|
tokenSeparator = separator.length() == 0 ? null : separator.charAt(0); // null means no separator while concatenating
|
||||||
|
} else {
|
||||||
|
boolean preserveSep = settings.getAsBoolean("preserve_separator", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
|
||||||
|
tokenSeparator = preserveSep ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
maxGraphExpansions = settings.getAsInt("max_graph_expansions", DEFAULT_MAX_GRAPH_EXPANSIONS);
|
||||||
|
preservePositionIncrements = settings.getAsBoolean("preserve_position_increments", DEFAULT_PRESERVE_POSITION_INCREMENTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new ConcatenateGraphFilter(tokenStream, tokenSeparator, preservePositionIncrements, maxGraphExpansions);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,260 @@
|
||||||
|
/*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*
|
||||||
|
* The OpenSearch Contributors require contributions made to
|
||||||
|
* this file be licensed under the Apache-2.0 license or a
|
||||||
|
* compatible open source license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.opensearch.analysis.common;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
|
||||||
|
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
|
||||||
|
import org.opensearch.LegacyESVersion;
|
||||||
|
import org.opensearch.cluster.metadata.IndexMetadata;
|
||||||
|
import org.opensearch.common.settings.Settings;
|
||||||
|
import org.opensearch.env.Environment;
|
||||||
|
import org.opensearch.index.analysis.AnalysisTestsHelper;
|
||||||
|
import org.opensearch.index.analysis.NamedAnalyzer;
|
||||||
|
import org.opensearch.index.analysis.TokenFilterFactory;
|
||||||
|
import org.opensearch.test.OpenSearchTestCase;
|
||||||
|
import org.opensearch.test.OpenSearchTokenStreamTestCase;
|
||||||
|
import org.opensearch.test.VersionUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
public class ConcatenateGraphTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
||||||
|
public void testSimpleTokenizerAndConcatenate() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("concatenate_graph");
|
||||||
|
String source = "PowerShot Is AweSome";
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
|
||||||
|
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot Is AweSome" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTokenizerCustomizedSeparator() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
|
||||||
|
String source = "PowerShot Is AweSome";
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
|
||||||
|
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot+Is+AweSome" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOldLuceneVersionSeparator() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(
|
||||||
|
IndexMetadata.SETTING_VERSION_CREATED,
|
||||||
|
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2)
|
||||||
|
)
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
|
||||||
|
String source = "PowerShot Is AweSome";
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
|
||||||
|
// earlier Lucene version will only use Lucene's default separator
|
||||||
|
assertTokenStreamContents(
|
||||||
|
tokenFilter.create(tokenizer),
|
||||||
|
new String[] {
|
||||||
|
"PowerShot"
|
||||||
|
+ ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR
|
||||||
|
+ "Is"
|
||||||
|
+ ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR
|
||||||
|
+ "AweSome" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOldLuceneVersionNoSeparator() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(
|
||||||
|
IndexMetadata.SETTING_VERSION_CREATED,
|
||||||
|
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2)
|
||||||
|
)
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.preserve_separator", "false")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
|
||||||
|
String source = "PowerShot Is AweSome";
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
|
||||||
|
// earlier Lucene version will not add separator if preserve_separator is false
|
||||||
|
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTokenizerEmptySeparator() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.token_separator", "")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
|
||||||
|
String source = "PowerShot Is AweSome";
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
|
||||||
|
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPreservePositionIncrementsDefault() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
|
||||||
|
|
||||||
|
CannedTokenStream cannedTokenStream = new CannedTokenStream(
|
||||||
|
new Token("a", 1, 0, 1),
|
||||||
|
new Token("b", 2, 2, 3), // there is a gap, posInc is 2
|
||||||
|
new Token("d", 1, 4, 5)
|
||||||
|
);
|
||||||
|
|
||||||
|
// the gap between a and b is not preserved
|
||||||
|
assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a+b+d" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPreservePositionIncrementsTrue() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.preserve_position_increments", "true")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph");
|
||||||
|
|
||||||
|
CannedTokenStream cannedTokenStream = new CannedTokenStream(
|
||||||
|
new Token("a", 1, 0, 1),
|
||||||
|
new Token("b", 2, 2, 3), // there is a gap, posInc is 2
|
||||||
|
new Token("d", 1, 4, 5)
|
||||||
|
);
|
||||||
|
|
||||||
|
// the gap between a and b is preserved
|
||||||
|
assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a++b+d" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGraph() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph")
|
||||||
|
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.type", "custom")
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
String source = "PowerShot Is AweSome";
|
||||||
|
|
||||||
|
// Expected output from Whitespace Tokenizer is: "PowerShot" --> "Is" --> "Awe" --> "Some"
|
||||||
|
// Expected output from word_delimiter_graph is a graph:
|
||||||
|
// <start> ---> "Power" --> "Shot" ---> "Is" ---> "Awe" ---> "Some" --- <end>
|
||||||
|
// | | | |
|
||||||
|
// --> "PowerShot" -------- --> "AweSome" ---------
|
||||||
|
// and this filter will traverse through all possible paths to produce concatenated tokens
|
||||||
|
String[] expected = new String[] {
|
||||||
|
"Power Shot Is Awe Some",
|
||||||
|
"Power Shot Is AweSome",
|
||||||
|
"PowerShot Is Awe Some",
|
||||||
|
"PowerShot Is AweSome" };
|
||||||
|
|
||||||
|
// all tokens will be in the same position
|
||||||
|
int[] expectedPosIncrements = new int[] { 1, 0, 0, 0 };
|
||||||
|
int[] expectedPosLengths = new int[] { 1, 1, 1, 1 };
|
||||||
|
|
||||||
|
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
|
||||||
|
assertAnalyzesToPositions(analyzer, source, expected, expectedPosIncrements, expectedPosLengths);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testInvalidSeparator() {
|
||||||
|
expectThrows(
|
||||||
|
IllegalArgumentException.class,
|
||||||
|
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.token_separator", "11")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similar to the {@link #testGraph()} case, there will be 4 paths generated by word_delimiter_graph.
|
||||||
|
* By setting max_graph_expansions to 3, we expect an exception.
|
||||||
|
*/
|
||||||
|
public void testMaxGraphExpansion() throws IOException {
|
||||||
|
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
|
Settings.builder()
|
||||||
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph")
|
||||||
|
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph")
|
||||||
|
.put("index.analysis.filter.my_concatenate_graph.max_graph_expansions", "3")
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.type", "custom")
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin()
|
||||||
|
);
|
||||||
|
|
||||||
|
String source = "PowerShot Is AweSome";
|
||||||
|
|
||||||
|
TokenStream tokenStream = analysis.indexAnalyzers.get("my_analyzer").tokenStream("dummy", source);
|
||||||
|
|
||||||
|
tokenStream.reset();
|
||||||
|
|
||||||
|
expectThrows(TooComplexToDeterminizeException.class, tokenStream::incrementToken);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue