LUCENE-9259: Fix wrong NGramFilterFactory argument name for preserveOriginal option

This commit is contained in:
Paul Pazderski 2020-03-07 21:32:40 +09:00 committed by Tomoko Uchida
parent 0c261f4215
commit 320578274b
4 changed files with 74 additions and 2 deletions

View File

@ -122,7 +122,7 @@ Optimizations
Bug Fixes
---------------------
(No changes)
* LUCENE-9259: Fix wrong NGramFilterFactory argument name for preserveOriginal option (Paul Pazderski)
Other
---------------------

View File

@ -50,7 +50,11 @@ public class NGramFilterFactory extends TokenFilterFactory {
super(args);
minGramSize = requireInt(args, "minGramSize");
maxGramSize = requireInt(args, "maxGramSize");
preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
// First check for the old accidental used option name. It was the only way to configure preserve original
// for NGramFilter and ignoring it would unnecessary break existing configs.
boolean preserve = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
preserveOriginal = getBoolean(args, "preserveOriginal", preserve);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}

View File

@ -81,6 +81,20 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
new String[] { "te", "tes", "es", "est", "st" });
}
/**
* Test the NGramFilterFactory with preserve option
*/
public void testNGramFilter3() throws Exception {
Reader reader = new StringReader("test");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("NGram",
"minGramSize", "2",
"maxGramSize", "3",
"preserveOriginal", "true").create(stream);
assertTokenStreamContents(stream,
new String[] { "te", "tes", "es", "est", "st", "test" });
}
/**
* Test NGramFilterFactory on tokens with payloads
*/
@ -152,6 +166,20 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
new String[] { "t", "te" });
}
/**
* Test EdgeNGramFilterFactory with preserve option
*/
public void testEdgeNGramFilter3() throws Exception {
Reader reader = new StringReader("test");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("EdgeNGram",
"minGramSize", "1",
"maxGramSize", "2",
"preserveOriginal", "true").create(stream);
assertTokenStreamContents(stream,
new String[] { "t", "te", "test" });
}
/**
* Test EdgeNGramFilterFactory on tokens with payloads
*/

View File

@ -476,6 +476,8 @@ This filter generates edge n-gram tokens of sizes within the given range.
`maxGramSize`:: (integer, default 1) The maximum gram size.
`preserveOriginal`:: (boolean, default false) If true keep the original term even if it is shorter than `minGramSize` or longer than `maxGramSize`.
*Example:*
Default behavior.
@ -548,6 +550,24 @@ A range of 4 to 6.
*Out:* "four", "scor", "score", "twen", "twent", "twenty"
*Example:*
Preserve original term.
[source,xml]
----
<analyzer>
<tokenizer name="standard"/>
<filter name="edgeNGram" minGramSize="2" maxGramSize="3" preserveOriginal="true"/>
</analyzer>
----
*In:* "four score"
*Tokenizer to Filter:* "four", "score"
*Out:* "fo", "fou", "four", "sc, "sco", "score"
== English Minimal Stem Filter
This filter stems plural English words to their singular form.
@ -1441,6 +1461,8 @@ Generates n-gram tokens of sizes in the given range. Note that tokens are ordere
`maxGramSize`:: (integer, default 2) The maximum gram size.
`preserveOriginal`:: (boolean, default false) If true keep the original term even if it is shorter than `minGramSize` or longer than `maxGramSize`.
*Example:*
Default behavior.
@ -1513,6 +1535,24 @@ A range of 3 to 5.
*Out:* "fou", "four", "our", "sco", "scor", "score", "cor", "core", "ore"
*Example:*
Preserve original term.
[source,xml]
----
<analyzer>
<tokenizer name="standard"/>
<filter name="nGram" minGramSize="2" maxGramSize="3" preserveOriginal="true"/>
</analyzer>
----
*In:* "four score"
*Tokenizer to Filter:* "four", "score"
*Out:* "fo", "fou", "ou", "our", "ur", "four", "sc", "sco", "co", "cor", "or", "ore", "re", "score"
== Numeric Payload Token Filter
This filter adds a numeric floating point payload value to tokens that match a given type. Refer to the Javadoc for the `org.apache.lucene.analysis.Token` class for more information about token types and payloads.