mirror of https://github.com/apache/lucene.git
LUCENE-9259: Fix wrong NGramFilterFactory argument name for preserveOriginal option
This commit is contained in:
parent
0c261f4215
commit
320578274b
|
@ -122,7 +122,7 @@ Optimizations
|
|||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
(No changes)
|
||||
* LUCENE-9259: Fix wrong NGramFilterFactory argument name for preserveOriginal option (Paul Pazderski)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
|
|
@ -50,7 +50,11 @@ public class NGramFilterFactory extends TokenFilterFactory {
|
|||
super(args);
|
||||
minGramSize = requireInt(args, "minGramSize");
|
||||
maxGramSize = requireInt(args, "maxGramSize");
|
||||
preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
|
||||
|
||||
// First check for the old accidental used option name. It was the only way to configure preserve original
|
||||
// for NGramFilter and ignoring it would unnecessary break existing configs.
|
||||
boolean preserve = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
|
||||
preserveOriginal = getBoolean(args, "preserveOriginal", preserve);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
|
|
@ -81,6 +81,20 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
new String[] { "te", "tes", "es", "est", "st" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the NGramFilterFactory with preserve option
|
||||
*/
|
||||
public void testNGramFilter3() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("NGram",
|
||||
"minGramSize", "2",
|
||||
"maxGramSize", "3",
|
||||
"preserveOriginal", "true").create(stream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "te", "tes", "es", "est", "st", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test NGramFilterFactory on tokens with payloads
|
||||
*/
|
||||
|
@ -152,6 +166,20 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
new String[] { "t", "te" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test EdgeNGramFilterFactory with preserve option
|
||||
*/
|
||||
public void testEdgeNGramFilter3() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("EdgeNGram",
|
||||
"minGramSize", "1",
|
||||
"maxGramSize", "2",
|
||||
"preserveOriginal", "true").create(stream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t", "te", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test EdgeNGramFilterFactory on tokens with payloads
|
||||
*/
|
||||
|
|
|
@ -476,6 +476,8 @@ This filter generates edge n-gram tokens of sizes within the given range.
|
|||
|
||||
`maxGramSize`:: (integer, default 1) The maximum gram size.
|
||||
|
||||
`preserveOriginal`:: (boolean, default false) If true keep the original term even if it is shorter than `minGramSize` or longer than `maxGramSize`.
|
||||
|
||||
*Example:*
|
||||
|
||||
Default behavior.
|
||||
|
@ -548,6 +550,24 @@ A range of 4 to 6.
|
|||
|
||||
*Out:* "four", "scor", "score", "twen", "twent", "twenty"
|
||||
|
||||
*Example:*
|
||||
|
||||
Preserve original term.
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer name="standard"/>
|
||||
<filter name="edgeNGram" minGramSize="2" maxGramSize="3" preserveOriginal="true"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
*In:* "four score"
|
||||
|
||||
*Tokenizer to Filter:* "four", "score"
|
||||
|
||||
*Out:* "fo", "fou", "four", "sc, "sco", "score"
|
||||
|
||||
== English Minimal Stem Filter
|
||||
|
||||
This filter stems plural English words to their singular form.
|
||||
|
@ -1441,6 +1461,8 @@ Generates n-gram tokens of sizes in the given range. Note that tokens are ordere
|
|||
|
||||
`maxGramSize`:: (integer, default 2) The maximum gram size.
|
||||
|
||||
`preserveOriginal`:: (boolean, default false) If true keep the original term even if it is shorter than `minGramSize` or longer than `maxGramSize`.
|
||||
|
||||
*Example:*
|
||||
|
||||
Default behavior.
|
||||
|
@ -1513,6 +1535,24 @@ A range of 3 to 5.
|
|||
|
||||
*Out:* "fou", "four", "our", "sco", "scor", "score", "cor", "core", "ore"
|
||||
|
||||
*Example:*
|
||||
|
||||
Preserve original term.
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer name="standard"/>
|
||||
<filter name="nGram" minGramSize="2" maxGramSize="3" preserveOriginal="true"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
*In:* "four score"
|
||||
|
||||
*Tokenizer to Filter:* "four", "score"
|
||||
|
||||
*Out:* "fo", "fou", "ou", "our", "ur", "four", "sc", "sco", "co", "cor", "or", "ore", "re", "score"
|
||||
|
||||
== Numeric Payload Token Filter
|
||||
|
||||
This filter adds a numeric floating point payload value to tokens that match a given type. Refer to the Javadoc for the `org.apache.lucene.analysis.Token` class for more information about token types and payloads.
|
||||
|
|
Loading…
Reference in New Issue