LUCENE-9259: Fix wrong NGramFilterFactory argument name for preserveOriginal option

2025-03-03 14:59:16 +00:00 · 2020-03-07 21:32:40 +09:00 · 2020-03-07 21:32:40 +09:00 · 320578274b
commit 320578274b
parent 0c261f4215
4 changed files with 74 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -122,7 +122,7 @@ Optimizations

 Bug Fixes
 ---------------------
-(No changes)
+* LUCENE-9259: Fix wrong NGramFilterFactory argument name for preserveOriginal option (Paul Pazderski)

 Other
 ---------------------
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
@ -50,7 +50,11 @@ public class NGramFilterFactory extends TokenFilterFactory {
    super(args);
    minGramSize = requireInt(args, "minGramSize");
    maxGramSize = requireInt(args, "maxGramSize");
-    preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
+
+    // First check for the old accidental used option name. It was the only way to configure preserve original
+    // for NGramFilter and ignoring it would unnecessary break existing configs.
+    boolean preserve = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
+    preserveOriginal = getBoolean(args, "preserveOriginal", preserve);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@ -81,6 +81,20 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
        new String[] { "te", "tes", "es", "est", "st" });
  }

+  /**
+   * Test the NGramFilterFactory with preserve option
+   */
+  public void testNGramFilter3() throws Exception {
+    Reader reader = new StringReader("test");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("NGram",
+        "minGramSize", "2",
+        "maxGramSize", "3",
+        "preserveOriginal", "true").create(stream);
+    assertTokenStreamContents(stream, 
+        new String[] { "te", "tes", "es", "est", "st", "test" });
+  }
+
  /**
   * Test NGramFilterFactory on tokens with payloads
   */
@ -152,6 +166,20 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
        new String[] { "t", "te" });
  }

+  /**
+   * Test EdgeNGramFilterFactory with preserve option
+   */
+  public void testEdgeNGramFilter3() throws Exception {
+    Reader reader = new StringReader("test");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("EdgeNGram",
+        "minGramSize", "1",
+        "maxGramSize", "2",
+        "preserveOriginal", "true").create(stream);
+    assertTokenStreamContents(stream, 
+        new String[] { "t", "te", "test" });
+  }
+
  /**
   * Test EdgeNGramFilterFactory on tokens with payloads
   */
--- a/solr/solr-ref-guide/src/filter-descriptions.adoc
+++ b/solr/solr-ref-guide/src/filter-descriptions.adoc
@ -476,6 +476,8 @@ This filter generates edge n-gram tokens of sizes within the given range.

 `maxGramSize`:: (integer, default 1) The maximum gram size.

+`preserveOriginal`:: (boolean, default false) If true keep the original term even if it is shorter than `minGramSize` or longer than `maxGramSize`.
+
 *Example:*

 Default behavior.
@ -548,6 +550,24 @@ A range of 4 to 6.

 *Out:* "four", "scor", "score", "twen", "twent", "twenty"

+*Example:*
+
+Preserve original term.
+
+[source,xml]
+----
+<analyzer>
+  <tokenizer name="standard"/>
+  <filter name="edgeNGram" minGramSize="2" maxGramSize="3" preserveOriginal="true"/>
+</analyzer>
+----
+
+*In:* "four score"
+
+*Tokenizer to Filter:* "four", "score"
+
+*Out:* "fo", "fou", "four", "sc, "sco", "score"
+
 == English Minimal Stem Filter

 This filter stems plural English words to their singular form.
@ -1441,6 +1461,8 @@ Generates n-gram tokens of sizes in the given range. Note that tokens are ordere

 `maxGramSize`:: (integer, default 2) The maximum gram size.

+`preserveOriginal`:: (boolean, default false) If true keep the original term even if it is shorter than `minGramSize` or longer than `maxGramSize`.
+
 *Example:*

 Default behavior.
@ -1513,6 +1535,24 @@ A range of 3 to 5.

 *Out:* "fou", "four", "our", "sco", "scor", "score", "cor", "core", "ore"

+*Example:*
+
+Preserve original term.
+
+[source,xml]
+----
+<analyzer>
+  <tokenizer name="standard"/>
+  <filter name="nGram" minGramSize="2" maxGramSize="3" preserveOriginal="true"/>
+</analyzer>
+----
+
+*In:* "four score"
+
+*Tokenizer to Filter:* "four", "score"
+
+*Out:* "fo", "fou", "ou", "our", "ur", "four", "sc", "sco", "co", "cor", "or", "ore", "re", "score"
+
 == Numeric Payload Token Filter

 This filter adds a numeric floating point payload value to tokens that match a given type. Refer to the Javadoc for the `org.apache.lucene.analysis.Token` class for more information about token types and payloads.