LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces (#272)

This change ensures that we don't skip consecutive whitespaces without checking the maximum backtrace gap.
2021-09-06 08:46:39 +02:00 · 2021-09-06 08:46:39 +02:00 · 4df8d641ac
parent 34f37d0d43
commit 4df8d641ac
2 changed files with 9 additions and 12 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -484,6 +484,9 @@ Bug Fixes

 * LUCENE-10060: Ensure DrillSidewaysQuery instances never get cached. (Greg Miller, Zachary Chen)

+* LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces.
+  (Jim Ferenczi)
+
 Other
 ---------------------
 (No changes)
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@ -746,21 +746,15 @@ public final class KoreanTokenizer extends Tokenizer {
        System.out.println("    " + posData.count + " arcs in");
      }

-      // Move to the first character that is not a whitespace.
-      // The whitespaces are added as a prefix for the term that we extract,
-      // this information is then used when computing the cost for the term using
-      // the space penalty factor.
-      // They are removed when the final tokens are generated.
+      // We add single space separator as prefixes of the terms that we extract.
+      // This information is needed to compute the space penalty factor of each term.
+      // These whitespace prefixes are removed when the final tokens are generated, or
+      // added as separated tokens when discardPunctuation is unset.
      if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
-        int nextChar = buffer.get(++pos);
-        while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
-          pos++;
-          nextChar = buffer.get(pos);
+        if (buffer.get(++pos) == -1) {
+          pos = posData.pos;
        }
      }
-      if (buffer.get(pos) == -1) {
-        pos = posData.pos;
-      }

      boolean anyMatches = false;