diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ed58c9df3e7..949d2b7f5ca 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -484,6 +484,9 @@ Bug Fixes * LUCENE-10060: Ensure DrillSidewaysQuery instances never get cached. (Greg Miller, Zachary Chen) +* LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces. + (Jim Ferenczi) + Other --------------------- (No changes) diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java index b89b0b24210..0765b801c4d 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java @@ -746,21 +746,15 @@ public final class KoreanTokenizer extends Tokenizer { System.out.println(" " + posData.count + " arcs in"); } - // Move to the first character that is not a whitespace. - // The whitespaces are added as a prefix for the term that we extract, - // this information is then used when computing the cost for the term using - // the space penalty factor. - // They are removed when the final tokens are generated. + // We add single space separator as prefixes of the terms that we extract. + // This information is needed to compute the space penalty factor of each term. + // These whitespace prefixes are removed when the final tokens are generated, or + // added as separated tokens when discardPunctuation is unset. if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) { - int nextChar = buffer.get(++pos); - while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) { - pos++; - nextChar = buffer.get(pos); + if (buffer.get(++pos) == -1) { + pos = posData.pos; } } - if (buffer.get(pos) == -1) { - pos = posData.pos; - } boolean anyMatches = false;