LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces (#272)

This change ensures that we don't skip consecutive whitespaces without checking the maximum backtrace gap.
This commit is contained in:
Jim Ferenczi 2021-09-06 08:46:39 +02:00 committed by GitHub
parent 34f37d0d43
commit 4df8d641ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 12 deletions

View File

@ -484,6 +484,9 @@ Bug Fixes
* LUCENE-10060: Ensure DrillSidewaysQuery instances never get cached. (Greg Miller, Zachary Chen) * LUCENE-10060: Ensure DrillSidewaysQuery instances never get cached. (Greg Miller, Zachary Chen)
* LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces.
(Jim Ferenczi)
Other Other
--------------------- ---------------------
(No changes) (No changes)

View File

@ -746,21 +746,15 @@ public final class KoreanTokenizer extends Tokenizer {
System.out.println(" " + posData.count + " arcs in"); System.out.println(" " + posData.count + " arcs in");
} }
// Move to the first character that is not a whitespace. // We add single space separator as prefixes of the terms that we extract.
// The whitespaces are added as a prefix for the term that we extract, // This information is needed to compute the space penalty factor of each term.
// this information is then used when computing the cost for the term using // These whitespace prefixes are removed when the final tokens are generated, or
// the space penalty factor. // added as separated tokens when discardPunctuation is unset.
// They are removed when the final tokens are generated.
if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) { if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
int nextChar = buffer.get(++pos); if (buffer.get(++pos) == -1) {
while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) { pos = posData.pos;
pos++;
nextChar = buffer.get(pos);
} }
} }
if (buffer.get(pos) == -1) {
pos = posData.pos;
}
boolean anyMatches = false; boolean anyMatches = false;