mirror of https://github.com/apache/lucene.git
LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces (#272)
This change ensures that we don't skip consecutive whitespaces without checking the maximum backtrace gap.
This commit is contained in:
parent
34f37d0d43
commit
4df8d641ac
|
@ -484,6 +484,9 @@ Bug Fixes
|
||||||
|
|
||||||
* LUCENE-10060: Ensure DrillSidewaysQuery instances never get cached. (Greg Miller, Zachary Chen)
|
* LUCENE-10060: Ensure DrillSidewaysQuery instances never get cached. (Greg Miller, Zachary Chen)
|
||||||
|
|
||||||
|
* LUCENE-10081: KoreanTokenizer should check the max backtrace gap on whitespaces.
|
||||||
|
(Jim Ferenczi)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
(No changes)
|
||||||
|
|
|
@ -746,21 +746,15 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
System.out.println(" " + posData.count + " arcs in");
|
System.out.println(" " + posData.count + " arcs in");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Move to the first character that is not a whitespace.
|
// We add single space separator as prefixes of the terms that we extract.
|
||||||
// The whitespaces are added as a prefix for the term that we extract,
|
// This information is needed to compute the space penalty factor of each term.
|
||||||
// this information is then used when computing the cost for the term using
|
// These whitespace prefixes are removed when the final tokens are generated, or
|
||||||
// the space penalty factor.
|
// added as separated tokens when discardPunctuation is unset.
|
||||||
// They are removed when the final tokens are generated.
|
|
||||||
if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
|
if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
|
||||||
int nextChar = buffer.get(++pos);
|
if (buffer.get(++pos) == -1) {
|
||||||
while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
|
|
||||||
pos++;
|
|
||||||
nextChar = buffer.get(pos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (buffer.get(pos) == -1) {
|
|
||||||
pos = posData.pos;
|
pos = posData.pos;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
boolean anyMatches = false;
|
boolean anyMatches = false;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue