mirror of https://github.com/apache/lucene.git
Fix UTF32toUTF8 will produce invalid transition (#12472)
This commit is contained in:
parent
4174b521dd
commit
ec1367862d
|
@ -193,6 +193,10 @@ Bug Fixes
|
|||
* GITHUB#12451: Change TestStringsToAutomaton validation to avoid automaton conversion bug discovered in GH#12458
|
||||
(Greg Miller).
|
||||
|
||||
* GITHUB#2472: UTF32ToUTF8 would sometimes accept extra invalid UTF-8 binary sequences. This should not have any
|
||||
impact on the user, unless you explicitly invoke the convert function of UTF32ToUTF8, and in the extremely rare
|
||||
scenario of searching a non-UTF-8 inverted field with Unicode search terms (Tang Donghai).
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -35,12 +35,12 @@ public final class UTF32ToUTF8 {
|
|||
private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
|
||||
private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};
|
||||
|
||||
static int[] MASKS = new int[32];
|
||||
static int[] MASKS = new int[8];
|
||||
|
||||
static {
|
||||
int v = 2;
|
||||
for (int i = 0; i < 32; i++) {
|
||||
MASKS[i] = v - 1;
|
||||
for (int i = 0; i < 7; i++) {
|
||||
MASKS[i + 1] = v - 1;
|
||||
v *= 2;
|
||||
}
|
||||
}
|
||||
|
@ -103,7 +103,7 @@ public final class UTF32ToUTF8 {
|
|||
|
||||
private void setRest(int code, int numBytes) {
|
||||
for (int i = 0; i < numBytes; i++) {
|
||||
bytes[numBytes - i].value = 128 | (code & MASKS[5]);
|
||||
bytes[numBytes - i].value = 128 | (code & MASKS[6]);
|
||||
bytes[numBytes - i].bits = 6;
|
||||
code = code >> 6;
|
||||
}
|
||||
|
@ -154,15 +154,12 @@ public final class UTF32ToUTF8 {
|
|||
|
||||
// Single value leading edge
|
||||
utf8.addTransition(start, n, startUTF8.byteAt(upto));
|
||||
// start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single
|
||||
|
||||
// Recurse for the rest
|
||||
build(n, end, startUTF8, endUTF8, 1 + upto);
|
||||
}
|
||||
} else if (startUTF8.len == endUTF8.len) {
|
||||
if (upto == startUTF8.len - 1) {
|
||||
// start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end));
|
||||
// // type=startend
|
||||
utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
|
||||
} else {
|
||||
start(start, end, startUTF8, upto, false);
|
||||
|
@ -206,15 +203,12 @@ public final class UTF32ToUTF8 {
|
|||
start,
|
||||
end,
|
||||
startUTF8.byteAt(upto),
|
||||
startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto) - 1]); // type=start
|
||||
// start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) |
|
||||
// MASKS[startUTF8.numBits(upto)-1], end)); // type=start
|
||||
startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)]); // type=start
|
||||
} else {
|
||||
int n = utf8.createState();
|
||||
utf8.addTransition(start, n, startUTF8.byteAt(upto));
|
||||
// start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start
|
||||
start(n, end, startUTF8, 1 + upto, true);
|
||||
int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto) - 1];
|
||||
int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)];
|
||||
if (doAll && startUTF8.byteAt(upto) != endCode) {
|
||||
all(start, end, startUTF8.byteAt(upto) + 1, endCode, startUTF8.len - upto - 1);
|
||||
}
|
||||
|
@ -224,28 +218,32 @@ public final class UTF32ToUTF8 {
|
|||
private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
|
||||
if (upto == endUTF8.len - 1) {
|
||||
// Done recursing
|
||||
// start.addTransition(new Transition(endUTF8.byteAt(upto) &
|
||||
// (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end
|
||||
utf8.addTransition(
|
||||
start,
|
||||
end,
|
||||
endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto) - 1]),
|
||||
endUTF8.byteAt(upto));
|
||||
start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)]), endUTF8.byteAt(upto));
|
||||
} else {
|
||||
final int startCode;
|
||||
if (endUTF8.numBits(upto) == 5) {
|
||||
// special case -- avoid created unused edges (endUTF8
|
||||
// doesn't accept certain byte sequences) -- there
|
||||
// are other cases we could optimize too:
|
||||
startCode = 194;
|
||||
// GH-ISSUE#12472: UTF-8 special case for the different start byte of the different
|
||||
// length=2,3,4
|
||||
if (endUTF8.len == 2) {
|
||||
assert upto == 0; // the upto==1 case will be handled by the first if above
|
||||
// the first length=2 UTF8 Unicode character is C2 80,
|
||||
// so we must special case 0xC2 as the 1st byte.
|
||||
startCode = 0xC2;
|
||||
} else if (endUTF8.len == 3 && upto == 1 && endUTF8.byteAt(0) == 0xE0) {
|
||||
// the first length=3 UTF8 Unicode character is E0 A0 80,
|
||||
// so we must special case 0xA0 as the 2nd byte when E0 was the first byte of endUTF8.
|
||||
startCode = 0xA0;
|
||||
} else if (endUTF8.len == 4 && upto == 1 && endUTF8.byteAt(0) == 0xF0) {
|
||||
// the first length=4 UTF8 Unicode character is F0 90 80 80,
|
||||
// so we must special case 0x90 as the 2nd byte when F0 was the first byte of endUTF8.
|
||||
startCode = 0x90;
|
||||
} else {
|
||||
startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto) - 1]);
|
||||
startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)]);
|
||||
}
|
||||
if (doAll && endUTF8.byteAt(upto) != startCode) {
|
||||
all(start, end, startCode, endUTF8.byteAt(upto) - 1, endUTF8.len - upto - 1);
|
||||
}
|
||||
int n = utf8.createState();
|
||||
// start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end
|
||||
utf8.addTransition(start, n, endUTF8.byteAt(upto));
|
||||
end(n, end, endUTF8, 1 + upto, true);
|
||||
}
|
||||
|
@ -253,20 +251,16 @@ public final class UTF32ToUTF8 {
|
|||
|
||||
private void all(int start, int end, int startCode, int endCode, int left) {
|
||||
if (left == 0) {
|
||||
// start.addTransition(new Transition(startCode, endCode, end)); // type=all
|
||||
utf8.addTransition(start, end, startCode, endCode);
|
||||
} else {
|
||||
int lastN = utf8.createState();
|
||||
// start.addTransition(new Transition(startCode, endCode, lastN)); // type=all
|
||||
utf8.addTransition(start, lastN, startCode, endCode);
|
||||
while (left > 1) {
|
||||
int n = utf8.createState();
|
||||
// lastN.addTransition(new Transition(128, 191, n)); // type=all*
|
||||
utf8.addTransition(lastN, n, 128, 191); // type=all*
|
||||
left--;
|
||||
lastN = n;
|
||||
}
|
||||
// lastN.addTransition(new Transition(128, 191, end)); // type = all*
|
||||
utf8.addTransition(lastN, end, 128, 191); // type = all*
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,9 @@ package org.apache.lucene.util;
|
|||
import java.util.Arrays;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.FiniteStringsIterator;
|
||||
|
||||
/*
|
||||
* Some of this code came from the excellent Unicode
|
||||
|
@ -188,6 +191,33 @@ public class TestUnicodeUtil extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testUTF8SpanMultipleBytes() throws Exception {
|
||||
Automaton.Builder b = new Automaton.Builder();
|
||||
// start state:
|
||||
int s1 = b.createState();
|
||||
|
||||
// single end accept state:
|
||||
int s2 = b.createState();
|
||||
b.setAccept(s2, true);
|
||||
|
||||
// utf8 codepoint length range from [1,2]
|
||||
b.addTransition(s1, s2, 0x7F, 0x80);
|
||||
// utf8 codepoint length range from [2,3]
|
||||
b.addTransition(s1, s2, 0x7FF, 0x800);
|
||||
// utf8 codepoint length range from [3,4]
|
||||
b.addTransition(s1, s2, 0xFFFF, 0x10000);
|
||||
|
||||
Automaton a = b.finish();
|
||||
|
||||
CompiledAutomaton c = new CompiledAutomaton(a);
|
||||
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
|
||||
int termCount = 0;
|
||||
for (IntsRef r = it.next(); r != null; r = it.next()) {
|
||||
termCount++;
|
||||
}
|
||||
assertEquals(6, termCount);
|
||||
}
|
||||
|
||||
public void testNewString() {
|
||||
final int[] codePoints = {
|
||||
Character.toCodePoint(Character.MIN_HIGH_SURROGATE, Character.MAX_LOW_SURROGATE),
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
public class TestStringsToAutomaton extends LuceneTestCase {
|
||||
|
@ -142,22 +141,11 @@ public class TestStringsToAutomaton extends LuceneTestCase {
|
|||
}
|
||||
|
||||
// Make sure every term produced by the automaton is expected
|
||||
FiniteStringsIterator it = new FiniteStringsIterator(a);
|
||||
if (isBinary) {
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
for (IntsRef r = it.next(); r != null; r = it.next()) {
|
||||
BytesRef t = Util.toBytesRef(r, scratch);
|
||||
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
|
||||
}
|
||||
} else {
|
||||
// Note that we validate against the original automaton, not the compiled one as the compiled
|
||||
// automaton can incorrectly produce invalid/overlong utf8 terms (see: GH#12458). This means
|
||||
// we need slightly different logic here since the automaton "speaks" code points and not
|
||||
// utf8 bytes.
|
||||
for (IntsRef r = it.next(); r != null; r = it.next()) {
|
||||
BytesRef t = newBytesRef(UnicodeUtil.newString(r.ints, r.offset, r.length));
|
||||
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
|
||||
}
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
|
||||
for (IntsRef r = it.next(); r != null; r = it.next()) {
|
||||
BytesRef t = Util.toBytesRef(r, scratch);
|
||||
assertTrue(expected.contains(t));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue