Fix UTF32toUTF8 will produce invalid transition (#12472)

This commit is contained in:
tang donghai 2023-08-17 04:59:07 +08:00 committed by GitHub
parent 4174b521dd
commit ec1367862d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 62 additions and 46 deletions

View File

@ -193,6 +193,10 @@ Bug Fixes
* GITHUB#12451: Change TestStringsToAutomaton validation to avoid automaton conversion bug discovered in GH#12458
(Greg Miller).
* GITHUB#2472: UTF32ToUTF8 would sometimes accept extra invalid UTF-8 binary sequences. This should not have any
impact on the user, unless you explicitly invoke the convert function of UTF32ToUTF8, and in the extremely rare
scenario of searching a non-UTF-8 inverted field with Unicode search terms (Tang Donghai).
Other
---------------------

View File

@ -35,12 +35,12 @@ public final class UTF32ToUTF8 {
private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};
static int[] MASKS = new int[32];
static int[] MASKS = new int[8];
static {
int v = 2;
for (int i = 0; i < 32; i++) {
MASKS[i] = v - 1;
for (int i = 0; i < 7; i++) {
MASKS[i + 1] = v - 1;
v *= 2;
}
}
@ -103,7 +103,7 @@ public final class UTF32ToUTF8 {
private void setRest(int code, int numBytes) {
for (int i = 0; i < numBytes; i++) {
bytes[numBytes - i].value = 128 | (code & MASKS[5]);
bytes[numBytes - i].value = 128 | (code & MASKS[6]);
bytes[numBytes - i].bits = 6;
code = code >> 6;
}
@ -154,15 +154,12 @@ public final class UTF32ToUTF8 {
// Single value leading edge
utf8.addTransition(start, n, startUTF8.byteAt(upto));
// start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single
// Recurse for the rest
build(n, end, startUTF8, endUTF8, 1 + upto);
}
} else if (startUTF8.len == endUTF8.len) {
if (upto == startUTF8.len - 1) {
// start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end));
// // type=startend
utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
} else {
start(start, end, startUTF8, upto, false);
@ -206,15 +203,12 @@ public final class UTF32ToUTF8 {
start,
end,
startUTF8.byteAt(upto),
startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto) - 1]); // type=start
// start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) |
// MASKS[startUTF8.numBits(upto)-1], end)); // type=start
startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)]); // type=start
} else {
int n = utf8.createState();
utf8.addTransition(start, n, startUTF8.byteAt(upto));
// start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start
start(n, end, startUTF8, 1 + upto, true);
int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto) - 1];
int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)];
if (doAll && startUTF8.byteAt(upto) != endCode) {
all(start, end, startUTF8.byteAt(upto) + 1, endCode, startUTF8.len - upto - 1);
}
@ -224,28 +218,32 @@ public final class UTF32ToUTF8 {
private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
if (upto == endUTF8.len - 1) {
// Done recursing
// start.addTransition(new Transition(endUTF8.byteAt(upto) &
// (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end
utf8.addTransition(
start,
end,
endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto) - 1]),
endUTF8.byteAt(upto));
start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)]), endUTF8.byteAt(upto));
} else {
final int startCode;
if (endUTF8.numBits(upto) == 5) {
// special case -- avoid created unused edges (endUTF8
// doesn't accept certain byte sequences) -- there
// are other cases we could optimize too:
startCode = 194;
// GH-ISSUE#12472: UTF-8 special case for the different start byte of the different
// length=2,3,4
if (endUTF8.len == 2) {
assert upto == 0; // the upto==1 case will be handled by the first if above
// the first length=2 UTF8 Unicode character is C2 80,
// so we must special case 0xC2 as the 1st byte.
startCode = 0xC2;
} else if (endUTF8.len == 3 && upto == 1 && endUTF8.byteAt(0) == 0xE0) {
// the first length=3 UTF8 Unicode character is E0 A0 80,
// so we must special case 0xA0 as the 2nd byte when E0 was the first byte of endUTF8.
startCode = 0xA0;
} else if (endUTF8.len == 4 && upto == 1 && endUTF8.byteAt(0) == 0xF0) {
// the first length=4 UTF8 Unicode character is F0 90 80 80,
// so we must special case 0x90 as the 2nd byte when F0 was the first byte of endUTF8.
startCode = 0x90;
} else {
startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto) - 1]);
startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)]);
}
if (doAll && endUTF8.byteAt(upto) != startCode) {
all(start, end, startCode, endUTF8.byteAt(upto) - 1, endUTF8.len - upto - 1);
}
int n = utf8.createState();
// start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end
utf8.addTransition(start, n, endUTF8.byteAt(upto));
end(n, end, endUTF8, 1 + upto, true);
}
@ -253,20 +251,16 @@ public final class UTF32ToUTF8 {
private void all(int start, int end, int startCode, int endCode, int left) {
if (left == 0) {
// start.addTransition(new Transition(startCode, endCode, end)); // type=all
utf8.addTransition(start, end, startCode, endCode);
} else {
int lastN = utf8.createState();
// start.addTransition(new Transition(startCode, endCode, lastN)); // type=all
utf8.addTransition(start, lastN, startCode, endCode);
while (left > 1) {
int n = utf8.createState();
// lastN.addTransition(new Transition(128, 191, n)); // type=all*
utf8.addTransition(lastN, n, 128, 191); // type=all*
left--;
lastN = n;
}
// lastN.addTransition(new Transition(128, 191, end)); // type = all*
utf8.addTransition(lastN, end, 128, 191); // type = all*
}
}

View File

@ -19,6 +19,9 @@ package org.apache.lucene.util;
import java.util.Arrays;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
/*
* Some of this code came from the excellent Unicode
@ -188,6 +191,33 @@ public class TestUnicodeUtil extends LuceneTestCase {
}
}
public void testUTF8SpanMultipleBytes() throws Exception {
Automaton.Builder b = new Automaton.Builder();
// start state:
int s1 = b.createState();
// single end accept state:
int s2 = b.createState();
b.setAccept(s2, true);
// utf8 codepoint length range from [1,2]
b.addTransition(s1, s2, 0x7F, 0x80);
// utf8 codepoint length range from [2,3]
b.addTransition(s1, s2, 0x7FF, 0x800);
// utf8 codepoint length range from [3,4]
b.addTransition(s1, s2, 0xFFFF, 0x10000);
Automaton a = b.finish();
CompiledAutomaton c = new CompiledAutomaton(a);
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
int termCount = 0;
for (IntsRef r = it.next(); r != null; r = it.next()) {
termCount++;
}
assertEquals(6, termCount);
}
public void testNewString() {
final int[] codePoints = {
Character.toCodePoint(Character.MIN_HIGH_SURROGATE, Character.MAX_LOW_SURROGATE),

View File

@ -33,7 +33,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Util;
public class TestStringsToAutomaton extends LuceneTestCase {
@ -142,22 +141,11 @@ public class TestStringsToAutomaton extends LuceneTestCase {
}
// Make sure every term produced by the automaton is expected
FiniteStringsIterator it = new FiniteStringsIterator(a);
if (isBinary) {
BytesRefBuilder scratch = new BytesRefBuilder();
for (IntsRef r = it.next(); r != null; r = it.next()) {
BytesRef t = Util.toBytesRef(r, scratch);
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
}
} else {
// Note that we validate against the original automaton, not the compiled one as the compiled
// automaton can incorrectly produce invalid/overlong utf8 terms (see: GH#12458). This means
// we need slightly different logic here since the automaton "speaks" code points and not
// utf8 bytes.
for (IntsRef r = it.next(); r != null; r = it.next()) {
BytesRef t = newBytesRef(UnicodeUtil.newString(r.ints, r.offset, r.length));
assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
}
BytesRefBuilder scratch = new BytesRefBuilder();
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
for (IntsRef r = it.next(); r != null; r = it.next()) {
BytesRef t = Util.toBytesRef(r, scratch);
assertTrue(expected.contains(t));
}
}