Fix UTF32toUTF8 will produce invalid transition (#12472)

2023-08-17 04:59:07 +08:00 · 2023-08-17 04:59:07 +08:00 · ec1367862d
parent 4174b521dd
commit ec1367862d
4 changed files with 62 additions and 46 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -193,6 +193,10 @@ Bug Fixes
 * GITHUB#12451: Change TestStringsToAutomaton validation to avoid automaton conversion bug discovered in GH#12458
  (Greg Miller).

+* GITHUB#2472: UTF32ToUTF8 would sometimes accept extra invalid UTF-8 binary sequences.  This should not have any
+  impact on the user, unless you explicitly invoke the convert function of UTF32ToUTF8, and in the extremely rare
+  scenario of searching a non-UTF-8 inverted field with Unicode search terms (Tang Donghai).
+
 Other
 ---------------------

--- a/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
@ -35,12 +35,12 @@ public final class UTF32ToUTF8 {
  private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
  private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};

-  static int[] MASKS = new int[32];
+  static int[] MASKS = new int[8];

  static {
    int v = 2;
-    for (int i = 0; i < 32; i++) {
-      MASKS[i] = v - 1;
+    for (int i = 0; i < 7; i++) {
+      MASKS[i + 1] = v - 1;
      v *= 2;
    }
  }
@ -103,7 +103,7 @@ public final class UTF32ToUTF8 {

    private void setRest(int code, int numBytes) {
      for (int i = 0; i < numBytes; i++) {
-        bytes[numBytes - i].value = 128 | (code & MASKS[5]);
+        bytes[numBytes - i].value = 128 | (code & MASKS[6]);
        bytes[numBytes - i].bits = 6;
        code = code >> 6;
      }
@ -154,15 +154,12 @@ public final class UTF32ToUTF8 {

        // Single value leading edge
        utf8.addTransition(start, n, startUTF8.byteAt(upto));
-        // start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=single

        // Recurse for the rest
        build(n, end, startUTF8, endUTF8, 1 + upto);
      }
    } else if (startUTF8.len == endUTF8.len) {
      if (upto == startUTF8.len - 1) {
-        // start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end));
-        //      // type=startend
        utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
      } else {
        start(start, end, startUTF8, upto, false);
@ -206,15 +203,12 @@ public final class UTF32ToUTF8 {
          start,
          end,
          startUTF8.byteAt(upto),
-          startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto) - 1]); // type=start
-      // start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) |
-      // MASKS[startUTF8.numBits(upto)-1], end));  // type=start
+          startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)]); // type=start
    } else {
      int n = utf8.createState();
      utf8.addTransition(start, n, startUTF8.byteAt(upto));
-      // start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=start
      start(n, end, startUTF8, 1 + upto, true);
-      int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto) - 1];
+      int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)];
      if (doAll && startUTF8.byteAt(upto) != endCode) {
        all(start, end, startUTF8.byteAt(upto) + 1, endCode, startUTF8.len - upto - 1);
      }
@ -224,28 +218,32 @@ public final class UTF32ToUTF8 {
  private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
    if (upto == endUTF8.len - 1) {
      // Done recursing
-      // start.addTransition(new Transition(endUTF8.byteAt(upto) &
-      // (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end));   // type=end
      utf8.addTransition(
-          start,
-          end,
-          endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto) - 1]),
-          endUTF8.byteAt(upto));
+          start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)]), endUTF8.byteAt(upto));
    } else {
      final int startCode;
-      if (endUTF8.numBits(upto) == 5) {
-        // special case -- avoid created unused edges (endUTF8
-        // doesn't accept certain byte sequences) -- there
-        // are other cases we could optimize too:
-        startCode = 194;
+      // GH-ISSUE#12472: UTF-8 special case for the different start byte of the different
+      // length=2,3,4
+      if (endUTF8.len == 2) {
+        assert upto == 0; // the upto==1 case will be handled by the first if above
+        // the first length=2 UTF8 Unicode character is C2 80,
+        // so we must special case 0xC2 as the 1st byte.
+        startCode = 0xC2;
+      } else if (endUTF8.len == 3 && upto == 1 && endUTF8.byteAt(0) == 0xE0) {
+        // the first length=3 UTF8 Unicode character is E0 A0 80,
+        // so we must special case 0xA0 as the 2nd byte when E0 was the first byte of endUTF8.
+        startCode = 0xA0;
+      } else if (endUTF8.len == 4 && upto == 1 && endUTF8.byteAt(0) == 0xF0) {
+        // the first length=4 UTF8 Unicode character is F0 90 80 80,
+        // so we must special case 0x90 as the 2nd byte when F0 was the first byte of endUTF8.
+        startCode = 0x90;
      } else {
-        startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto) - 1]);
+        startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)]);
      }
      if (doAll && endUTF8.byteAt(upto) != startCode) {
        all(start, end, startCode, endUTF8.byteAt(upto) - 1, endUTF8.len - upto - 1);
      }
      int n = utf8.createState();
-      // start.addTransition(new Transition(endUTF8.byteAt(upto), n));  // type=end
      utf8.addTransition(start, n, endUTF8.byteAt(upto));
      end(n, end, endUTF8, 1 + upto, true);
    }
@ -253,20 +251,16 @@ public final class UTF32ToUTF8 {

  private void all(int start, int end, int startCode, int endCode, int left) {
    if (left == 0) {
-      // start.addTransition(new Transition(startCode, endCode, end));  // type=all
      utf8.addTransition(start, end, startCode, endCode);
    } else {
      int lastN = utf8.createState();
-      // start.addTransition(new Transition(startCode, endCode, lastN));  // type=all
      utf8.addTransition(start, lastN, startCode, endCode);
      while (left > 1) {
        int n = utf8.createState();
-        // lastN.addTransition(new Transition(128, 191, n));  // type=all*
        utf8.addTransition(lastN, n, 128, 191); // type=all*
        left--;
        lastN = n;
      }
-      // lastN.addTransition(new Transition(128, 191, end)); // type = all*
      utf8.addTransition(lastN, end, 128, 191); // type = all*
    }
  }
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@ -19,6 +19,9 @@ package org.apache.lucene.util;
 import java.util.Arrays;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.FiniteStringsIterator;

 /*
 * Some of this code came from the excellent Unicode
@ -188,6 +191,33 @@ public class TestUnicodeUtil extends LuceneTestCase {
    }
  }

+  public void testUTF8SpanMultipleBytes() throws Exception {
+    Automaton.Builder b = new Automaton.Builder();
+    // start state:
+    int s1 = b.createState();
+
+    // single end accept state:
+    int s2 = b.createState();
+    b.setAccept(s2, true);
+
+    // utf8 codepoint length range from [1,2]
+    b.addTransition(s1, s2, 0x7F, 0x80);
+    // utf8 codepoint length range from [2,3]
+    b.addTransition(s1, s2, 0x7FF, 0x800);
+    // utf8 codepoint length range from [3,4]
+    b.addTransition(s1, s2, 0xFFFF, 0x10000);
+
+    Automaton a = b.finish();
+
+    CompiledAutomaton c = new CompiledAutomaton(a);
+    FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
+    int termCount = 0;
+    for (IntsRef r = it.next(); r != null; r = it.next()) {
+      termCount++;
+    }
+    assertEquals(6, termCount);
+  }
+
  public void testNewString() {
    final int[] codePoints = {
      Character.toCodePoint(Character.MIN_HIGH_SURROGATE, Character.MAX_LOW_SURROGATE),
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@ -33,7 +33,6 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.Util;

 public class TestStringsToAutomaton extends LuceneTestCase {
@ -142,22 +141,11 @@ public class TestStringsToAutomaton extends LuceneTestCase {
    }

    // Make sure every term produced by the automaton is expected
-    FiniteStringsIterator it = new FiniteStringsIterator(a);
-    if (isBinary) {
-      BytesRefBuilder scratch = new BytesRefBuilder();
-      for (IntsRef r = it.next(); r != null; r = it.next()) {
-        BytesRef t = Util.toBytesRef(r, scratch);
-        assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
-      }
-    } else {
-      // Note that we validate against the original automaton, not the compiled one as the compiled
-      // automaton can incorrectly produce invalid/overlong utf8 terms (see: GH#12458). This means
-      // we need slightly different logic here since the automaton "speaks" code points and not
-      // utf8 bytes.
-      for (IntsRef r = it.next(); r != null; r = it.next()) {
-        BytesRef t = newBytesRef(UnicodeUtil.newString(r.ints, r.offset, r.length));
-        assertTrue(t + " unexpectedly produced by automaton", expected.contains(t));
-      }
+    BytesRefBuilder scratch = new BytesRefBuilder();
+    FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
+    for (IntsRef r = it.next(); r != null; r = it.next()) {
+      BytesRef t = Util.toBytesRef(r, scratch);
+      assertTrue(expected.contains(t));
    }
  }