Output well-formed UTF-8 bytes in SimpleTextCodec's segmentinfos (#12897)

The SimpleTextSegmentInfoFormat was writing the random byte array used as a segment's ID directly -- not converting to a simple text representation of the byte array. As a result, the segment infos were often malformed.
2024-01-11 14:45:48 +00:00 · 2024-01-11 14:45:48 +00:00 · b7728c5657
parent 75e1a0b96c
commit b7728c5657
2 changed files with 48 additions and 3 deletions
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
@ -36,7 +36,6 @@ import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.CollectionUtil;
@ -164,7 +163,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
      SimpleTextUtil.readLine(input, scratch);
      assert StringHelper.startsWith(scratch.get(), SI_ID);
-      final byte[] id = ArrayUtil.copyOfSubArray(scratch.bytes(), SI_ID.length, scratch.length());
+      final byte[] id = SimpleTextUtil.fromBytesRefString(readString(SI_ID.length, scratch)).bytes;
      if (!Arrays.equals(segmentID, id)) {
        throw new CorruptIndexException(
@ -307,7 +306,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
      }
      SimpleTextUtil.write(output, SI_ID);
-      SimpleTextUtil.write(output, new BytesRef(si.getId()));
+      SimpleTextUtil.write(output, new BytesRef(si.getId()).toString(), scratch);
      SimpleTextUtil.writeNewline(output);
      Sort indexSort = si.getIndexSort();
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
@ -16,8 +16,18 @@
 */
 package org.apache.lucene.codecs.simpletext;
 import java.io.IOException;
 import java.util.Collections;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.Version;
 /** Tests SimpleTextSegmentInfoFormat */
@ -33,4 +43,40 @@ public class TestSimpleTextSegmentInfoFormat extends BaseSegmentInfoFormatTestCa
  protected Codec getCodec() {
    return codec;
  }
  public void testFileIsUTF8() throws IOException {
    Directory dir = newDirectory();
    Codec codec = getCodec();
    byte[] id = StringHelper.randomId();
    SegmentInfo info =
        new SegmentInfo(
            dir,
            getVersions()[0],
            getVersions()[0],
            "_123",
            1,
            false,
            false,
            codec,
            Collections.<String, String>emptyMap(),
            id,
            Collections.emptyMap(),
            null);
    info.setFiles(Collections.<String>emptySet());
    codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
    String segFileName =
        IndexFileNames.segmentFileName("_123", "", SimpleTextSegmentInfoFormat.SI_EXTENSION);
    try (ChecksumIndexInput input = dir.openChecksumInput(segFileName)) {
      long length = input.length();
      if (length > 5_000) {
        // Avoid allocating a huge array if the length is wrong
        fail("SegmentInfos should not be this large");
      }
      byte[] bytes = new byte[(int) length];
      BytesRef bytesRef = new BytesRef(bytes);
      // If the following are equal, it means the bytes were not well-formed UTF8.
      assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
    }
    dir.close();
  }
 }