diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index accdb184df8..5480c0fec0c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -36,7 +36,6 @@ import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.CollectionUtil; @@ -164,7 +163,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_ID); - final byte[] id = ArrayUtil.copyOfSubArray(scratch.bytes(), SI_ID.length, scratch.length()); + final byte[] id = SimpleTextUtil.fromBytesRefString(readString(SI_ID.length, scratch)).bytes; if (!Arrays.equals(segmentID, id)) { throw new CorruptIndexException( @@ -307,7 +306,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { } SimpleTextUtil.write(output, SI_ID); - SimpleTextUtil.write(output, new BytesRef(si.getId())); + SimpleTextUtil.write(output, new BytesRef(si.getId()).toString(), scratch); SimpleTextUtil.writeNewline(output); Sort indexSort = si.getIndexSort(); diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java index 610f5a2d756..ee55977a896 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java @@ -16,8 +16,18 @@ */ package org.apache.lucene.codecs.simpletext; +import java.io.IOException; +import java.util.Collections; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.Version; /** Tests SimpleTextSegmentInfoFormat */ @@ -33,4 +43,40 @@ public class TestSimpleTextSegmentInfoFormat extends BaseSegmentInfoFormatTestCa protected Codec getCodec() { return codec; } + + public void testFileIsUTF8() throws IOException { + Directory dir = newDirectory(); + Codec codec = getCodec(); + byte[] id = StringHelper.randomId(); + SegmentInfo info = + new SegmentInfo( + dir, + getVersions()[0], + getVersions()[0], + "_123", + 1, + false, + false, + codec, + Collections.emptyMap(), + id, + Collections.emptyMap(), + null); + info.setFiles(Collections.emptySet()); + codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); + String segFileName = + IndexFileNames.segmentFileName("_123", "", SimpleTextSegmentInfoFormat.SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(segFileName)) { + long length = input.length(); + if (length > 5_000) { + // Avoid allocating a huge array if the length is wrong + fail("SegmentInfos should not be this large"); + } + byte[] bytes = new byte[(int) length]; + BytesRef bytesRef = new BytesRef(bytes); + // If the following are equal, it means the bytes were not well-formed UTF8. + assertNotEquals(bytesRef.toString(), Term.toString(bytesRef)); + } + dir.close(); + } }