Output well-formed UTF-8 bytes in SimpleTextCodec's segmentinfos (#12897)

The SimpleTextSegmentInfoFormat was writing the random byte array used
as a segment's ID directly -- not converting to a simple text
representation of the byte array. As a result, the segment infos were
often malformed.
This commit is contained in:
Michael Froh 2024-01-11 14:45:48 +00:00 committed by GitHub
parent 75e1a0b96c
commit b7728c5657
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 3 deletions

View File

@ -36,7 +36,6 @@ import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CollectionUtil;
@ -164,7 +163,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SI_ID);
final byte[] id = ArrayUtil.copyOfSubArray(scratch.bytes(), SI_ID.length, scratch.length());
final byte[] id = SimpleTextUtil.fromBytesRefString(readString(SI_ID.length, scratch)).bytes;
if (!Arrays.equals(segmentID, id)) {
throw new CorruptIndexException(
@ -307,7 +306,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
}
SimpleTextUtil.write(output, SI_ID);
SimpleTextUtil.write(output, new BytesRef(si.getId()));
SimpleTextUtil.write(output, new BytesRef(si.getId()).toString(), scratch);
SimpleTextUtil.writeNewline(output);
Sort indexSort = si.getIndexSort();

View File

@ -16,8 +16,18 @@
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.util.Collections;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
/** Tests SimpleTextSegmentInfoFormat */
@ -33,4 +43,40 @@ public class TestSimpleTextSegmentInfoFormat extends BaseSegmentInfoFormatTestCa
protected Codec getCodec() {
return codec;
}
public void testFileIsUTF8() throws IOException {
Directory dir = newDirectory();
Codec codec = getCodec();
byte[] id = StringHelper.randomId();
SegmentInfo info =
new SegmentInfo(
dir,
getVersions()[0],
getVersions()[0],
"_123",
1,
false,
false,
codec,
Collections.<String, String>emptyMap(),
id,
Collections.emptyMap(),
null);
info.setFiles(Collections.<String>emptySet());
codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
String segFileName =
IndexFileNames.segmentFileName("_123", "", SimpleTextSegmentInfoFormat.SI_EXTENSION);
try (ChecksumIndexInput input = dir.openChecksumInput(segFileName)) {
long length = input.length();
if (length > 5_000) {
// Avoid allocating a huge array if the length is wrong
fail("SegmentInfos should not be this large");
}
byte[] bytes = new byte[(int) length];
BytesRef bytesRef = new BytesRef(bytes);
// If the following are equal, it means the bytes were not well-formed UTF8.
assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
}
dir.close();
}
}