mirror of https://github.com/apache/lucene.git
Output well-formed UTF-8 bytes in SimpleTextCodec's segmentinfos (#12897)
The SimpleTextSegmentInfoFormat was writing the random byte array used as a segment's ID directly -- not converting to a simple text representation of the byte array. As a result, the segment infos were often malformed.
This commit is contained in:
parent
75e1a0b96c
commit
b7728c5657
|
@ -36,7 +36,6 @@ import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.CollectionUtil;
|
import org.apache.lucene.util.CollectionUtil;
|
||||||
|
@ -164,7 +163,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
|
||||||
|
|
||||||
SimpleTextUtil.readLine(input, scratch);
|
SimpleTextUtil.readLine(input, scratch);
|
||||||
assert StringHelper.startsWith(scratch.get(), SI_ID);
|
assert StringHelper.startsWith(scratch.get(), SI_ID);
|
||||||
final byte[] id = ArrayUtil.copyOfSubArray(scratch.bytes(), SI_ID.length, scratch.length());
|
final byte[] id = SimpleTextUtil.fromBytesRefString(readString(SI_ID.length, scratch)).bytes;
|
||||||
|
|
||||||
if (!Arrays.equals(segmentID, id)) {
|
if (!Arrays.equals(segmentID, id)) {
|
||||||
throw new CorruptIndexException(
|
throw new CorruptIndexException(
|
||||||
|
@ -307,7 +306,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
|
||||||
}
|
}
|
||||||
|
|
||||||
SimpleTextUtil.write(output, SI_ID);
|
SimpleTextUtil.write(output, SI_ID);
|
||||||
SimpleTextUtil.write(output, new BytesRef(si.getId()));
|
SimpleTextUtil.write(output, new BytesRef(si.getId()).toString(), scratch);
|
||||||
SimpleTextUtil.writeNewline(output);
|
SimpleTextUtil.writeNewline(output);
|
||||||
|
|
||||||
Sort indexSort = si.getIndexSort();
|
Sort indexSort = si.getIndexSort();
|
||||||
|
|
|
@ -16,8 +16,18 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.simpletext;
|
package org.apache.lucene.codecs.simpletext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.SegmentInfo;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
|
import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/** Tests SimpleTextSegmentInfoFormat */
|
/** Tests SimpleTextSegmentInfoFormat */
|
||||||
|
@ -33,4 +43,40 @@ public class TestSimpleTextSegmentInfoFormat extends BaseSegmentInfoFormatTestCa
|
||||||
protected Codec getCodec() {
|
protected Codec getCodec() {
|
||||||
return codec;
|
return codec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testFileIsUTF8() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
Codec codec = getCodec();
|
||||||
|
byte[] id = StringHelper.randomId();
|
||||||
|
SegmentInfo info =
|
||||||
|
new SegmentInfo(
|
||||||
|
dir,
|
||||||
|
getVersions()[0],
|
||||||
|
getVersions()[0],
|
||||||
|
"_123",
|
||||||
|
1,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
codec,
|
||||||
|
Collections.<String, String>emptyMap(),
|
||||||
|
id,
|
||||||
|
Collections.emptyMap(),
|
||||||
|
null);
|
||||||
|
info.setFiles(Collections.<String>emptySet());
|
||||||
|
codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
|
||||||
|
String segFileName =
|
||||||
|
IndexFileNames.segmentFileName("_123", "", SimpleTextSegmentInfoFormat.SI_EXTENSION);
|
||||||
|
try (ChecksumIndexInput input = dir.openChecksumInput(segFileName)) {
|
||||||
|
long length = input.length();
|
||||||
|
if (length > 5_000) {
|
||||||
|
// Avoid allocating a huge array if the length is wrong
|
||||||
|
fail("SegmentInfos should not be this large");
|
||||||
|
}
|
||||||
|
byte[] bytes = new byte[(int) length];
|
||||||
|
BytesRef bytesRef = new BytesRef(bytes);
|
||||||
|
// If the following are equal, it means the bytes were not well-formed UTF8.
|
||||||
|
assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
|
||||||
|
}
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue