LUCENE-5969, LUCENE-5895: fix sign bit bugs in segment/commit IDs, use byte[] representation

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5969@1627714 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-09-26 05:20:43 +00:00
parent afee9af13f
commit 88648b3a9c
13 changed files with 126 additions and 71 deletions

View File

@ -31,7 +31,7 @@ import org.apache.lucene.store.DataOutput; // javadocs
* <p>
* Files:
* <ul>
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Id, Footer
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Footer
* </ul>
* </p>
* Data types:
@ -44,7 +44,6 @@ import org.apache.lucene.store.DataOutput; // javadocs
* <li>Diagnostics --&gt; {@link DataOutput#writeStringStringMap Map&lt;String,String&gt;}</li>
* <li>IsCompoundFile --&gt; {@link DataOutput#writeByte Int8}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* <li>Id --&gt; {@link DataOutput#writeString String}</li>
* </ul>
* </p>
* Field Descriptions:
@ -88,6 +87,5 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
static final String CODEC_NAME = "Lucene46SegmentInfo";
static final int VERSION_START = 0;
static final int VERSION_CHECKSUM = 1;
static final int VERSION_ID = 2;
static final int VERSION_CURRENT = VERSION_ID;
static final int VERSION_CURRENT = VERSION_CHECKSUM;
}

View File

@ -65,13 +65,6 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader {
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
final Map<String,String> diagnostics = input.readStringStringMap();
final Set<String> files = input.readStringSet();
String id;
if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_ID) {
id = input.readString();
} else {
id = null;
}
if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_CHECKSUM) {
CodecUtil.checkFooter(input);
@ -79,7 +72,7 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader {
CodecUtil.checkEOF(input);
}
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, null);
si.setFiles(files);
return si;

View File

@ -64,7 +64,6 @@ public class Lucene46SegmentInfoWriter extends SegmentInfoWriter {
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeStringStringMap(si.getDiagnostics());
output.writeStringSet(si.files());
output.writeString(si.getId());
CodecUtil.writeFooter(output);
success = true;
} finally {

View File

@ -20,6 +20,7 @@ package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@ -109,7 +110,7 @@ public class SimpleTextSegmentInfoReader extends SegmentInfoReader {
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SI_ID);
final String id = readString(SI_ID.length, scratch);
final byte[] id = Arrays.copyOfRange(scratch.bytes(), SI_ID.length, scratch.length());
SimpleTextUtil.checkFooter(input);

View File

@ -107,7 +107,7 @@ public class SimpleTextSegmentInfoWriter extends SegmentInfoWriter {
}
SimpleTextUtil.write(output, SI_ID);
SimpleTextUtil.write(output, si.getId(), scratch);
SimpleTextUtil.write(output, new BytesRef(si.getId()));
SimpleTextUtil.writeNewline(output);
SimpleTextUtil.writeChecksum(output, scratch);

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFormatTooNewException;
@ -31,6 +32,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
/**
* Utility class for reading and writing versioned headers.
@ -94,12 +96,12 @@ public final class CodecUtil {
* Writes a codec header for a per-segment, which records both a string to
* identify the file, a version number, and the unique ID of the segment.
* This header can be parsed and validated with
* {@link #checkSegmentHeader(DataInput, String, int, int, String) checkSegmentHeader()}.
* {@link #checkSegmentHeader(DataInput, String, int, int, byte[]) checkSegmentHeader()}.
* <p>
* CodecSegmentHeader --&gt; CodecHeader,SegmentID
* <ul>
* <li>CodecHeader --&gt; {@link #writeHeader}
* <li>SegmentID --&gt; {@link DataOutput#writeString String}.
* <li>SegmentID --&gt; {@link DataOutput#writeByte byte}<sup>16</sup>.
* Unique identifier for the segment.
* </ul>
* <p>
@ -113,13 +115,15 @@ public final class CodecUtil {
* @param segmentID Unique identifier for the segment
* @param version Version number
* @throws IOException If there is an I/O error writing to the underlying medium.
* @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
* @throws IllegalArgumentException If the codec name is not simple ASCII, or
* is more than 127 characters in length, or if segmentID is invalid.
*/
// nocommit: fix javadocs, add segmentLength()
public static void writeSegmentHeader(DataOutput out, String codec, int version, String segmentID) throws IOException {
public static void writeSegmentHeader(DataOutput out, String codec, int version, byte[] segmentID) throws IOException {
if (segmentID.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(segmentID));
}
writeHeader(out, codec, version);
// nocommit: improve encoding of this ID
out.writeString(segmentID);
out.writeBytes(segmentID, 0, segmentID.length);
}
/**
@ -132,6 +136,17 @@ public final class CodecUtil {
public static int headerLength(String codec) {
return 9+codec.length();
}
/**
* Computes the length of a segment header.
*
* @param codec Codec name.
* @return length of the entire segment header.
* @see #writeSegmentHeader(DataOutput, String, int, byte[])
*/
public static int segmentHeaderLength(String codec) {
return headerLength(codec) + StringHelper.ID_LENGTH;
}
/**
* Reads and validates a header previously written with
@ -192,7 +207,7 @@ public final class CodecUtil {
/**
* Reads and validates a header previously written with
* {@link #writeSegmentHeader(DataOutput, String, int, String)}.
* {@link #writeSegmentHeader(DataOutput, String, int, byte[])}.
* <p>
* When reading a file, supply the expected <code>codec</code>,
* expected version range (<code>minVersion to maxVersion</code>),
@ -219,13 +234,15 @@ public final class CodecUtil {
* @throws IndexFormatTooNewException If the actual version is greater
* than <code>maxVersion</code>.
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeSegmentHeader(DataOutput, String, int, String)
* @see #writeSegmentHeader(DataOutput, String, int, byte[])
*/
public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, String segmentID) throws IOException {
public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] segmentID) throws IOException {
int version = checkHeader(in, codec, minVersion, maxVersion);
String id = in.readString();
if (!id.equals(segmentID)) {
throw new CorruptIndexException("file mismatch, expected segment id=" + segmentID + ", got=" + id, in);
byte id[] = new byte[StringHelper.ID_LENGTH];
in.readBytes(id, 0, id.length);
if (!Arrays.equals(id, segmentID)) {
throw new CorruptIndexException("file mismatch, expected segment id=" + StringHelper.idToString(segmentID)
+ ", got=" + StringHelper.idToString(id), in);
}
return version;
}

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
/**
@ -69,7 +70,8 @@ public class Lucene50SegmentInfoReader extends SegmentInfoReader {
final Map<String,String> diagnostics = input.readStringStringMap();
final Set<String> files = input.readStringSet();
String id = input.readString();
byte[] id = new byte[StringHelper.ID_LENGTH];
input.readBytes(id, 0, id.length);
si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
si.setFiles(files);

View File

@ -64,7 +64,8 @@ public class Lucene50SegmentInfoWriter extends SegmentInfoWriter {
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeStringStringMap(si.getDiagnostics());
output.writeStringSet(si.files());
output.writeString(si.getId());
byte[] id = si.getId();
output.writeBytes(id, 0, id.length);
CodecUtil.writeFooter(output);
success = true;
} finally {

View File

@ -47,6 +47,7 @@ import org.apache.lucene.util.CommandLineUtil;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
@ -514,7 +515,7 @@ public class CheckIndex {
}
msg(infoStream, "Segments file=" + segmentsFileName + " numSegments=" + numSegments
+ " " + versionString + " id=" + sis.getId() + " format=" + sFormat + userDataString);
+ " " + versionString + " id=" + StringHelper.idToString(sis.getId()) + " format=" + sFormat + userDataString);
if (onlySegments != null) {
result.partial = true;
@ -565,7 +566,7 @@ public class CheckIndex {
try {
msg(infoStream, " version=" + (version == null ? "3.0" : version));
msg(infoStream, " id=" + info.info.getId());
msg(infoStream, " id=" + StringHelper.idToString(info.info.getId()));
final Codec codec = info.info.getCodec();
msg(infoStream, " codec=" + codec);
segInfoStat.codec = codec;

View File

@ -18,6 +18,7 @@ package org.apache.lucene.index;
*/
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
@ -59,7 +60,7 @@ public final class SegmentInfo {
private boolean isCompoundFile;
/** Id that uniquely identifies this segment. */
private final String id;
private final byte[] id;
private Codec codec;
@ -89,7 +90,7 @@ public final class SegmentInfo {
*/
public SegmentInfo(Directory dir, Version version, String name, int docCount,
boolean isCompoundFile, Codec codec, Map<String,String> diagnostics,
String id) {
byte[] id) {
assert !(dir instanceof TrackingDirectoryWrapper);
this.dir = dir;
this.version = version;
@ -99,6 +100,9 @@ public final class SegmentInfo {
this.codec = codec;
this.diagnostics = diagnostics;
this.id = id;
if (id != null && id.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("invalid id: " + Arrays.toString(id));
}
}
/**
@ -218,8 +222,8 @@ public final class SegmentInfo {
}
/** Return the id that uniquely identifies this segment. */
public String getId() {
return id;
public byte[] getId() {
return id == null ? null : id.clone();
}
private Set<String> setFiles;

View File

@ -125,8 +125,8 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
/** The file format version for the segments_N codec header, since 4.9+ */
public static final int VERSION_49 = 3;
/** The file format version for the segments_N codec header, since 4.11+ */
public static final int VERSION_411 = 4;
/** The file format version for the segments_N codec header, since 5.0+ */
public static final int VERSION_50 = 4;
/** Used to name new segments. */
// TODO: should this be a long ...?
@ -151,8 +151,8 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
*/
private static PrintStream infoStream = null;
/** Id for this commit; only written starting with Lucene 4.11 */
private String id;
/** Id for this commit; only written starting with Lucene 5.0 */
private byte[] id;
/** Sole constructor. Typically you call this and then
* use {@link #read(Directory) or
@ -262,10 +262,10 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
nextGeneration);
}
/** Since Lucene 4.11, every commit (segments_N) writes a unique id. This will
* return that id, or null if this commit was pre-4.11. */
public String getId() {
return id;
/** Since Lucene 5.0, every commit (segments_N) writes a unique id. This will
* return that id, or null if this commit was 5.0. */
public byte[] getId() {
return id == null ? null : id.clone();
}
/**
@ -296,7 +296,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
throw new IndexFormatTooOldException(input, magic, CodecUtil.CODEC_MAGIC, CodecUtil.CODEC_MAGIC);
}
// 4.0+
int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_411);
int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_50);
version = input.readLong();
counter = input.readInt();
int numSegments = input.readInt();
@ -361,8 +361,9 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
add(siPerCommit);
}
userData = input.readStringStringMap();
if (format >= VERSION_411) {
id = input.readString();
if (format >= VERSION_50) {
id = new byte[StringHelper.ID_LENGTH];
input.readBytes(id, 0, id.length);
}
if (format >= VERSION_48) {
@ -425,7 +426,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
try {
segnOutput = directory.createOutput(segmentFileName, IOContext.DEFAULT);
CodecUtil.writeHeader(segnOutput, "segments", VERSION_411);
CodecUtil.writeHeader(segnOutput, "segments", VERSION_50);
segnOutput.writeLong(version);
segnOutput.writeInt(counter); // write counter
segnOutput.writeInt(size()); // write infos
@ -451,7 +452,8 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
assert si.dir == directory;
}
segnOutput.writeStringStringMap(userData);
segnOutput.writeString(StringHelper.randomId());
byte[] id = StringHelper.randomId();
segnOutput.writeBytes(id, 0, id.length);
CodecUtil.writeFooter(segnOutput);
segnOutput.close();
directory.sync(Collections.singleton(segmentFileName));

View File

@ -232,21 +232,21 @@ public abstract class StringHelper {
// Holds 128 bit unsigned value:
private static BigInteger nextId;
private static final BigInteger idMask;
private static final BigInteger mask128;
private static final Object idLock = new Object();
private static final String idPad = "00000000000000000000000000000000";
static {
byte[] maskBytes = new byte[16];
Arrays.fill(maskBytes, (byte) 0xff);
idMask = new BigInteger(maskBytes);
// 128 bit unsigned mask
byte[] maskBytes128 = new byte[16];
Arrays.fill(maskBytes128, (byte) 0xff);
mask128 = new BigInteger(1, maskBytes128);
String prop = System.getProperty("tests.seed");
// State for xorshift128:
long x0;
long x1;
long seed;
if (prop != null) {
// So if there is a test failure that somehow relied on this id,
// we remain reproducible based on the test seed:
@ -280,17 +280,25 @@ public abstract class StringHelper {
s1 ^= s1 << 23; // a
x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
}
// 64-bit unsigned mask
byte[] maskBytes64 = new byte[8];
Arrays.fill(maskBytes64, (byte) 0xff);
BigInteger mask64 = new BigInteger(1, maskBytes64);
// First make unsigned versions of x0, x1:
BigInteger unsignedX0 = new BigInteger(1, BigInteger.valueOf(x0).toByteArray());
BigInteger unsignedX1 = new BigInteger(1, BigInteger.valueOf(x1).toByteArray());
BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);
// Concatentate bits of x0 and x1, as unsigned 128 bit integer:
nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
}
/** length in bytes of an ID */
public static final int ID_LENGTH = 16;
/** Generates a non-cryptographic globally unique id. */
public static String randomId() {
public static byte[] randomId() {
// NOTE: we don't use Java's UUID.randomUUID() implementation here because:
//
@ -306,15 +314,42 @@ public abstract class StringHelper {
// what impact that has on the period, whereas the simple ++ (mod 2^128)
// we use here is guaranteed to have the full period.
String id;
byte bits[];
synchronized(idLock) {
id = nextId.toString(16);
nextId = nextId.add(BigInteger.ONE).and(idMask);
bits = nextId.toByteArray();
nextId = nextId.add(BigInteger.ONE).and(mask128);
}
// toByteArray() always returns a sign bit, so it may require an extra byte (always zero)
if (bits.length > ID_LENGTH) {
assert bits.length == ID_LENGTH + 1;
assert bits[0] == 0;
return Arrays.copyOfRange(bits, 1, bits.length);
} else {
byte[] result = new byte[ID_LENGTH];
System.arraycopy(bits, 0, result, result.length - bits.length, bits.length);
return result;
}
}
/**
* Helper method to render an ID as a string, for debugging
* <p>
* Returns the string {@code (null)} if the id is null.
* Otherwise, returns a string representation for debugging.
* Never throws an exception. The returned string may
* indicate if the id is definitely invalid.
*/
public static String idToString(byte id[]) {
if (id == null) {
return "(null)";
} else {
StringBuilder sb = new StringBuilder();
sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX));
if (id.length != ID_LENGTH) {
sb.append(" (INVALID FORMAT)");
}
return sb.toString();
}
assert id.length() <= 32: "id=" + id;
id = idPad.substring(id.length()) + id;
return id;
}
}

View File

@ -2767,11 +2767,13 @@ public class TestIndexWriter extends LuceneTestCase {
SegmentInfos sis = new SegmentInfos();
sis.read(d);
String id1 = sis.getId();
byte[] id1 = sis.getId();
assertNotNull(id1);
assertEquals(StringHelper.ID_LENGTH, id1.length);
String id2 = sis.info(0).info.getId();
byte[] id2 = sis.info(0).info.getId();
assertNotNull(id2);
assertEquals(StringHelper.ID_LENGTH, id2.length);
// Make sure CheckIndex includes id output:
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
@ -2784,14 +2786,14 @@ public class TestIndexWriter extends LuceneTestCase {
assertTrue(s, indexStatus != null && indexStatus.clean);
// Commit id is always stored:
assertTrue("missing id=" + id1 + " in:\n" + s, s.contains("id=" + id1));
assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1)));
assertTrue("missing id=" + id2 + " in:\n" + s, s.contains("id=" + id2));
assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1)));
d.close();
Set<String> ids = new HashSet<>();
for(int i=0;i<100000;i++) {
String id = StringHelper.randomId();
String id = StringHelper.idToString(StringHelper.randomId());
assertFalse("id=" + id + " i=" + i, ids.contains(id));
ids.add(id);
}