files = input.readStringSet();
-
- String id;
- if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_ID) {
- id = input.readString();
- } else {
- id = null;
- }
if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_CHECKSUM) {
CodecUtil.checkFooter(input);
@@ -79,7 +72,7 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader {
CodecUtil.checkEOF(input);
}
- final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
+ final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, null);
si.setFiles(files);
return si;
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java
index 10b2d5f4f15..a8db471154d 100755
--- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java
@@ -64,7 +64,6 @@ public class Lucene46SegmentInfoWriter extends SegmentInfoWriter {
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeStringStringMap(si.getDiagnostics());
output.writeStringSet(si.files());
- output.writeString(si.getId());
CodecUtil.writeFooter(output);
success = true;
} finally {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java
index 6e1708420a1..4e0f7db3947 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java
@@ -20,6 +20,7 @@ package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@@ -109,7 +110,7 @@ public class SimpleTextSegmentInfoReader extends SegmentInfoReader {
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SI_ID);
- final String id = readString(SI_ID.length, scratch);
+ final byte[] id = Arrays.copyOfRange(scratch.bytes(), SI_ID.length, scratch.length());
SimpleTextUtil.checkFooter(input);
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java
index 9daf4cbe8b1..0350acc8ac2 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java
@@ -107,7 +107,7 @@ public class SimpleTextSegmentInfoWriter extends SegmentInfoWriter {
}
SimpleTextUtil.write(output, SI_ID);
- SimpleTextUtil.write(output, si.getId(), scratch);
+ SimpleTextUtil.write(output, new BytesRef(si.getId()));
SimpleTextUtil.writeNewline(output);
SimpleTextUtil.writeChecksum(output, scratch);
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
index d46beed81f3..4c109721aa0 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
@@ -19,6 +19,7 @@ package org.apache.lucene.codecs;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFormatTooNewException;
@@ -31,6 +32,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.StringHelper;
/**
* Utility class for reading and writing versioned headers.
@@ -94,12 +96,12 @@ public final class CodecUtil {
* Writes a codec header for a per-segment, which records both a string to
* identify the file, a version number, and the unique ID of the segment.
* This header can be parsed and validated with
- * {@link #checkSegmentHeader(DataInput, String, int, int, String) checkSegmentHeader()}.
+ * {@link #checkSegmentHeader(DataInput, String, int, int, byte[]) checkSegmentHeader()}.
*
* CodecSegmentHeader --> CodecHeader,SegmentID
*
* - CodecHeader --> {@link #writeHeader}
- *
- SegmentID --> {@link DataOutput#writeString String}.
+ *
- SegmentID --> {@link DataOutput#writeByte byte}16.
* Unique identifier for the segment.
*
*
@@ -113,13 +115,15 @@ public final class CodecUtil {
* @param segmentID Unique identifier for the segment
* @param version Version number
* @throws IOException If there is an I/O error writing to the underlying medium.
- * @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
+ * @throws IllegalArgumentException If the codec name is not simple ASCII, or
+ * is more than 127 characters in length, or if segmentID is invalid.
*/
- // nocommit: fix javadocs, add segmentLength()
- public static void writeSegmentHeader(DataOutput out, String codec, int version, String segmentID) throws IOException {
+ public static void writeSegmentHeader(DataOutput out, String codec, int version, byte[] segmentID) throws IOException {
+ if (segmentID.length != StringHelper.ID_LENGTH) {
+ throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(segmentID));
+ }
writeHeader(out, codec, version);
- // nocommit: improve encoding of this ID
- out.writeString(segmentID);
+ out.writeBytes(segmentID, 0, segmentID.length);
}
/**
@@ -132,6 +136,17 @@ public final class CodecUtil {
public static int headerLength(String codec) {
return 9+codec.length();
}
+
+ /**
+ * Computes the length of a segment header.
+ *
+ * @param codec Codec name.
+ * @return length of the entire segment header.
+ * @see #writeSegmentHeader(DataOutput, String, int, byte[])
+ */
+ public static int segmentHeaderLength(String codec) {
+ return headerLength(codec) + StringHelper.ID_LENGTH;
+ }
/**
* Reads and validates a header previously written with
@@ -192,7 +207,7 @@ public final class CodecUtil {
/**
* Reads and validates a header previously written with
- * {@link #writeSegmentHeader(DataOutput, String, int, String)}.
+ * {@link #writeSegmentHeader(DataOutput, String, int, byte[])}.
*
* When reading a file, supply the expected codec
,
* expected version range (minVersion to maxVersion
),
@@ -219,13 +234,15 @@ public final class CodecUtil {
* @throws IndexFormatTooNewException If the actual version is greater
* than maxVersion
.
* @throws IOException If there is an I/O error reading from the underlying medium.
- * @see #writeSegmentHeader(DataOutput, String, int, String)
+ * @see #writeSegmentHeader(DataOutput, String, int, byte[])
*/
- public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, String segmentID) throws IOException {
+ public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] segmentID) throws IOException {
int version = checkHeader(in, codec, minVersion, maxVersion);
- String id = in.readString();
- if (!id.equals(segmentID)) {
- throw new CorruptIndexException("file mismatch, expected segment id=" + segmentID + ", got=" + id, in);
+ byte id[] = new byte[StringHelper.ID_LENGTH];
+ in.readBytes(id, 0, id.length);
+ if (!Arrays.equals(id, segmentID)) {
+ throw new CorruptIndexException("file mismatch, expected segment id=" + StringHelper.idToString(segmentID)
+ + ", got=" + StringHelper.idToString(id), in);
}
return version;
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java
index a6721c82d3a..21b6e1637d0 100755
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java
@@ -30,6 +30,7 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
/**
@@ -69,7 +70,8 @@ public class Lucene50SegmentInfoReader extends SegmentInfoReader {
final Map diagnostics = input.readStringStringMap();
final Set files = input.readStringSet();
- String id = input.readString();
+ byte[] id = new byte[StringHelper.ID_LENGTH];
+ input.readBytes(id, 0, id.length);
si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
si.setFiles(files);
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java
index 62341035f23..ab407d9221c 100755
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java
@@ -64,7 +64,8 @@ public class Lucene50SegmentInfoWriter extends SegmentInfoWriter {
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeStringStringMap(si.getDiagnostics());
output.writeStringSet(si.files());
- output.writeString(si.getId());
+ byte[] id = si.getId();
+ output.writeBytes(id, 0, id.length);
CodecUtil.writeFooter(output);
success = true;
} finally {
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index db4a5a33657..59ab2b77773 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -47,6 +47,7 @@ import org.apache.lucene.util.CommandLineUtil;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongBitSet;
+import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
@@ -514,7 +515,7 @@ public class CheckIndex {
}
msg(infoStream, "Segments file=" + segmentsFileName + " numSegments=" + numSegments
- + " " + versionString + " id=" + sis.getId() + " format=" + sFormat + userDataString);
+ + " " + versionString + " id=" + StringHelper.idToString(sis.getId()) + " format=" + sFormat + userDataString);
if (onlySegments != null) {
result.partial = true;
@@ -565,7 +566,7 @@ public class CheckIndex {
try {
msg(infoStream, " version=" + (version == null ? "3.0" : version));
- msg(infoStream, " id=" + info.info.getId());
+ msg(infoStream, " id=" + StringHelper.idToString(info.info.getId()));
final Codec codec = info.info.getCodec();
msg(infoStream, " codec=" + codec);
segInfoStat.codec = codec;
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java
index 2437f6d722b..2ae6f3b09e8 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java
@@ -18,6 +18,7 @@ package org.apache.lucene.index;
*/
+import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
@@ -59,7 +60,7 @@ public final class SegmentInfo {
private boolean isCompoundFile;
/** Id that uniquely identifies this segment. */
- private final String id;
+ private final byte[] id;
private Codec codec;
@@ -89,7 +90,7 @@ public final class SegmentInfo {
*/
public SegmentInfo(Directory dir, Version version, String name, int docCount,
boolean isCompoundFile, Codec codec, Map diagnostics,
- String id) {
+ byte[] id) {
assert !(dir instanceof TrackingDirectoryWrapper);
this.dir = dir;
this.version = version;
@@ -99,6 +100,9 @@ public final class SegmentInfo {
this.codec = codec;
this.diagnostics = diagnostics;
this.id = id;
+ if (id != null && id.length != StringHelper.ID_LENGTH) {
+ throw new IllegalArgumentException("invalid id: " + Arrays.toString(id));
+ }
}
/**
@@ -218,8 +222,8 @@ public final class SegmentInfo {
}
/** Return the id that uniquely identifies this segment. */
- public String getId() {
- return id;
+ public byte[] getId() {
+ return id == null ? null : id.clone();
}
private Set setFiles;
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
index 70824c5f7f3..6f69e3a1e63 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
@@ -125,8 +125,8 @@ public final class SegmentInfos implements Cloneable, Iterable= VERSION_411) {
- id = input.readString();
+ if (format >= VERSION_50) {
+ id = new byte[StringHelper.ID_LENGTH];
+ input.readBytes(id, 0, id.length);
}
if (format >= VERSION_48) {
@@ -425,7 +426,7 @@ public final class SegmentInfos implements Cloneable, Iterable>> 17) ^ (s0 >>> 26); // b, c
}
+
+ // 64-bit unsigned mask
+ byte[] maskBytes64 = new byte[8];
+ Arrays.fill(maskBytes64, (byte) 0xff);
+ BigInteger mask64 = new BigInteger(1, maskBytes64);
// First make unsigned versions of x0, x1:
- BigInteger unsignedX0 = new BigInteger(1, BigInteger.valueOf(x0).toByteArray());
- BigInteger unsignedX1 = new BigInteger(1, BigInteger.valueOf(x1).toByteArray());
+ BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
+ BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);
// Concatentate bits of x0 and x1, as unsigned 128 bit integer:
nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
}
+
+ /** length in bytes of an ID */
+ public static final int ID_LENGTH = 16;
/** Generates a non-cryptographic globally unique id. */
- public static String randomId() {
+ public static byte[] randomId() {
// NOTE: we don't use Java's UUID.randomUUID() implementation here because:
//
@@ -306,15 +314,42 @@ public abstract class StringHelper {
// what impact that has on the period, whereas the simple ++ (mod 2^128)
// we use here is guaranteed to have the full period.
- String id;
+ byte bits[];
synchronized(idLock) {
- id = nextId.toString(16);
- nextId = nextId.add(BigInteger.ONE).and(idMask);
+ bits = nextId.toByteArray();
+ nextId = nextId.add(BigInteger.ONE).and(mask128);
+ }
+
+ // toByteArray() always returns a sign bit, so it may require an extra byte (always zero)
+ if (bits.length > ID_LENGTH) {
+ assert bits.length == ID_LENGTH + 1;
+ assert bits[0] == 0;
+ return Arrays.copyOfRange(bits, 1, bits.length);
+ } else {
+ byte[] result = new byte[ID_LENGTH];
+ System.arraycopy(bits, 0, result, result.length - bits.length, bits.length);
+ return result;
+ }
+ }
+
+ /**
+ * Helper method to render an ID as a string, for debugging
+ *
+ * Returns the string {@code (null)} if the id is null.
+ * Otherwise, returns a string representation for debugging.
+ * Never throws an exception. The returned string may
+ * indicate if the id is definitely invalid.
+ */
+ public static String idToString(byte id[]) {
+ if (id == null) {
+ return "(null)";
+ } else {
+ StringBuilder sb = new StringBuilder();
+ sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX));
+ if (id.length != ID_LENGTH) {
+ sb.append(" (INVALID FORMAT)");
+ }
+ return sb.toString();
}
-
- assert id.length() <= 32: "id=" + id;
- id = idPad.substring(id.length()) + id;
-
- return id;
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
index ccf69cdfd8a..0de4e4e4829 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
@@ -2767,11 +2767,13 @@ public class TestIndexWriter extends LuceneTestCase {
SegmentInfos sis = new SegmentInfos();
sis.read(d);
- String id1 = sis.getId();
+ byte[] id1 = sis.getId();
assertNotNull(id1);
+ assertEquals(StringHelper.ID_LENGTH, id1.length);
- String id2 = sis.info(0).info.getId();
+ byte[] id2 = sis.info(0).info.getId();
assertNotNull(id2);
+ assertEquals(StringHelper.ID_LENGTH, id2.length);
// Make sure CheckIndex includes id output:
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
@@ -2784,14 +2786,14 @@ public class TestIndexWriter extends LuceneTestCase {
assertTrue(s, indexStatus != null && indexStatus.clean);
// Commit id is always stored:
- assertTrue("missing id=" + id1 + " in:\n" + s, s.contains("id=" + id1));
+ assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1)));
- assertTrue("missing id=" + id2 + " in:\n" + s, s.contains("id=" + id2));
+ assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1)));
d.close();
Set ids = new HashSet<>();
for(int i=0;i<100000;i++) {
- String id = StringHelper.randomId();
+ String id = StringHelper.idToString(StringHelper.randomId());
assertFalse("id=" + id + " i=" + i, ids.contains(id));
ids.add(id);
}