From 88648b3a9c18b178b9a5bc9adf007d751ac9e293 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 26 Sep 2014 05:20:43 +0000 Subject: [PATCH] LUCENE-5969, LUCENE-5895: fix sign bit bugs in segment/commit IDs, use byte[] representation git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5969@1627714 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene46/Lucene46SegmentInfoFormat.java | 6 +- .../lucene46/Lucene46SegmentInfoReader.java | 9 +-- .../lucene46/Lucene46SegmentInfoWriter.java | 1 - .../SimpleTextSegmentInfoReader.java | 3 +- .../SimpleTextSegmentInfoWriter.java | 2 +- .../org/apache/lucene/codecs/CodecUtil.java | 43 ++++++++---- .../lucene50/Lucene50SegmentInfoReader.java | 4 +- .../lucene50/Lucene50SegmentInfoWriter.java | 3 +- .../org/apache/lucene/index/CheckIndex.java | 5 +- .../org/apache/lucene/index/SegmentInfo.java | 12 ++-- .../org/apache/lucene/index/SegmentInfos.java | 28 ++++---- .../org/apache/lucene/util/StringHelper.java | 69 ++++++++++++++----- .../apache/lucene/index/TestIndexWriter.java | 12 ++-- 13 files changed, 126 insertions(+), 71 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java index a8d3da9560d..a8e4586779c 100755 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoFormat.java @@ -31,7 +31,7 @@ import org.apache.lucene.store.DataOutput; // javadocs *

* Files: *

*

* Data types: @@ -44,7 +44,6 @@ import org.apache.lucene.store.DataOutput; // javadocs *
  • Diagnostics --> {@link DataOutput#writeStringStringMap Map<String,String>}
  • *
  • IsCompoundFile --> {@link DataOutput#writeByte Int8}
  • *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • - *
  • Id --> {@link DataOutput#writeString String}
  • * *

    * Field Descriptions: @@ -88,6 +87,5 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat { static final String CODEC_NAME = "Lucene46SegmentInfo"; static final int VERSION_START = 0; static final int VERSION_CHECKSUM = 1; - static final int VERSION_ID = 2; - static final int VERSION_CURRENT = VERSION_ID; + static final int VERSION_CURRENT = VERSION_CHECKSUM; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoReader.java index 93fa55c0734..b205644272f 100755 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoReader.java @@ -65,13 +65,6 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader { final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; final Map diagnostics = input.readStringStringMap(); final Set files = input.readStringSet(); - - String id; - if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_ID) { - id = input.readString(); - } else { - id = null; - } if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_CHECKSUM) { CodecUtil.checkFooter(input); @@ -79,7 +72,7 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader { CodecUtil.checkEOF(input); } - final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id); + final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, null); si.setFiles(files); return si; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java index 10b2d5f4f15..a8db471154d 100755 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene46/Lucene46SegmentInfoWriter.java @@ -64,7 +64,6 @@ public class Lucene46SegmentInfoWriter extends SegmentInfoWriter { output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO)); output.writeStringStringMap(si.getDiagnostics()); output.writeStringSet(si.files()); - output.writeString(si.getId()); CodecUtil.writeFooter(output); success = true; } finally { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java index 6e1708420a1..4e0f7db3947 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoReader.java @@ -20,6 +20,7 @@ package org.apache.lucene.codecs.simpletext; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.text.ParseException; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -109,7 +110,7 @@ public class SimpleTextSegmentInfoReader extends SegmentInfoReader { SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_ID); - final String id = readString(SI_ID.length, scratch); + final byte[] id = Arrays.copyOfRange(scratch.bytes(), SI_ID.length, scratch.length()); SimpleTextUtil.checkFooter(input); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java index 9daf4cbe8b1..0350acc8ac2 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoWriter.java @@ -107,7 +107,7 @@ public class SimpleTextSegmentInfoWriter extends SegmentInfoWriter { } SimpleTextUtil.write(output, SI_ID); - SimpleTextUtil.write(output, si.getId(), scratch); + SimpleTextUtil.write(output, new BytesRef(si.getId())); SimpleTextUtil.writeNewline(output); SimpleTextUtil.writeChecksum(output, scratch); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java index d46beed81f3..4c109721aa0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java @@ -19,6 +19,7 @@ package org.apache.lucene.codecs; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexFormatTooNewException; @@ -31,6 +32,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.StringHelper; /** * Utility class for reading and writing versioned headers. @@ -94,12 +96,12 @@ public final class CodecUtil { * Writes a codec header for a per-segment, which records both a string to * identify the file, a version number, and the unique ID of the segment. * This header can be parsed and validated with - * {@link #checkSegmentHeader(DataInput, String, int, int, String) checkSegmentHeader()}. + * {@link #checkSegmentHeader(DataInput, String, int, int, byte[]) checkSegmentHeader()}. *

    * CodecSegmentHeader --> CodecHeader,SegmentID *

      *
    • CodecHeader --> {@link #writeHeader} - *
    • SegmentID --> {@link DataOutput#writeString String}. + *
    • SegmentID --> {@link DataOutput#writeByte byte}16. * Unique identifier for the segment. *
    *

    @@ -113,13 +115,15 @@ public final class CodecUtil { * @param segmentID Unique identifier for the segment * @param version Version number * @throws IOException If there is an I/O error writing to the underlying medium. - * @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length + * @throws IllegalArgumentException If the codec name is not simple ASCII, or + * is more than 127 characters in length, or if segmentID is invalid. */ - // nocommit: fix javadocs, add segmentLength() - public static void writeSegmentHeader(DataOutput out, String codec, int version, String segmentID) throws IOException { + public static void writeSegmentHeader(DataOutput out, String codec, int version, byte[] segmentID) throws IOException { + if (segmentID.length != StringHelper.ID_LENGTH) { + throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(segmentID)); + } writeHeader(out, codec, version); - // nocommit: improve encoding of this ID - out.writeString(segmentID); + out.writeBytes(segmentID, 0, segmentID.length); } /** @@ -132,6 +136,17 @@ public final class CodecUtil { public static int headerLength(String codec) { return 9+codec.length(); } + + /** + * Computes the length of a segment header. + * + * @param codec Codec name. + * @return length of the entire segment header. + * @see #writeSegmentHeader(DataOutput, String, int, byte[]) + */ + public static int segmentHeaderLength(String codec) { + return headerLength(codec) + StringHelper.ID_LENGTH; + } /** * Reads and validates a header previously written with @@ -192,7 +207,7 @@ public final class CodecUtil { /** * Reads and validates a header previously written with - * {@link #writeSegmentHeader(DataOutput, String, int, String)}. + * {@link #writeSegmentHeader(DataOutput, String, int, byte[])}. *

    * When reading a file, supply the expected codec, * expected version range (minVersion to maxVersion), @@ -219,13 +234,15 @@ public final class CodecUtil { * @throws IndexFormatTooNewException If the actual version is greater * than maxVersion. * @throws IOException If there is an I/O error reading from the underlying medium. - * @see #writeSegmentHeader(DataOutput, String, int, String) + * @see #writeSegmentHeader(DataOutput, String, int, byte[]) */ - public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, String segmentID) throws IOException { + public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] segmentID) throws IOException { int version = checkHeader(in, codec, minVersion, maxVersion); - String id = in.readString(); - if (!id.equals(segmentID)) { - throw new CorruptIndexException("file mismatch, expected segment id=" + segmentID + ", got=" + id, in); + byte id[] = new byte[StringHelper.ID_LENGTH]; + in.readBytes(id, 0, id.length); + if (!Arrays.equals(id, segmentID)) { + throw new CorruptIndexException("file mismatch, expected segment id=" + StringHelper.idToString(segmentID) + + ", got=" + StringHelper.idToString(id), in); } return version; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java index a6721c82d3a..21b6e1637d0 100755 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoReader.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.Version; /** @@ -69,7 +70,8 @@ public class Lucene50SegmentInfoReader extends SegmentInfoReader { final Map diagnostics = input.readStringStringMap(); final Set files = input.readStringSet(); - String id = input.readString(); + byte[] id = new byte[StringHelper.ID_LENGTH]; + input.readBytes(id, 0, id.length); si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id); si.setFiles(files); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java index 62341035f23..ab407d9221c 100755 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoWriter.java @@ -64,7 +64,8 @@ public class Lucene50SegmentInfoWriter extends SegmentInfoWriter { output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO)); output.writeStringStringMap(si.getDiagnostics()); output.writeStringSet(si.files()); - output.writeString(si.getId()); + byte[] id = si.getId(); + output.writeBytes(id, 0, id.length); CodecUtil.writeFooter(output); success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index db4a5a33657..59ab2b77773 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -47,6 +47,7 @@ import org.apache.lucene.util.CommandLineUtil; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LongBitSet; +import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.Version; @@ -514,7 +515,7 @@ public class CheckIndex { } msg(infoStream, "Segments file=" + segmentsFileName + " numSegments=" + numSegments - + " " + versionString + " id=" + sis.getId() + " format=" + sFormat + userDataString); + + " " + versionString + " id=" + StringHelper.idToString(sis.getId()) + " format=" + sFormat + userDataString); if (onlySegments != null) { result.partial = true; @@ -565,7 +566,7 @@ public class CheckIndex { try { msg(infoStream, " version=" + (version == null ? "3.0" : version)); - msg(infoStream, " id=" + info.info.getId()); + msg(infoStream, " id=" + StringHelper.idToString(info.info.getId())); final Codec codec = info.info.getCodec(); msg(infoStream, " codec=" + codec); segInfoStat.codec = codec; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java index 2437f6d722b..2ae6f3b09e8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java @@ -18,6 +18,7 @@ package org.apache.lucene.index; */ +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; @@ -59,7 +60,7 @@ public final class SegmentInfo { private boolean isCompoundFile; /** Id that uniquely identifies this segment. */ - private final String id; + private final byte[] id; private Codec codec; @@ -89,7 +90,7 @@ public final class SegmentInfo { */ public SegmentInfo(Directory dir, Version version, String name, int docCount, boolean isCompoundFile, Codec codec, Map diagnostics, - String id) { + byte[] id) { assert !(dir instanceof TrackingDirectoryWrapper); this.dir = dir; this.version = version; @@ -99,6 +100,9 @@ public final class SegmentInfo { this.codec = codec; this.diagnostics = diagnostics; this.id = id; + if (id != null && id.length != StringHelper.ID_LENGTH) { + throw new IllegalArgumentException("invalid id: " + Arrays.toString(id)); + } } /** @@ -218,8 +222,8 @@ public final class SegmentInfo { } /** Return the id that uniquely identifies this segment. */ - public String getId() { - return id; + public byte[] getId() { + return id == null ? null : id.clone(); } private Set setFiles; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java index 70824c5f7f3..6f69e3a1e63 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java @@ -125,8 +125,8 @@ public final class SegmentInfos implements Cloneable, Iterable= VERSION_411) { - id = input.readString(); + if (format >= VERSION_50) { + id = new byte[StringHelper.ID_LENGTH]; + input.readBytes(id, 0, id.length); } if (format >= VERSION_48) { @@ -425,7 +426,7 @@ public final class SegmentInfos implements Cloneable, Iterable>> 17) ^ (s0 >>> 26); // b, c } + + // 64-bit unsigned mask + byte[] maskBytes64 = new byte[8]; + Arrays.fill(maskBytes64, (byte) 0xff); + BigInteger mask64 = new BigInteger(1, maskBytes64); // First make unsigned versions of x0, x1: - BigInteger unsignedX0 = new BigInteger(1, BigInteger.valueOf(x0).toByteArray()); - BigInteger unsignedX1 = new BigInteger(1, BigInteger.valueOf(x1).toByteArray()); + BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64); + BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64); // Concatentate bits of x0 and x1, as unsigned 128 bit integer: nextId = unsignedX0.shiftLeft(64).or(unsignedX1); } + + /** length in bytes of an ID */ + public static final int ID_LENGTH = 16; /** Generates a non-cryptographic globally unique id. */ - public static String randomId() { + public static byte[] randomId() { // NOTE: we don't use Java's UUID.randomUUID() implementation here because: // @@ -306,15 +314,42 @@ public abstract class StringHelper { // what impact that has on the period, whereas the simple ++ (mod 2^128) // we use here is guaranteed to have the full period. - String id; + byte bits[]; synchronized(idLock) { - id = nextId.toString(16); - nextId = nextId.add(BigInteger.ONE).and(idMask); + bits = nextId.toByteArray(); + nextId = nextId.add(BigInteger.ONE).and(mask128); + } + + // toByteArray() always returns a sign bit, so it may require an extra byte (always zero) + if (bits.length > ID_LENGTH) { + assert bits.length == ID_LENGTH + 1; + assert bits[0] == 0; + return Arrays.copyOfRange(bits, 1, bits.length); + } else { + byte[] result = new byte[ID_LENGTH]; + System.arraycopy(bits, 0, result, result.length - bits.length, bits.length); + return result; + } + } + + /** + * Helper method to render an ID as a string, for debugging + *

    + * Returns the string {@code (null)} if the id is null. + * Otherwise, returns a string representation for debugging. + * Never throws an exception. The returned string may + * indicate if the id is definitely invalid. + */ + public static String idToString(byte id[]) { + if (id == null) { + return "(null)"; + } else { + StringBuilder sb = new StringBuilder(); + sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX)); + if (id.length != ID_LENGTH) { + sb.append(" (INVALID FORMAT)"); + } + return sb.toString(); } - - assert id.length() <= 32: "id=" + id; - id = idPad.substring(id.length()) + id; - - return id; } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index ccf69cdfd8a..0de4e4e4829 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -2767,11 +2767,13 @@ public class TestIndexWriter extends LuceneTestCase { SegmentInfos sis = new SegmentInfos(); sis.read(d); - String id1 = sis.getId(); + byte[] id1 = sis.getId(); assertNotNull(id1); + assertEquals(StringHelper.ID_LENGTH, id1.length); - String id2 = sis.info(0).info.getId(); + byte[] id2 = sis.info(0).info.getId(); assertNotNull(id2); + assertEquals(StringHelper.ID_LENGTH, id2.length); // Make sure CheckIndex includes id output: ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); @@ -2784,14 +2786,14 @@ public class TestIndexWriter extends LuceneTestCase { assertTrue(s, indexStatus != null && indexStatus.clean); // Commit id is always stored: - assertTrue("missing id=" + id1 + " in:\n" + s, s.contains("id=" + id1)); + assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1))); - assertTrue("missing id=" + id2 + " in:\n" + s, s.contains("id=" + id2)); + assertTrue("missing id=" + StringHelper.idToString(id1) + " in:\n" + s, s.contains("id=" + StringHelper.idToString(id1))); d.close(); Set ids = new HashSet<>(); for(int i=0;i<100000;i++) { - String id = StringHelper.randomId(); + String id = StringHelper.idToString(StringHelper.randomId()); assertFalse("id=" + id + " i=" + i, ids.contains(id)); ids.add(id); }