From afee9af13f2627c1e9fb3498d826d7a227c60ea7 Mon Sep 17 00:00:00 2001
From: Robert Muir
Date: Fri, 26 Sep 2014 02:22:18 +0000
Subject: [PATCH] LUCENE-5969: take bitvector out back and shoot it
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5969@1627701 13f79535-47bb-0310-9956-ffa450edef68
---
.../lucene/codecs/lucene40/BitVector.java | 0
.../lucene40/Lucene40LiveDocsFormat.java | 0
.../org/apache/lucene/codecs/CodecUtil.java | 79 +++++++++++-
.../lucene/codecs/lucene40/package.html | 25 ----
.../lucene/codecs/lucene50/Lucene50Codec.java | 3 +-
.../lucene50/Lucene50FieldInfosFormat.java | 4 +-
.../lucene50/Lucene50FieldInfosReader.java | 9 +-
.../lucene50/Lucene50FieldInfosWriter.java | 3 +-
.../lucene50/Lucene50LiveDocsFormat.java | 115 ++++++++++++++++++
.../org/apache/lucene/util/FixedBitSet.java | 2 +-
.../lucene/index/TestIndexFileDeleter.java | 2 +-
11 files changed, 197 insertions(+), 45 deletions(-)
rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene40/BitVector.java (100%)
rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java (100%)
delete mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene40/package.html
create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/BitVector.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/BitVector.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
index 40e9214c015..d46beed81f3 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
@@ -80,8 +80,7 @@ public final class CodecUtil {
* @throws IOException If there is an I/O error writing to the underlying medium.
* @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
*/
- public static void writeHeader(DataOutput out, String codec, int version)
- throws IOException {
+ public static void writeHeader(DataOutput out, String codec, int version) throws IOException {
BytesRef bytes = new BytesRef(codec);
if (bytes.length != codec.length() || bytes.length >= 128) {
throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]");
@@ -90,6 +89,38 @@ public final class CodecUtil {
out.writeString(codec);
out.writeInt(version);
}
+
+ /**
+ * Writes a codec header for a per-segment, which records both a string to
+ * identify the file, a version number, and the unique ID of the segment.
+ * This header can be parsed and validated with
+ * {@link #checkSegmentHeader(DataInput, String, int, int, String) checkSegmentHeader()}.
+ *
+ * CodecSegmentHeader --> CodecHeader,SegmentID
+ *
+ * - CodecHeader --> {@link #writeHeader}
+ *
- SegmentID --> {@link DataOutput#writeString String}.
+ * Unique identifier for the segment.
+ *
+ *
+ * Note that the length of a segment header depends only upon the
+ * name of the codec, so this length can be computed at any time
+ * with {@link #headerLength(String)}.
+ *
+ * @param out Output stream
+ * @param codec String to identify this file. It should be simple ASCII,
+ * less than 128 characters in length.
+ * @param segmentID Unique identifier for the segment
+ * @param version Version number
+ * @throws IOException If there is an I/O error writing to the underlying medium.
+ * @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
+ */
+ // nocommit: fix javadocs, add segmentLength()
+ public static void writeSegmentHeader(DataOutput out, String codec, int version, String segmentID) throws IOException {
+ writeHeader(out, codec, version);
+ // nocommit: improve encoding of this ID
+ out.writeString(segmentID);
+ }
/**
* Computes the length of a codec header.
@@ -129,9 +160,7 @@ public final class CodecUtil {
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeHeader(DataOutput, String, int)
*/
- public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion)
- throws IOException {
-
+ public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion) throws IOException {
// Safety to guard against reading a bogus string:
final int actualHeader = in.readInt();
if (actualHeader != CODEC_MAGIC) {
@@ -161,6 +190,46 @@ public final class CodecUtil {
return actualVersion;
}
+ /**
+ * Reads and validates a header previously written with
+ * {@link #writeSegmentHeader(DataOutput, String, int, String)}.
+ *
+ * When reading a file, supply the expected codec
,
+ * expected version range (minVersion to maxVersion
),
+ * and segment ID.
+ *
+ * @param in Input stream, positioned at the point where the
+ * header was previously written. Typically this is located
+ * at the beginning of the file.
+ * @param codec The expected codec name.
+ * @param minVersion The minimum supported expected version number.
+ * @param maxVersion The maximum supported expected version number.
+ * @param segmentID The expected segment this file belongs to.
+ * @return The actual version found, when a valid header is found
+ * that matches codec
, with an actual version
+ * where minVersion <= actual <= maxVersion
,
+ * and matching segmentID
+ * Otherwise an exception is thrown.
+ * @throws CorruptIndexException If the first four bytes are not
+ * {@link #CODEC_MAGIC}, or if the actual codec found is
+ * not codec
, or if the segmentID
+ * does not match.
+ * @throws IndexFormatTooOldException If the actual version is less
+ * than minVersion
.
+ * @throws IndexFormatTooNewException If the actual version is greater
+ * than maxVersion
.
+ * @throws IOException If there is an I/O error reading from the underlying medium.
+ * @see #writeSegmentHeader(DataOutput, String, int, String)
+ */
+ public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, String segmentID) throws IOException {
+ int version = checkHeader(in, codec, minVersion, maxVersion);
+ String id = in.readString();
+ if (!id.equals(segmentID)) {
+ throw new CorruptIndexException("file mismatch, expected segment id=" + segmentID + ", got=" + id, in);
+ }
+ return version;
+ }
+
/**
* Writes a codec footer, which records both a checksum
* algorithm ID and a checksum. This footer can
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/package.html
deleted file mode 100644
index 7959cc0f464..00000000000
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/package.html
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-
-
-
-
-
-Lucene 4.0 file format.
-
-
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
index fdef7d68888..f2c78a917fb 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
@@ -27,7 +27,6 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
-import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat;
import org.apache.lucene.codecs.lucene49.Lucene49NormsFormat;
@@ -49,7 +48,7 @@ public class Lucene50Codec extends Codec {
private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
- private final LiveDocsFormat liveDocsFormat = new Lucene40LiveDocsFormat();
+ private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java
index 2d9f5ec0a82..18fb60984e0 100755
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java
@@ -36,8 +36,7 @@ import org.apache.lucene.store.DataOutput;
* FieldBits,DocValuesBits,DocValuesGen,Attributes> FieldsCount,Footer
* Data types:
*
- * - Header --> {@link CodecUtil#checkHeader CodecHeader}
- * - SegmentID --> {@link DataOutput#writeString String}
+ * - Header --> {@link CodecUtil#checkSegmentHeader SegmentHeader}
* - FieldsCount --> {@link DataOutput#writeVInt VInt}
* - FieldName --> {@link DataOutput#writeString String}
* - FieldBits, DocValuesBits --> {@link DataOutput#writeByte Byte}
@@ -49,7 +48,6 @@ import org.apache.lucene.store.DataOutput;
*
* Field Descriptions:
*
- * - SegmentID: {@link SegmentInfo#getId()} this file belongs to
* - FieldsCount: the number of fields in this file.
* - FieldName: name of the field as a UTF-8 String.
* - FieldNumber: the field's number. Note that unlike previous versions of
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosReader.java
index a842289c5d1..f759e27aa05 100755
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosReader.java
@@ -54,13 +54,10 @@ final class Lucene50FieldInfosReader extends FieldInfosReader {
Throwable priorE = null;
FieldInfo infos[] = null;
try {
- CodecUtil.checkHeader(input, Lucene50FieldInfosFormat.CODEC_NAME,
+ CodecUtil.checkSegmentHeader(input, Lucene50FieldInfosFormat.CODEC_NAME,
Lucene50FieldInfosFormat.FORMAT_START,
- Lucene50FieldInfosFormat.FORMAT_CURRENT);
- String id = input.readString();
- if (!id.equals(segmentInfo.getId())) {
- throw new CorruptIndexException("file mismatch, expected segment id=" + segmentInfo.getId() + ", got=" + id, input);
- }
+ Lucene50FieldInfosFormat.FORMAT_CURRENT,
+ segmentInfo.getId());
final int size = input.readVInt(); //read in the size
infos = new FieldInfo[size];
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosWriter.java
index 4ff42f4fc93..c3dcbafdbe0 100755
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosWriter.java
@@ -47,8 +47,7 @@ final class Lucene50FieldInfosWriter extends FieldInfosWriter {
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene50FieldInfosFormat.EXTENSION);
try (IndexOutput output = directory.createOutput(fileName, context)) {
- CodecUtil.writeHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT);
- output.writeString(segmentInfo.getId());
+ CodecUtil.writeSegmentHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId());
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
fi.checkConsistency();
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java
new file mode 100644
index 00000000000..612bb9b1779
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java
@@ -0,0 +1,115 @@
+package org.apache.lucene.codecs.lucene50;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.LiveDocsFormat;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentCommitInfo;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.MutableBits;
+
+/**
+ * Lucene 5.0 live docs format
+ *
+ *
The .liv file is optional, and only exists when a segment contains
+ * deletions.
+ * Although per-segment, this file is maintained exterior to compound segment
+ * files.
+ * Deletions (.liv) --> SegmentHeader,Bits
+ *
+ * - SegmentHeader --> {@link CodecUtil#writeSegmentHeader SegmentHeader}
+ * - Bits --> <{@link DataOutput#writeLong Int64}> LongCount
+ *
+ */
+public class Lucene50LiveDocsFormat extends LiveDocsFormat {
+
+ /** extension of live docs */
+ private static final String EXTENSION = "liv";
+
+ /** codec of live docs */
+ private static final String CODEC_NAME = "Lucene50LiveDocs";
+
+ /** supported version range */
+ private static final int VERSION_START = 0;
+ private static final int VERSION_CURRENT = VERSION_START;
+
+ @Override
+ public MutableBits newLiveDocs(int size) throws IOException {
+ FixedBitSet bits = new FixedBitSet(size);
+ bits.set(0, size);
+ return bits;
+ }
+
+ @Override
+ public MutableBits newLiveDocs(Bits existing) throws IOException {
+ FixedBitSet fbs = (FixedBitSet) existing;
+ return fbs.clone();
+ }
+
+ @Override
+ public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
+ String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getDelGen());
+ final int length = info.info.getDocCount();
+ try (ChecksumIndexInput input = dir.openChecksumInput(name, context)) {
+ Throwable priorE = null;
+ try {
+ CodecUtil.checkSegmentHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, info.info.getId());
+ long data[] = new long[FixedBitSet.bits2words(length)];
+ for (int i = 0; i < data.length; i++) {
+ data[i] = input.readLong();
+ }
+ return new FixedBitSet(data, length);
+ } catch (Throwable exception) {
+ priorE = exception;
+ } finally {
+ CodecUtil.checkFooter(input, priorE);
+ }
+ }
+ throw new AssertionError();
+ }
+
+ @Override
+ public void writeLiveDocs(MutableBits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context) throws IOException {
+ String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getNextDelGen());
+ long data[] = ((FixedBitSet) bits).getBits();
+ try (IndexOutput output = dir.createOutput(name, context)) {
+ CodecUtil.writeSegmentHeader(output, CODEC_NAME, VERSION_CURRENT, info.info.getId());
+ for (int i = 0; i < data.length; i++) {
+ output.writeLong(data[i]);
+ }
+ CodecUtil.writeFooter(output);
+ }
+ }
+
+ @Override
+ public void files(SegmentCommitInfo info, Collection files) throws IOException {
+ if (info.hasDeletions()) {
+ files.add(IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getDelGen()));
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
index cf69d3efe12..b107223de29 100644
--- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
+++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
@@ -31,7 +31,7 @@ import org.apache.lucene.search.DocIdSetIterator;
*
* @lucene.internal
*/
-public final class FixedBitSet extends DocIdSet implements Bits {
+public final class FixedBitSet extends DocIdSet implements MutableBits {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FixedBitSet.class);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java
index fe20dc106a9..e3efa9cb62f 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexFileDeleter.java
@@ -96,7 +96,7 @@ public class TestIndexFileDeleter extends LuceneTestCase {
*/
// TODO: fix this test better
- String ext = Codec.getDefault().getName().equals("SimpleText") ? ".liv" : ".del";
+ String ext = ".liv";
// Create a bogus separate del file for a
// segment that already has a separate del file: