mirror of https://github.com/apache/lucene.git
LUCENE-5969: take bitvector out back and shoot it
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5969@1627701 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
82a41114a7
commit
afee9af13f
|
@ -80,8 +80,7 @@ public final class CodecUtil {
|
||||||
* @throws IOException If there is an I/O error writing to the underlying medium.
|
* @throws IOException If there is an I/O error writing to the underlying medium.
|
||||||
* @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
|
* @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
|
||||||
*/
|
*/
|
||||||
public static void writeHeader(DataOutput out, String codec, int version)
|
public static void writeHeader(DataOutput out, String codec, int version) throws IOException {
|
||||||
throws IOException {
|
|
||||||
BytesRef bytes = new BytesRef(codec);
|
BytesRef bytes = new BytesRef(codec);
|
||||||
if (bytes.length != codec.length() || bytes.length >= 128) {
|
if (bytes.length != codec.length() || bytes.length >= 128) {
|
||||||
throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]");
|
throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]");
|
||||||
|
@ -91,6 +90,38 @@ public final class CodecUtil {
|
||||||
out.writeInt(version);
|
out.writeInt(version);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes a codec header for a per-segment, which records both a string to
|
||||||
|
* identify the file, a version number, and the unique ID of the segment.
|
||||||
|
* This header can be parsed and validated with
|
||||||
|
* {@link #checkSegmentHeader(DataInput, String, int, int, String) checkSegmentHeader()}.
|
||||||
|
* <p>
|
||||||
|
* CodecSegmentHeader --> CodecHeader,SegmentID
|
||||||
|
* <ul>
|
||||||
|
* <li>CodecHeader --> {@link #writeHeader}
|
||||||
|
* <li>SegmentID --> {@link DataOutput#writeString String}.
|
||||||
|
* Unique identifier for the segment.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* Note that the length of a segment header depends only upon the
|
||||||
|
* name of the codec, so this length can be computed at any time
|
||||||
|
* with {@link #headerLength(String)}.
|
||||||
|
*
|
||||||
|
* @param out Output stream
|
||||||
|
* @param codec String to identify this file. It should be simple ASCII,
|
||||||
|
* less than 128 characters in length.
|
||||||
|
* @param segmentID Unique identifier for the segment
|
||||||
|
* @param version Version number
|
||||||
|
* @throws IOException If there is an I/O error writing to the underlying medium.
|
||||||
|
* @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
|
||||||
|
*/
|
||||||
|
// nocommit: fix javadocs, add segmentLength()
|
||||||
|
public static void writeSegmentHeader(DataOutput out, String codec, int version, String segmentID) throws IOException {
|
||||||
|
writeHeader(out, codec, version);
|
||||||
|
// nocommit: improve encoding of this ID
|
||||||
|
out.writeString(segmentID);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes the length of a codec header.
|
* Computes the length of a codec header.
|
||||||
*
|
*
|
||||||
|
@ -129,9 +160,7 @@ public final class CodecUtil {
|
||||||
* @throws IOException If there is an I/O error reading from the underlying medium.
|
* @throws IOException If there is an I/O error reading from the underlying medium.
|
||||||
* @see #writeHeader(DataOutput, String, int)
|
* @see #writeHeader(DataOutput, String, int)
|
||||||
*/
|
*/
|
||||||
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion)
|
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion) throws IOException {
|
||||||
throws IOException {
|
|
||||||
|
|
||||||
// Safety to guard against reading a bogus string:
|
// Safety to guard against reading a bogus string:
|
||||||
final int actualHeader = in.readInt();
|
final int actualHeader = in.readInt();
|
||||||
if (actualHeader != CODEC_MAGIC) {
|
if (actualHeader != CODEC_MAGIC) {
|
||||||
|
@ -161,6 +190,46 @@ public final class CodecUtil {
|
||||||
return actualVersion;
|
return actualVersion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads and validates a header previously written with
|
||||||
|
* {@link #writeSegmentHeader(DataOutput, String, int, String)}.
|
||||||
|
* <p>
|
||||||
|
* When reading a file, supply the expected <code>codec</code>,
|
||||||
|
* expected version range (<code>minVersion to maxVersion</code>),
|
||||||
|
* and segment ID.
|
||||||
|
*
|
||||||
|
* @param in Input stream, positioned at the point where the
|
||||||
|
* header was previously written. Typically this is located
|
||||||
|
* at the beginning of the file.
|
||||||
|
* @param codec The expected codec name.
|
||||||
|
* @param minVersion The minimum supported expected version number.
|
||||||
|
* @param maxVersion The maximum supported expected version number.
|
||||||
|
* @param segmentID The expected segment this file belongs to.
|
||||||
|
* @return The actual version found, when a valid header is found
|
||||||
|
* that matches <code>codec</code>, with an actual version
|
||||||
|
* where <code>minVersion <= actual <= maxVersion</code>,
|
||||||
|
* and matching <code>segmentID</code>
|
||||||
|
* Otherwise an exception is thrown.
|
||||||
|
* @throws CorruptIndexException If the first four bytes are not
|
||||||
|
* {@link #CODEC_MAGIC}, or if the actual codec found is
|
||||||
|
* not <code>codec</code>, or if the <code>segmentID</code>
|
||||||
|
* does not match.
|
||||||
|
* @throws IndexFormatTooOldException If the actual version is less
|
||||||
|
* than <code>minVersion</code>.
|
||||||
|
* @throws IndexFormatTooNewException If the actual version is greater
|
||||||
|
* than <code>maxVersion</code>.
|
||||||
|
* @throws IOException If there is an I/O error reading from the underlying medium.
|
||||||
|
* @see #writeSegmentHeader(DataOutput, String, int, String)
|
||||||
|
*/
|
||||||
|
public static int checkSegmentHeader(DataInput in, String codec, int minVersion, int maxVersion, String segmentID) throws IOException {
|
||||||
|
int version = checkHeader(in, codec, minVersion, maxVersion);
|
||||||
|
String id = in.readString();
|
||||||
|
if (!id.equals(segmentID)) {
|
||||||
|
throw new CorruptIndexException("file mismatch, expected segment id=" + segmentID + ", got=" + id, in);
|
||||||
|
}
|
||||||
|
return version;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes a codec footer, which records both a checksum
|
* Writes a codec footer, which records both a checksum
|
||||||
* algorithm ID and a checksum. This footer can
|
* algorithm ID and a checksum. This footer can
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
|
||||||
<!--
|
|
||||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
-->
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
Lucene 4.0 file format.
|
|
||||||
</body>
|
|
||||||
</html>
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat;
|
|
||||||
import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
|
import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
|
||||||
import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat;
|
import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene49.Lucene49NormsFormat;
|
import org.apache.lucene.codecs.lucene49.Lucene49NormsFormat;
|
||||||
|
@ -49,7 +48,7 @@ public class Lucene50Codec extends Codec {
|
||||||
private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
|
private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
|
||||||
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
||||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
||||||
private final LiveDocsFormat liveDocsFormat = new Lucene40LiveDocsFormat();
|
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
|
||||||
|
|
||||||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -36,8 +36,7 @@ import org.apache.lucene.store.DataOutput;
|
||||||
* FieldBits,DocValuesBits,DocValuesGen,Attributes> <sup>FieldsCount</sup>,Footer</p>
|
* FieldBits,DocValuesBits,DocValuesGen,Attributes> <sup>FieldsCount</sup>,Footer</p>
|
||||||
* <p>Data types:
|
* <p>Data types:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>Header --> {@link CodecUtil#checkHeader CodecHeader}</li>
|
* <li>Header --> {@link CodecUtil#checkSegmentHeader SegmentHeader}</li>
|
||||||
* <li>SegmentID --> {@link DataOutput#writeString String}</li>
|
|
||||||
* <li>FieldsCount --> {@link DataOutput#writeVInt VInt}</li>
|
* <li>FieldsCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||||
* <li>FieldName --> {@link DataOutput#writeString String}</li>
|
* <li>FieldName --> {@link DataOutput#writeString String}</li>
|
||||||
* <li>FieldBits, DocValuesBits --> {@link DataOutput#writeByte Byte}</li>
|
* <li>FieldBits, DocValuesBits --> {@link DataOutput#writeByte Byte}</li>
|
||||||
|
@ -49,7 +48,6 @@ import org.apache.lucene.store.DataOutput;
|
||||||
* </p>
|
* </p>
|
||||||
* Field Descriptions:
|
* Field Descriptions:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>SegmentID: {@link SegmentInfo#getId()} this file belongs to</li>
|
|
||||||
* <li>FieldsCount: the number of fields in this file.</li>
|
* <li>FieldsCount: the number of fields in this file.</li>
|
||||||
* <li>FieldName: name of the field as a UTF-8 String.</li>
|
* <li>FieldName: name of the field as a UTF-8 String.</li>
|
||||||
* <li>FieldNumber: the field's number. Note that unlike previous versions of
|
* <li>FieldNumber: the field's number. Note that unlike previous versions of
|
||||||
|
|
|
@ -54,13 +54,10 @@ final class Lucene50FieldInfosReader extends FieldInfosReader {
|
||||||
Throwable priorE = null;
|
Throwable priorE = null;
|
||||||
FieldInfo infos[] = null;
|
FieldInfo infos[] = null;
|
||||||
try {
|
try {
|
||||||
CodecUtil.checkHeader(input, Lucene50FieldInfosFormat.CODEC_NAME,
|
CodecUtil.checkSegmentHeader(input, Lucene50FieldInfosFormat.CODEC_NAME,
|
||||||
Lucene50FieldInfosFormat.FORMAT_START,
|
Lucene50FieldInfosFormat.FORMAT_START,
|
||||||
Lucene50FieldInfosFormat.FORMAT_CURRENT);
|
Lucene50FieldInfosFormat.FORMAT_CURRENT,
|
||||||
String id = input.readString();
|
segmentInfo.getId());
|
||||||
if (!id.equals(segmentInfo.getId())) {
|
|
||||||
throw new CorruptIndexException("file mismatch, expected segment id=" + segmentInfo.getId() + ", got=" + id, input);
|
|
||||||
}
|
|
||||||
|
|
||||||
final int size = input.readVInt(); //read in the size
|
final int size = input.readVInt(); //read in the size
|
||||||
infos = new FieldInfo[size];
|
infos = new FieldInfo[size];
|
||||||
|
|
|
@ -47,8 +47,7 @@ final class Lucene50FieldInfosWriter extends FieldInfosWriter {
|
||||||
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
|
public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
|
||||||
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene50FieldInfosFormat.EXTENSION);
|
final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene50FieldInfosFormat.EXTENSION);
|
||||||
try (IndexOutput output = directory.createOutput(fileName, context)) {
|
try (IndexOutput output = directory.createOutput(fileName, context)) {
|
||||||
CodecUtil.writeHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT);
|
CodecUtil.writeSegmentHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId());
|
||||||
output.writeString(segmentInfo.getId());
|
|
||||||
output.writeVInt(infos.size());
|
output.writeVInt(infos.size());
|
||||||
for (FieldInfo fi : infos) {
|
for (FieldInfo fi : infos) {
|
||||||
fi.checkConsistency();
|
fi.checkConsistency();
|
||||||
|
|
|
@ -0,0 +1,115 @@
|
||||||
|
package org.apache.lucene.codecs.lucene50;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.SegmentCommitInfo;
|
||||||
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.MutableBits;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 5.0 live docs format
|
||||||
|
* <p>
|
||||||
|
* <p>The .liv file is optional, and only exists when a segment contains
|
||||||
|
* deletions.</p>
|
||||||
|
* <p>Although per-segment, this file is maintained exterior to compound segment
|
||||||
|
* files.</p>
|
||||||
|
* <p>Deletions (.liv) --> SegmentHeader,Bits</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>SegmentHeader --> {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
|
||||||
|
* <li>Bits --> <{@link DataOutput#writeLong Int64}> <sup>LongCount</sup></li>
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
|
public class Lucene50LiveDocsFormat extends LiveDocsFormat {
|
||||||
|
|
||||||
|
/** extension of live docs */
|
||||||
|
private static final String EXTENSION = "liv";
|
||||||
|
|
||||||
|
/** codec of live docs */
|
||||||
|
private static final String CODEC_NAME = "Lucene50LiveDocs";
|
||||||
|
|
||||||
|
/** supported version range */
|
||||||
|
private static final int VERSION_START = 0;
|
||||||
|
private static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MutableBits newLiveDocs(int size) throws IOException {
|
||||||
|
FixedBitSet bits = new FixedBitSet(size);
|
||||||
|
bits.set(0, size);
|
||||||
|
return bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MutableBits newLiveDocs(Bits existing) throws IOException {
|
||||||
|
FixedBitSet fbs = (FixedBitSet) existing;
|
||||||
|
return fbs.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
|
||||||
|
String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getDelGen());
|
||||||
|
final int length = info.info.getDocCount();
|
||||||
|
try (ChecksumIndexInput input = dir.openChecksumInput(name, context)) {
|
||||||
|
Throwable priorE = null;
|
||||||
|
try {
|
||||||
|
CodecUtil.checkSegmentHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, info.info.getId());
|
||||||
|
long data[] = new long[FixedBitSet.bits2words(length)];
|
||||||
|
for (int i = 0; i < data.length; i++) {
|
||||||
|
data[i] = input.readLong();
|
||||||
|
}
|
||||||
|
return new FixedBitSet(data, length);
|
||||||
|
} catch (Throwable exception) {
|
||||||
|
priorE = exception;
|
||||||
|
} finally {
|
||||||
|
CodecUtil.checkFooter(input, priorE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeLiveDocs(MutableBits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context) throws IOException {
|
||||||
|
String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getNextDelGen());
|
||||||
|
long data[] = ((FixedBitSet) bits).getBits();
|
||||||
|
try (IndexOutput output = dir.createOutput(name, context)) {
|
||||||
|
CodecUtil.writeSegmentHeader(output, CODEC_NAME, VERSION_CURRENT, info.info.getId());
|
||||||
|
for (int i = 0; i < data.length; i++) {
|
||||||
|
output.writeLong(data[i]);
|
||||||
|
}
|
||||||
|
CodecUtil.writeFooter(output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void files(SegmentCommitInfo info, Collection<String> files) throws IOException {
|
||||||
|
if (info.hasDeletions()) {
|
||||||
|
files.add(IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, info.getDelGen()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -31,7 +31,7 @@ import org.apache.lucene.search.DocIdSetIterator;
|
||||||
*
|
*
|
||||||
* @lucene.internal
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
public final class FixedBitSet extends DocIdSet implements Bits {
|
public final class FixedBitSet extends DocIdSet implements MutableBits {
|
||||||
|
|
||||||
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FixedBitSet.class);
|
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FixedBitSet.class);
|
||||||
|
|
||||||
|
|
|
@ -96,7 +96,7 @@ public class TestIndexFileDeleter extends LuceneTestCase {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// TODO: fix this test better
|
// TODO: fix this test better
|
||||||
String ext = Codec.getDefault().getName().equals("SimpleText") ? ".liv" : ".del";
|
String ext = ".liv";
|
||||||
|
|
||||||
// Create a bogus separate del file for a
|
// Create a bogus separate del file for a
|
||||||
// segment that already has a separate del file:
|
// segment that already has a separate del file:
|
||||||
|
|
Loading…
Reference in New Issue