LUCENE-4051: Use Codec File Headers for Lucene40 StoredFields, DocValues, Norms & TermVectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1341768 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2012-05-23 07:51:56 +00:00
parent 8c037c2115
commit 88b483cbbd
13 changed files with 143 additions and 137 deletions

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.CodecUtil;
/**
* Lucene 4.0 Stored Fields Format.
@ -42,7 +43,8 @@ import org.apache.lucene.store.IOContext;
* <p>This contains, for each document, a pointer to its field data, as
* follows:</p>
* <ul>
* <li>FieldIndex (.fdx) --&gt; &lt;FieldValuesPosition&gt; <sup>SegSize</sup></li>
* <li>FieldIndex (.fdx) --&gt; &lt;Header&gt;, &lt;FieldValuesPosition&gt; <sup>SegSize</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>FieldValuesPosition --&gt; {@link DataOutput#writeLong Uint64}</li>
* </ul>
* </li>
@ -50,7 +52,8 @@ import org.apache.lucene.store.IOContext;
* <p><a name="field_data" id="field_data"></a>The field data, or <tt>.fdt</tt> file.</p>
* <p>This contains the stored fields of each document, as follows:</p>
* <ul>
* <li>FieldData (.fdt) --&gt; &lt;DocFieldData&gt; <sup>SegSize</sup></li>
* <li>FieldData (.fdt) --&gt; &lt;Header&gt;, &lt;DocFieldData&gt; <sup>SegSize</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>DocFieldData --&gt; FieldCount, &lt;FieldNum, Bits, Value&gt;
* <sup>FieldCount</sup></li>
* <li>FieldCount --&gt; {@link DataOutput#writeVInt VInt}</li>

View File

@ -30,11 +30,14 @@ import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
import java.io.Closeable;
import java.util.Set;
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.*;
/**
* Class responsible for access to stored document fields.
* <p/>
@ -44,8 +47,6 @@ import java.util.Set;
* @lucene.internal
*/
public final class Lucene40StoredFieldsReader extends StoredFieldsReader implements Cloneable, Closeable {
private final static int FORMAT_SIZE = 4;
private final FieldInfos fieldInfos;
private final IndexInput fieldsStream;
private final IndexInput indexStream;
@ -78,17 +79,15 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
boolean success = false;
fieldInfos = fn;
try {
fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION), context);
final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION);
fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context);
final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION);
indexStream = d.openInput(indexStreamFN, context);
// its a 4.0 codec: so its not too-old, its corrupt.
// TODO: change this to CodecUtil.checkHeader
if (Lucene40StoredFieldsWriter.FORMAT_CURRENT != indexStream.readInt()) {
throw new CorruptIndexException("unexpected fdx header: " + indexStream);
}
final long indexSize = indexStream.length() - FORMAT_SIZE;
CodecUtil.checkHeader(indexStream, CODEC_NAME_IDX, VERSION_START, VERSION_CURRENT);
CodecUtil.checkHeader(fieldsStream, CODEC_NAME_DAT, VERSION_START, VERSION_CURRENT);
assert HEADER_LENGTH_DAT == fieldsStream.getFilePointer();
assert HEADER_LENGTH_IDX == indexStream.getFilePointer();
final long indexSize = indexStream.length() - HEADER_LENGTH_IDX;
this.size = (int) (indexSize >> 3);
// Verify two sources of "maxDoc" agree:
if (this.size != si.docCount) {
@ -135,7 +134,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
}
private void seekIndex(int docID) throws IOException {
indexStream.seek(FORMAT_SIZE + docID * 8L);
indexStream.seek(HEADER_LENGTH_IDX + docID * 8L);
}
public final void visitDocument(int n, StoredFieldVisitor visitor) throws CorruptIndexException, IOException {
@ -148,7 +147,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
int bits = fieldsStream.readByte() & 0xFF;
assert bits <= (Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK | Lucene40StoredFieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits);
assert bits <= (FIELD_IS_NUMERIC_MASK | FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits);
switch(visitor.needsField(fieldInfo)) {
case YES:
@ -164,19 +163,19 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
}
private void readField(StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException {
final int numeric = bits & Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK;
final int numeric = bits & FIELD_IS_NUMERIC_MASK;
if (numeric != 0) {
switch(numeric) {
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_INT:
case FIELD_IS_NUMERIC_INT:
visitor.intField(info, fieldsStream.readInt());
return;
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_LONG:
case FIELD_IS_NUMERIC_LONG:
visitor.longField(info, fieldsStream.readLong());
return;
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_FLOAT:
case FIELD_IS_NUMERIC_FLOAT:
visitor.floatField(info, Float.intBitsToFloat(fieldsStream.readInt()));
return;
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
case FIELD_IS_NUMERIC_DOUBLE:
visitor.doubleField(info, Double.longBitsToDouble(fieldsStream.readLong()));
return;
default:
@ -186,7 +185,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
final int length = fieldsStream.readVInt();
byte bytes[] = new byte[length];
fieldsStream.readBytes(bytes, 0, length);
if ((bits & Lucene40StoredFieldsWriter.FIELD_IS_BINARY) != 0) {
if ((bits & FIELD_IS_BINARY) != 0) {
visitor.binaryField(info, bytes, 0, bytes.length);
} else {
visitor.stringField(info, new String(bytes, 0, bytes.length, IOUtils.CHARSET_UTF_8));
@ -195,15 +194,15 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
}
private void skipField(int bits) throws IOException {
final int numeric = bits & Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK;
final int numeric = bits & FIELD_IS_NUMERIC_MASK;
if (numeric != 0) {
switch(numeric) {
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_INT:
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_FLOAT:
case FIELD_IS_NUMERIC_INT:
case FIELD_IS_NUMERIC_FLOAT:
fieldsStream.readInt();
return;
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_LONG:
case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
case FIELD_IS_NUMERIC_LONG:
case FIELD_IS_NUMERIC_DOUBLE:
fieldsStream.readLong();
return;
default:
@ -242,7 +241,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
}
public static void files(SegmentInfo info, Set<String> files) throws IOException {
files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_INDEX_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_EXTENSION));
}
}

View File

@ -34,6 +34,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
/**
@ -62,16 +63,14 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter {
// currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT;
// currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT;
// (Happens to be the same as for now) Lucene 3.2: NumericFields are stored in binary format
static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3;
static final String CODEC_NAME_IDX = "Lucene40StoredFieldsIndex";
static final String CODEC_NAME_DAT = "Lucene40StoredFieldsData";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final long HEADER_LENGTH_IDX = CodecUtil.headerLength(CODEC_NAME_IDX);
static final long HEADER_LENGTH_DAT = CodecUtil.headerLength(CODEC_NAME_DAT);
// NOTE: if you introduce a new format, make it 1 higher
// than the current one, and always change this if you
// switch to a new format!
static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS;
// when removing support for old versions, leave the last supported version here
static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_2_NUMERIC_FIELDS;
/** Extension of stored fields file */
public static final String FIELDS_EXTENSION = "fdt";
@ -94,9 +93,10 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter {
fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context);
indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION), context);
fieldsStream.writeInt(FORMAT_CURRENT);
indexStream.writeInt(FORMAT_CURRENT);
CodecUtil.writeHeader(fieldsStream, CODEC_NAME_DAT, VERSION_CURRENT);
CodecUtil.writeHeader(indexStream, CODEC_NAME_IDX, VERSION_CURRENT);
assert HEADER_LENGTH_DAT == fieldsStream.getFilePointer();
assert HEADER_LENGTH_IDX == indexStream.getFilePointer();
success = true;
} finally {
if (!success) {
@ -209,7 +209,7 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter {
@Override
public void finish(int numDocs) throws IOException {
if (4+((long) numDocs)*8 != indexStream.getFilePointer())
if (HEADER_LENGTH_IDX+((long) numDocs)*8 != indexStream.getFilePointer())
// This is most likely a bug in Sun JRE 1.6.0_04/_05;
// we detect that the bug has struck, here, and
// throw an exception to prevent the corruption from

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.CodecUtil;
/**
* Lucene 4.0 Term Vectors format.
@ -38,10 +39,10 @@ import org.apache.lucene.store.IOContext;
* <p>The Document Index or .tvx file.</p>
* <p>For each document, this stores the offset into the document data (.tvd) and
* field data (.tvf) files.</p>
* <p>DocumentIndex (.tvx) --&gt; TVXVersion&lt;DocumentPosition,FieldPosition&gt;
* <p>DocumentIndex (.tvx) --&gt; Header,&lt;DocumentPosition,FieldPosition&gt;
* <sup>NumDocs</sup></p>
* <ul>
* <li>TVXVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>DocumentPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li>
* <li>FieldPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li>
* </ul>
@ -53,10 +54,10 @@ import org.apache.lucene.store.IOContext;
* in the .tvf (Term Vector Fields) file.</p>
* <p>The .tvd file is used to map out the fields that have term vectors stored
* and where the field information is in the .tvf file.</p>
* <p>Document (.tvd) --&gt; TVDVersion&lt;NumFields, FieldNums,
* <p>Document (.tvd) --&gt; Header,&lt;NumFields, FieldNums,
* FieldPositions&gt; <sup>NumDocs</sup></p>
* <ul>
* <li>TVDVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>NumFields --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>FieldNums --&gt; &lt;FieldNumDelta&gt; <sup>NumFields</sup></li>
* <li>FieldNumDelta --&gt; {@link DataOutput#writeVInt VInt}</li>
@ -69,10 +70,10 @@ import org.apache.lucene.store.IOContext;
* <p>This file contains, for each field that has a term vector stored, a list of
* the terms, their frequencies and, optionally, position and offset
* information.</p>
* <p>Field (.tvf) --&gt; TVFVersion&lt;NumTerms, Position/Offset, TermFreqs&gt;
* <p>Field (.tvf) --&gt; Header,&lt;NumTerms, Position/Offset, TermFreqs&gt;
* <sup>NumFields</sup></p>
* <ul>
* <li>TVFVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>NumTerms --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>Position/Offset --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>TermFreqs --&gt; &lt;TermText, TermFreq, Positions?, Offsets?&gt;

View File

@ -33,8 +33,6 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -43,8 +41,10 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
/**
* Lucene 4.0 Term Vectors reader.
* <p>
@ -54,22 +54,6 @@ import org.apache.lucene.util.IOUtils;
*/
public class Lucene40TermVectorsReader extends TermVectorsReader {
// NOTE: if you make a new format, it must be larger than
// the current format
// Changed strings to UTF8 with length-in-bytes not length-in-chars
static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
// NOTE: always change this if you switch to a new format!
// whenever you add a new format, make it 1 larger (positive version logic)!
static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
// when removing support for old versions, leave the last supported version here
static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
static final int FORMAT_SIZE = 4;
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
@ -82,6 +66,17 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
/** Extension of vectors index file */
static final String VECTORS_INDEX_EXTENSION = "tvx";
static final String CODEC_NAME_FIELDS = "Lucene40TermVectorsFields";
static final String CODEC_NAME_DOCS = "Lucene40TermVectorsDocs";
static final String CODEC_NAME_INDEX = "Lucene40TermVectorsIndex";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final long HEADER_LENGTH_FIELDS = CodecUtil.headerLength(CODEC_NAME_FIELDS);
static final long HEADER_LENGTH_DOCS = CodecUtil.headerLength(CODEC_NAME_DOCS);
static final long HEADER_LENGTH_INDEX = CodecUtil.headerLength(CODEC_NAME_INDEX);
private FieldInfos fieldInfos;
@ -91,17 +86,15 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
private int size;
private int numTotalDocs;
private final int format;
// used by clone
Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) {
Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs) {
this.fieldInfos = fieldInfos;
this.tvx = tvx;
this.tvd = tvd;
this.tvf = tvf;
this.size = size;
this.numTotalDocs = numTotalDocs;
this.format = format;
}
public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
@ -114,18 +107,21 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
try {
String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION);
tvx = d.openInput(idxName, context);
format = checkValidFormat(tvx);
final int tvxVersion = CodecUtil.checkHeader(tvx, CODEC_NAME_INDEX, VERSION_START, VERSION_CURRENT);
String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION);
tvd = d.openInput(fn, context);
final int tvdFormat = checkValidFormat(tvd);
final int tvdVersion = CodecUtil.checkHeader(tvd, CODEC_NAME_DOCS, VERSION_START, VERSION_CURRENT);
fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION);
tvf = d.openInput(fn, context);
final int tvfFormat = checkValidFormat(tvf);
final int tvfVersion = CodecUtil.checkHeader(tvf, CODEC_NAME_FIELDS, VERSION_START, VERSION_CURRENT);
assert HEADER_LENGTH_INDEX == tvx.getFilePointer();
assert HEADER_LENGTH_DOCS == tvd.getFilePointer();
assert HEADER_LENGTH_FIELDS == tvf.getFilePointer();
assert tvxVersion == tvdVersion;
assert tvxVersion == tvfVersion;
assert format == tvdFormat;
assert format == tvfFormat;
numTotalDocs = (int) (tvx.length() >> 4);
numTotalDocs = (int) (tvx.length()-HEADER_LENGTH_INDEX >> 4);
this.size = numTotalDocs;
assert size == 0 || numTotalDocs == size;
@ -156,13 +152,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
// Not private to avoid synthetic access$NNN methods
void seekTvx(final int docNum) throws IOException {
tvx.seek(docNum * 16L + FORMAT_SIZE);
}
boolean canReadRawDocs() {
// we can always read raw docs, unless the term vectors
// didn't exist
return format != 0;
tvx.seek(docNum * 16L + HEADER_LENGTH_INDEX);
}
/** Retrieve the length (in bytes) of the tvd and tvf
@ -210,16 +200,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
}
}
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
{
int format = in.readInt();
if (format < FORMAT_MINIMUM)
throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
if (format > FORMAT_CURRENT)
throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
return format;
}
public void close() throws IOException {
IOUtils.close(tvx, tvd, tvf);
}
@ -708,7 +688,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
cloneTvf = (IndexInput) tvf.clone();
}
return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format);
return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs);
}
public static void files(SegmentInfo info, Set<String> files) throws IOException {

View File

@ -35,9 +35,13 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import static org.apache.lucene.codecs.lucene40.Lucene40TermVectorsReader.*;
// TODO: make a new 4.0 TV format that encodes better
// - use startOffset (not endOffset) as base for delta on
// next startOffset because today for syns or ngrams or
@ -58,6 +62,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
private final Directory directory;
private final String segment;
private IndexOutput tvx = null, tvd = null, tvf = null;
public Lucene40TermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
this.directory = directory;
@ -66,11 +72,14 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
try {
// Open files for TermVector storage
tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_INDEX_EXTENSION), context);
tvx.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT);
CodecUtil.writeHeader(tvx, CODEC_NAME_INDEX, VERSION_CURRENT);
tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), context);
tvd.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT);
CodecUtil.writeHeader(tvd, CODEC_NAME_DOCS, VERSION_CURRENT);
tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_FIELDS_EXTENSION), context);
tvf.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT);
CodecUtil.writeHeader(tvf, CODEC_NAME_FIELDS, VERSION_CURRENT);
assert HEADER_LENGTH_INDEX == tvx.getFilePointer();
assert HEADER_LENGTH_DOCS == tvd.getFilePointer();
assert HEADER_LENGTH_FIELDS == tvf.getFilePointer();
success = true;
} finally {
if (!success) {
@ -252,10 +261,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReader();
if (vectorsReader != null && vectorsReader instanceof Lucene40TermVectorsReader) {
// If the TV* files are an older format then they cannot read raw docs:
if (((Lucene40TermVectorsReader)vectorsReader).canReadRawDocs()) {
matchingVectorsReader = (Lucene40TermVectorsReader) vectorsReader;
}
}
}
if (reader.liveDocs != null) {
@ -356,7 +362,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
@Override
public void finish(int numDocs) throws IOException {
if (4+((long) numDocs)*16 != tvx.getFilePointer())
if (HEADER_LENGTH_INDEX+((long) numDocs)*16 != tvx.getFilePointer())
// This is most likely a bug in Sun JRE 1.6.0_04/_05;
// we detect that the bug has struck, here, and
// throw an exception to prevent the corruption from

View File

@ -236,27 +236,34 @@ public final class Bytes {
private IndexOutput datOut;
protected BytesRef bytesRef = new BytesRef();
private final Directory dir;
private final String codecName;
private final String codecNameIdx;
private final String codecNameDat;
private final int version;
private final IOContext context;
protected BytesWriterBase(Directory dir, String id, String codecName,
protected BytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
int version, Counter bytesUsed, IOContext context, Type type) throws IOException {
super(bytesUsed, type);
this.id = id;
this.dir = dir;
this.codecName = codecName;
this.codecNameIdx = codecNameIdx;
this.codecNameDat = codecNameDat;
this.version = version;
this.context = context;
assert codecNameDat != null || codecNameIdx != null: "both codec names are null";
assert (codecNameDat != null && !codecNameDat.equals(codecNameIdx))
|| (codecNameIdx != null && !codecNameIdx.equals(codecNameDat)):
"index and data codec names must not be equal";
}
protected IndexOutput getOrCreateDataOut() throws IOException {
if (datOut == null) {
boolean success = false;
assert codecNameDat != null;
try {
datOut = dir.createOutput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
DocValuesWriterBase.DATA_EXTENSION), context);
CodecUtil.writeHeader(datOut, codecName, version);
CodecUtil.writeHeader(datOut, codecNameDat, version);
success = true;
} finally {
if (!success) {
@ -279,9 +286,10 @@ public final class Bytes {
boolean success = false;
try {
if (idxOut == null) {
assert codecNameIdx != null;
idxOut = dir.createOutput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
DocValuesWriterBase.INDEX_EXTENSION), context);
CodecUtil.writeHeader(idxOut, codecName, version);
CodecUtil.writeHeader(idxOut, codecNameIdx, version);
}
success = true;
} finally {
@ -308,8 +316,8 @@ public final class Bytes {
protected final int version;
protected final String id;
protected final Type type;
protected BytesReaderBase(Directory dir, String id, String codecName,
protected BytesReaderBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
int maxVersion, boolean doIndex, IOContext context, Type type) throws IOException {
IndexInput dataIn = null;
IndexInput indexIn = null;
@ -317,11 +325,11 @@ public final class Bytes {
try {
dataIn = dir.openInput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
DocValuesWriterBase.DATA_EXTENSION), context);
version = CodecUtil.checkHeader(dataIn, codecName, maxVersion, maxVersion);
version = CodecUtil.checkHeader(dataIn, codecNameDat, maxVersion, maxVersion);
if (doIndex) {
indexIn = dir.openInput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
DocValuesWriterBase.INDEX_EXTENSION), context);
final int version2 = CodecUtil.checkHeader(indexIn, codecName,
final int version2 = CodecUtil.checkHeader(indexIn, codecNameIdx,
maxVersion, maxVersion);
assert version == version2;
}
@ -377,23 +385,23 @@ public final class Bytes {
protected final boolean fasterButMoreRam;
protected long maxBytes = 0;
protected DerefBytesWriterBase(Directory dir, String id, String codecName,
protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
int codecVersion, Counter bytesUsed, IOContext context, Type type)
throws IOException {
this(dir, id, codecName, codecVersion, new DirectTrackingAllocator(
this(dir, id, codecNameIdx, codecNameDat, codecVersion, new DirectTrackingAllocator(
ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), bytesUsed, context, false, type);
}
protected DerefBytesWriterBase(Directory dir, String id, String codecName,
protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
int codecVersion, Counter bytesUsed, IOContext context, boolean fasterButMoreRam, Type type)
throws IOException {
this(dir, id, codecName, codecVersion, new DirectTrackingAllocator(
this(dir, id, codecNameIdx, codecNameDat, codecVersion, new DirectTrackingAllocator(
ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), bytesUsed, context, fasterButMoreRam,type);
}
protected DerefBytesWriterBase(Directory dir, String id, String codecName, int codecVersion, Allocator allocator,
protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat, int codecVersion, Allocator allocator,
Counter bytesUsed, IOContext context, boolean fasterButMoreRam, Type type) throws IOException {
super(dir, id, codecName, codecVersion, bytesUsed, context, type);
super(dir, id, codecNameIdx, codecNameDat, codecVersion, bytesUsed, context, type);
hash = new BytesRefHash(new ByteBlockPool(allocator),
BytesRefHash.DEFAULT_CAPACITY, new TrackingDirectBytesStartArray(
BytesRefHash.DEFAULT_CAPACITY, bytesUsed));

View File

@ -39,14 +39,16 @@ import org.apache.lucene.util.packed.PackedInts;
*/
class FixedDerefBytesImpl {
static final String CODEC_NAME = "FixedDerefBytes";
static final String CODEC_NAME_IDX = "FixedDerefBytesIdx";
static final String CODEC_NAME_DAT = "FixedDerefBytesDat";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
public static class Writer extends DerefBytesWriterBase {
public Writer(Directory dir, String id, Counter bytesUsed, IOContext context)
throws IOException {
super(dir, id, CODEC_NAME, VERSION_CURRENT, bytesUsed, context, Type.BYTES_FIXED_DEREF);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_CURRENT, bytesUsed, context, Type.BYTES_FIXED_DEREF);
}
@Override
@ -71,7 +73,7 @@ class FixedDerefBytesImpl {
private final int size;
private final int numValuesStored;
FixedDerefReader(Directory dir, String id, int maxDoc, IOContext context) throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true, context, Type.BYTES_FIXED_DEREF);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_START, true, context, Type.BYTES_FIXED_DEREF);
size = datIn.readInt();
numValuesStored = idxIn.readInt();
}

View File

@ -49,7 +49,8 @@ import org.apache.lucene.util.packed.PackedInts;
*/
class FixedSortedBytesImpl {
static final String CODEC_NAME = "FixedSortedBytes";
static final String CODEC_NAME_IDX = "FixedSortedBytesIdx";
static final String CODEC_NAME_DAT = "FixedSortedBytesDat";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
@ -58,7 +59,7 @@ class FixedSortedBytesImpl {
public Writer(Directory dir, String id, Comparator<BytesRef> comp,
Counter bytesUsed, IOContext context, boolean fasterButMoreRam) throws IOException {
super(dir, id, CODEC_NAME, VERSION_CURRENT, bytesUsed, context, fasterButMoreRam, Type.BYTES_FIXED_SORTED);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_CURRENT, bytesUsed, context, fasterButMoreRam, Type.BYTES_FIXED_SORTED);
this.comp = comp;
}
@ -127,7 +128,7 @@ class FixedSortedBytesImpl {
public Reader(Directory dir, String id, int maxDoc, IOContext context,
Type type, Comparator<BytesRef> comparator) throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true, context, type);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_START, true, context, type);
size = datIn.readInt();
valueCount = idxIn.readInt();
this.comparator = comparator;

View File

@ -61,14 +61,14 @@ class FixedStraightBytesImpl {
private final int byteBlockSize = BYTE_BLOCK_SIZE;
private final ByteBlockPool pool;
protected FixedBytesWriterBase(Directory dir, String id, String codecName,
protected FixedBytesWriterBase(Directory dir, String id, String codecNameDat,
int version, Counter bytesUsed, IOContext context) throws IOException {
this(dir, id, codecName, version, bytesUsed, context, Type.BYTES_FIXED_STRAIGHT);
this(dir, id, codecNameDat, version, bytesUsed, context, Type.BYTES_FIXED_STRAIGHT);
}
protected FixedBytesWriterBase(Directory dir, String id, String codecName,
protected FixedBytesWriterBase(Directory dir, String id, String codecNameDat,
int version, Counter bytesUsed, IOContext context, Type type) throws IOException {
super(dir, id, codecName, version, bytesUsed, context, type);
super(dir, id, null, codecNameDat, version, bytesUsed, context, type);
pool = new ByteBlockPool(new DirectTrackingAllocator(bytesUsed));
pool.nextBuffer();
}
@ -139,8 +139,8 @@ class FixedStraightBytesImpl {
super(dir, id, CODEC_NAME, VERSION_CURRENT, bytesUsed, context);
}
public Writer(Directory dir, String id, String codecName, int version, Counter bytesUsed, IOContext context) throws IOException {
super(dir, id, codecName, version, bytesUsed, context);
public Writer(Directory dir, String id, String codecNameDat, int version, Counter bytesUsed, IOContext context) throws IOException {
super(dir, id, codecNameDat, version, bytesUsed, context);
}
@ -268,8 +268,8 @@ class FixedStraightBytesImpl {
this(dir, id, CODEC_NAME, VERSION_CURRENT, maxDoc, context, Type.BYTES_FIXED_STRAIGHT);
}
protected FixedStraightReader(Directory dir, String id, String codec, int version, int maxDoc, IOContext context, Type type) throws IOException {
super(dir, id, codec, version, false, context, type);
protected FixedStraightReader(Directory dir, String id, String codecNameDat, int version, int maxDoc, IOContext context, Type type) throws IOException {
super(dir, id, null, codecNameDat, version, false, context, type);
size = datIn.readInt();
this.maxDoc = maxDoc;
}

View File

@ -41,7 +41,9 @@ import org.apache.lucene.util.packed.PackedInts;
*/
class VarDerefBytesImpl {
static final String CODEC_NAME = "VarDerefBytes";
static final String CODEC_NAME_IDX = "VarDerefBytesIdx";
static final String CODEC_NAME_DAT = "VarDerefBytesDat";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
@ -57,7 +59,7 @@ class VarDerefBytesImpl {
static class Writer extends DerefBytesWriterBase {
public Writer(Directory dir, String id, Counter bytesUsed, IOContext context)
throws IOException {
super(dir, id, CODEC_NAME, VERSION_CURRENT, bytesUsed, context, Type.BYTES_VAR_DEREF);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_CURRENT, bytesUsed, context, Type.BYTES_VAR_DEREF);
size = 0;
}
@ -93,7 +95,7 @@ class VarDerefBytesImpl {
public static class VarDerefReader extends BytesReaderBase {
private final long totalBytes;
VarDerefReader(Directory dir, String id, int maxDoc, IOContext context) throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true, context, Type.BYTES_VAR_DEREF);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_START, true, context, Type.BYTES_VAR_DEREF);
totalBytes = idxIn.readLong();
}

View File

@ -50,7 +50,9 @@ import org.apache.lucene.util.packed.PackedInts;
*/
final class VarSortedBytesImpl {
static final String CODEC_NAME = "VarDerefBytes";
static final String CODEC_NAME_IDX = "VarDerefBytesIdx";
static final String CODEC_NAME_DAT = "VarDerefBytesDat";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
@ -59,7 +61,7 @@ final class VarSortedBytesImpl {
public Writer(Directory dir, String id, Comparator<BytesRef> comp,
Counter bytesUsed, IOContext context, boolean fasterButMoreRam) throws IOException {
super(dir, id, CODEC_NAME, VERSION_CURRENT, bytesUsed, context, fasterButMoreRam, Type.BYTES_VAR_SORTED);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_CURRENT, bytesUsed, context, fasterButMoreRam, Type.BYTES_VAR_SORTED);
this.comp = comp;
size = 0;
}
@ -154,7 +156,7 @@ final class VarSortedBytesImpl {
Reader(Directory dir, String id, int maxDoc,
IOContext context, Type type, Comparator<BytesRef> comparator)
throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true, context, type);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_START, true, context, type);
this.comparator = comparator;
}

View File

@ -50,7 +50,9 @@ import org.apache.lucene.util.packed.PackedInts;
*/
class VarStraightBytesImpl {
static final String CODEC_NAME = "VarStraightBytes";
static final String CODEC_NAME_IDX = "VarStraightBytesIdx";
static final String CODEC_NAME_DAT = "VarStraightBytesDat";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
@ -64,7 +66,7 @@ class VarStraightBytesImpl {
private boolean merge = false;
public Writer(Directory dir, String id, Counter bytesUsed, IOContext context)
throws IOException {
super(dir, id, CODEC_NAME, VERSION_CURRENT, bytesUsed, context, Type.BYTES_VAR_STRAIGHT);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_CURRENT, bytesUsed, context, Type.BYTES_VAR_STRAIGHT);
pool = new ByteBlockPool(new DirectTrackingAllocator(bytesUsed));
docToAddress = new long[1];
pool.nextBuffer(); // init
@ -236,7 +238,7 @@ class VarStraightBytesImpl {
final int maxDoc;
VarStraightReader(Directory dir, String id, int maxDoc, IOContext context) throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true, context, Type.BYTES_VAR_STRAIGHT);
super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_START, true, context, Type.BYTES_VAR_STRAIGHT);
this.maxDoc = maxDoc;
}