LUCENE-1120: speed up merging of term vectors by bulk-copying the raw bytes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@615183 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-01-25 11:32:32 +00:00
parent ee835ccf21
commit 76dfb92a44
19 changed files with 331 additions and 148 deletions

View File

@ -1,4 +1,4 @@
Lucene Change Log Lucene Change Log
$Id$ $Id$
======================= Trunk (not yet released) ======================= ======================= Trunk (not yet released) =======================
@ -32,6 +32,10 @@ Optimizations
disk full situation before actually filling up the disk. (Mike disk full situation before actually filling up the disk. (Mike
McCandless) McCandless)
2. LUCENE-1120: Speed up merging of term vectors by bulk-copying the
raw bytes for each contiguous range of non-deleted documents.
(Mike McCandless)
Documentation Documentation
Build Build

View File

@ -662,12 +662,12 @@ final class DocumentsWriter {
// Append term vectors to the real outputs: // Append term vectors to the real outputs:
if (tvx != null) { if (tvx != null) {
tvx.writeLong(tvd.getFilePointer()); tvx.writeLong(tvd.getFilePointer());
tvx.writeLong(tvf.getFilePointer());
tvd.writeVInt(numVectorFields); tvd.writeVInt(numVectorFields);
if (numVectorFields > 0) { if (numVectorFields > 0) {
for(int i=0;i<numVectorFields;i++) for(int i=0;i<numVectorFields;i++)
tvd.writeVInt(vectorFieldNumbers[i]); tvd.writeVInt(vectorFieldNumbers[i]);
assert 0 == vectorFieldPointers[0]; assert 0 == vectorFieldPointers[0];
tvd.writeVLong(tvf.getFilePointer());
long lastPos = vectorFieldPointers[0]; long lastPos = vectorFieldPointers[0];
for(int i=1;i<numVectorFields;i++) { for(int i=1;i<numVectorFields;i++) {
long pos = vectorFieldPointers[i]; long pos = vectorFieldPointers[i];
@ -870,22 +870,23 @@ final class DocumentsWriter {
// state: // state:
try { try {
tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
tvx.writeInt(TermVectorsReader.FORMAT_VERSION); tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
tvd.writeInt(TermVectorsReader.FORMAT_VERSION); tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
tvf.writeInt(TermVectorsReader.FORMAT_VERSION); tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
// We must "catch up" for all docIDs that had no // We must "catch up" for all docIDs that had no
// vectors before this one // vectors before this one
for(int i=0;i<docID;i++) for(int i=0;i<docID;i++) {
tvx.writeLong(0); tvx.writeLong(0);
tvx.writeLong(0);
}
} catch (Throwable t) { } catch (Throwable t) {
throw new AbortException(t, DocumentsWriter.this); throw new AbortException(t, DocumentsWriter.this);
} }
files = null; files = null;
} }
numVectorFields = 0; numVectorFields = 0;
} }
} }

View File

@ -205,6 +205,38 @@ final class SegmentMerger {
} }
} }
private SegmentReader[] matchingSegmentReaders;
private int[] rawDocLengths;
private int[] rawDocLengths2;
private void setMatchingSegmentReaders() {
// If the i'th reader is a SegmentReader and has
// identical fieldName -> number mapping, then this
// array will be non-null at position i:
matchingSegmentReaders = new SegmentReader[readers.size()];
// If this reader is a SegmentReader, and all of its
// field name -> number mappings match the "merged"
// FieldInfos, then we can do a bulk copy of the
// stored fields:
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
if (reader instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) reader;
boolean same = true;
FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
for (int j = 0; same && j < segmentFieldInfos.size(); j++)
same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
if (same)
matchingSegmentReaders[i] = segmentReader;
}
}
// Used for bulk-reading raw bytes for stored fields
rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
}
/** /**
* *
* @return The number of documents in all of the readers * @return The number of documents in all of the readers
@ -248,34 +280,10 @@ final class SegmentMerger {
int docCount = 0; int docCount = 0;
setMatchingSegmentReaders();
if (mergeDocStores) { if (mergeDocStores) {
// If the i'th reader is a SegmentReader and has
// identical fieldName -> number mapping, then this
// array will be non-null at position i:
SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()];
// If this reader is a SegmentReader, and all of its
// field name -> number mappings match the "merged"
// FieldInfos, then we can do a bulk copy of the
// stored fields:
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
if (reader instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) reader;
boolean same = true;
FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
for (int j = 0; same && j < segmentFieldInfos.size(); j++)
same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
if (same) {
matchingSegmentReaders[i] = segmentReader;
}
}
}
// Used for bulk-reading raw bytes for stored fields
final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
// in merge mode, we use this FieldSelector // in merge mode, we use this FieldSelector
FieldSelector fieldSelectorMerge = new FieldSelector() { FieldSelector fieldSelectorMerge = new FieldSelector() {
@ -350,15 +358,45 @@ final class SegmentMerger {
try { try {
for (int r = 0; r < readers.size(); r++) { for (int r = 0; r < readers.size(); r++) {
final SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
TermVectorsReader matchingVectorsReader;
if (matchingSegmentReader != null) {
matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;
// If the TV* files are an older format then they
// cannot read raw docs:
if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs())
matchingVectorsReader = null;
} else
matchingVectorsReader = null;
IndexReader reader = (IndexReader) readers.elementAt(r); IndexReader reader = (IndexReader) readers.elementAt(r);
int maxDoc = reader.maxDoc(); int maxDoc = reader.maxDoc();
for (int docNum = 0; docNum < maxDoc; docNum++) { for (int docNum = 0; docNum < maxDoc;) {
// skip deleted docs // skip deleted docs
if (reader.isDeleted(docNum)) if (!reader.isDeleted(docNum)) {
continue; if (matchingVectorsReader != null) {
termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum)); // We can optimize this case (doing a bulk
if (checkAbort != null) // byte copy) since the field numbers are
checkAbort.work(300); // identical
int start = docNum;
int numDocs = 0;
do {
docNum++;
numDocs++;
} while(docNum < maxDoc && !matchingSegmentReader.isDeleted(docNum) && numDocs < MAX_RAW_MERGE_DOCS);
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
if (checkAbort != null)
checkAbort.work(300*numDocs);
} else {
termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
docNum++;
if (checkAbort != null)
checkAbort.work(300);
}
} else
docNum++;
} }
} }
} finally { } finally {

View File

@ -22,13 +22,18 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
/** /**
* @version $Id$ * @version $Id$
*/ */
class TermVectorsReader implements Cloneable { class TermVectorsReader implements Cloneable {
// NOTE: if you make a new format, it must be larger than
// the current format
static final int FORMAT_VERSION = 2; static final int FORMAT_VERSION = 2;
static final int FORMAT_VERSION2 = 3;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
static final int FORMAT_SIZE = 4; static final int FORMAT_SIZE = 4;
@ -41,13 +46,13 @@ class TermVectorsReader implements Cloneable {
private IndexInput tvd; private IndexInput tvd;
private IndexInput tvf; private IndexInput tvf;
private int size; private int size;
private int numTotalDocs;
// The docID offset where our docs begin in the index // The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file. // file. This will be 0 if we have our own private file.
private int docStoreOffset; private int docStoreOffset;
private int tvdFormat; private final int format;
private int tvfFormat;
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
throws CorruptIndexException, IOException { throws CorruptIndexException, IOException {
@ -56,7 +61,7 @@ class TermVectorsReader implements Cloneable {
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize) TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
throws CorruptIndexException, IOException { throws CorruptIndexException, IOException {
this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0); this(d, segment, fieldInfos, readBufferSize, -1, 0);
} }
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
@ -66,22 +71,35 @@ class TermVectorsReader implements Cloneable {
try { try {
if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) { if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize); tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
checkValidFormat(tvx); format = checkValidFormat(tvx);
tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize); tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
tvdFormat = checkValidFormat(tvd); final int tvdFormat = checkValidFormat(tvd);
tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize); tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
tvfFormat = checkValidFormat(tvf); final int tvfFormat = checkValidFormat(tvf);
assert format == tvdFormat;
assert format == tvfFormat;
if (format >= FORMAT_VERSION2) {
assert (tvx.length()-FORMAT_SIZE) % 16 == 0;
numTotalDocs = (int) (tvx.length() >> 4);
} else {
assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
numTotalDocs = (int) (tvx.length() >> 3);
}
if (-1 == docStoreOffset) { if (-1 == docStoreOffset) {
this.docStoreOffset = 0; this.docStoreOffset = 0;
this.size = (int) (tvx.length() >> 3); this.size = numTotalDocs;
} else { } else {
this.docStoreOffset = docStoreOffset; this.docStoreOffset = docStoreOffset;
this.size = size; this.size = size;
// Verify the file is long enough to hold all of our // Verify the file is long enough to hold all of our
// docs // docs
assert ((int) (tvx.length() / 8)) >= size + docStoreOffset; assert numTotalDocs >= size + docStoreOffset;
} }
} } else
format = 0;
this.fieldInfos = fieldInfos; this.fieldInfos = fieldInfos;
success = true; success = true;
@ -96,26 +114,94 @@ class TermVectorsReader implements Cloneable {
} }
} }
} }
// Used for bulk copy when merging
IndexInput getTvdStream() {
return tvd;
}
// Used for bulk copy when merging
IndexInput getTvfStream() {
return tvf;
}
final private void seekTvx(final int docNum) throws IOException {
if (format < FORMAT_VERSION2)
tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
else
tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
}
boolean canReadRawDocs() {
return format >= FORMAT_VERSION2;
}
/** Retrieve the length (in bytes) of the tvd and tvf
* entries for the next numDocs starting with
* startDocID. This is used for bulk copying when
* merging segments, if the field numbers are
* congruent. Once this returns, the tvf & tvd streams
* are seeked to the startDocID. */
final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
if (tvx == null) {
Arrays.fill(tvdLengths, 0);
Arrays.fill(tvfLengths, 0);
return;
}
// SegmentMerger calls canReadRawDocs() first and should
// not call us if that returns false.
if (format < FORMAT_VERSION2)
throw new IllegalStateException("cannot read raw docs with older term vector formats");
seekTvx(startDocID);
long tvdPosition = tvx.readLong();
tvd.seek(tvdPosition);
long tvfPosition = tvx.readLong();
tvf.seek(tvfPosition);
long lastTvdPosition = tvdPosition;
long lastTvfPosition = tvfPosition;
int count = 0;
while (count < numDocs) {
final int docID = startDocID + count + 1;
if (docID < numTotalDocs) {
tvdPosition = tvx.readLong();
tvfPosition = tvx.readLong();
} else {
tvdPosition = tvd.length();
tvfPosition = tvf.length();
}
tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
count++;
lastTvdPosition = tvdPosition;
lastTvfPosition = tvfPosition;
}
}
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
{ {
int format = in.readInt(); int format = in.readInt();
if (format > FORMAT_VERSION) if (format > FORMAT_VERSION2) {
{
throw new CorruptIndexException("Incompatible format version: " + format + " expected " throw new CorruptIndexException("Incompatible format version: " + format + " expected "
+ FORMAT_VERSION + " or less"); + FORMAT_VERSION2 + " or less");
} }
return format; return format;
} }
void close() throws IOException { void close() throws IOException {
// make all effort to close up. Keep the first exception // make all effort to close up. Keep the first exception
// and throw it as a new one. // and throw it as a new one.
IOException keep = null; IOException keep = null;
if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (keep != null) throw (IOException) keep.fillInStackTrace(); if (keep != null) throw (IOException) keep.fillInStackTrace();
} }
/** /**
@ -133,11 +219,11 @@ class TermVectorsReader implements Cloneable {
//We don't need to do this in other seeks because we already have the //We don't need to do this in other seeks because we already have the
// file pointer // file pointer
//that was written in another file //that was written in another file
tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE); seekTvx(docNum);
//System.out.println("TVX Pointer: " + tvx.getFilePointer()); //System.out.println("TVX Pointer: " + tvx.getFilePointer());
long position = tvx.readLong(); long tvdPosition = tvx.readLong();
tvd.seek(position); tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt(); int fieldCount = tvd.readVInt();
//System.out.println("Num Fields: " + fieldCount); //System.out.println("Num Fields: " + fieldCount);
// There are only a few fields per document. We opt for a full scan // There are only a few fields per document. We opt for a full scan
@ -146,7 +232,7 @@ class TermVectorsReader implements Cloneable {
int number = 0; int number = 0;
int found = -1; int found = -1;
for (int i = 0; i < fieldCount; i++) { for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == FORMAT_VERSION) if (format >= FORMAT_VERSION)
number = tvd.readVInt(); number = tvd.readVInt();
else else
number += tvd.readVInt(); number += tvd.readVInt();
@ -159,8 +245,12 @@ class TermVectorsReader implements Cloneable {
// document // document
if (found != -1) { if (found != -1) {
// Compute position in the tvf file // Compute position in the tvf file
position = 0; long position;
for (int i = 0; i <= found; i++) if (format >= FORMAT_VERSION2)
position = tvx.readLong();
else
position = tvd.readVLong();
for (int i = 1; i <= found; i++)
position += tvd.readVLong(); position += tvd.readVLong();
mapper.setDocumentNumber(docNum); mapper.setDocumentNumber(docNum);
@ -190,6 +280,45 @@ class TermVectorsReader implements Cloneable {
return mapper.materializeVector(); return mapper.materializeVector();
} }
// Reads the String[] fields; you have to pre-seek tvd to
// the right point
final private String[] readFields(int fieldCount) throws IOException {
int number = 0;
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
if (format >= FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
return fields;
}
// Reads the long[] offsets into TVF; you have to pre-seek
// tvx/tvd to the right point
final private long[] readTvfPointers(int fieldCount) throws IOException {
// Compute position in the tvf file
long position;
if (format >= FORMAT_VERSION2)
position = tvx.readLong();
else
position = tvd.readVLong();
long[] tvfPointers = new long[fieldCount];
tvfPointers[0] = position;
for (int i = 1; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
return tvfPointers;
}
/** /**
* Return all term vectors stored for this document or null if the could not be read in. * Return all term vectors stored for this document or null if the could not be read in.
* *
@ -201,34 +330,16 @@ class TermVectorsReader implements Cloneable {
TermFreqVector[] result = null; TermFreqVector[] result = null;
if (tvx != null) { if (tvx != null) {
//We need to offset by //We need to offset by
tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE); seekTvx(docNum);
long position = tvx.readLong(); long tvdPosition = tvx.readLong();
tvd.seek(position); tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt(); int fieldCount = tvd.readVInt();
// No fields are vectorized for this document // No fields are vectorized for this document
if (fieldCount != 0) { if (fieldCount != 0) {
int number = 0; final String[] fields = readFields(fieldCount);
String[] fields = new String[fieldCount]; final long[] tvfPointers = readTvfPointers(fieldCount);
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
// Compute position in the tvf file
position = 0;
long[] tvfPointers = new long[fieldCount];
for (int i = 0; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
result = readTermVectors(docNum, fields, tvfPointers); result = readTermVectors(docNum, fields, tvfPointers);
} }
} else { } else {
@ -241,34 +352,17 @@ class TermVectorsReader implements Cloneable {
// Check if no term vectors are available for this segment at all // Check if no term vectors are available for this segment at all
if (tvx != null) { if (tvx != null) {
//We need to offset by //We need to offset by
tvx.seek((docNumber * 8L) + FORMAT_SIZE);
long position = tvx.readLong();
tvd.seek(position); seekTvx(docNumber);
long tvdPosition = tvx.readLong();
tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt(); int fieldCount = tvd.readVInt();
// No fields are vectorized for this document // No fields are vectorized for this document
if (fieldCount != 0) { if (fieldCount != 0) {
int number = 0; final String[] fields = readFields(fieldCount);
String[] fields = new String[fieldCount]; final long[] tvfPointers = readTvfPointers(fieldCount);
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
// Compute position in the tvf file
position = 0;
long[] tvfPointers = new long[fieldCount];
for (int i = 0; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
mapper.setDocumentNumber(docNumber); mapper.setDocumentNumber(docNumber);
readTermVectors(fields, tvfPointers, mapper); readTermVectors(fields, tvfPointers, mapper);
} }
@ -293,9 +387,8 @@ class TermVectorsReader implements Cloneable {
private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
throws IOException { throws IOException {
for (int i = 0; i < fields.length; i++) { for (int i = 0; i < fields.length; i++) {
readTermVector(fields[i], tvfPointers[i], mapper); readTermVector(fields[i], tvfPointers[i], mapper);
} }
} }
@ -324,7 +417,7 @@ class TermVectorsReader implements Cloneable {
boolean storePositions; boolean storePositions;
boolean storeOffsets; boolean storeOffsets;
if(tvfFormat == FORMAT_VERSION){ if (format >= FORMAT_VERSION){
byte bits = tvf.readByte(); byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
@ -400,8 +493,6 @@ class TermVectorsReader implements Cloneable {
} }
} }
protected Object clone() { protected Object clone() {
if (tvx == null || tvd == null || tvf == null) if (tvx == null || tvd == null || tvf == null)
@ -418,11 +509,9 @@ class TermVectorsReader implements Cloneable {
return clone; return clone;
} }
} }
/** /**
* Models the existing parallel array structure * Models the existing parallel array structure
*/ */

View File

@ -33,11 +33,11 @@ final class TermVectorsWriter {
throws IOException { throws IOException {
// Open files for TermVector storage // Open files for TermVector storage
tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
tvx.writeInt(TermVectorsReader.FORMAT_VERSION); tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
tvd.writeInt(TermVectorsReader.FORMAT_VERSION); tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
tvf.writeInt(TermVectorsReader.FORMAT_VERSION); tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
this.fieldInfos = fieldInfos; this.fieldInfos = fieldInfos;
} }
@ -53,6 +53,7 @@ final class TermVectorsWriter {
throws IOException { throws IOException {
tvx.writeLong(tvd.getFilePointer()); tvx.writeLong(tvd.getFilePointer());
tvx.writeLong(tvf.getFilePointer());
if (vectors != null) { if (vectors != null) {
final int numFields = vectors.length; final int numFields = vectors.length;
@ -145,8 +146,8 @@ final class TermVectorsWriter {
} }
// 2nd pass: write field pointers to tvd // 2nd pass: write field pointers to tvd
long lastFieldPointer = 0; long lastFieldPointer = fieldPointers[0];
for (int i=0; i<numFields; i++) { for (int i=1; i<numFields; i++) {
final long fieldPointer = fieldPointers[i]; final long fieldPointer = fieldPointers[i];
tvd.writeVLong(fieldPointer-lastFieldPointer); tvd.writeVLong(fieldPointer-lastFieldPointer);
lastFieldPointer = fieldPointer; lastFieldPointer = fieldPointer;
@ -154,6 +155,28 @@ final class TermVectorsWriter {
} else } else
tvd.writeVInt(0); tvd.writeVInt(0);
} }
/**
* Do a bulk copy of numDocs documents from reader to our
* streams. This is used to expedite merging, if the
* field numbers are congruent.
*/
final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
long tvdPosition = tvd.getFilePointer();
long tvfPosition = tvf.getFilePointer();
long tvdStart = tvdPosition;
long tvfStart = tvfPosition;
for(int i=0;i<numDocs;i++) {
tvx.writeLong(tvdPosition);
tvdPosition += tvdLengths[i];
tvx.writeLong(tvfPosition);
tvfPosition += tvfLengths[i];
}
tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
assert tvd.getFilePointer() == tvdPosition;
assert tvf.getFilePointer() == tvfPosition;
}
/** Close all streams. */ /** Close all streams. */
final void close() throws IOException { final void close() throws IOException {

View File

@ -18,10 +18,16 @@ package org.apache.lucene.index;
*/ */
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import java.util.Vector;
import java.util.Arrays; import java.util.Arrays;
import java.io.ByteArrayOutputStream; import java.util.Enumeration;
import java.io.ObjectOutputStream; import java.util.zip.ZipFile;
import java.util.zip.ZipEntry;
import java.io.OutputStream;
import java.io.InputStream;
import java.io.FileOutputStream;
import java.io.BufferedOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.File; import java.io.File;
@ -33,9 +39,6 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import java.io.*;
import java.util.*;
import java.util.zip.*;
/* /*
Verify we can read the pre-2.1 file format, do searches Verify we can read the pre-2.1 file format, do searches
@ -50,11 +53,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase
/* /*
public void testCreatePreLocklessCFS() throws IOException { public void testCreatePreLocklessCFS() throws IOException {
createIndex("src/test/org/apache/lucene/index/index.prelockless.cfs", true); createIndex("src/test/org/apache/lucene/index/index.cfs", true);
} }
public void testCreatePreLocklessNoCFS() throws IOException { public void testCreatePreLocklessNoCFS() throws IOException {
createIndex("src/test/org/apache/lucene/index/index.prelockless.nocfs", false); createIndex("src/test/org/apache/lucene/index/index.nocfs", false);
} }
*/ */
@ -106,10 +109,14 @@ public class TestBackwardsCompatibility extends LuceneTestCase
rmDir(dirName); rmDir(dirName);
} }
final String[] oldNames = {"prelockless.cfs", final String[] oldNames = {"19.cfs",
"prelockless.nocfs", "19.nocfs",
"presharedstores.cfs", "20.cfs",
"presharedstores.nocfs"}; "20.nocfs",
"21.cfs",
"21.nocfs",
"22.cfs",
"22.nocfs"};
public void testSearchOldIndex() throws IOException { public void testSearchOldIndex() throws IOException {
for(int i=0;i<oldNames.length;i++) { for(int i=0;i<oldNames.length;i++) {
@ -146,6 +153,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase
} }
} }
private void testHits(Hits hits, int expectedCount, IndexReader reader) throws IOException {
final int hitCount = hits.length();
assertEquals("wrong number of hits", expectedCount, hitCount);
for(int i=0;i<hitCount;i++) {
hits.doc(i);
reader.getTermFreqVectors(hits.id(i));
}
}
public void searchIndex(String dirName) throws IOException { public void searchIndex(String dirName) throws IOException {
//QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer()); //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
//Query query = parser.parse("handle:1"); //Query query = parser.parse("handle:1");
@ -156,12 +172,14 @@ public class TestBackwardsCompatibility extends LuceneTestCase
IndexSearcher searcher = new IndexSearcher(dir); IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("content", "aaa"))); Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
assertEquals(34, hits.length());
Document d = hits.doc(0);
// First document should be #21 since it's norm was increased: // First document should be #21 since it's norm was
// increased:
Document d = hits.doc(0);
assertEquals("didn't get the right document first", "21", d.get("id")); assertEquals("didn't get the right document first", "21", d.get("id"));
testHits(hits, 34, searcher.getIndexReader());
searcher.close(); searcher.close();
dir.close(); dir.close();
} }
@ -189,9 +207,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase
// make sure searching sees right # hits // make sure searching sees right # hits
IndexSearcher searcher = new IndexSearcher(dir); IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("content", "aaa"))); Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
assertEquals("wrong number of hits", 44, hits.length());
Document d = hits.doc(0); Document d = hits.doc(0);
assertEquals("wrong first document", "21", d.get("id")); assertEquals("wrong first document", "21", d.get("id"));
testHits(hits, 44, searcher.getIndexReader());
searcher.close(); searcher.close();
// make sure we can do delete & setNorm against this // make sure we can do delete & setNorm against this
@ -209,6 +227,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
assertEquals("wrong number of hits", 43, hits.length()); assertEquals("wrong number of hits", 43, hits.length());
d = hits.doc(0); d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id")); assertEquals("wrong first document", "22", d.get("id"));
testHits(hits, 43, searcher.getIndexReader());
searcher.close(); searcher.close();
// optimize // optimize
@ -220,6 +239,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
hits = searcher.search(new TermQuery(new Term("content", "aaa"))); hits = searcher.search(new TermQuery(new Term("content", "aaa")));
assertEquals("wrong number of hits", 43, hits.length()); assertEquals("wrong number of hits", 43, hits.length());
d = hits.doc(0); d = hits.doc(0);
testHits(hits, 43, searcher.getIndexReader());
assertEquals("wrong first document", "22", d.get("id")); assertEquals("wrong first document", "22", d.get("id"));
searcher.close(); searcher.close();
@ -257,6 +277,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
assertEquals("wrong number of hits", 33, hits.length()); assertEquals("wrong number of hits", 33, hits.length());
d = hits.doc(0); d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id")); assertEquals("wrong first document", "22", d.get("id"));
testHits(hits, 33, searcher.getIndexReader());
searcher.close(); searcher.close();
// optimize // optimize
@ -269,6 +290,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
assertEquals("wrong number of hits", 33, hits.length()); assertEquals("wrong number of hits", 33, hits.length());
d = hits.doc(0); d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id")); assertEquals("wrong first document", "22", d.get("id"));
testHits(hits, 33, searcher.getIndexReader());
searcher.close(); searcher.close();
dir.close(); dir.close();
@ -283,6 +305,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
Directory dir = FSDirectory.getDirectory(dirName); Directory dir = FSDirectory.getDirectory(dirName);
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setUseCompoundFile(doCFS); writer.setUseCompoundFile(doCFS);
writer.setMaxBufferedDocs(10);
for(int i=0;i<35;i++) { for(int i=0;i<35;i++) {
addDoc(writer, i); addDoc(writer, i);
@ -393,6 +416,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
Document doc = new Document(); Document doc = new Document();
doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED)); doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc); writer.addDocument(doc);
} }

View File

@ -1937,9 +1937,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) { for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j)) if (reader.isDeleted(j))
numDel++; numDel++;
else else {
reader.document(j); reader.document(j);
reader.getTermFreqVectors(j); reader.getTermFreqVectors(j);
}
} }
reader.close(); reader.close();
@ -1963,9 +1964,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) { for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j)) if (reader.isDeleted(j))
numDel++; numDel++;
else else {
reader.document(j); reader.document(j);
reader.getTermFreqVectors(j); reader.getTermFreqVectors(j);
}
} }
reader.close(); reader.close();
assertEquals(0, numDel); assertEquals(0, numDel);
@ -2053,9 +2055,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) { for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j)) if (reader.isDeleted(j))
numDel++; numDel++;
else else {
reader.document(j); reader.document(j);
reader.getTermFreqVectors(j); reader.getTermFreqVectors(j);
}
} }
reader.close(); reader.close();
@ -2079,9 +2082,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) { for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j)) if (reader.isDeleted(j))
numDel++; numDel++;
else else {
reader.document(j); reader.document(j);
reader.getTermFreqVectors(j); reader.getTermFreqVectors(j);
}
} }
reader.close(); reader.close();
assertEquals(0, numDel); assertEquals(0, numDel);

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.