LUCENE-1708: optimize deletes X matching reader when merging stored fields & vectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787827 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-06-23 20:32:36 +00:00
parent 434da77559
commit 2d93f7e288
6 changed files with 276 additions and 176 deletions

View File

@ -112,6 +112,9 @@ Changes in runtime behavior
rely on this behavior by the 3.0 release of Lucene. (Jonathan rely on this behavior by the 3.0 release of Lucene. (Jonathan
Mamou, Mark Miller via Mike McCandless) Mamou, Mark Miller via Mike McCandless)
7. LUCENE-1708 - IndexReader.document() no longer checks if the document is
deleted. You can call IndexReader.isDeleted(n) prior to calling document(n).
(Shai Erera via Mike McCandless)
API Changes API Changes

View File

@ -42,7 +42,7 @@
<property name="Name" value="Lucene"/> <property name="Name" value="Lucene"/>
<property name="dev.version" value="2.9-dev"/> <property name="dev.version" value="2.9-dev"/>
<property name="version" value="${dev.version}"/> <property name="version" value="${dev.version}"/>
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090624"/> <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090623a"/>
<property name="spec.version" value="${version}"/> <property name="spec.version" value="${version}"/>
<property name="year" value="2000-${current.year}"/> <property name="year" value="2000-${current.year}"/>
<property name="final.name" value="lucene-${name}-${version}"/> <property name="final.name" value="lucene-${name}-${version}"/>

View File

@ -819,8 +819,16 @@ public abstract class IndexReader implements Cloneable {
return maxDoc() - numDocs(); return maxDoc() - numDocs();
} }
/** Returns the stored fields of the <code>n</code><sup>th</sup> /**
<code>Document</code> in this index. * Returns the stored fields of the <code>n</code><sup>th</sup>
* <code>Document</code> in this index.
* <p>
* <b>NOTE:</b> for performance reasons, this method does not check if the
* requested document is deleted, and therefore asking for a deleted document
* may yield unspecified results. Usually this is not required, however you
* can call {@link #isDeleted(int)} with the requested document ID to verify
* the document is not deleted.
*
* @throws CorruptIndexException if the index is corrupt * @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error * @throws IOException if there is a low-level IO error
*/ */
@ -830,30 +838,38 @@ public abstract class IndexReader implements Cloneable {
} }
/** /**
* Get the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup> position. The {@link org.apache.lucene.document.FieldSelector} * Get the {@link org.apache.lucene.document.Document} at the <code>n</code>
* may be used to determine what {@link org.apache.lucene.document.Field}s to load and how they should be loaded. * <sup>th</sup> position. The {@link FieldSelector} may be used to determine
* * what {@link org.apache.lucene.document.Field}s to load and how they should
* <b>NOTE:</b> If this Reader (more specifically, the underlying <code>FieldsReader</code>) is closed before the lazy {@link org.apache.lucene.document.Field} is * be loaded. <b>NOTE:</b> If this Reader (more specifically, the underlying
* loaded an exception may be thrown. If you want the value of a lazy {@link org.apache.lucene.document.Field} to be available after closing you must * <code>FieldsReader</code>) is closed before the lazy
* explicitly load it or fetch the Document again with a new loader. * {@link org.apache.lucene.document.Field} is loaded an exception may be
* * thrown. If you want the value of a lazy
* {@link org.apache.lucene.document.Field} to be available after closing you
* must explicitly load it or fetch the Document again with a new loader.
* <p>
* <b>NOTE:</b> for performance reasons, this method does not check if the
* requested document is deleted, and therefore asking for a deleted document
* may yield unspecified results. Usually this is not required, however you
* can call {@link #isDeleted(int)} with the requested document ID to verify
* the document is not deleted.
* *
* @param n Get the document at the <code>n</code><sup>th</sup> position * @param n Get the document at the <code>n</code><sup>th</sup> position
* @param fieldSelector The {@link org.apache.lucene.document.FieldSelector} to use to determine what Fields should be loaded on the Document. May be null, in which case all Fields will be loaded. * @param fieldSelector The {@link FieldSelector} to use to determine what
* @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position * Fields should be loaded on the Document. May be null, in which case
* all Fields will be loaded.
* @return The stored fields of the
* {@link org.apache.lucene.document.Document} at the nth position
* @throws CorruptIndexException if the index is corrupt * @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error * @throws IOException if there is a low-level IO error
*
* @see org.apache.lucene.document.Fieldable * @see org.apache.lucene.document.Fieldable
* @see org.apache.lucene.document.FieldSelector * @see org.apache.lucene.document.FieldSelector
* @see org.apache.lucene.document.SetBasedFieldSelector * @see org.apache.lucene.document.SetBasedFieldSelector
* @see org.apache.lucene.document.LoadFirstFieldSelector * @see org.apache.lucene.document.LoadFirstFieldSelector
*/ */
//When we convert to JDK 1.5 make this Set<String> // TODO (1.5): When we convert to JDK 1.5 make this Set<String>
public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException; public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;
/** Returns true if document <i>n</i> has been deleted */ /** Returns true if document <i>n</i> has been deleted */
public abstract boolean isDeleted(int n); public abstract boolean isDeleted(int n);

View File

@ -26,6 +26,8 @@ import java.util.List;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.MergePolicy.MergeAbortedException;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
@ -55,7 +57,7 @@ final class SegmentMerger {
private int mergedDocs; private int mergedDocs;
private CheckAbort checkAbort; private final CheckAbort checkAbort;
// Whether we should merge doc stores (stored fields and // Whether we should merge doc stores (stored fields and
// vectors files). When all segments we are merging // vectors files). When all segments we are merging
@ -75,13 +77,25 @@ final class SegmentMerger {
SegmentMerger(Directory dir, String name) { SegmentMerger(Directory dir, String name) {
directory = dir; directory = dir;
segment = name; segment = name;
checkAbort = new CheckAbort(null, null) {
public void work(double units) throws MergeAbortedException {
// do nothing
}
};
} }
SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) { SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) {
directory = writer.getDirectory(); directory = writer.getDirectory();
segment = name; segment = name;
if (merge != null) if (merge != null) {
checkAbort = new CheckAbort(merge, directory); checkAbort = new CheckAbort(merge, directory);
} else {
checkAbort = new CheckAbort(null, null) {
public void work(double units) throws MergeAbortedException {
// do nothing
}
};
}
termIndexInterval = writer.getTermIndexInterval(); termIndexInterval = writer.getTermIndexInterval();
} }
@ -152,9 +166,8 @@ final class SegmentMerger {
* @throws IOException * @throws IOException
*/ */
final void closeReaders() throws IOException { final void closeReaders() throws IOException {
for (int i = 0; i < readers.size(); i++) { // close readers for (Iterator iter = readers.iterator(); iter.hasNext();) {
IndexReader reader = (IndexReader) readers.get(i); ((IndexReader) iter.next()).close();
reader.close();
} }
} }
@ -206,12 +219,17 @@ final class SegmentMerger {
return files; return files;
} }
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, private void addIndexed(IndexReader reader, FieldInfos fInfos,
boolean storeOffsetWithTermVector, boolean storePayloads, boolean omitTermFreqAndPositions) throws IOException { Collection names, boolean storeTermVectors,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
boolean storePayloads, boolean omitTFAndPositions)
throws IOException {
Iterator i = names.iterator(); Iterator i = names.iterator();
while (i.hasNext()) { while (i.hasNext()) {
String field = (String) i.next(); String field = (String) i.next();
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads, omitTermFreqAndPositions); fInfos.add(field, true, storeTermVectors,
storePositionWithTermVector, storeOffsetWithTermVector, !reader
.hasNorms(field), storePayloads, omitTFAndPositions);
} }
} }
@ -223,24 +241,28 @@ final class SegmentMerger {
// If the i'th reader is a SegmentReader and has // If the i'th reader is a SegmentReader and has
// identical fieldName -> number mapping, then this // identical fieldName -> number mapping, then this
// array will be non-null at position i: // array will be non-null at position i:
matchingSegmentReaders = new SegmentReader[readers.size()]; int numReaders = readers.size();
matchingSegmentReaders = new SegmentReader[numReaders];
// If this reader is a SegmentReader, and all of its // If this reader is a SegmentReader, and all of its
// field name -> number mappings match the "merged" // field name -> number mappings match the "merged"
// FieldInfos, then we can do a bulk copy of the // FieldInfos, then we can do a bulk copy of the
// stored fields: // stored fields:
for (int i = 0; i < readers.size(); i++) { for (int i = 0; i < numReaders; i++) {
IndexReader reader = (IndexReader) readers.get(i); IndexReader reader = (IndexReader) readers.get(i);
if (reader instanceof SegmentReader) { if (reader instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) reader; SegmentReader segmentReader = (SegmentReader) reader;
boolean same = true; boolean same = true;
FieldInfos segmentFieldInfos = segmentReader.getFieldInfos(); FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
for (int j = 0; same && j < segmentFieldInfos.size(); j++) int numFieldInfos = segmentFieldInfos.size();
for (int j = 0; same && j < numFieldInfos; j++) {
same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j)); same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
if (same) }
if (same) {
matchingSegmentReaders[i] = segmentReader; matchingSegmentReaders[i] = segmentReader;
} }
} }
}
// Used for bulk-reading raw bytes for stored fields // Used for bulk-reading raw bytes for stored fields
rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
@ -268,23 +290,28 @@ final class SegmentMerger {
fieldInfos = new FieldInfos(); // merge field names fieldInfos = new FieldInfos(); // merge field names
} }
for (int i = 0; i < readers.size(); i++) { for (Iterator iter = readers.iterator(); iter.hasNext();) {
IndexReader reader = (IndexReader) readers.get(i); IndexReader reader = (IndexReader) iter.next();
if (reader instanceof SegmentReader) { if (reader instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) reader; SegmentReader segmentReader = (SegmentReader) reader;
for (int j = 0; j < segmentReader.getFieldInfos().size(); j++) { FieldInfos readerFieldInfos = segmentReader.getFieldInfos();
FieldInfo fi = segmentReader.getFieldInfos().fieldInfo(j); int numReaderFieldInfos = readerFieldInfos.size();
fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.hasNorms(fi.name), fi.storePayloads, fi.omitTermFreqAndPositions); for (int j = 0; j < numReaderFieldInfos; j++) {
FieldInfo fi = readerFieldInfos.fieldInfo(j);
fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector,
fi.storePositionWithTermVector, fi.storeOffsetWithTermVector,
!reader.hasNorms(fi.name), fi.storePayloads,
fi.omitTermFreqAndPositions);
} }
} else { } else {
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.OMIT_TF), false, false, false, false, true); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false);
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false); fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false);
} }
} }
fieldInfos.write(directory, segment + ".fnm"); fieldInfos.write(directory, segment + ".fnm");
@ -307,64 +334,23 @@ final class SegmentMerger {
final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
try { try {
for (int i = 0; i < readers.size(); i++) { int idx = 0;
final IndexReader reader = (IndexReader) readers.get(i); for (Iterator iter = readers.iterator(); iter.hasNext();) {
final SegmentReader matchingSegmentReader = matchingSegmentReaders[i]; final IndexReader reader = (IndexReader) iter.next();
final FieldsReader matchingFieldsReader; final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
final boolean hasMatchingReader; FieldsReader matchingFieldsReader = null;
if (matchingSegmentReader != null) { if (matchingSegmentReader != null) {
final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader(); final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
if (fieldsReader != null && !fieldsReader.canReadRawDocs()) { if (fieldsReader != null && fieldsReader.canReadRawDocs()) {
matchingFieldsReader = null;
hasMatchingReader = false;
} else {
matchingFieldsReader = fieldsReader; matchingFieldsReader = fieldsReader;
hasMatchingReader = true;
} }
}
if (reader.hasDeletions()) {
docCount += copyFieldsWithDeletions(fieldSelectorMerge, fieldsWriter,
reader, matchingFieldsReader);
} else { } else {
hasMatchingReader = false; docCount += copyFieldsNoDeletions(fieldSelectorMerge, fieldsWriter,
matchingFieldsReader = null; reader, matchingFieldsReader);
}
final int maxDoc = reader.maxDoc();
final boolean hasDeletions = reader.hasDeletions();
for (int j = 0; j < maxDoc;) {
if (!hasDeletions || !reader.isDeleted(j)) { // skip deleted docs
if (hasMatchingReader) {
// We can optimize this case (doing a bulk
// byte copy) since the field numbers are
// identical
int start = j;
int numDocs = 0;
do {
j++;
numDocs++;
if (j >= maxDoc)
break;
if (hasDeletions && matchingSegmentReader.isDeleted(j)) {
j++;
break;
}
} while(numDocs < MAX_RAW_MERGE_DOCS);
IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs);
fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs);
docCount += numDocs;
if (checkAbort != null)
checkAbort.work(300*numDocs);
} else {
// NOTE: it's very important to first assign
// to doc then pass it to
// termVectorsWriter.addAllDocVectors; see
// LUCENE-1282
Document doc = reader.document(j, fieldSelectorMerge);
fieldsWriter.addDocument(doc);
j++;
docCount++;
if (checkAbort != null)
checkAbort.work(300);
}
} else
j++;
} }
} }
} finally { } finally {
@ -385,12 +371,89 @@ final class SegmentMerger {
// If we are skipping the doc stores, that means there // If we are skipping the doc stores, that means there
// are no deletions in any of these segments, so we // are no deletions in any of these segments, so we
// just sum numDocs() of each segment to get total docCount // just sum numDocs() of each segment to get total docCount
for (int i = 0; i < readers.size(); i++) for (Iterator iter = readers.iterator(); iter.hasNext();) {
docCount += ((IndexReader) readers.get(i)).numDocs(); docCount += ((IndexReader) iter.next()).numDocs();
}
return docCount; return docCount;
} }
private int copyFieldsWithDeletions(final FieldSelector fieldSelectorMerge,
final FieldsWriter fieldsWriter, final IndexReader reader,
final FieldsReader matchingFieldsReader)
throws IOException, MergeAbortedException, CorruptIndexException {
int docCount = 0;
final int maxDoc = reader.maxDoc();
if (matchingFieldsReader != null) {
// We can bulk-copy because the fieldInfos are "congruent"
for (int j = 0; j < maxDoc;) {
if (reader.isDeleted(j)) {
// skip deleted docs
++j;
continue;
}
// We can optimize this case (doing a bulk byte copy) since the field
// numbers are identical
int start = j, numDocs = 0;
do {
j++;
numDocs++;
if (j >= maxDoc) break;
if (reader.isDeleted(j)) {
j++;
break;
}
} while(numDocs < MAX_RAW_MERGE_DOCS);
IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs);
fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs);
docCount += numDocs;
checkAbort.work(300 * numDocs);
}
} else {
for (int j = 0; j < maxDoc; j++) {
if (reader.isDeleted(j)) {
// skip deleted docs
continue;
}
// NOTE: it's very important to first assign to doc then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
Document doc = reader.document(j, fieldSelectorMerge);
fieldsWriter.addDocument(doc);
docCount++;
checkAbort.work(300);
}
}
return docCount;
}
private int copyFieldsNoDeletions(FieldSelector fieldSelectorMerge,
final FieldsWriter fieldsWriter, final IndexReader reader,
final FieldsReader matchingFieldsReader)
throws IOException, MergeAbortedException, CorruptIndexException {
final int maxDoc = reader.maxDoc();
int docCount = 0;
if (matchingFieldsReader != null) {
// We can bulk-copy because the fieldInfos are "congruent"
while (docCount < maxDoc) {
int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, docCount, len);
fieldsWriter.addRawDocuments(stream, rawDocLengths, len);
docCount += len;
checkAbort.work(300 * len);
}
} else {
for (; docCount < maxDoc; docCount++) {
// NOTE: it's very important to first assign to doc then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
Document doc = reader.document(docCount, fieldSelectorMerge);
fieldsWriter.addDocument(doc);
checkAbort.work(300);
}
}
return docCount;
}
/** /**
* Merge the TermVectors from each of the segments into the new one. * Merge the TermVectors from each of the segments into the new one.
* @throws IOException * @throws IOException
@ -400,65 +463,24 @@ final class SegmentMerger {
new TermVectorsWriter(directory, segment, fieldInfos); new TermVectorsWriter(directory, segment, fieldInfos);
try { try {
for (int r = 0; r < readers.size(); r++) { int idx = 0;
final SegmentReader matchingSegmentReader = matchingSegmentReaders[r]; for (Iterator iter = readers.iterator(); iter.hasNext();) {
TermVectorsReader matchingVectorsReader; final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
final boolean hasMatchingReader; TermVectorsReader matchingVectorsReader = null;
if (matchingSegmentReader != null) { if (matchingSegmentReader != null) {
matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig; TermVectorsReader vectorsReader = matchingSegmentReader.termVectorsReaderOrig;
// If the TV* files are an older format then they
// cannot read raw docs:
if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs()) {
matchingVectorsReader = null;
hasMatchingReader = false;
} else
hasMatchingReader = matchingVectorsReader != null;
// If the TV* files are an older format then they cannot read raw docs:
if (vectorsReader != null && vectorsReader.canReadRawDocs()) {
matchingVectorsReader = vectorsReader;
}
}
final IndexReader reader = (IndexReader) iter.next();
if (reader.hasDeletions()) {
copyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader);
} else { } else {
hasMatchingReader = false; copyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader);
matchingVectorsReader = null;
}
IndexReader reader = (IndexReader) readers.get(r);
final boolean hasDeletions = reader.hasDeletions();
int maxDoc = reader.maxDoc();
for (int docNum = 0; docNum < maxDoc;) {
// skip deleted docs
if (!hasDeletions || !reader.isDeleted(docNum)) {
if (hasMatchingReader) {
// We can optimize this case (doing a bulk
// byte copy) since the field numbers are
// identical
int start = docNum;
int numDocs = 0;
do {
docNum++;
numDocs++;
if (docNum >= maxDoc)
break;
if (hasDeletions && matchingSegmentReader.isDeleted(docNum)) {
docNum++;
break;
}
} while(numDocs < MAX_RAW_MERGE_DOCS);
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
if (checkAbort != null)
checkAbort.work(300*numDocs);
} else {
// NOTE: it's very important to first assign
// to vectors then pass it to
// termVectorsWriter.addAllDocVectors; see
// LUCENE-1282
TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
termVectorsWriter.addAllDocVectors(vectors);
docNum++;
if (checkAbort != null)
checkAbort.work(300);
}
} else
docNum++;
} }
} }
} finally { } finally {
@ -476,6 +498,78 @@ final class SegmentMerger {
throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption"); throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
} }
private void copyVectorsWithDeletions(final TermVectorsWriter termVectorsWriter,
final TermVectorsReader matchingVectorsReader,
final IndexReader reader)
throws IOException, MergeAbortedException {
final int maxDoc = reader.maxDoc();
if (matchingVectorsReader != null) {
// We can bulk-copy because the fieldInfos are "congruent"
for (int docNum = 0; docNum < maxDoc;) {
if (reader.isDeleted(docNum)) {
// skip deleted docs
++docNum;
continue;
}
// We can optimize this case (doing a bulk byte copy) since the field
// numbers are identical
int start = docNum, numDocs = 0;
do {
docNum++;
numDocs++;
if (docNum >= maxDoc) break;
if (reader.isDeleted(docNum)) {
docNum++;
break;
}
} while(numDocs < MAX_RAW_MERGE_DOCS);
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
checkAbort.work(300 * numDocs);
}
} else {
for (int docNum = 0; docNum < maxDoc; docNum++) {
if (reader.isDeleted(docNum)) {
// skip deleted docs
continue;
}
// NOTE: it's very important to first assign to vectors then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
termVectorsWriter.addAllDocVectors(vectors);
checkAbort.work(300);
}
}
}
private void copyVectorsNoDeletions(final TermVectorsWriter termVectorsWriter,
final TermVectorsReader matchingVectorsReader,
final IndexReader reader)
throws IOException, MergeAbortedException {
final int maxDoc = reader.maxDoc();
if (matchingVectorsReader != null) {
// We can bulk-copy because the fieldInfos are "congruent"
int docCount = 0;
while (docCount < maxDoc) {
int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, docCount, len);
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
docCount += len;
checkAbort.work(300 * len);
}
} else {
for (int docNum = 0; docNum < maxDoc; docNum++) {
// NOTE: it's very important to first assign to vectors then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
termVectorsWriter.addAllDocVectors(vectors);
checkAbort.work(300);
}
}
}
private SegmentMergeQueue queue = null; private SegmentMergeQueue queue = null;
private final void mergeTerms() throws CorruptIndexException, IOException { private final void mergeTerms() throws CorruptIndexException, IOException {
@ -519,7 +613,7 @@ final class SegmentMerger {
assert reader.numDocs() == reader.maxDoc() - smi.delCount; assert reader.numDocs() == reader.maxDoc() - smi.delCount;
if (smi.next()) if (smi.next())
queue.put(smi); // initialize queue queue.add(smi); // initialize queue
else else
smi.close(); smi.close();
} }
@ -551,13 +645,12 @@ final class SegmentMerger {
int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo
if (checkAbort != null)
checkAbort.work(df/3.0); checkAbort.work(df/3.0);
while (matchSize > 0) { while (matchSize > 0) {
SegmentMergeInfo smi = match[--matchSize]; SegmentMergeInfo smi = match[--matchSize];
if (smi.next()) if (smi.next())
queue.put(smi); // restore queue queue.add(smi); // restore queue
else else
smi.close(); // done with a segment smi.close(); // done with a segment
} }
@ -631,15 +724,16 @@ final class SegmentMerger {
byte[] normBuffer = null; byte[] normBuffer = null;
IndexOutput output = null; IndexOutput output = null;
try { try {
for (int i = 0; i < fieldInfos.size(); i++) { int numFieldInfos = fieldInfos.size();
for (int i = 0; i < numFieldInfos; i++) {
FieldInfo fi = fieldInfos.fieldInfo(i); FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed && !fi.omitNorms) { if (fi.isIndexed && !fi.omitNorms) {
if (output == null) { if (output == null) {
output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION); output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length); output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
} }
for (int j = 0; j < readers.size(); j++) { for (Iterator iter = readers.iterator(); iter.hasNext();) {
IndexReader reader = (IndexReader) readers.get(j); IndexReader reader = (IndexReader) iter.next();
int maxDoc = reader.maxDoc(); int maxDoc = reader.maxDoc();
if (normBuffer == null || normBuffer.length < maxDoc) { if (normBuffer == null || normBuffer.length < maxDoc) {
// the buffer is too small for the current segment // the buffer is too small for the current segment
@ -658,7 +752,6 @@ final class SegmentMerger {
} }
} }
} }
if (checkAbort != null)
checkAbort.work(maxDoc); checkAbort.work(maxDoc);
} }
} }
@ -670,7 +763,7 @@ final class SegmentMerger {
} }
} }
final static class CheckAbort { static class CheckAbort {
private double workCount; private double workCount;
private MergePolicy.OneMerge merge; private MergePolicy.OneMerge merge;
private Directory dir; private Directory dir;
@ -695,4 +788,5 @@ final class SegmentMerger {
} }
} }
} }
} }

View File

@ -849,15 +849,8 @@ class SegmentReader extends IndexReader implements Cloneable {
return fieldInfos; return fieldInfos;
} }
/**
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
ensureOpen(); ensureOpen();
if (isDeleted(n))
throw new IllegalArgumentException
("attempt to access a deleted document");
return getFieldsReader().doc(n, fieldSelector); return getFieldsReader().doc(n, fieldSelector);
} }

View File

@ -81,12 +81,6 @@ public class TestSegmentReader extends LuceneTestCase {
assertTrue(deleteReader.isDeleted(0) == true); assertTrue(deleteReader.isDeleted(0) == true);
assertTrue(deleteReader.hasDeletions() == true); assertTrue(deleteReader.hasDeletions() == true);
assertTrue(deleteReader.numDocs() == 0); assertTrue(deleteReader.numDocs() == 0);
try {
deleteReader.document(0);
fail();
} catch (IllegalArgumentException e) {
// expcected exception
}
} }
public void testGetFieldNameVariations() { public void testGetFieldNameVariations() {