mirror of https://github.com/apache/lucene.git
LUCENE-1708: optimize deletes X matching reader when merging stored fields & vectors
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787827 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
434da77559
commit
2d93f7e288
|
@ -112,6 +112,9 @@ Changes in runtime behavior
|
||||||
rely on this behavior by the 3.0 release of Lucene. (Jonathan
|
rely on this behavior by the 3.0 release of Lucene. (Jonathan
|
||||||
Mamou, Mark Miller via Mike McCandless)
|
Mamou, Mark Miller via Mike McCandless)
|
||||||
|
|
||||||
|
7. LUCENE-1708 - IndexReader.document() no longer checks if the document is
|
||||||
|
deleted. You can call IndexReader.isDeleted(n) prior to calling document(n).
|
||||||
|
(Shai Erera via Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
<property name="Name" value="Lucene"/>
|
<property name="Name" value="Lucene"/>
|
||||||
<property name="dev.version" value="2.9-dev"/>
|
<property name="dev.version" value="2.9-dev"/>
|
||||||
<property name="version" value="${dev.version}"/>
|
<property name="version" value="${dev.version}"/>
|
||||||
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090624"/>
|
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090623a"/>
|
||||||
<property name="spec.version" value="${version}"/>
|
<property name="spec.version" value="${version}"/>
|
||||||
<property name="year" value="2000-${current.year}"/>
|
<property name="year" value="2000-${current.year}"/>
|
||||||
<property name="final.name" value="lucene-${name}-${version}"/>
|
<property name="final.name" value="lucene-${name}-${version}"/>
|
||||||
|
|
|
@ -819,8 +819,16 @@ public abstract class IndexReader implements Cloneable {
|
||||||
return maxDoc() - numDocs();
|
return maxDoc() - numDocs();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the stored fields of the <code>n</code><sup>th</sup>
|
/**
|
||||||
<code>Document</code> in this index.
|
* Returns the stored fields of the <code>n</code><sup>th</sup>
|
||||||
|
* <code>Document</code> in this index.
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> for performance reasons, this method does not check if the
|
||||||
|
* requested document is deleted, and therefore asking for a deleted document
|
||||||
|
* may yield unspecified results. Usually this is not required, however you
|
||||||
|
* can call {@link #isDeleted(int)} with the requested document ID to verify
|
||||||
|
* the document is not deleted.
|
||||||
|
*
|
||||||
* @throws CorruptIndexException if the index is corrupt
|
* @throws CorruptIndexException if the index is corrupt
|
||||||
* @throws IOException if there is a low-level IO error
|
* @throws IOException if there is a low-level IO error
|
||||||
*/
|
*/
|
||||||
|
@ -830,30 +838,38 @@ public abstract class IndexReader implements Cloneable {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup> position. The {@link org.apache.lucene.document.FieldSelector}
|
* Get the {@link org.apache.lucene.document.Document} at the <code>n</code>
|
||||||
* may be used to determine what {@link org.apache.lucene.document.Field}s to load and how they should be loaded.
|
* <sup>th</sup> position. The {@link FieldSelector} may be used to determine
|
||||||
*
|
* what {@link org.apache.lucene.document.Field}s to load and how they should
|
||||||
* <b>NOTE:</b> If this Reader (more specifically, the underlying <code>FieldsReader</code>) is closed before the lazy {@link org.apache.lucene.document.Field} is
|
* be loaded. <b>NOTE:</b> If this Reader (more specifically, the underlying
|
||||||
* loaded an exception may be thrown. If you want the value of a lazy {@link org.apache.lucene.document.Field} to be available after closing you must
|
* <code>FieldsReader</code>) is closed before the lazy
|
||||||
* explicitly load it or fetch the Document again with a new loader.
|
* {@link org.apache.lucene.document.Field} is loaded an exception may be
|
||||||
*
|
* thrown. If you want the value of a lazy
|
||||||
|
* {@link org.apache.lucene.document.Field} to be available after closing you
|
||||||
|
* must explicitly load it or fetch the Document again with a new loader.
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> for performance reasons, this method does not check if the
|
||||||
|
* requested document is deleted, and therefore asking for a deleted document
|
||||||
|
* may yield unspecified results. Usually this is not required, however you
|
||||||
|
* can call {@link #isDeleted(int)} with the requested document ID to verify
|
||||||
|
* the document is not deleted.
|
||||||
*
|
*
|
||||||
* @param n Get the document at the <code>n</code><sup>th</sup> position
|
* @param n Get the document at the <code>n</code><sup>th</sup> position
|
||||||
* @param fieldSelector The {@link org.apache.lucene.document.FieldSelector} to use to determine what Fields should be loaded on the Document. May be null, in which case all Fields will be loaded.
|
* @param fieldSelector The {@link FieldSelector} to use to determine what
|
||||||
* @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
|
* Fields should be loaded on the Document. May be null, in which case
|
||||||
|
* all Fields will be loaded.
|
||||||
|
* @return The stored fields of the
|
||||||
|
* {@link org.apache.lucene.document.Document} at the nth position
|
||||||
* @throws CorruptIndexException if the index is corrupt
|
* @throws CorruptIndexException if the index is corrupt
|
||||||
* @throws IOException if there is a low-level IO error
|
* @throws IOException if there is a low-level IO error
|
||||||
*
|
|
||||||
* @see org.apache.lucene.document.Fieldable
|
* @see org.apache.lucene.document.Fieldable
|
||||||
* @see org.apache.lucene.document.FieldSelector
|
* @see org.apache.lucene.document.FieldSelector
|
||||||
* @see org.apache.lucene.document.SetBasedFieldSelector
|
* @see org.apache.lucene.document.SetBasedFieldSelector
|
||||||
* @see org.apache.lucene.document.LoadFirstFieldSelector
|
* @see org.apache.lucene.document.LoadFirstFieldSelector
|
||||||
*/
|
*/
|
||||||
//When we convert to JDK 1.5 make this Set<String>
|
// TODO (1.5): When we convert to JDK 1.5 make this Set<String>
|
||||||
public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;
|
public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** Returns true if document <i>n</i> has been deleted */
|
/** Returns true if document <i>n</i> has been deleted */
|
||||||
public abstract boolean isDeleted(int n);
|
public abstract boolean isDeleted(int n);
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,8 @@ import java.util.List;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldSelector;
|
import org.apache.lucene.document.FieldSelector;
|
||||||
import org.apache.lucene.document.FieldSelectorResult;
|
import org.apache.lucene.document.FieldSelectorResult;
|
||||||
|
import org.apache.lucene.index.IndexReader.FieldOption;
|
||||||
|
import org.apache.lucene.index.MergePolicy.MergeAbortedException;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
@ -55,7 +57,7 @@ final class SegmentMerger {
|
||||||
|
|
||||||
private int mergedDocs;
|
private int mergedDocs;
|
||||||
|
|
||||||
private CheckAbort checkAbort;
|
private final CheckAbort checkAbort;
|
||||||
|
|
||||||
// Whether we should merge doc stores (stored fields and
|
// Whether we should merge doc stores (stored fields and
|
||||||
// vectors files). When all segments we are merging
|
// vectors files). When all segments we are merging
|
||||||
|
@ -75,13 +77,25 @@ final class SegmentMerger {
|
||||||
SegmentMerger(Directory dir, String name) {
|
SegmentMerger(Directory dir, String name) {
|
||||||
directory = dir;
|
directory = dir;
|
||||||
segment = name;
|
segment = name;
|
||||||
|
checkAbort = new CheckAbort(null, null) {
|
||||||
|
public void work(double units) throws MergeAbortedException {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) {
|
SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) {
|
||||||
directory = writer.getDirectory();
|
directory = writer.getDirectory();
|
||||||
segment = name;
|
segment = name;
|
||||||
if (merge != null)
|
if (merge != null) {
|
||||||
checkAbort = new CheckAbort(merge, directory);
|
checkAbort = new CheckAbort(merge, directory);
|
||||||
|
} else {
|
||||||
|
checkAbort = new CheckAbort(null, null) {
|
||||||
|
public void work(double units) throws MergeAbortedException {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
termIndexInterval = writer.getTermIndexInterval();
|
termIndexInterval = writer.getTermIndexInterval();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,9 +166,8 @@ final class SegmentMerger {
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
final void closeReaders() throws IOException {
|
final void closeReaders() throws IOException {
|
||||||
for (int i = 0; i < readers.size(); i++) { // close readers
|
for (Iterator iter = readers.iterator(); iter.hasNext();) {
|
||||||
IndexReader reader = (IndexReader) readers.get(i);
|
((IndexReader) iter.next()).close();
|
||||||
reader.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -206,12 +219,17 @@ final class SegmentMerger {
|
||||||
return files;
|
return files;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
|
private void addIndexed(IndexReader reader, FieldInfos fInfos,
|
||||||
boolean storeOffsetWithTermVector, boolean storePayloads, boolean omitTermFreqAndPositions) throws IOException {
|
Collection names, boolean storeTermVectors,
|
||||||
|
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
|
||||||
|
boolean storePayloads, boolean omitTFAndPositions)
|
||||||
|
throws IOException {
|
||||||
Iterator i = names.iterator();
|
Iterator i = names.iterator();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
String field = (String) i.next();
|
String field = (String) i.next();
|
||||||
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads, omitTermFreqAndPositions);
|
fInfos.add(field, true, storeTermVectors,
|
||||||
|
storePositionWithTermVector, storeOffsetWithTermVector, !reader
|
||||||
|
.hasNorms(field), storePayloads, omitTFAndPositions);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,24 +241,28 @@ final class SegmentMerger {
|
||||||
// If the i'th reader is a SegmentReader and has
|
// If the i'th reader is a SegmentReader and has
|
||||||
// identical fieldName -> number mapping, then this
|
// identical fieldName -> number mapping, then this
|
||||||
// array will be non-null at position i:
|
// array will be non-null at position i:
|
||||||
matchingSegmentReaders = new SegmentReader[readers.size()];
|
int numReaders = readers.size();
|
||||||
|
matchingSegmentReaders = new SegmentReader[numReaders];
|
||||||
|
|
||||||
// If this reader is a SegmentReader, and all of its
|
// If this reader is a SegmentReader, and all of its
|
||||||
// field name -> number mappings match the "merged"
|
// field name -> number mappings match the "merged"
|
||||||
// FieldInfos, then we can do a bulk copy of the
|
// FieldInfos, then we can do a bulk copy of the
|
||||||
// stored fields:
|
// stored fields:
|
||||||
for (int i = 0; i < readers.size(); i++) {
|
for (int i = 0; i < numReaders; i++) {
|
||||||
IndexReader reader = (IndexReader) readers.get(i);
|
IndexReader reader = (IndexReader) readers.get(i);
|
||||||
if (reader instanceof SegmentReader) {
|
if (reader instanceof SegmentReader) {
|
||||||
SegmentReader segmentReader = (SegmentReader) reader;
|
SegmentReader segmentReader = (SegmentReader) reader;
|
||||||
boolean same = true;
|
boolean same = true;
|
||||||
FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
|
FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
|
||||||
for (int j = 0; same && j < segmentFieldInfos.size(); j++)
|
int numFieldInfos = segmentFieldInfos.size();
|
||||||
|
for (int j = 0; same && j < numFieldInfos; j++) {
|
||||||
same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
|
same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
|
||||||
if (same)
|
}
|
||||||
|
if (same) {
|
||||||
matchingSegmentReaders[i] = segmentReader;
|
matchingSegmentReaders[i] = segmentReader;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Used for bulk-reading raw bytes for stored fields
|
// Used for bulk-reading raw bytes for stored fields
|
||||||
rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
|
rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
|
||||||
|
@ -268,23 +290,28 @@ final class SegmentMerger {
|
||||||
fieldInfos = new FieldInfos(); // merge field names
|
fieldInfos = new FieldInfos(); // merge field names
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < readers.size(); i++) {
|
for (Iterator iter = readers.iterator(); iter.hasNext();) {
|
||||||
IndexReader reader = (IndexReader) readers.get(i);
|
IndexReader reader = (IndexReader) iter.next();
|
||||||
if (reader instanceof SegmentReader) {
|
if (reader instanceof SegmentReader) {
|
||||||
SegmentReader segmentReader = (SegmentReader) reader;
|
SegmentReader segmentReader = (SegmentReader) reader;
|
||||||
for (int j = 0; j < segmentReader.getFieldInfos().size(); j++) {
|
FieldInfos readerFieldInfos = segmentReader.getFieldInfos();
|
||||||
FieldInfo fi = segmentReader.getFieldInfos().fieldInfo(j);
|
int numReaderFieldInfos = readerFieldInfos.size();
|
||||||
fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.hasNorms(fi.name), fi.storePayloads, fi.omitTermFreqAndPositions);
|
for (int j = 0; j < numReaderFieldInfos; j++) {
|
||||||
|
FieldInfo fi = readerFieldInfos.fieldInfo(j);
|
||||||
|
fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector,
|
||||||
|
fi.storePositionWithTermVector, fi.storeOffsetWithTermVector,
|
||||||
|
!reader.hasNorms(fi.name), fi.storePayloads,
|
||||||
|
fi.omitTermFreqAndPositions);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.OMIT_TF), false, false, false, false, true);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false);
|
||||||
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
|
fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fieldInfos.write(directory, segment + ".fnm");
|
fieldInfos.write(directory, segment + ".fnm");
|
||||||
|
@ -307,64 +334,23 @@ final class SegmentMerger {
|
||||||
final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
|
final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (int i = 0; i < readers.size(); i++) {
|
int idx = 0;
|
||||||
final IndexReader reader = (IndexReader) readers.get(i);
|
for (Iterator iter = readers.iterator(); iter.hasNext();) {
|
||||||
final SegmentReader matchingSegmentReader = matchingSegmentReaders[i];
|
final IndexReader reader = (IndexReader) iter.next();
|
||||||
final FieldsReader matchingFieldsReader;
|
final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
|
||||||
final boolean hasMatchingReader;
|
FieldsReader matchingFieldsReader = null;
|
||||||
if (matchingSegmentReader != null) {
|
if (matchingSegmentReader != null) {
|
||||||
final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
|
final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
|
||||||
if (fieldsReader != null && !fieldsReader.canReadRawDocs()) {
|
if (fieldsReader != null && fieldsReader.canReadRawDocs()) {
|
||||||
matchingFieldsReader = null;
|
|
||||||
hasMatchingReader = false;
|
|
||||||
} else {
|
|
||||||
matchingFieldsReader = fieldsReader;
|
matchingFieldsReader = fieldsReader;
|
||||||
hasMatchingReader = true;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (reader.hasDeletions()) {
|
||||||
|
docCount += copyFieldsWithDeletions(fieldSelectorMerge, fieldsWriter,
|
||||||
|
reader, matchingFieldsReader);
|
||||||
} else {
|
} else {
|
||||||
hasMatchingReader = false;
|
docCount += copyFieldsNoDeletions(fieldSelectorMerge, fieldsWriter,
|
||||||
matchingFieldsReader = null;
|
reader, matchingFieldsReader);
|
||||||
}
|
|
||||||
final int maxDoc = reader.maxDoc();
|
|
||||||
final boolean hasDeletions = reader.hasDeletions();
|
|
||||||
for (int j = 0; j < maxDoc;) {
|
|
||||||
if (!hasDeletions || !reader.isDeleted(j)) { // skip deleted docs
|
|
||||||
if (hasMatchingReader) {
|
|
||||||
// We can optimize this case (doing a bulk
|
|
||||||
// byte copy) since the field numbers are
|
|
||||||
// identical
|
|
||||||
int start = j;
|
|
||||||
int numDocs = 0;
|
|
||||||
do {
|
|
||||||
j++;
|
|
||||||
numDocs++;
|
|
||||||
if (j >= maxDoc)
|
|
||||||
break;
|
|
||||||
if (hasDeletions && matchingSegmentReader.isDeleted(j)) {
|
|
||||||
j++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} while(numDocs < MAX_RAW_MERGE_DOCS);
|
|
||||||
|
|
||||||
IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs);
|
|
||||||
fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs);
|
|
||||||
docCount += numDocs;
|
|
||||||
if (checkAbort != null)
|
|
||||||
checkAbort.work(300*numDocs);
|
|
||||||
} else {
|
|
||||||
// NOTE: it's very important to first assign
|
|
||||||
// to doc then pass it to
|
|
||||||
// termVectorsWriter.addAllDocVectors; see
|
|
||||||
// LUCENE-1282
|
|
||||||
Document doc = reader.document(j, fieldSelectorMerge);
|
|
||||||
fieldsWriter.addDocument(doc);
|
|
||||||
j++;
|
|
||||||
docCount++;
|
|
||||||
if (checkAbort != null)
|
|
||||||
checkAbort.work(300);
|
|
||||||
}
|
|
||||||
} else
|
|
||||||
j++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -385,12 +371,89 @@ final class SegmentMerger {
|
||||||
// If we are skipping the doc stores, that means there
|
// If we are skipping the doc stores, that means there
|
||||||
// are no deletions in any of these segments, so we
|
// are no deletions in any of these segments, so we
|
||||||
// just sum numDocs() of each segment to get total docCount
|
// just sum numDocs() of each segment to get total docCount
|
||||||
for (int i = 0; i < readers.size(); i++)
|
for (Iterator iter = readers.iterator(); iter.hasNext();) {
|
||||||
docCount += ((IndexReader) readers.get(i)).numDocs();
|
docCount += ((IndexReader) iter.next()).numDocs();
|
||||||
|
}
|
||||||
|
|
||||||
return docCount;
|
return docCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int copyFieldsWithDeletions(final FieldSelector fieldSelectorMerge,
|
||||||
|
final FieldsWriter fieldsWriter, final IndexReader reader,
|
||||||
|
final FieldsReader matchingFieldsReader)
|
||||||
|
throws IOException, MergeAbortedException, CorruptIndexException {
|
||||||
|
int docCount = 0;
|
||||||
|
final int maxDoc = reader.maxDoc();
|
||||||
|
if (matchingFieldsReader != null) {
|
||||||
|
// We can bulk-copy because the fieldInfos are "congruent"
|
||||||
|
for (int j = 0; j < maxDoc;) {
|
||||||
|
if (reader.isDeleted(j)) {
|
||||||
|
// skip deleted docs
|
||||||
|
++j;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// We can optimize this case (doing a bulk byte copy) since the field
|
||||||
|
// numbers are identical
|
||||||
|
int start = j, numDocs = 0;
|
||||||
|
do {
|
||||||
|
j++;
|
||||||
|
numDocs++;
|
||||||
|
if (j >= maxDoc) break;
|
||||||
|
if (reader.isDeleted(j)) {
|
||||||
|
j++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while(numDocs < MAX_RAW_MERGE_DOCS);
|
||||||
|
|
||||||
|
IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs);
|
||||||
|
fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs);
|
||||||
|
docCount += numDocs;
|
||||||
|
checkAbort.work(300 * numDocs);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int j = 0; j < maxDoc; j++) {
|
||||||
|
if (reader.isDeleted(j)) {
|
||||||
|
// skip deleted docs
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// NOTE: it's very important to first assign to doc then pass it to
|
||||||
|
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
|
||||||
|
Document doc = reader.document(j, fieldSelectorMerge);
|
||||||
|
fieldsWriter.addDocument(doc);
|
||||||
|
docCount++;
|
||||||
|
checkAbort.work(300);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return docCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int copyFieldsNoDeletions(FieldSelector fieldSelectorMerge,
|
||||||
|
final FieldsWriter fieldsWriter, final IndexReader reader,
|
||||||
|
final FieldsReader matchingFieldsReader)
|
||||||
|
throws IOException, MergeAbortedException, CorruptIndexException {
|
||||||
|
final int maxDoc = reader.maxDoc();
|
||||||
|
int docCount = 0;
|
||||||
|
if (matchingFieldsReader != null) {
|
||||||
|
// We can bulk-copy because the fieldInfos are "congruent"
|
||||||
|
while (docCount < maxDoc) {
|
||||||
|
int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
|
||||||
|
IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, docCount, len);
|
||||||
|
fieldsWriter.addRawDocuments(stream, rawDocLengths, len);
|
||||||
|
docCount += len;
|
||||||
|
checkAbort.work(300 * len);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (; docCount < maxDoc; docCount++) {
|
||||||
|
// NOTE: it's very important to first assign to doc then pass it to
|
||||||
|
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
|
||||||
|
Document doc = reader.document(docCount, fieldSelectorMerge);
|
||||||
|
fieldsWriter.addDocument(doc);
|
||||||
|
checkAbort.work(300);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return docCount;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Merge the TermVectors from each of the segments into the new one.
|
* Merge the TermVectors from each of the segments into the new one.
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
|
@ -400,65 +463,24 @@ final class SegmentMerger {
|
||||||
new TermVectorsWriter(directory, segment, fieldInfos);
|
new TermVectorsWriter(directory, segment, fieldInfos);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (int r = 0; r < readers.size(); r++) {
|
int idx = 0;
|
||||||
final SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
|
for (Iterator iter = readers.iterator(); iter.hasNext();) {
|
||||||
TermVectorsReader matchingVectorsReader;
|
final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
|
||||||
final boolean hasMatchingReader;
|
TermVectorsReader matchingVectorsReader = null;
|
||||||
if (matchingSegmentReader != null) {
|
if (matchingSegmentReader != null) {
|
||||||
matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;
|
TermVectorsReader vectorsReader = matchingSegmentReader.termVectorsReaderOrig;
|
||||||
|
|
||||||
// If the TV* files are an older format then they
|
|
||||||
// cannot read raw docs:
|
|
||||||
if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs()) {
|
|
||||||
matchingVectorsReader = null;
|
|
||||||
hasMatchingReader = false;
|
|
||||||
} else
|
|
||||||
hasMatchingReader = matchingVectorsReader != null;
|
|
||||||
|
|
||||||
|
// If the TV* files are an older format then they cannot read raw docs:
|
||||||
|
if (vectorsReader != null && vectorsReader.canReadRawDocs()) {
|
||||||
|
matchingVectorsReader = vectorsReader;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final IndexReader reader = (IndexReader) iter.next();
|
||||||
|
if (reader.hasDeletions()) {
|
||||||
|
copyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader);
|
||||||
} else {
|
} else {
|
||||||
hasMatchingReader = false;
|
copyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader);
|
||||||
matchingVectorsReader = null;
|
|
||||||
}
|
|
||||||
IndexReader reader = (IndexReader) readers.get(r);
|
|
||||||
final boolean hasDeletions = reader.hasDeletions();
|
|
||||||
int maxDoc = reader.maxDoc();
|
|
||||||
for (int docNum = 0; docNum < maxDoc;) {
|
|
||||||
// skip deleted docs
|
|
||||||
if (!hasDeletions || !reader.isDeleted(docNum)) {
|
|
||||||
if (hasMatchingReader) {
|
|
||||||
// We can optimize this case (doing a bulk
|
|
||||||
// byte copy) since the field numbers are
|
|
||||||
// identical
|
|
||||||
int start = docNum;
|
|
||||||
int numDocs = 0;
|
|
||||||
do {
|
|
||||||
docNum++;
|
|
||||||
numDocs++;
|
|
||||||
if (docNum >= maxDoc)
|
|
||||||
break;
|
|
||||||
if (hasDeletions && matchingSegmentReader.isDeleted(docNum)) {
|
|
||||||
docNum++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} while(numDocs < MAX_RAW_MERGE_DOCS);
|
|
||||||
|
|
||||||
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
|
|
||||||
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
|
|
||||||
if (checkAbort != null)
|
|
||||||
checkAbort.work(300*numDocs);
|
|
||||||
} else {
|
|
||||||
// NOTE: it's very important to first assign
|
|
||||||
// to vectors then pass it to
|
|
||||||
// termVectorsWriter.addAllDocVectors; see
|
|
||||||
// LUCENE-1282
|
|
||||||
TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
|
|
||||||
termVectorsWriter.addAllDocVectors(vectors);
|
|
||||||
docNum++;
|
|
||||||
if (checkAbort != null)
|
|
||||||
checkAbort.work(300);
|
|
||||||
}
|
|
||||||
} else
|
|
||||||
docNum++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -476,6 +498,78 @@ final class SegmentMerger {
|
||||||
throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
|
throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void copyVectorsWithDeletions(final TermVectorsWriter termVectorsWriter,
|
||||||
|
final TermVectorsReader matchingVectorsReader,
|
||||||
|
final IndexReader reader)
|
||||||
|
throws IOException, MergeAbortedException {
|
||||||
|
final int maxDoc = reader.maxDoc();
|
||||||
|
if (matchingVectorsReader != null) {
|
||||||
|
// We can bulk-copy because the fieldInfos are "congruent"
|
||||||
|
for (int docNum = 0; docNum < maxDoc;) {
|
||||||
|
if (reader.isDeleted(docNum)) {
|
||||||
|
// skip deleted docs
|
||||||
|
++docNum;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// We can optimize this case (doing a bulk byte copy) since the field
|
||||||
|
// numbers are identical
|
||||||
|
int start = docNum, numDocs = 0;
|
||||||
|
do {
|
||||||
|
docNum++;
|
||||||
|
numDocs++;
|
||||||
|
if (docNum >= maxDoc) break;
|
||||||
|
if (reader.isDeleted(docNum)) {
|
||||||
|
docNum++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while(numDocs < MAX_RAW_MERGE_DOCS);
|
||||||
|
|
||||||
|
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
|
||||||
|
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
|
||||||
|
checkAbort.work(300 * numDocs);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int docNum = 0; docNum < maxDoc; docNum++) {
|
||||||
|
if (reader.isDeleted(docNum)) {
|
||||||
|
// skip deleted docs
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: it's very important to first assign to vectors then pass it to
|
||||||
|
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
|
||||||
|
TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
|
||||||
|
termVectorsWriter.addAllDocVectors(vectors);
|
||||||
|
checkAbort.work(300);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void copyVectorsNoDeletions(final TermVectorsWriter termVectorsWriter,
|
||||||
|
final TermVectorsReader matchingVectorsReader,
|
||||||
|
final IndexReader reader)
|
||||||
|
throws IOException, MergeAbortedException {
|
||||||
|
final int maxDoc = reader.maxDoc();
|
||||||
|
if (matchingVectorsReader != null) {
|
||||||
|
// We can bulk-copy because the fieldInfos are "congruent"
|
||||||
|
int docCount = 0;
|
||||||
|
while (docCount < maxDoc) {
|
||||||
|
int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
|
||||||
|
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, docCount, len);
|
||||||
|
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
|
||||||
|
docCount += len;
|
||||||
|
checkAbort.work(300 * len);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int docNum = 0; docNum < maxDoc; docNum++) {
|
||||||
|
// NOTE: it's very important to first assign to vectors then pass it to
|
||||||
|
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
|
||||||
|
TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
|
||||||
|
termVectorsWriter.addAllDocVectors(vectors);
|
||||||
|
checkAbort.work(300);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private SegmentMergeQueue queue = null;
|
private SegmentMergeQueue queue = null;
|
||||||
|
|
||||||
private final void mergeTerms() throws CorruptIndexException, IOException {
|
private final void mergeTerms() throws CorruptIndexException, IOException {
|
||||||
|
@ -519,7 +613,7 @@ final class SegmentMerger {
|
||||||
assert reader.numDocs() == reader.maxDoc() - smi.delCount;
|
assert reader.numDocs() == reader.maxDoc() - smi.delCount;
|
||||||
|
|
||||||
if (smi.next())
|
if (smi.next())
|
||||||
queue.put(smi); // initialize queue
|
queue.add(smi); // initialize queue
|
||||||
else
|
else
|
||||||
smi.close();
|
smi.close();
|
||||||
}
|
}
|
||||||
|
@ -551,13 +645,12 @@ final class SegmentMerger {
|
||||||
|
|
||||||
int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo
|
int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo
|
||||||
|
|
||||||
if (checkAbort != null)
|
|
||||||
checkAbort.work(df/3.0);
|
checkAbort.work(df/3.0);
|
||||||
|
|
||||||
while (matchSize > 0) {
|
while (matchSize > 0) {
|
||||||
SegmentMergeInfo smi = match[--matchSize];
|
SegmentMergeInfo smi = match[--matchSize];
|
||||||
if (smi.next())
|
if (smi.next())
|
||||||
queue.put(smi); // restore queue
|
queue.add(smi); // restore queue
|
||||||
else
|
else
|
||||||
smi.close(); // done with a segment
|
smi.close(); // done with a segment
|
||||||
}
|
}
|
||||||
|
@ -631,15 +724,16 @@ final class SegmentMerger {
|
||||||
byte[] normBuffer = null;
|
byte[] normBuffer = null;
|
||||||
IndexOutput output = null;
|
IndexOutput output = null;
|
||||||
try {
|
try {
|
||||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
int numFieldInfos = fieldInfos.size();
|
||||||
|
for (int i = 0; i < numFieldInfos; i++) {
|
||||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||||
if (fi.isIndexed && !fi.omitNorms) {
|
if (fi.isIndexed && !fi.omitNorms) {
|
||||||
if (output == null) {
|
if (output == null) {
|
||||||
output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
|
output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
|
||||||
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
|
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
|
||||||
}
|
}
|
||||||
for (int j = 0; j < readers.size(); j++) {
|
for (Iterator iter = readers.iterator(); iter.hasNext();) {
|
||||||
IndexReader reader = (IndexReader) readers.get(j);
|
IndexReader reader = (IndexReader) iter.next();
|
||||||
int maxDoc = reader.maxDoc();
|
int maxDoc = reader.maxDoc();
|
||||||
if (normBuffer == null || normBuffer.length < maxDoc) {
|
if (normBuffer == null || normBuffer.length < maxDoc) {
|
||||||
// the buffer is too small for the current segment
|
// the buffer is too small for the current segment
|
||||||
|
@ -658,7 +752,6 @@ final class SegmentMerger {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (checkAbort != null)
|
|
||||||
checkAbort.work(maxDoc);
|
checkAbort.work(maxDoc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -670,7 +763,7 @@ final class SegmentMerger {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final static class CheckAbort {
|
static class CheckAbort {
|
||||||
private double workCount;
|
private double workCount;
|
||||||
private MergePolicy.OneMerge merge;
|
private MergePolicy.OneMerge merge;
|
||||||
private Directory dir;
|
private Directory dir;
|
||||||
|
@ -695,4 +788,5 @@ final class SegmentMerger {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -849,15 +849,8 @@ class SegmentReader extends IndexReader implements Cloneable {
|
||||||
return fieldInfos;
|
return fieldInfos;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @throws CorruptIndexException if the index is corrupt
|
|
||||||
* @throws IOException if there is a low-level IO error
|
|
||||||
*/
|
|
||||||
public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
|
public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
|
||||||
ensureOpen();
|
ensureOpen();
|
||||||
if (isDeleted(n))
|
|
||||||
throw new IllegalArgumentException
|
|
||||||
("attempt to access a deleted document");
|
|
||||||
return getFieldsReader().doc(n, fieldSelector);
|
return getFieldsReader().doc(n, fieldSelector);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -81,12 +81,6 @@ public class TestSegmentReader extends LuceneTestCase {
|
||||||
assertTrue(deleteReader.isDeleted(0) == true);
|
assertTrue(deleteReader.isDeleted(0) == true);
|
||||||
assertTrue(deleteReader.hasDeletions() == true);
|
assertTrue(deleteReader.hasDeletions() == true);
|
||||||
assertTrue(deleteReader.numDocs() == 0);
|
assertTrue(deleteReader.numDocs() == 0);
|
||||||
try {
|
|
||||||
deleteReader.document(0);
|
|
||||||
fail();
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
// expcected exception
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testGetFieldNameVariations() {
|
public void testGetFieldNameVariations() {
|
||||||
|
|
Loading…
Reference in New Issue