LUCENE-1267: record per-segment deletion count in segments file; add maxDoc() & numDocs() in IW; deprecate docCount() in favor of maxDoc()

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@651919 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-04-27 11:14:10 +00:00
parent 963ec9e522
commit aecd167b05
9 changed files with 147 additions and 19 deletions

View File

@ -94,6 +94,9 @@ Bug fixes
hitting an exception in readInternal, the buffer is incorrectly
filled with stale bytes such that subsequent calls to readByte()
return incorrect results. (Trejkaz via Mike McCandless)
8. LUCENE-1267: Added numDocs() and maxDoc() to IndexWriter;
deprecated docCount(). (Mike McCandless)
New features

View File

@ -119,9 +119,11 @@ public class CheckIndex {
// able to create position=-1 when the very first
// Token has positionIncrement 0
allowMinusOnePosition = false;
if (format == SegmentInfos.FORMAT_CHECKSUM) {
if (format == SegmentInfos.FORMAT_CHECKSUM)
sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
} else if (format < SegmentInfos.FORMAT_CHECKSUM) {
else if (format == SegmentInfos.FORMAT_DEL_COUNT)
sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
else if (format < SegmentInfos.CURRENT_FORMAT) {
sFormat = "int=" + format + " [newer version of Lucene than this tool]";
skip = true;
} else {
@ -178,10 +180,15 @@ public class CheckIndex {
reader = SegmentReader.get(info);
final int numDocs = reader.numDocs();
toLoseDocCount = numDocs;
if (reader.hasDeletions())
if (reader.hasDeletions()) {
if (info.docCount - numDocs != info.getDelCount())
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
out.println("OK [" + (info.docCount - numDocs) + " deleted docs]");
else
} else {
if (info.getDelCount() != 0)
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
out.println("OK");
}
out.print(" test: fields, norms.......");
Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

View File

@ -1798,17 +1798,48 @@ public class IndexWriter {
return analyzer;
}
/** Returns the number of documents currently in this index. */
/** Returns the number of documents currently in this
* index, not counting deletions.
* @deprecated Please use {@link #maxDoc()} (same as this
* method) or {@link #numDocs()} (also takes deletions
* into account), instead. */
public synchronized int docCount() {
ensureOpen();
return maxDoc();
}
/** Returns total number of docs in this index, including
* docs not yet flushed (still in the RAM buffer),
* not counting deletions.
* @see #numDocs */
public synchronized int maxDoc() {
int count;
if (docWriter != null)
count = docWriter.getNumDocsInRAM();
else
count = 0;
for (int i = 0; i < segmentInfos.size(); i++)
count += segmentInfos.info(i).docCount;
return count;
}
/** Returns total number of docs in this index, including
* docs not yet flushed (still in the RAM buffer), and
* including deletions. <b>NOTE:</b> buffered deletions
* are not counted. If you really need these to be
* counted you should call {@link #commit()} first.
* @see #numDocs */
public synchronized int numDocs() throws IOException {
int count;
if (docWriter != null)
count = docWriter.getNumDocsInRAM();
else
count = 0;
for (int i = 0; i < segmentInfos.size(); i++) {
SegmentInfo si = segmentInfos.info(i);
count += si.docCount;
final SegmentInfo info = segmentInfos.info(i);
count += info.docCount - info.getDelCount();
}
return count;
}
@ -3354,6 +3385,7 @@ public class IndexWriter {
BitVector deletes = null;
int docUpto = 0;
int delCount = 0;
final int numSegmentsToMerge = sourceSegments.size();
for(int i=0;i<numSegmentsToMerge;i++) {
@ -3390,8 +3422,10 @@ public class IndexWriter {
if (previousDeletes.get(j))
assert currentDeletes.get(j);
else {
if (currentDeletes.get(j))
if (currentDeletes.get(j)) {
deletes.set(docUpto);
delCount++;
}
docUpto++;
}
}
@ -3406,8 +3440,10 @@ public class IndexWriter {
BitVector currentDeletes = new BitVector(directory, currentInfo.getDelFileName());
for(int j=0;j<docCount;j++) {
if (currentDeletes.get(j))
if (currentDeletes.get(j)) {
deletes.set(docUpto);
delCount++;
}
docUpto++;
}
@ -3420,6 +3456,8 @@ public class IndexWriter {
merge.info.advanceDelGen();
message("commit merge deletes to " + merge.info.getDelFileName());
deletes.write(directory, merge.info.getDelFileName());
merge.info.setDelCount(delCount);
assert delCount == deletes.count();
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BitVector;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
@ -73,6 +74,9 @@ final class SegmentInfo {
// other segments
private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx)
private int delCount; // How many deleted docs in this segment, or -1 if not yet known
// (if it's an older index)
public SegmentInfo(String name, int docCount, Directory dir) {
this.name = name;
this.docCount = docCount;
@ -84,6 +88,7 @@ final class SegmentInfo {
docStoreOffset = -1;
docStoreSegment = name;
docStoreIsCompoundFile = false;
delCount = 0;
}
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) {
@ -99,6 +104,7 @@ final class SegmentInfo {
this.docStoreOffset = docStoreOffset;
this.docStoreSegment = docStoreSegment;
this.docStoreIsCompoundFile = docStoreIsCompoundFile;
delCount = 0;
assert docStoreOffset == -1 || docStoreSegment != null;
}
@ -122,6 +128,7 @@ final class SegmentInfo {
}
isCompoundFile = src.isCompoundFile;
hasSingleNormFile = src.hasSingleNormFile;
delCount = src.delCount;
}
/**
@ -168,6 +175,11 @@ final class SegmentInfo {
}
isCompoundFile = input.readByte();
preLockless = (isCompoundFile == CHECK_DIR);
if (format <= SegmentInfos.FORMAT_DEL_COUNT) {
delCount = input.readInt();
assert delCount <= docCount;
} else
delCount = -1;
} else {
delGen = CHECK_DIR;
normGen = null;
@ -177,6 +189,7 @@ final class SegmentInfo {
docStoreOffset = -1;
docStoreIsCompoundFile = false;
docStoreSegment = null;
delCount = -1;
}
}
@ -263,6 +276,7 @@ final class SegmentInfo {
SegmentInfo si = new SegmentInfo(name, docCount, dir);
si.isCompoundFile = isCompoundFile;
si.delGen = delGen;
si.delCount = delCount;
si.preLockless = preLockless;
si.hasSingleNormFile = hasSingleNormFile;
if (normGen != null) {
@ -429,6 +443,23 @@ final class SegmentInfo {
}
}
int getDelCount() throws IOException {
if (delCount == -1) {
if (hasDeletions()) {
final String delFileName = getDelFileName();
delCount = new BitVector(dir, delFileName).count();
} else
delCount = 0;
}
assert delCount <= docCount;
return delCount;
}
void setDelCount(int delCount) {
this.delCount = delCount;
assert delCount <= docCount;
}
int getDocStoreOffset() {
return docStoreOffset;
}
@ -475,6 +506,7 @@ final class SegmentInfo {
}
}
output.writeByte(isCompoundFile);
output.writeInt(delCount);
}
private void addIfExists(List files, String fileName) throws IOException {

View File

@ -61,8 +61,12 @@ final class SegmentInfos extends Vector {
* ensure all bytes were successfully written. */
public static final int FORMAT_CHECKSUM = -5;
/** This format adds the deletion count for each segment.
* This way IndexWriter can efficiently report numDocs(). */
public static final int FORMAT_DEL_COUNT = -6;
/* This must always point to the most recent file format. */
private static final int CURRENT_FORMAT = FORMAT_CHECKSUM;
static final int CURRENT_FORMAT = FORMAT_DEL_COUNT;
public int counter = 0; // used to name new segments
/**

View File

@ -55,10 +55,12 @@ class SegmentReader extends DirectoryIndexReader {
private boolean deletedDocsDirty = false;
private boolean normsDirty = false;
private boolean undeleteAll = false;
private int pendingDeleteCount;
private boolean rollbackDeletedDocsDirty = false;
private boolean rollbackNormsDirty = false;
private boolean rollbackUndeleteAll = false;
private int rollbackPendingDeleteCount;
IndexInput freqStream;
IndexInput proxStream;
@ -351,11 +353,16 @@ class SegmentReader extends DirectoryIndexReader {
if (hasDeletions(si)) {
deletedDocs = new BitVector(directory(), si.getDelFileName());
// Verify # deletes does not exceed maxDoc for this segment:
if (deletedDocs.count() > maxDoc()) {
throw new CorruptIndexException("number of deletes (" + deletedDocs.count() + ") exceeds max doc (" + maxDoc() + ") for segment " + si.name);
}
}
assert si.getDelCount() == deletedDocs.count() :
"delete count mismatch: info=" + si.getDelCount() + " vs BitVector=" + deletedDocs.count();
// Verify # deletes does not exceed maxDoc for this
// segment:
assert si.getDelCount() <= maxDoc() :
"delete count mismatch: " + deletedDocs.count() + ") exceeds max doc (" + maxDoc() + ") for segment " + si.name;
} else
assert si.getDelCount() == 0;
}
protected synchronized DirectoryIndexReader doReopen(SegmentInfos infos) throws CorruptIndexException, IOException {
@ -525,9 +532,12 @@ class SegmentReader extends DirectoryIndexReader {
// .tmp & renaming it) because the file is not live
// until segments file is written:
deletedDocs.write(directory(), si.getDelFileName());
si.setDelCount(si.getDelCount()+pendingDeleteCount);
}
if (undeleteAll && si.hasDeletions()) {
si.clearDelGen();
si.setDelCount(0);
}
if (normsDirty) { // re-write norms
si.setNumFields(fieldInfos.size());
@ -620,7 +630,8 @@ class SegmentReader extends DirectoryIndexReader {
deletedDocs = new BitVector(maxDoc());
deletedDocsDirty = true;
undeleteAll = false;
deletedDocs.set(docNum);
if (!deletedDocs.getAndSet(docNum))
pendingDeleteCount++;
}
protected void doUndeleteAll() {
@ -1009,6 +1020,7 @@ class SegmentReader extends DirectoryIndexReader {
rollbackDeletedDocsDirty = deletedDocsDirty;
rollbackNormsDirty = normsDirty;
rollbackUndeleteAll = undeleteAll;
rollbackPendingDeleteCount = pendingDeleteCount;
Iterator it = norms.values().iterator();
while (it.hasNext()) {
Norm norm = (Norm) it.next();
@ -1021,6 +1033,7 @@ class SegmentReader extends DirectoryIndexReader {
deletedDocsDirty = rollbackDeletedDocsDirty;
normsDirty = rollbackNormsDirty;
undeleteAll = rollbackUndeleteAll;
pendingDeleteCount = rollbackPendingDeleteCount;
Iterator it = norms.values().iterator();
while (it.hasNext()) {
Norm norm = (Norm) it.next();

View File

@ -56,6 +56,25 @@ public final class BitVector {
count = -1;
}
/** Sets the value of <code>bit</code> to true, and
* returns true if bit was already set */
public final boolean getAndSet(int bit) {
if (bit >= size) {
throw new ArrayIndexOutOfBoundsException(bit);
}
final int pos = bit >> 3;
final int v = bits[pos];
final int flag = 1 << (bit & 7);
if ((flag & v) != 0)
return true;
else {
bits[pos] = (byte) (v | flag);
if (count != -1)
count++;
return false;
}
}
/** Sets the value of <code>bit</code> to zero. */
public final void clear(int bit) {
if (bit >= size) {

View File

@ -49,7 +49,11 @@ public class TestCheckIndex extends LuceneTestCase {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex.out = new PrintStream(bos);
assertTrue(CheckIndex.check(dir, false, null));
if (!CheckIndex.check(dir, false, null)) {
System.out.println("CheckIndex failed");
System.out.println(bos.toString());
fail();
}
final List onlySegments = new ArrayList();
onlySegments.add("_0");
assertTrue(CheckIndex.check(dir, false, onlySegments));

View File

@ -104,8 +104,11 @@ public class TestIndexWriter extends LuceneTestCase
// optimize the index and check that the new doc count is correct
writer = new IndexWriter(dir, true, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
assertEquals(100, writer.maxDoc());
assertEquals(60, writer.numDocs());
writer.optimize();
assertEquals(60, writer.docCount());
assertEquals(60, writer.maxDoc());
assertEquals(60, writer.numDocs());
writer.close();
// check that the index reader gives the same numbers.
@ -117,7 +120,8 @@ public class TestIndexWriter extends LuceneTestCase
// make sure opening a new index for create over
// this existing one works correctly:
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
assertEquals(0, writer.docCount());
assertEquals(0, writer.maxDoc());
assertEquals(0, writer.numDocs());
writer.close();
}
@ -3030,7 +3034,10 @@ public class TestIndexWriter extends LuceneTestCase
writer = new IndexWriter(dir,
false, new StandardAnalyzer(),
IndexWriter.MaxFieldLength.LIMITED);
assertEquals(8, writer.numDocs());
assertEquals(10, writer.maxDoc());
writer.expungeDeletes();
assertEquals(8, writer.numDocs());
writer.close();
ir = IndexReader.open(dir);
assertEquals(8, ir.maxDoc());
@ -3075,6 +3082,7 @@ public class TestIndexWriter extends LuceneTestCase
false, new StandardAnalyzer(),
IndexWriter.MaxFieldLength.LIMITED);
writer.setMergeFactor(3);
assertEquals(49, writer.numDocs());
writer.expungeDeletes();
writer.close();
ir = IndexReader.open(dir);