mirror of https://github.com/apache/lucene.git
LUCENE-1267: record per-segment deletion count in segments file; add maxDoc() & numDocs() in IW; deprecate docCount() in favor of maxDoc()
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@651919 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
963ec9e522
commit
aecd167b05
|
@ -94,6 +94,9 @@ Bug fixes
|
|||
hitting an exception in readInternal, the buffer is incorrectly
|
||||
filled with stale bytes such that subsequent calls to readByte()
|
||||
return incorrect results. (Trejkaz via Mike McCandless)
|
||||
|
||||
8. LUCENE-1267: Added numDocs() and maxDoc() to IndexWriter;
|
||||
deprecated docCount(). (Mike McCandless)
|
||||
|
||||
New features
|
||||
|
||||
|
|
|
@ -119,9 +119,11 @@ public class CheckIndex {
|
|||
// able to create position=-1 when the very first
|
||||
// Token has positionIncrement 0
|
||||
allowMinusOnePosition = false;
|
||||
if (format == SegmentInfos.FORMAT_CHECKSUM) {
|
||||
if (format == SegmentInfos.FORMAT_CHECKSUM)
|
||||
sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
|
||||
} else if (format < SegmentInfos.FORMAT_CHECKSUM) {
|
||||
else if (format == SegmentInfos.FORMAT_DEL_COUNT)
|
||||
sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
|
||||
else if (format < SegmentInfos.CURRENT_FORMAT) {
|
||||
sFormat = "int=" + format + " [newer version of Lucene than this tool]";
|
||||
skip = true;
|
||||
} else {
|
||||
|
@ -178,10 +180,15 @@ public class CheckIndex {
|
|||
reader = SegmentReader.get(info);
|
||||
final int numDocs = reader.numDocs();
|
||||
toLoseDocCount = numDocs;
|
||||
if (reader.hasDeletions())
|
||||
if (reader.hasDeletions()) {
|
||||
if (info.docCount - numDocs != info.getDelCount())
|
||||
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
|
||||
out.println("OK [" + (info.docCount - numDocs) + " deleted docs]");
|
||||
else
|
||||
} else {
|
||||
if (info.getDelCount() != 0)
|
||||
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
|
||||
out.println("OK");
|
||||
}
|
||||
|
||||
out.print(" test: fields, norms.......");
|
||||
Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
|
||||
|
|
|
@ -1798,17 +1798,48 @@ public class IndexWriter {
|
|||
return analyzer;
|
||||
}
|
||||
|
||||
/** Returns the number of documents currently in this index. */
|
||||
/** Returns the number of documents currently in this
|
||||
* index, not counting deletions.
|
||||
* @deprecated Please use {@link #maxDoc()} (same as this
|
||||
* method) or {@link #numDocs()} (also takes deletions
|
||||
* into account), instead. */
|
||||
public synchronized int docCount() {
|
||||
ensureOpen();
|
||||
return maxDoc();
|
||||
}
|
||||
|
||||
/** Returns total number of docs in this index, including
|
||||
* docs not yet flushed (still in the RAM buffer),
|
||||
* not counting deletions.
|
||||
* @see #numDocs */
|
||||
public synchronized int maxDoc() {
|
||||
int count;
|
||||
if (docWriter != null)
|
||||
count = docWriter.getNumDocsInRAM();
|
||||
else
|
||||
count = 0;
|
||||
|
||||
for (int i = 0; i < segmentInfos.size(); i++)
|
||||
count += segmentInfos.info(i).docCount;
|
||||
return count;
|
||||
}
|
||||
|
||||
/** Returns total number of docs in this index, including
|
||||
* docs not yet flushed (still in the RAM buffer), and
|
||||
* including deletions. <b>NOTE:</b> buffered deletions
|
||||
* are not counted. If you really need these to be
|
||||
* counted you should call {@link #commit()} first.
|
||||
* @see #numDocs */
|
||||
public synchronized int numDocs() throws IOException {
|
||||
int count;
|
||||
if (docWriter != null)
|
||||
count = docWriter.getNumDocsInRAM();
|
||||
else
|
||||
count = 0;
|
||||
|
||||
for (int i = 0; i < segmentInfos.size(); i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
count += si.docCount;
|
||||
final SegmentInfo info = segmentInfos.info(i);
|
||||
count += info.docCount - info.getDelCount();
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
@ -3354,6 +3385,7 @@ public class IndexWriter {
|
|||
|
||||
BitVector deletes = null;
|
||||
int docUpto = 0;
|
||||
int delCount = 0;
|
||||
|
||||
final int numSegmentsToMerge = sourceSegments.size();
|
||||
for(int i=0;i<numSegmentsToMerge;i++) {
|
||||
|
@ -3390,8 +3422,10 @@ public class IndexWriter {
|
|||
if (previousDeletes.get(j))
|
||||
assert currentDeletes.get(j);
|
||||
else {
|
||||
if (currentDeletes.get(j))
|
||||
if (currentDeletes.get(j)) {
|
||||
deletes.set(docUpto);
|
||||
delCount++;
|
||||
}
|
||||
docUpto++;
|
||||
}
|
||||
}
|
||||
|
@ -3406,8 +3440,10 @@ public class IndexWriter {
|
|||
BitVector currentDeletes = new BitVector(directory, currentInfo.getDelFileName());
|
||||
|
||||
for(int j=0;j<docCount;j++) {
|
||||
if (currentDeletes.get(j))
|
||||
if (currentDeletes.get(j)) {
|
||||
deletes.set(docUpto);
|
||||
delCount++;
|
||||
}
|
||||
docUpto++;
|
||||
}
|
||||
|
||||
|
@ -3420,6 +3456,8 @@ public class IndexWriter {
|
|||
merge.info.advanceDelGen();
|
||||
message("commit merge deletes to " + merge.info.getDelFileName());
|
||||
deletes.write(directory, merge.info.getDelFileName());
|
||||
merge.info.setDelCount(delCount);
|
||||
assert delCount == deletes.count();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
@ -73,6 +74,9 @@ final class SegmentInfo {
|
|||
// other segments
|
||||
private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx)
|
||||
|
||||
private int delCount; // How many deleted docs in this segment, or -1 if not yet known
|
||||
// (if it's an older index)
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir) {
|
||||
this.name = name;
|
||||
this.docCount = docCount;
|
||||
|
@ -84,6 +88,7 @@ final class SegmentInfo {
|
|||
docStoreOffset = -1;
|
||||
docStoreSegment = name;
|
||||
docStoreIsCompoundFile = false;
|
||||
delCount = 0;
|
||||
}
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) {
|
||||
|
@ -99,6 +104,7 @@ final class SegmentInfo {
|
|||
this.docStoreOffset = docStoreOffset;
|
||||
this.docStoreSegment = docStoreSegment;
|
||||
this.docStoreIsCompoundFile = docStoreIsCompoundFile;
|
||||
delCount = 0;
|
||||
assert docStoreOffset == -1 || docStoreSegment != null;
|
||||
}
|
||||
|
||||
|
@ -122,6 +128,7 @@ final class SegmentInfo {
|
|||
}
|
||||
isCompoundFile = src.isCompoundFile;
|
||||
hasSingleNormFile = src.hasSingleNormFile;
|
||||
delCount = src.delCount;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -168,6 +175,11 @@ final class SegmentInfo {
|
|||
}
|
||||
isCompoundFile = input.readByte();
|
||||
preLockless = (isCompoundFile == CHECK_DIR);
|
||||
if (format <= SegmentInfos.FORMAT_DEL_COUNT) {
|
||||
delCount = input.readInt();
|
||||
assert delCount <= docCount;
|
||||
} else
|
||||
delCount = -1;
|
||||
} else {
|
||||
delGen = CHECK_DIR;
|
||||
normGen = null;
|
||||
|
@ -177,6 +189,7 @@ final class SegmentInfo {
|
|||
docStoreOffset = -1;
|
||||
docStoreIsCompoundFile = false;
|
||||
docStoreSegment = null;
|
||||
delCount = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -263,6 +276,7 @@ final class SegmentInfo {
|
|||
SegmentInfo si = new SegmentInfo(name, docCount, dir);
|
||||
si.isCompoundFile = isCompoundFile;
|
||||
si.delGen = delGen;
|
||||
si.delCount = delCount;
|
||||
si.preLockless = preLockless;
|
||||
si.hasSingleNormFile = hasSingleNormFile;
|
||||
if (normGen != null) {
|
||||
|
@ -429,6 +443,23 @@ final class SegmentInfo {
|
|||
}
|
||||
}
|
||||
|
||||
int getDelCount() throws IOException {
|
||||
if (delCount == -1) {
|
||||
if (hasDeletions()) {
|
||||
final String delFileName = getDelFileName();
|
||||
delCount = new BitVector(dir, delFileName).count();
|
||||
} else
|
||||
delCount = 0;
|
||||
}
|
||||
assert delCount <= docCount;
|
||||
return delCount;
|
||||
}
|
||||
|
||||
void setDelCount(int delCount) {
|
||||
this.delCount = delCount;
|
||||
assert delCount <= docCount;
|
||||
}
|
||||
|
||||
int getDocStoreOffset() {
|
||||
return docStoreOffset;
|
||||
}
|
||||
|
@ -475,6 +506,7 @@ final class SegmentInfo {
|
|||
}
|
||||
}
|
||||
output.writeByte(isCompoundFile);
|
||||
output.writeInt(delCount);
|
||||
}
|
||||
|
||||
private void addIfExists(List files, String fileName) throws IOException {
|
||||
|
|
|
@ -61,8 +61,12 @@ final class SegmentInfos extends Vector {
|
|||
* ensure all bytes were successfully written. */
|
||||
public static final int FORMAT_CHECKSUM = -5;
|
||||
|
||||
/** This format adds the deletion count for each segment.
|
||||
* This way IndexWriter can efficiently report numDocs(). */
|
||||
public static final int FORMAT_DEL_COUNT = -6;
|
||||
|
||||
/* This must always point to the most recent file format. */
|
||||
private static final int CURRENT_FORMAT = FORMAT_CHECKSUM;
|
||||
static final int CURRENT_FORMAT = FORMAT_DEL_COUNT;
|
||||
|
||||
public int counter = 0; // used to name new segments
|
||||
/**
|
||||
|
|
|
@ -55,10 +55,12 @@ class SegmentReader extends DirectoryIndexReader {
|
|||
private boolean deletedDocsDirty = false;
|
||||
private boolean normsDirty = false;
|
||||
private boolean undeleteAll = false;
|
||||
private int pendingDeleteCount;
|
||||
|
||||
private boolean rollbackDeletedDocsDirty = false;
|
||||
private boolean rollbackNormsDirty = false;
|
||||
private boolean rollbackUndeleteAll = false;
|
||||
private int rollbackPendingDeleteCount;
|
||||
|
||||
IndexInput freqStream;
|
||||
IndexInput proxStream;
|
||||
|
@ -351,11 +353,16 @@ class SegmentReader extends DirectoryIndexReader {
|
|||
if (hasDeletions(si)) {
|
||||
deletedDocs = new BitVector(directory(), si.getDelFileName());
|
||||
|
||||
// Verify # deletes does not exceed maxDoc for this segment:
|
||||
if (deletedDocs.count() > maxDoc()) {
|
||||
throw new CorruptIndexException("number of deletes (" + deletedDocs.count() + ") exceeds max doc (" + maxDoc() + ") for segment " + si.name);
|
||||
}
|
||||
}
|
||||
assert si.getDelCount() == deletedDocs.count() :
|
||||
"delete count mismatch: info=" + si.getDelCount() + " vs BitVector=" + deletedDocs.count();
|
||||
|
||||
// Verify # deletes does not exceed maxDoc for this
|
||||
// segment:
|
||||
assert si.getDelCount() <= maxDoc() :
|
||||
"delete count mismatch: " + deletedDocs.count() + ") exceeds max doc (" + maxDoc() + ") for segment " + si.name;
|
||||
|
||||
} else
|
||||
assert si.getDelCount() == 0;
|
||||
}
|
||||
|
||||
protected synchronized DirectoryIndexReader doReopen(SegmentInfos infos) throws CorruptIndexException, IOException {
|
||||
|
@ -525,9 +532,12 @@ class SegmentReader extends DirectoryIndexReader {
|
|||
// .tmp & renaming it) because the file is not live
|
||||
// until segments file is written:
|
||||
deletedDocs.write(directory(), si.getDelFileName());
|
||||
|
||||
si.setDelCount(si.getDelCount()+pendingDeleteCount);
|
||||
}
|
||||
if (undeleteAll && si.hasDeletions()) {
|
||||
si.clearDelGen();
|
||||
si.setDelCount(0);
|
||||
}
|
||||
if (normsDirty) { // re-write norms
|
||||
si.setNumFields(fieldInfos.size());
|
||||
|
@ -620,7 +630,8 @@ class SegmentReader extends DirectoryIndexReader {
|
|||
deletedDocs = new BitVector(maxDoc());
|
||||
deletedDocsDirty = true;
|
||||
undeleteAll = false;
|
||||
deletedDocs.set(docNum);
|
||||
if (!deletedDocs.getAndSet(docNum))
|
||||
pendingDeleteCount++;
|
||||
}
|
||||
|
||||
protected void doUndeleteAll() {
|
||||
|
@ -1009,6 +1020,7 @@ class SegmentReader extends DirectoryIndexReader {
|
|||
rollbackDeletedDocsDirty = deletedDocsDirty;
|
||||
rollbackNormsDirty = normsDirty;
|
||||
rollbackUndeleteAll = undeleteAll;
|
||||
rollbackPendingDeleteCount = pendingDeleteCount;
|
||||
Iterator it = norms.values().iterator();
|
||||
while (it.hasNext()) {
|
||||
Norm norm = (Norm) it.next();
|
||||
|
@ -1021,6 +1033,7 @@ class SegmentReader extends DirectoryIndexReader {
|
|||
deletedDocsDirty = rollbackDeletedDocsDirty;
|
||||
normsDirty = rollbackNormsDirty;
|
||||
undeleteAll = rollbackUndeleteAll;
|
||||
pendingDeleteCount = rollbackPendingDeleteCount;
|
||||
Iterator it = norms.values().iterator();
|
||||
while (it.hasNext()) {
|
||||
Norm norm = (Norm) it.next();
|
||||
|
|
|
@ -56,6 +56,25 @@ public final class BitVector {
|
|||
count = -1;
|
||||
}
|
||||
|
||||
/** Sets the value of <code>bit</code> to true, and
|
||||
* returns true if bit was already set */
|
||||
public final boolean getAndSet(int bit) {
|
||||
if (bit >= size) {
|
||||
throw new ArrayIndexOutOfBoundsException(bit);
|
||||
}
|
||||
final int pos = bit >> 3;
|
||||
final int v = bits[pos];
|
||||
final int flag = 1 << (bit & 7);
|
||||
if ((flag & v) != 0)
|
||||
return true;
|
||||
else {
|
||||
bits[pos] = (byte) (v | flag);
|
||||
if (count != -1)
|
||||
count++;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Sets the value of <code>bit</code> to zero. */
|
||||
public final void clear(int bit) {
|
||||
if (bit >= size) {
|
||||
|
|
|
@ -49,7 +49,11 @@ public class TestCheckIndex extends LuceneTestCase {
|
|||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||
|
||||
CheckIndex.out = new PrintStream(bos);
|
||||
assertTrue(CheckIndex.check(dir, false, null));
|
||||
if (!CheckIndex.check(dir, false, null)) {
|
||||
System.out.println("CheckIndex failed");
|
||||
System.out.println(bos.toString());
|
||||
fail();
|
||||
}
|
||||
final List onlySegments = new ArrayList();
|
||||
onlySegments.add("_0");
|
||||
assertTrue(CheckIndex.check(dir, false, onlySegments));
|
||||
|
|
|
@ -104,8 +104,11 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
|
||||
// optimize the index and check that the new doc count is correct
|
||||
writer = new IndexWriter(dir, true, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
assertEquals(100, writer.maxDoc());
|
||||
assertEquals(60, writer.numDocs());
|
||||
writer.optimize();
|
||||
assertEquals(60, writer.docCount());
|
||||
assertEquals(60, writer.maxDoc());
|
||||
assertEquals(60, writer.numDocs());
|
||||
writer.close();
|
||||
|
||||
// check that the index reader gives the same numbers.
|
||||
|
@ -117,7 +120,8 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
// make sure opening a new index for create over
|
||||
// this existing one works correctly:
|
||||
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
assertEquals(0, writer.docCount());
|
||||
assertEquals(0, writer.maxDoc());
|
||||
assertEquals(0, writer.numDocs());
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
@ -3030,7 +3034,10 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
writer = new IndexWriter(dir,
|
||||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
assertEquals(8, writer.numDocs());
|
||||
assertEquals(10, writer.maxDoc());
|
||||
writer.expungeDeletes();
|
||||
assertEquals(8, writer.numDocs());
|
||||
writer.close();
|
||||
ir = IndexReader.open(dir);
|
||||
assertEquals(8, ir.maxDoc());
|
||||
|
@ -3075,6 +3082,7 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
writer.setMergeFactor(3);
|
||||
assertEquals(49, writer.numDocs());
|
||||
writer.expungeDeletes();
|
||||
writer.close();
|
||||
ir = IndexReader.open(dir);
|
||||
|
|
Loading…
Reference in New Issue