LUCENE-3423: add Terms.getDocCount statistic

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1169452 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-09-11 13:25:22 +00:00
parent ffb3cbee57
commit 7b2548df48
23 changed files with 225 additions and 17 deletions

View File

@ -492,6 +492,9 @@ New features
* LUCENE-3376: ReusableAnalyzerBase has been moved from modules/analysis/common * LUCENE-3376: ReusableAnalyzerBase has been moved from modules/analysis/common
into lucene/src/java/org/apache/lucene/analysis (Chris Male) into lucene/src/java/org/apache/lucene/analysis (Chris Male)
* LUCENE-3423: add Terms.getDocCount(), which returns the number of documents
that have at least one term for a field. (Yonik Seeley, Robert Muir)
Optimizations Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms * LUCENE-2588: Don't store unnecessary suffixes when writing the terms

View File

@ -401,6 +401,12 @@ public class InstantiatedIndexReader extends IndexReader {
return -1; return -1;
} }
// TODO: support this?
@Override
public int getDocCount() throws IOException {
return -1;
}
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator(); return BytesRef.getUTF8SortedAsUnicodeComparator();

View File

@ -846,6 +846,13 @@ public class MemoryIndex {
// each term has df=1 // each term has df=1
return info.sortedTerms.length; return info.sortedTerms.length;
} }
@Override
public int getDocCount() throws IOException {
return info.sortedTerms.length > 0 ? 1 : 0;
}
}; };
} }
} }

View File

@ -45,6 +45,7 @@ import org.apache.lucene.index.values.ValuesEnum;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
/** /**
@ -705,6 +706,7 @@ public class CheckIndex {
long sumTotalTermFreq = 0; long sumTotalTermFreq = 0;
long sumDocFreq = 0; long sumDocFreq = 0;
FixedBitSet visitedDocs = new FixedBitSet(reader.maxDoc());
while(true) { while(true) {
final BytesRef term = terms.next(); final BytesRef term = terms.next();
@ -766,6 +768,7 @@ public class CheckIndex {
if (doc == DocIdSetIterator.NO_MORE_DOCS) { if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break; break;
} }
visitedDocs.set(doc);
final int freq = docs2.freq(); final int freq = docs2.freq();
status.totPos += freq; status.totPos += freq;
totalTermFreq += freq; totalTermFreq += freq;
@ -810,6 +813,7 @@ public class CheckIndex {
docCount = 0; docCount = 0;
totalTermFreq = 0; totalTermFreq = 0;
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
visitedDocs.set(docsNoDel.docID());
docCount++; docCount++;
totalTermFreq += docsNoDel.freq(); totalTermFreq += docsNoDel.freq();
} }
@ -919,6 +923,13 @@ public class CheckIndex {
throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq);
} }
} }
if (fieldTerms != null) {
final int v = fieldTerms.getDocCount();
if (v != -1 && visitedDocs.cardinality() != v) {
throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
}
}
// Test seek to last term: // Test seek to last term:
if (lastTerm != null) { if (lastTerm != null) {

View File

@ -109,6 +109,11 @@ public class FilterIndexReader extends IndexReader {
public long getSumDocFreq() throws IOException { public long getSumDocFreq() throws IOException {
return in.getSumDocFreq(); return in.getSumDocFreq();
} }
@Override
public int getDocCount() throws IOException {
return in.getDocCount();
}
} }
/** Base class for filtering {@link TermsEnum} implementations. */ /** Base class for filtering {@link TermsEnum} implementations. */

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.util.BitVector; import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
// TODO: break into separate freq and prox writers as // TODO: break into separate freq and prox writers as
@ -261,6 +262,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final ByteSliceReader freq = new ByteSliceReader(); final ByteSliceReader freq = new ByteSliceReader();
final ByteSliceReader prox = new ByteSliceReader(); final ByteSliceReader prox = new ByteSliceReader();
FixedBitSet visitedDocs = new FixedBitSet(state.numDocs);
long sumTotalTermFreq = 0; long sumTotalTermFreq = 0;
long sumDocFreq = 0; long sumDocFreq = 0;
@ -347,6 +349,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// passes, ie first sweep marks all del docs, and // passes, ie first sweep marks all del docs, and
// 2nd sweep does the real flush, but I suspect // 2nd sweep does the real flush, but I suspect
// that'd add too much time to flush. // that'd add too much time to flush.
visitedDocs.set(docID);
postingsConsumer.startDoc(docID, termDocFreq); postingsConsumer.startDoc(docID, termDocFreq);
if (docID < delDocLimit) { if (docID < delDocLimit) {
// Mark it deleted. TODO: we could also skip // Mark it deleted. TODO: we could also skip
@ -408,7 +411,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
sumDocFreq += numDocs; sumDocFreq += numDocs;
} }
termsConsumer.finish(sumTotalTermFreq, sumDocFreq); termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
} }
} }

View File

@ -120,6 +120,19 @@ public final class MultiTerms extends Terms {
} }
return sum; return sum;
} }
@Override
public int getDocCount() throws IOException {
int sum = 0;
for(Terms terms : subs) {
final int v = terms.getDocCount();
if (v == -1) {
return -1;
}
sum += v;
}
return sum;
}
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {

View File

@ -174,6 +174,13 @@ public abstract class Terms {
* into account. */ * into account. */
public abstract long getSumDocFreq() throws IOException; public abstract long getSumDocFreq() throws IOException;
/** Returns the number of documents that have at least one
* term for this field, or -1 if this measure isn't
* stored by the codec. Note that, just like other term
* measures, this measure does not take deleted documents
* into account. */
public abstract int getDocCount() throws IOException;
/** /**
* Returns a thread-private {@link TermsEnum} instance. Obtaining * Returns a thread-private {@link TermsEnum} instance. Obtaining
* {@link TermsEnum} from this method might be more efficient than using * {@link TermsEnum} from this method might be more efficient than using

View File

@ -139,8 +139,9 @@ public class BlockTermsReader extends FieldsProducer {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong(); final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
assert !fields.containsKey(fieldInfo.name); assert !fields.containsKey(fieldInfo.name);
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq)); fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount));
} }
success = true; success = true;
} finally { } finally {
@ -243,14 +244,16 @@ public class BlockTermsReader extends FieldsProducer {
final long termsStartPointer; final long termsStartPointer;
final long sumTotalTermFreq; final long sumTotalTermFreq;
final long sumDocFreq; final long sumDocFreq;
final int docCount;
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq) { FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
assert numTerms > 0; assert numTerms > 0;
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
this.numTerms = numTerms; this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer; this.termsStartPointer = termsStartPointer;
this.sumTotalTermFreq = sumTotalTermFreq; this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq; this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
} }
@Override @Override
@ -283,6 +286,11 @@ public class BlockTermsReader extends FieldsProducer {
return sumDocFreq; return sumDocFreq;
} }
@Override
public int getDocCount() throws IOException {
return docCount;
}
// Iterates through terms in this field // Iterates through terms in this field
private final class SegmentTermsEnum extends TermsEnum { private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in; private final IndexInput in;

View File

@ -134,6 +134,7 @@ public class BlockTermsWriter extends FieldsConsumer {
out.writeVLong(field.sumTotalTermFreq); out.writeVLong(field.sumTotalTermFreq);
} }
out.writeVLong(field.sumDocFreq); out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
} }
} }
writeTrailer(dirStart); writeTrailer(dirStart);
@ -160,6 +161,7 @@ public class BlockTermsWriter extends FieldsConsumer {
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
long sumTotalTermFreq; long sumTotalTermFreq;
long sumDocFreq; long sumDocFreq;
int docCount;
private TermEntry[] pendingTerms; private TermEntry[] pendingTerms;
@ -234,7 +236,7 @@ public class BlockTermsWriter extends FieldsConsumer {
// Finishes all terms in this field // Finishes all terms in this field
@Override @Override
public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (pendingCount > 0) { if (pendingCount > 0) {
flushBlock(); flushBlock();
} }
@ -243,6 +245,7 @@ public class BlockTermsWriter extends FieldsConsumer {
this.sumTotalTermFreq = sumTotalTermFreq; this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq; this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
fieldIndexWriter.finish(out.getFilePointer()); fieldIndexWriter.finish(out.getFilePointer());
} }

View File

@ -153,9 +153,10 @@ public class BlockTreeTermsReader extends FieldsProducer {
assert fieldInfo != null: "field=" + field; assert fieldInfo != null: "field=" + field;
final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong(); final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
final long indexStartFP = indexDivisor != -1 ? indexIn.readVLong() : 0; final long indexStartFP = indexDivisor != -1 ? indexIn.readVLong() : 0;
assert !fields.containsKey(fieldInfo.name); assert !fields.containsKey(fieldInfo.name);
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, indexStartFP, indexIn)); fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, indexIn));
} }
success = true; success = true;
} finally { } finally {
@ -399,6 +400,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final FieldInfo fieldInfo; final FieldInfo fieldInfo;
final long sumTotalTermFreq; final long sumTotalTermFreq;
final long sumDocFreq; final long sumDocFreq;
final int docCount;
final long indexStartFP; final long indexStartFP;
final long rootBlockFP; final long rootBlockFP;
final BytesRef rootCode; final BytesRef rootCode;
@ -406,13 +408,14 @@ public class BlockTreeTermsReader extends FieldsProducer {
//private boolean DEBUG; //private boolean DEBUG;
FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, long indexStartFP, IndexInput indexIn) throws IOException { FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException {
assert numTerms > 0; assert numTerms > 0;
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
//DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
this.numTerms = numTerms; this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq; this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq; this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.indexStartFP = indexStartFP; this.indexStartFP = indexStartFP;
this.rootCode = rootCode; this.rootCode = rootCode;
// if (DEBUG) { // if (DEBUG) {
@ -475,6 +478,11 @@ public class BlockTreeTermsReader extends FieldsProducer {
return sumDocFreq; return sumDocFreq;
} }
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override @Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {

View File

@ -350,6 +350,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
private long numTerms; private long numTerms;
long sumTotalTermFreq; long sumTotalTermFreq;
long sumDocFreq; long sumDocFreq;
int docCount;
long indexStartFP; long indexStartFP;
// Used only to partition terms into the block tree; we // Used only to partition terms into the block tree; we
@ -866,7 +867,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// Finishes all terms in this field // Finishes all terms in this field
@Override @Override
public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (numTerms > 0) { if (numTerms > 0) {
blockBuilder.finish(); blockBuilder.finish();
@ -878,6 +879,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
this.sumTotalTermFreq = sumTotalTermFreq; this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq; this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
// Write FST to index // Write FST to index
indexStartFP = indexOut.getFilePointer(); indexStartFP = indexOut.getFilePointer();
@ -929,6 +931,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
out.writeVLong(field.sumTotalTermFreq); out.writeVLong(field.sumTotalTermFreq);
} }
out.writeVLong(field.sumDocFreq); out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
indexOut.writeVLong(field.indexStartFP); indexOut.writeVLong(field.indexStartFP);
} }
} }

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
/** /**
* @lucene.experimental * @lucene.experimental
@ -56,7 +57,7 @@ public abstract class PostingsConsumer {
/** Default merge impl: append documents, mapping around /** Default merge impl: append documents, mapping around
* deletes */ * deletes */
public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException { public TermStats merge(final MergeState mergeState, final DocsEnum postings, final FixedBitSet visitedDocs) throws IOException {
int df = 0; int df = 0;
long totTF = 0; long totTF = 0;
@ -67,6 +68,7 @@ public abstract class PostingsConsumer {
if (doc == DocIdSetIterator.NO_MORE_DOCS) { if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break; break;
} }
visitedDocs.set(doc);
final int freq = postings.freq(); final int freq = postings.freq();
this.startDoc(doc, freq); this.startDoc(doc, freq);
this.finishDoc(); this.finishDoc();
@ -80,6 +82,7 @@ public abstract class PostingsConsumer {
if (doc == DocIdSetIterator.NO_MORE_DOCS) { if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break; break;
} }
visitedDocs.set(doc);
final int freq = postingsEnum.freq(); final int freq = postingsEnum.freq();
this.startDoc(doc, freq); this.startDoc(doc, freq);
totTF += freq; totTF += freq;

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.MultiDocsEnum;
import org.apache.lucene.index.MultiDocsAndPositionsEnum; import org.apache.lucene.index.MultiDocsAndPositionsEnum;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
/** /**
* @lucene.experimental * @lucene.experimental
@ -42,7 +43,7 @@ public abstract class TermsConsumer {
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */ /** Called when we are done adding terms to this field */
public abstract void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException; public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;
/** Return the BytesRef Comparator used to sort terms /** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */ * before feeding to this API. */
@ -59,6 +60,7 @@ public abstract class TermsConsumer {
long sumTotalTermFreq = 0; long sumTotalTermFreq = 0;
long sumDocFreq = 0; long sumDocFreq = 0;
long sumDFsinceLastAbortCheck = 0; long sumDFsinceLastAbortCheck = 0;
FixedBitSet visitedDocs = new FixedBitSet(mergeState.mergedDocCount);
if (mergeState.fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (mergeState.fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (docsEnum == null) { if (docsEnum == null) {
@ -75,7 +77,7 @@ public abstract class TermsConsumer {
if (docsEnumIn != null) { if (docsEnumIn != null) {
docsEnum.reset(docsEnumIn); docsEnum.reset(docsEnumIn);
final PostingsConsumer postingsConsumer = startTerm(term); final PostingsConsumer postingsConsumer = startTerm(term);
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum); final TermStats stats = postingsConsumer.merge(mergeState, docsEnum, visitedDocs);
if (stats.docFreq > 0) { if (stats.docFreq > 0) {
finishTerm(term, stats); finishTerm(term, stats);
sumTotalTermFreq += stats.totalTermFreq; sumTotalTermFreq += stats.totalTermFreq;
@ -109,7 +111,7 @@ public abstract class TermsConsumer {
} }
} }
final PostingsConsumer postingsConsumer = startTerm(term); final PostingsConsumer postingsConsumer = startTerm(term);
final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum); final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum, visitedDocs);
if (stats.docFreq > 0) { if (stats.docFreq > 0) {
finishTerm(term, stats); finishTerm(term, stats);
sumTotalTermFreq += stats.totalTermFreq; sumTotalTermFreq += stats.totalTermFreq;
@ -124,6 +126,6 @@ public abstract class TermsConsumer {
} }
} }
finish(sumTotalTermFreq, sumDocFreq); finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
} }
} }

View File

@ -220,7 +220,7 @@ public class MemoryCodec extends Codec {
} }
@Override @Override
public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (termCount > 0) { if (termCount > 0) {
out.writeVInt(termCount); out.writeVInt(termCount);
out.writeVInt(field.number); out.writeVInt(field.number);
@ -228,6 +228,7 @@ public class MemoryCodec extends Codec {
out.writeVLong(sumTotalTermFreq); out.writeVLong(sumTotalTermFreq);
} }
out.writeVLong(sumDocFreq); out.writeVLong(sumDocFreq);
out.writeVInt(docCount);
builder.finish().save(out); builder.finish().save(out);
if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
} }
@ -682,6 +683,7 @@ public class MemoryCodec extends Codec {
private final long sumTotalTermFreq; private final long sumTotalTermFreq;
private final long sumDocFreq; private final long sumDocFreq;
private final int docCount;
private FST<BytesRef> fst; private FST<BytesRef> fst;
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
private final FieldInfo field; private final FieldInfo field;
@ -695,6 +697,7 @@ public class MemoryCodec extends Codec {
sumTotalTermFreq = -1; sumTotalTermFreq = -1;
} }
sumDocFreq = in.readVLong(); sumDocFreq = in.readVLong();
docCount = in.readVInt();
fst = new FST<BytesRef>(in, outputs); fst = new FST<BytesRef>(in, outputs);
} }
@ -709,6 +712,11 @@ public class MemoryCodec extends Codec {
return sumDocFreq; return sumDocFreq;
} }
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override @Override
public TermsEnum iterator() { public TermsEnum iterator() {
return new FSTTermsEnum(field, fst); return new FSTTermsEnum(field, fst);

View File

@ -249,6 +249,11 @@ public class PreFlexFields extends FieldsProducer {
public long getSumDocFreq() throws IOException { public long getSumDocFreq() throws IOException {
return -1; return -1;
} }
@Override
public int getDocCount() throws IOException {
return -1;
}
} }
private class PreTermsEnum extends TermsEnum { private class PreTermsEnum extends TermsEnum {

View File

@ -31,6 +31,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.BytesRefFSTEnum;
@ -472,9 +473,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final IndexOptions indexOptions; private final IndexOptions indexOptions;
private long sumTotalTermFreq; private long sumTotalTermFreq;
private long sumDocFreq; private long sumDocFreq;
private int docCount;
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst; private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
private int termCount; private int termCount;
private final BytesRef scratch = new BytesRef(10); private final BytesRef scratch = new BytesRef(10);
private final CharsRef scratchUTF16 = new CharsRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException { public SimpleTextTerms(String field, long termsStart) throws IOException {
this.termsStart = termsStart; this.termsStart = termsStart;
@ -494,6 +497,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
long lastDocsStart = -1; long lastDocsStart = -1;
int docFreq = 0; int docFreq = 0;
long totalTermFreq = 0; long totalTermFreq = 0;
OpenBitSet visitedDocs = new OpenBitSet();
while(true) { while(true) {
readLine(in, scratch); readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) { if (scratch.equals(END) || scratch.startsWith(FIELD)) {
@ -507,6 +511,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
} else if (scratch.startsWith(DOC)) { } else if (scratch.startsWith(DOC)) {
docFreq++; docFreq++;
sumDocFreq++; sumDocFreq++;
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
visitedDocs.set(docID);
} else if (scratch.startsWith(POS)) { } else if (scratch.startsWith(POS)) {
totalTermFreq++; totalTermFreq++;
} else if (scratch.startsWith(TERM)) { } else if (scratch.startsWith(TERM)) {
@ -528,6 +535,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
termCount++; termCount++;
} }
} }
docCount = (int) visitedDocs.cardinality();
fst = b.finish(); fst = b.finish();
/* /*
PrintStream ps = new PrintStream("out.dot"); PrintStream ps = new PrintStream("out.dot");
@ -566,6 +574,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
public long getSumDocFreq() throws IOException { public long getSumDocFreq() throws IOException {
return sumDocFreq; return sumDocFreq;
} }
@Override
public int getDocCount() throws IOException {
return docCount;
}
} }
@Override @Override

View File

@ -95,7 +95,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
} }
@Override @Override
public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
} }
@Override @Override

View File

@ -196,7 +196,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
} }
@Override @Override
public void finish(long sumTotalTermCount, long sumDocFreq) throws IOException { public void finish(long sumTotalTermCount, long sumDocFreq, int docCount) throws IOException {
} }
@Override @Override

View File

@ -99,6 +99,7 @@ public class TestExternalCodecs extends LuceneTestCase {
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>(); final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
long sumTotalTermFreq; long sumTotalTermFreq;
long sumDocFreq; long sumDocFreq;
int docCount;
RAMField(String field) { RAMField(String field) {
this.field = field; this.field = field;
@ -118,6 +119,11 @@ public class TestExternalCodecs extends LuceneTestCase {
public long getSumDocFreq() throws IOException { public long getSumDocFreq() throws IOException {
return sumDocFreq; return sumDocFreq;
} }
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override @Override
public TermsEnum iterator() { public TermsEnum iterator() {
@ -206,9 +212,10 @@ public class TestExternalCodecs extends LuceneTestCase {
} }
@Override @Override
public void finish(long sumTotalTermFreq, long sumDocFreq) { public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) {
field.sumTotalTermFreq = sumTotalTermFreq; field.sumTotalTermFreq = sumTotalTermFreq;
field.sumDocFreq = sumDocFreq; field.sumDocFreq = sumDocFreq;
field.docCount = docCount;
} }
} }

View File

@ -42,6 +42,7 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -108,11 +109,15 @@ public class TestCodecs extends LuceneTestCase {
final TermsConsumer termsConsumer = consumer.addField(fieldInfo); final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
long sumTotalTermCount = 0; long sumTotalTermCount = 0;
long sumDF = 0; long sumDF = 0;
OpenBitSet visitedDocs = new OpenBitSet();
for (final TermData term : terms) { for (final TermData term : terms) {
for (int i = 0; i < term.docs.length; i++) {
visitedDocs.set(term.docs[i]);
}
sumDF += term.docs.length; sumDF += term.docs.length;
sumTotalTermCount += term.write(termsConsumer); sumTotalTermCount += term.write(termsConsumer);
} }
termsConsumer.finish(sumTotalTermCount, sumDF); termsConsumer.finish(sumTotalTermCount, sumDF, (int) visitedDocs.cardinality());
} }
} }

View File

@ -0,0 +1,83 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
/**
* Tests the Terms.docCount statistic
*/
public class TestDocCount extends LuceneTestCase {
public void testSimple() throws Exception {
assumeFalse("PreFlex codec does not support docCount statistic!",
"PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
int numDocs = atLeast(100);
for (int i = 0; i < numDocs; i++) {
iw.addDocument(doc());
}
IndexReader ir = iw.getReader();
verifyCount(ir);
ir.close();
iw.optimize();
ir = iw.getReader();
verifyCount(ir);
ir.close();
iw.close();
dir.close();
}
private Document doc() {
Document doc = new Document();
int numFields = _TestUtil.nextInt(random, 1, 10);
for (int i = 0; i < numFields; i++) {
doc.add(newField("" + _TestUtil.nextInt(random, 'a', 'z'), "" + _TestUtil.nextInt(random, 'a', 'z'), StringField.TYPE_UNSTORED));
}
return doc;
}
private void verifyCount(IndexReader ir) throws Exception {
Fields fields = MultiFields.getFields(ir);
if (fields == null) {
return;
}
FieldsEnum e = fields.iterator();
String field;
while ((field = e.next()) != null) {
Terms terms = fields.terms(field);
int docCount = terms.getDocCount();
FixedBitSet visited = new FixedBitSet(ir.maxDoc());
TermsEnum te = terms.iterator();
while (te.next() != null) {
DocsEnum de = te.docs(null, null);
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
visited.set(de.docID());
}
}
assertEquals(visited.cardinality(), docCount);
}
}
}

View File

@ -134,6 +134,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
public long getSumDocFreq() throws IOException { public long getSumDocFreq() throws IOException {
return -1; return -1;
} }
@Override
public int getDocCount() throws IOException {
return -1;
}
}); });
assert termsEnum != null; assert termsEnum != null;