diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6a37b135976..2d577d09af9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -492,6 +492,9 @@ New features * LUCENE-3376: ReusableAnalyzerBase has been moved from modules/analysis/common into lucene/src/java/org/apache/lucene/analysis (Chris Male) +* LUCENE-3423: add Terms.getDocCount(), which returns the number of documents + that have at least one term for a field. (Yonik Seeley, Robert Muir) + Optimizations * LUCENE-2588: Don't store unnecessary suffixes when writing the terms diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java index 817fbcea108..02e31190e80 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -401,6 +401,12 @@ public class InstantiatedIndexReader extends IndexReader { return -1; } + // TODO: support this? + @Override + public int getDocCount() throws IOException { + return -1; + } + @Override public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index daa2c889f2e..7d6e050b60c 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -846,6 +846,13 @@ public class MemoryIndex { // each term has df=1 return info.sortedTerms.length; } + + @Override + public int getDocCount() throws IOException { + return info.sortedTerms.length > 0 ? 1 : 0; + } + + }; } } diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index ae2062ba717..29822034e67 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -45,6 +45,7 @@ import org.apache.lucene.index.values.ValuesEnum; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.StringHelper; /** @@ -705,6 +706,7 @@ public class CheckIndex { long sumTotalTermFreq = 0; long sumDocFreq = 0; + FixedBitSet visitedDocs = new FixedBitSet(reader.maxDoc()); while(true) { final BytesRef term = terms.next(); @@ -766,6 +768,7 @@ public class CheckIndex { if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } + visitedDocs.set(doc); final int freq = docs2.freq(); status.totPos += freq; totalTermFreq += freq; @@ -810,6 +813,7 @@ public class CheckIndex { docCount = 0; totalTermFreq = 0; while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + visitedDocs.set(docsNoDel.docID()); docCount++; totalTermFreq += docsNoDel.freq(); } @@ -919,6 +923,13 @@ public class CheckIndex { throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); } } + + if (fieldTerms != null) { + final int v = fieldTerms.getDocCount(); + if (v != -1 && visitedDocs.cardinality() != v) { + throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality()); + } + } // Test seek to last term: if (lastTerm != null) { diff --git a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java index 5d7f0a50270..7feccebe7c6 100644 --- a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -109,6 +109,11 @@ public class FilterIndexReader extends IndexReader { public long getSumDocFreq() throws IOException { return in.getSumDocFreq(); } + + @Override + public int getDocCount() throws IOException { + return in.getDocCount(); + } } /** Base class for filtering {@link TermsEnum} implementations. */ diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index 842f3c407dc..2705de82a84 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.util.BitVector; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.RamUsageEstimator; // TODO: break into separate freq and prox writers as @@ -261,6 +262,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem final ByteSliceReader freq = new ByteSliceReader(); final ByteSliceReader prox = new ByteSliceReader(); + FixedBitSet visitedDocs = new FixedBitSet(state.numDocs); long sumTotalTermFreq = 0; long sumDocFreq = 0; @@ -347,6 +349,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. + visitedDocs.set(docID); postingsConsumer.startDoc(docID, termDocFreq); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip @@ -408,7 +411,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem sumDocFreq += numDocs; } - termsConsumer.finish(sumTotalTermFreq, sumDocFreq); + termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); } } diff --git a/lucene/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/src/java/org/apache/lucene/index/MultiTerms.java index 168a142dd65..3d296f1cb8c 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTerms.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java @@ -120,6 +120,19 @@ public final class MultiTerms extends Terms { } return sum; } + + @Override + public int getDocCount() throws IOException { + int sum = 0; + for(Terms terms : subs) { + final int v = terms.getDocCount(); + if (v == -1) { + return -1; + } + sum += v; + } + return sum; + } @Override public Comparator getComparator() { diff --git a/lucene/src/java/org/apache/lucene/index/Terms.java b/lucene/src/java/org/apache/lucene/index/Terms.java index 6cdf1f353db..96c5c8f5660 100644 --- a/lucene/src/java/org/apache/lucene/index/Terms.java +++ b/lucene/src/java/org/apache/lucene/index/Terms.java @@ -174,6 +174,13 @@ public abstract class Terms { * into account. */ public abstract long getSumDocFreq() throws IOException; + /** Returns the number of documents that have at least one + * term for this field, or -1 if this measure isn't + * stored by the codec. Note that, just like other term + * measures, this measure does not take deleted documents + * into account. */ + public abstract int getDocCount() throws IOException; + /** * Returns a thread-private {@link TermsEnum} instance. Obtaining * {@link TermsEnum} from this method might be more efficient than using diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java index ffd685c7c2e..6534109490b 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java @@ -139,8 +139,9 @@ public class BlockTermsReader extends FieldsProducer { final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); final long sumDocFreq = in.readVLong(); + final int docCount = in.readVInt(); assert !fields.containsKey(fieldInfo.name); - fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq)); + fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount)); } success = true; } finally { @@ -243,14 +244,16 @@ public class BlockTermsReader extends FieldsProducer { final long termsStartPointer; final long sumTotalTermFreq; final long sumDocFreq; + final int docCount; - FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq) { + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.termsStartPointer = termsStartPointer; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; + this.docCount = docCount; } @Override @@ -283,6 +286,11 @@ public class BlockTermsReader extends FieldsProducer { return sumDocFreq; } + @Override + public int getDocCount() throws IOException { + return docCount; + } + // Iterates through terms in this field private final class SegmentTermsEnum extends TermsEnum { private final IndexInput in; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java index a3ebed27a78..a07b251ed63 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java @@ -134,6 +134,7 @@ public class BlockTermsWriter extends FieldsConsumer { out.writeVLong(field.sumTotalTermFreq); } out.writeVLong(field.sumDocFreq); + out.writeVInt(field.docCount); } } writeTrailer(dirStart); @@ -160,6 +161,7 @@ public class BlockTermsWriter extends FieldsConsumer { private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; long sumTotalTermFreq; long sumDocFreq; + int docCount; private TermEntry[] pendingTerms; @@ -234,7 +236,7 @@ public class BlockTermsWriter extends FieldsConsumer { // Finishes all terms in this field @Override - public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { if (pendingCount > 0) { flushBlock(); } @@ -243,6 +245,7 @@ public class BlockTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; + this.docCount = docCount; fieldIndexWriter.finish(out.getFilePointer()); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java index b8610609c51..fc66d1ee4b7 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java @@ -153,9 +153,10 @@ public class BlockTreeTermsReader extends FieldsProducer { assert fieldInfo != null: "field=" + field; final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); final long sumDocFreq = in.readVLong(); + final int docCount = in.readVInt(); final long indexStartFP = indexDivisor != -1 ? indexIn.readVLong() : 0; assert !fields.containsKey(fieldInfo.name); - fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, indexStartFP, indexIn)); + fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, indexIn)); } success = true; } finally { @@ -399,6 +400,7 @@ public class BlockTreeTermsReader extends FieldsProducer { final FieldInfo fieldInfo; final long sumTotalTermFreq; final long sumDocFreq; + final int docCount; final long indexStartFP; final long rootBlockFP; final BytesRef rootCode; @@ -406,13 +408,14 @@ public class BlockTreeTermsReader extends FieldsProducer { //private boolean DEBUG; - FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, long indexStartFP, IndexInput indexIn) throws IOException { + FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); this.numTerms = numTerms; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; + this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; // if (DEBUG) { @@ -475,6 +478,11 @@ public class BlockTreeTermsReader extends FieldsProducer { return sumDocFreq; } + @Override + public int getDocCount() throws IOException { + return docCount; + } + @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java index d1536c41221..bee12eb5958 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java @@ -350,6 +350,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { private long numTerms; long sumTotalTermFreq; long sumDocFreq; + int docCount; long indexStartFP; // Used only to partition terms into the block tree; we @@ -866,7 +867,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // Finishes all terms in this field @Override - public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { if (numTerms > 0) { blockBuilder.finish(); @@ -878,6 +879,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; + this.docCount = docCount; // Write FST to index indexStartFP = indexOut.getFilePointer(); @@ -929,6 +931,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { out.writeVLong(field.sumTotalTermFreq); } out.writeVLong(field.sumDocFreq); + out.writeVInt(field.docCount); indexOut.writeVLong(field.indexStartFP); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java index 9d32183ae24..d1ab39aad66 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; /** * @lucene.experimental @@ -56,7 +57,7 @@ public abstract class PostingsConsumer { /** Default merge impl: append documents, mapping around * deletes */ - public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException { + public TermStats merge(final MergeState mergeState, final DocsEnum postings, final FixedBitSet visitedDocs) throws IOException { int df = 0; long totTF = 0; @@ -67,6 +68,7 @@ public abstract class PostingsConsumer { if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } + visitedDocs.set(doc); final int freq = postings.freq(); this.startDoc(doc, freq); this.finishDoc(); @@ -80,6 +82,7 @@ public abstract class PostingsConsumer { if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } + visitedDocs.set(doc); final int freq = postingsEnum.freq(); this.startDoc(doc, freq); totTF += freq; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java index 97bd4f427ae..99a38708bf4 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.MultiDocsEnum; import org.apache.lucene.index.MultiDocsAndPositionsEnum; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; /** * @lucene.experimental @@ -42,7 +43,7 @@ public abstract class TermsConsumer { public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; /** Called when we are done adding terms to this field */ - public abstract void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException; + public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException; /** Return the BytesRef Comparator used to sort terms * before feeding to this API. */ @@ -59,6 +60,7 @@ public abstract class TermsConsumer { long sumTotalTermFreq = 0; long sumDocFreq = 0; long sumDFsinceLastAbortCheck = 0; + FixedBitSet visitedDocs = new FixedBitSet(mergeState.mergedDocCount); if (mergeState.fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (docsEnum == null) { @@ -75,7 +77,7 @@ public abstract class TermsConsumer { if (docsEnumIn != null) { docsEnum.reset(docsEnumIn); final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, docsEnum); + final TermStats stats = postingsConsumer.merge(mergeState, docsEnum, visitedDocs); if (stats.docFreq > 0) { finishTerm(term, stats); sumTotalTermFreq += stats.totalTermFreq; @@ -109,7 +111,7 @@ public abstract class TermsConsumer { } } final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum); + final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum, visitedDocs); if (stats.docFreq > 0) { finishTerm(term, stats); sumTotalTermFreq += stats.totalTermFreq; @@ -124,6 +126,6 @@ public abstract class TermsConsumer { } } - finish(sumTotalTermFreq, sumDocFreq); + finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java index 029f6cac08d..a3a9407e42a 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java @@ -220,7 +220,7 @@ public class MemoryCodec extends Codec { } @Override - public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { if (termCount > 0) { out.writeVInt(termCount); out.writeVInt(field.number); @@ -228,6 +228,7 @@ public class MemoryCodec extends Codec { out.writeVLong(sumTotalTermFreq); } out.writeVLong(sumDocFreq); + out.writeVInt(docCount); builder.finish().save(out); if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); } @@ -682,6 +683,7 @@ public class MemoryCodec extends Codec { private final long sumTotalTermFreq; private final long sumDocFreq; + private final int docCount; private FST fst; private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); private final FieldInfo field; @@ -695,6 +697,7 @@ public class MemoryCodec extends Codec { sumTotalTermFreq = -1; } sumDocFreq = in.readVLong(); + docCount = in.readVInt(); fst = new FST(in, outputs); } @@ -709,6 +712,11 @@ public class MemoryCodec extends Codec { return sumDocFreq; } + @Override + public int getDocCount() throws IOException { + return docCount; + } + @Override public TermsEnum iterator() { return new FSTTermsEnum(field, fst); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index d9e5133d07c..1b16915a9ab 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -249,6 +249,11 @@ public class PreFlexFields extends FieldsProducer { public long getSumDocFreq() throws IOException { return -1; } + + @Override + public int getDocCount() throws IOException { + return -1; + } } private class PreTermsEnum extends TermsEnum { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index 676b277e1b4..7919c4ea93e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -31,6 +31,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.BytesRefFSTEnum; @@ -472,9 +473,11 @@ class SimpleTextFieldsReader extends FieldsProducer { private final IndexOptions indexOptions; private long sumTotalTermFreq; private long sumDocFreq; + private int docCount; private FST>> fst; private int termCount; private final BytesRef scratch = new BytesRef(10); + private final CharsRef scratchUTF16 = new CharsRef(10); public SimpleTextTerms(String field, long termsStart) throws IOException { this.termsStart = termsStart; @@ -494,6 +497,7 @@ class SimpleTextFieldsReader extends FieldsProducer { long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; + OpenBitSet visitedDocs = new OpenBitSet(); while(true) { readLine(in, scratch); if (scratch.equals(END) || scratch.startsWith(FIELD)) { @@ -507,6 +511,9 @@ class SimpleTextFieldsReader extends FieldsProducer { } else if (scratch.startsWith(DOC)) { docFreq++; sumDocFreq++; + UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16); + int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); + visitedDocs.set(docID); } else if (scratch.startsWith(POS)) { totalTermFreq++; } else if (scratch.startsWith(TERM)) { @@ -528,6 +535,7 @@ class SimpleTextFieldsReader extends FieldsProducer { termCount++; } } + docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); @@ -566,6 +574,11 @@ class SimpleTextFieldsReader extends FieldsProducer { public long getSumDocFreq() throws IOException { return sumDocFreq; } + + @Override + public int getDocCount() throws IOException { + return docCount; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java index cb0a06d29f6..9427da66dbd 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java @@ -95,7 +95,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer { } @Override - public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { } @Override diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java index e80a5be0745..05bbbfbc3e0 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java @@ -196,7 +196,7 @@ class PreFlexFieldsWriter extends FieldsConsumer { } @Override - public void finish(long sumTotalTermCount, long sumDocFreq) throws IOException { + public void finish(long sumTotalTermCount, long sumDocFreq, int docCount) throws IOException { } @Override diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java index 7029c425346..b7389768ac7 100644 --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -99,6 +99,7 @@ public class TestExternalCodecs extends LuceneTestCase { final SortedMap termToDocs = new TreeMap(); long sumTotalTermFreq; long sumDocFreq; + int docCount; RAMField(String field) { this.field = field; @@ -118,6 +119,11 @@ public class TestExternalCodecs extends LuceneTestCase { public long getSumDocFreq() throws IOException { return sumDocFreq; } + + @Override + public int getDocCount() throws IOException { + return docCount; + } @Override public TermsEnum iterator() { @@ -206,9 +212,10 @@ public class TestExternalCodecs extends LuceneTestCase { } @Override - public void finish(long sumTotalTermFreq, long sumDocFreq) { + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) { field.sumTotalTermFreq = sumTotalTermFreq; field.sumDocFreq = sumDocFreq; + field.docCount = docCount; } } diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index 8f2f84447e8..8db99ab0b9d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -42,6 +42,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.Version; import org.apache.lucene.util._TestUtil; import org.junit.BeforeClass; @@ -108,11 +109,15 @@ public class TestCodecs extends LuceneTestCase { final TermsConsumer termsConsumer = consumer.addField(fieldInfo); long sumTotalTermCount = 0; long sumDF = 0; + OpenBitSet visitedDocs = new OpenBitSet(); for (final TermData term : terms) { + for (int i = 0; i < term.docs.length; i++) { + visitedDocs.set(term.docs[i]); + } sumDF += term.docs.length; sumTotalTermCount += term.write(termsConsumer); } - termsConsumer.finish(sumTotalTermCount, sumDF); + termsConsumer.finish(sumTotalTermCount, sumDF, (int) visitedDocs.cardinality()); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestDocCount.java b/lucene/src/test/org/apache/lucene/index/TestDocCount.java new file mode 100644 index 00000000000..4c12aa03857 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/TestDocCount.java @@ -0,0 +1,83 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +/** + * Tests the Terms.docCount statistic + */ +public class TestDocCount extends LuceneTestCase { + public void testSimple() throws Exception { + assumeFalse("PreFlex codec does not support docCount statistic!", + "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec())); + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + int numDocs = atLeast(100); + for (int i = 0; i < numDocs; i++) { + iw.addDocument(doc()); + } + IndexReader ir = iw.getReader(); + verifyCount(ir); + ir.close(); + iw.optimize(); + ir = iw.getReader(); + verifyCount(ir); + ir.close(); + iw.close(); + dir.close(); + } + + private Document doc() { + Document doc = new Document(); + int numFields = _TestUtil.nextInt(random, 1, 10); + for (int i = 0; i < numFields; i++) { + doc.add(newField("" + _TestUtil.nextInt(random, 'a', 'z'), "" + _TestUtil.nextInt(random, 'a', 'z'), StringField.TYPE_UNSTORED)); + } + return doc; + } + + private void verifyCount(IndexReader ir) throws Exception { + Fields fields = MultiFields.getFields(ir); + if (fields == null) { + return; + } + FieldsEnum e = fields.iterator(); + String field; + while ((field = e.next()) != null) { + Terms terms = fields.terms(field); + int docCount = terms.getDocCount(); + FixedBitSet visited = new FixedBitSet(ir.maxDoc()); + TermsEnum te = terms.iterator(); + while (te.next() != null) { + DocsEnum de = te.docs(null, null); + while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + visited.set(de.docID()); + } + } + assertEquals(visited.cardinality(), docCount); + } + } +} diff --git a/lucene/src/test/org/apache/lucene/search/FieldCacheRewriteMethod.java b/lucene/src/test/org/apache/lucene/search/FieldCacheRewriteMethod.java index f0863b6eaf6..6da853497cf 100644 --- a/lucene/src/test/org/apache/lucene/search/FieldCacheRewriteMethod.java +++ b/lucene/src/test/org/apache/lucene/search/FieldCacheRewriteMethod.java @@ -134,6 +134,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod public long getSumDocFreq() throws IOException { return -1; } + + @Override + public int getDocCount() throws IOException { + return -1; + } }); assert termsEnum != null;