From 1ae1d6b4fab04fb41f23fa99834f74543229a478 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 8 Jul 2011 21:03:43 +0000 Subject: [PATCH] LUCENE-3290: add FieldInvertState.numUniqueTerms, Terms.sumDocFreq git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1144513 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 7 ++ .../instantiated/InstantiatedIndexReader.java | 6 + .../lucene/index/memory/MemoryIndex.java | 6 + .../search/FieldCacheRewriteMethod.java | 5 + .../org/apache/lucene/index/CheckIndex.java | 10 +- .../apache/lucene/index/FieldInvertState.java | 9 ++ .../lucene/index/FilterIndexReader.java | 5 + .../index/FreqProxTermsWriterPerField.java | 8 +- .../org/apache/lucene/index/MultiTerms.java | 13 +++ .../java/org/apache/lucene/index/Terms.java | 7 ++ .../lucene/index/codecs/BlockTermsReader.java | 12 +- .../lucene/index/codecs/BlockTermsWriter.java | 5 +- .../lucene/index/codecs/TermsConsumer.java | 25 ++-- .../index/codecs/memory/MemoryCodec.java | 10 +- .../index/codecs/preflex/PreFlexFields.java | 5 + .../simpletext/SimpleTextFieldsReader.java | 7 ++ .../simpletext/SimpleTextFieldsWriter.java | 2 +- .../codecs/preflexrw/PreFlexFieldsWriter.java | 2 +- .../org/apache/lucene/TestExternalCodecs.java | 9 +- .../org/apache/lucene/index/TestCodecs.java | 4 +- .../apache/lucene/index/TestSumDocFreq.java | 101 ++++++++++++++++ .../lucene/index/TestUniqueTermCount.java | 108 ++++++++++++++++++ 22 files changed, 345 insertions(+), 21 deletions(-) create mode 100644 lucene/src/test/org/apache/lucene/index/TestSumDocFreq.java create mode 100644 lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7d76af0b813..a4c3d2556f4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -421,6 +421,8 @@ New features * LUCENE-2862: Added TermsEnum.totalTermFreq() and Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir) +* LUCENE-3290: Added Terms.getSumDocFreq() (Mike McCandless, Robert Muir) + * LUCENE-3003: Added new expert class oal.index.DocTermsOrd, refactored from Solr's UnInvertedField, for accessing term ords for multi-valued fields, per document. This is similar to FieldCache in @@ -512,6 +514,11 @@ Bug fixes causing the file to sometimes be larger than it needed to be. (Mike McCandless) +New Features + +* LUCENE-3290: Added FieldInvertState.numUniqueTerms + (Mike McCandless, Robert Muir) + Optimizations * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java index 57aca7a109a..b8b10477a34 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -426,6 +426,12 @@ public class InstantiatedIndexReader extends IndexReader { public long getSumTotalTermFreq() { return sumTotalTermFreq; } + + // TODO: support this? + @Override + public long getSumDocFreq() { + return -1; + } @Override public Comparator getComparator() { diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 0c93f3feedb..d6e7e09e50d 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -842,6 +842,12 @@ public class MemoryIndex { public long getSumTotalTermFreq() { return info.getSumTotalTermFreq(); } + + @Override + public long getSumDocFreq() throws IOException { + // each term has df=1 + return info.sortedTerms.length; + } }; } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java index 8539f855f9f..8a999d628db 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java @@ -128,6 +128,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod public long getSumTotalTermFreq() { return -1; } + + @Override + public long getSumDocFreq() throws IOException { + return -1; + } }); assert termsEnum != null; diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index a3fec4d2f58..4b0164f549d 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -691,7 +691,7 @@ public class CheckIndex { Comparator termComp = terms.getComparator(); long sumTotalTermFreq = 0; - + long sumDocFreq = 0; while(true) { final BytesRef term = terms.next(); @@ -712,6 +712,7 @@ public class CheckIndex { final int docFreq = terms.docFreq(); status.totFreq += docFreq; + sumDocFreq += docFreq; docs = terms.docs(liveDocs, docs); postings = terms.docsAndPositions(liveDocs, postings); @@ -879,6 +880,13 @@ public class CheckIndex { throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); } } + + if (sumDocFreq != 0) { + final long v = fields.terms(field).getSumDocFreq(); + if (v != -1 && sumDocFreq != v) { + throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); + } + } // Test seek to last term: if (lastTerm != null) { diff --git a/lucene/src/java/org/apache/lucene/index/FieldInvertState.java b/lucene/src/java/org/apache/lucene/index/FieldInvertState.java index 8c4e92ad4ea..0b54500c71d 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInvertState.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInvertState.java @@ -31,6 +31,7 @@ public final class FieldInvertState { int numOverlap; int offset; int maxTermFrequency; + int uniqueTermCount; float boost; AttributeSource attributeSource; @@ -55,6 +56,7 @@ public final class FieldInvertState { numOverlap = 0; offset = 0; maxTermFrequency = 0; + uniqueTermCount = 0; boost = docBoost; attributeSource = null; } @@ -122,6 +124,13 @@ public final class FieldInvertState { return maxTermFrequency; } + /** + * Return the number of unique terms encountered in this field. + */ + public int getUniqueTermCount() { + return uniqueTermCount; + } + public AttributeSource getAttributeSource() { return attributeSource; } diff --git a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java index e7d2190774a..d57591a6b15 100644 --- a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -105,6 +105,11 @@ public class FilterIndexReader extends IndexReader { public long getSumTotalTermFreq() throws IOException { return in.getSumTotalTermFreq(); } + + @Override + public long getSumDocFreq() throws IOException { + return in.getSumDocFreq(); + } } /** Base class for filtering {@link TermsEnum} implementations. */ diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index a770c9011bf..ddc4e441e71 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -134,6 +134,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem writeProx(termID, fieldState.position); } fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); + fieldState.uniqueTermCount++; } @Override @@ -151,6 +152,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID]; postings.lastDocIDs[termID] = docState.docID; + fieldState.uniqueTermCount++; } } else { if (docState.docID != postings.lastDocIDs[termID]) { @@ -171,6 +173,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocIDs[termID] = docState.docID; writeProx(termID, fieldState.position); + fieldState.uniqueTermCount++; } else { fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]); writeProx(termID, fieldState.position-postings.lastPositions[termID]); @@ -251,6 +254,8 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem final ByteSliceReader prox = new ByteSliceReader(); long sumTotalTermFreq = 0; + long sumDocFreq = 0; + for (int i = 0; i < numTerms; i++) { final int termID = termIDs[i]; // Get BytesRef @@ -389,9 +394,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem } termsConsumer.finishTerm(text, new TermStats(numDocs, totTF)); sumTotalTermFreq += totTF; + sumDocFreq += numDocs; } - termsConsumer.finish(sumTotalTermFreq); + termsConsumer.finish(sumTotalTermFreq, sumDocFreq); } } diff --git a/lucene/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/src/java/org/apache/lucene/index/MultiTerms.java index 2da5db54df8..2785170dfb3 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTerms.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java @@ -88,6 +88,19 @@ public final class MultiTerms extends Terms { } return sum; } + + @Override + public long getSumDocFreq() throws IOException { + long sum = 0; + for(Terms terms : subs) { + final long v = terms.getSumDocFreq(); + if (v == -1) { + return -1; + } + sum += v; + } + return sum; + } @Override public Comparator getComparator() { diff --git a/lucene/src/java/org/apache/lucene/index/Terms.java b/lucene/src/java/org/apache/lucene/index/Terms.java index b7600c8a784..b7a27e0d0c9 100644 --- a/lucene/src/java/org/apache/lucene/index/Terms.java +++ b/lucene/src/java/org/apache/lucene/index/Terms.java @@ -132,6 +132,13 @@ public abstract class Terms { * into account. */ public abstract long getSumTotalTermFreq() throws IOException; + /** Returns the sum of {@link #docFreq(BytesRef)} for + * all terms in this field, or -1 if this measure isn't + * stored by the codec. Note that, just like other term + * measures, this measure does not take deleted documents + * into account. */ + public abstract long getSumDocFreq() throws IOException; + /** * Returns a thread-private {@link TermsEnum} instance. Obtaining * {@link TermsEnum} from this method might be more efficient than using diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java index 19c280b0c7a..a970d57966e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java @@ -137,8 +137,9 @@ public class BlockTermsReader extends FieldsProducer { final long termsStartPointer = in.readVLong(); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong(); + final long sumDocFreq = in.readVLong(); assert !fields.containsKey(fieldInfo.name); - fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq)); + fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq)); } success = true; } finally { @@ -245,13 +246,15 @@ public class BlockTermsReader extends FieldsProducer { final FieldInfo fieldInfo; final long termsStartPointer; final long sumTotalTermFreq; + final long sumDocFreq; - FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) { + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.termsStartPointer = termsStartPointer; this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; } @Override @@ -279,6 +282,11 @@ public class BlockTermsReader extends FieldsProducer { return sumTotalTermFreq; } + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } + // Iterates through terms in this field private final class SegmentTermsEnum extends TermsEnum { private final IndexInput in; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java index 926a6af6301..89ab114919e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java @@ -132,6 +132,7 @@ public class BlockTermsWriter extends FieldsConsumer { if (!field.fieldInfo.omitTermFreqAndPositions) { out.writeVLong(field.sumTotalTermFreq); } + out.writeVLong(field.sumDocFreq); } } writeTrailer(dirStart); @@ -157,6 +158,7 @@ public class BlockTermsWriter extends FieldsConsumer { private long numTerms; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; long sumTotalTermFreq; + long sumDocFreq; private TermEntry[] pendingTerms; @@ -231,7 +233,7 @@ public class BlockTermsWriter extends FieldsConsumer { // Finishes all terms in this field @Override - public void finish(long sumTotalTermFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { if (pendingCount > 0) { flushBlock(); } @@ -239,6 +241,7 @@ public class BlockTermsWriter extends FieldsConsumer { out.writeVInt(0); this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; fieldIndexWriter.finish(out.getFilePointer()); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java index 875da3bbb63..01280154cac 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java @@ -41,7 +41,7 @@ public abstract class TermsConsumer { public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; /** Called when we are done adding terms to this field */ - public abstract void finish(long sumTotalTermFreq) throws IOException; + public abstract void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException; /** Return the BytesRef Comparator used to sort terms * before feeding to this API. */ @@ -56,7 +56,8 @@ public abstract class TermsConsumer { BytesRef term; assert termsEnum != null; long sumTotalTermFreq = 0; - long sumDF = 0; + long sumDocFreq = 0; + long sumDFsinceLastAbortCheck = 0; if (mergeState.fieldInfo.omitTermFreqAndPositions) { if (docsEnum == null) { @@ -74,10 +75,11 @@ public abstract class TermsConsumer { final TermStats stats = postingsConsumer.merge(mergeState, docsEnum); if (stats.docFreq > 0) { finishTerm(term, stats); - sumDF += stats.docFreq; - if (sumDF > 60000) { - mergeState.checkAbort.work(sumDF/5.0); - sumDF = 0; + sumDFsinceLastAbortCheck += stats.docFreq; + sumDocFreq += stats.docFreq; + if (sumDFsinceLastAbortCheck > 60000) { + mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); + sumDFsinceLastAbortCheck = 0; } } } @@ -105,16 +107,17 @@ public abstract class TermsConsumer { if (stats.docFreq > 0) { finishTerm(term, stats); sumTotalTermFreq += stats.totalTermFreq; - sumDF += stats.docFreq; - if (sumDF > 60000) { - mergeState.checkAbort.work(sumDF/5.0); - sumDF = 0; + sumDFsinceLastAbortCheck += stats.docFreq; + sumDocFreq += stats.docFreq; + if (sumDFsinceLastAbortCheck > 60000) { + mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); + sumDFsinceLastAbortCheck = 0; } } } } } - finish(sumTotalTermFreq); + finish(sumTotalTermFreq, sumDocFreq); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java index dbe6265900a..e36994fe990 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java @@ -219,13 +219,14 @@ public class MemoryCodec extends Codec { } @Override - public void finish(long sumTotalTermFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { if (termCount > 0) { out.writeVInt(termCount); out.writeVInt(field.number); if (!field.omitTermFreqAndPositions) { out.writeVLong(sumTotalTermFreq); } + out.writeVLong(sumDocFreq); builder.finish().save(out); if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); } @@ -683,6 +684,7 @@ public class MemoryCodec extends Codec { private final static class TermsReader extends Terms { private final long sumTotalTermFreq; + private final long sumDocFreq; private FST fst; private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); private final FieldInfo field; @@ -695,6 +697,7 @@ public class MemoryCodec extends Codec { } else { sumTotalTermFreq = 0; } + sumDocFreq = in.readVLong(); fst = new FST(in, outputs); } @@ -704,6 +707,11 @@ public class MemoryCodec extends Codec { return sumTotalTermFreq; } + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } + @Override public TermsEnum iterator() { return new FSTTermsEnum(field, fst); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index ac3962d50e4..a994b468527 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -266,6 +266,11 @@ public class PreFlexFields extends FieldsProducer { public long getSumTotalTermFreq() { return -1; } + + @Override + public long getSumDocFreq() throws IOException { + return -1; + } } private class PreTermsEnum extends TermsEnum { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index 34451939966..e2a37f6b199 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -463,6 +463,7 @@ class SimpleTextFieldsReader extends FieldsProducer { private final long termsStart; private final boolean omitTF; private long sumTotalTermFreq; + private long sumDocFreq; private FST>> fst; private int termCount; private final BytesRef scratch = new BytesRef(10); @@ -500,6 +501,7 @@ class SimpleTextFieldsReader extends FieldsProducer { break; } else if (scratch.startsWith(DOC)) { docFreq++; + sumDocFreq++; } else if (scratch.startsWith(POS)) { totalTermFreq++; } else if (scratch.startsWith(TERM)) { @@ -554,6 +556,11 @@ class SimpleTextFieldsReader extends FieldsProducer { public long getSumTotalTermFreq() { return sumTotalTermFreq; } + + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java index f822ec6a72c..49ed1f5c786 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java @@ -89,7 +89,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer { } @Override - public void finish(long sumTotalTermFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { } @Override diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java index 4950cf97ea9..e10323146dd 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java @@ -195,7 +195,7 @@ class PreFlexFieldsWriter extends FieldsConsumer { } @Override - public void finish(long sumTotalTermCount) throws IOException { + public void finish(long sumTotalTermCount, long sumDocFreq) throws IOException { } @Override diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java index 50485853819..b3cebe05567 100644 --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -102,6 +102,7 @@ public class TestExternalCodecs extends LuceneTestCase { final String field; final SortedMap termToDocs = new TreeMap(); long sumTotalTermFreq; + long sumDocFreq; RAMField(String field) { this.field = field; @@ -116,6 +117,11 @@ public class TestExternalCodecs extends LuceneTestCase { public long getSumTotalTermFreq() { return sumTotalTermFreq; } + + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } @Override public TermsEnum iterator() { @@ -204,8 +210,9 @@ public class TestExternalCodecs extends LuceneTestCase { } @Override - public void finish(long sumTotalTermFreq) { + public void finish(long sumTotalTermFreq, long sumDocFreq) { field.sumTotalTermFreq = sumTotalTermFreq; + field.sumDocFreq = sumDocFreq; } } diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index fb702291507..c655d8a6b03 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -101,10 +101,12 @@ public class TestCodecs extends LuceneTestCase { Arrays.sort(terms); final TermsConsumer termsConsumer = consumer.addField(fieldInfo); long sumTotalTermCount = 0; + long sumDF = 0; for (final TermData term : terms) { + sumDF += term.docs.length; sumTotalTermCount += term.write(termsConsumer); } - termsConsumer.finish(sumTotalTermCount); + termsConsumer.finish(sumTotalTermCount, sumDF); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestSumDocFreq.java b/lucene/src/test/org/apache/lucene/index/TestSumDocFreq.java new file mode 100644 index 00000000000..92d579f70df --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/TestSumDocFreq.java @@ -0,0 +1,101 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +/** + * Tests {@link Terms#getSumDocFreq()} + * @lucene.experimental + */ +public class TestSumDocFreq extends LuceneTestCase { + + public void testSumDocFreq() throws Exception { + final int numDocs = atLeast(500); + + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); + + Document doc = new Document(); + Field field1 = newField("foo", "", Field.Index.ANALYZED); + Field field2 = newField("bar", "", Field.Index.ANALYZED); + doc.add(field1); + doc.add(field2); + for (int i = 0; i < numDocs; i++) { + char ch1 = (char) _TestUtil.nextInt(random, 'a', 'z'); + char ch2 = (char) _TestUtil.nextInt(random, 'a', 'z'); + field1.setValue("" + ch1 + " " + ch2); + ch1 = (char) _TestUtil.nextInt(random, 'a', 'z'); + ch2 = (char) _TestUtil.nextInt(random, 'a', 'z'); + field2.setValue("" + ch1 + " " + ch2); + writer.addDocument(doc); + } + + IndexReader ir = writer.getReader(); + writer.close(); + + assertSumDocFreq(ir); + ir.close(); + + ir = IndexReader.open(dir, false); + int numDeletions = atLeast(20); + for (int i = 0; i < numDeletions; i++) { + ir.deleteDocument(random.nextInt(ir.maxDoc())); + } + ir.close(); + + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))); + w.optimize(); + w.close(); + + ir = IndexReader.open(dir, true); + assertSumDocFreq(ir); + ir.close(); + + dir.close(); + } + + private void assertSumDocFreq(IndexReader ir) throws Exception { + // compute sumDocFreq across all fields + Fields fields = MultiFields.getFields(ir); + FieldsEnum fieldEnum = fields.iterator(); + String f = null; + while ((f = fieldEnum.next()) != null) { + Terms terms = fields.terms(f); + long sumDocFreq = terms.getSumDocFreq(); + if (sumDocFreq == -1) { + if (VERBOSE) { + System.out.println("skipping field: " + f + ", codec does not support sumDocFreq"); + } + continue; + } + + long computedSumDocFreq = 0; + TermsEnum termsEnum = terms.iterator(); + while (termsEnum.next() != null) { + computedSumDocFreq += termsEnum.docFreq(); + } + assertEquals(computedSumDocFreq, sumDocFreq); + } + } +} diff --git a/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java b/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java new file mode 100644 index 00000000000..3fd64fa4926 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java @@ -0,0 +1,108 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.HashSet; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.DefaultSimilarityProvider; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +/** + * Tests the uniqueTermCount statistic in FieldInvertState + */ +public class TestUniqueTermCount extends LuceneTestCase { + Directory dir; + IndexReader reader; + /* expected uniqueTermCount values for our documents */ + ArrayList expected = new ArrayList(); + + @Override + public void setUp() throws Exception { + super.setUp(); + dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random, MockTokenizer.SIMPLE, true)).setMergePolicy(newLogMergePolicy()); + config.setSimilarityProvider(new DefaultSimilarityProvider() { + @Override + public Similarity get(String field) { + return new TestSimilarity(); + } + }); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, config); + Document doc = new Document(); + Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(foo); + for (int i = 0; i < 100; i++) { + foo.setValue(addValue()); + writer.addDocument(doc); + } + reader = writer.getReader(); + writer.close(); + } + + @Override + public void tearDown() throws Exception { + reader.close(); + dir.close(); + super.tearDown(); + } + + public void test() throws Exception { + byte fooNorms[] = MultiNorms.norms(reader, "foo"); + for (int i = 0; i < reader.maxDoc(); i++) + assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff); + } + + /** + * Makes a bunch of single-char tokens (the max # unique terms will at most be 26). + * puts the # unique terms into expected, to be checked against the norm. + */ + private String addValue() { + StringBuilder sb = new StringBuilder(); + HashSet terms = new HashSet(); + int num = _TestUtil.nextInt(random, 0, 255); + for (int i = 0; i < num; i++) { + sb.append(' '); + char term = (char) _TestUtil.nextInt(random, 'a', 'z'); + sb.append(term); + terms.add("" + term); + } + expected.add(terms.size()); + return sb.toString(); + } + + /** + * Simple similarity that encodes maxTermFrequency directly as a byte + */ + class TestSimilarity extends DefaultSimilarity { + + @Override + public byte computeNorm(FieldInvertState state) { + return (byte) state.getUniqueTermCount(); + } + } +}