diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a38c96aa61d..d51a1936ea0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -359,6 +359,9 @@ New features terms dict. This impl stores the indexed terms in an FST, which is much more RAM efficient than FixedGapTermsIndex. (Mike McCandless) +* LUCENE-2862: Added TermsEnum.totalTermFreq() and + Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir) + Optimizations * LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching. diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java index 179d9c42c80..76faa4602bd 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java @@ -238,6 +238,10 @@ public class InstantiatedIndex while((text = termsEnum.next()) != null) { String termText = text.utf8ToString(); InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText); + final long totalTermFreq = termsEnum.totalTermFreq(); + if (totalTermFreq != -1) { + instantiatedTerm.addPositionsCount(totalTermFreq); + } getTermsByFieldAndText().get(field).put(termText, instantiatedTerm); instantiatedTerm.setTermIndex(terms.size()); terms.add(instantiatedTerm); diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java index 3b191b83fc5..7cece688d33 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -398,18 +398,33 @@ public class InstantiatedIndexReader extends IndexReader { if (i < 0) { i = -i - 1; } - if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) { + if (i >= orderedTerms.length || orderedTerms[i].field() != field) { // field does not exist return null; } final int startLoc = i; + // TODO: heavy to do this here; would be better to + // do it up front & cache + long sum = 0; + int upto = i; + while(upto < orderedTerms.length && orderedTerms[i].field() == field) { + sum += orderedTerms[i].getTotalTermFreq(); + upto++; + } + final long sumTotalTermFreq = sum; + return new Terms() { @Override public TermsEnum iterator() { return new InstantiatedTermsEnum(orderedTerms, startLoc, field); } + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + @Override public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java index f5cd26dfe31..95bb1f01cd4 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -315,6 +315,7 @@ public class InstantiatedIndexWriter implements Closeable { } associatedDocuments[associatedDocuments.length - 1] = info; term.setAssociatedDocuments(associatedDocuments); + term.addPositionsCount(positions.length); // todo optimize, only if term vector? informationByTermOfCurrentDocument.put(term, info); diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java index b893e04cd86..cdd2197c89c 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java @@ -45,6 +45,8 @@ public class InstantiatedTerm private Term term; + private long totalTermFreq; + /** * index of term in InstantiatedIndex * @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */ @@ -92,6 +94,14 @@ public class InstantiatedTerm this.associatedDocuments = associatedDocuments; } + void addPositionsCount(long count) { + totalTermFreq += count; + } + + public long getTotalTermFreq() { + return totalTermFreq; + } + /** * Finds index to the first beyond the current whose document number is * greater than or equal to target, -1 if there is no such element. diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java index 0dd4ffc2803..8628d005b60 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java @@ -24,7 +24,6 @@ import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.codecs.PrefixCodedTermState; import java.io.IOException; import java.util.Arrays; @@ -110,6 +109,12 @@ public class InstantiatedTermsEnum extends TermsEnum { return terms[upto].getAssociatedDocuments().length; } + @Override + public long totalTermFreq() { + final long v = terms[upto].getTotalTermFreq(); + return v == 0 ? -1 : v; + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) { diff --git a/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java b/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java index 1ad0743b635..ae091b5ec6e 100644 --- a/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java +++ b/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java @@ -66,6 +66,7 @@ public class TestIndicesEquals extends LuceneTestCase { // create dir data IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer())); + for (int i = 0; i < 20; i++) { Document document = new Document(); assembleDocument(document, i); @@ -395,6 +396,10 @@ public class TestIndicesEquals extends LuceneTestCase { } assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq()); + final long totalTermFreq = aprioriTermEnum.totalTermFreq(); + if (totalTermFreq != -1) { + assertEquals(totalTermFreq, testTermEnum.totalTermFreq()); + } // compare termDocs seeking diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index cd1df94a9c7..88049654f46 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -610,6 +610,8 @@ public class MemoryIndex implements Serializable { /** Term for this field's fieldName, lazily computed on demand */ public transient Term template; + private final long sumTotalTermFreq; + private static final long serialVersionUID = 2882195016849084649L; public Info(HashMap terms, int numTokens, int numOverlapTokens, float boost) { @@ -617,6 +619,15 @@ public class MemoryIndex implements Serializable { this.numTokens = numTokens; this.numOverlapTokens = numOverlapTokens; this.boost = boost; + long sum = 0; + for(Map.Entry ent : terms.entrySet()) { + sum += ent.getValue().size(); + } + sumTotalTermFreq = sum; + } + + public long getSumTotalTermFreq() { + return sumTotalTermFreq; } /** @@ -826,6 +837,11 @@ public class MemoryIndex implements Serializable { public long getUniqueTermCount() { return info.sortedTerms.length; } + + @Override + public long getSumTotalTermFreq() { + return info.getSumTotalTermFreq(); + } }; } } @@ -895,6 +911,11 @@ public class MemoryIndex implements Serializable { return 1; } + @Override + public long totalTermFreq() { + return info.sortedTerms[termUpto].getValue().size(); + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { if (reuse == null || !(reuse instanceof MemoryDocsEnum)) { diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java index c0ac082f293..77d29820660 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java @@ -176,15 +176,34 @@ public class HighFreqTerms { return ts; } - public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception { - BytesRef br = termtext; + public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termText) throws Exception { + long totalTF = 0; - Bits skipDocs = MultiFields.getDeletedDocs(reader); - DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br); - // if term is not in index return totalTF of 0 - if (de == null) { + + Terms terms = MultiFields.getTerms(reader, field); + if (terms == null) { return 0; } + + TermsEnum termsEnum = terms.iterator(); + if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) { + return 0; + } + + Bits skipDocs = MultiFields.getDeletedDocs(reader); + if (skipDocs == null) { + // TODO: we could do this up front, during the scan + // (next()), instead of after-the-fact here w/ seek, + // if the codec supports it and there are no del + // docs... + final long totTF = termsEnum.totalTermFreq(); + if (totTF != -1) { + return totTF; + } + } + + DocsEnum de = termsEnum.docs(skipDocs, null); + // use DocsEnum.read() and BulkResult api final DocsEnum.BulkReadResult bulkresult = de.getBulkResult(); int count; diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java index b28fbdde436..08c4a808367 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java @@ -41,4 +41,9 @@ public final class TermStats { String getTermText() { return termtext.utf8ToString(); } + + @Override + public String toString() { + return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq); + } } \ No newline at end of file diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java b/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java index 4f164e72b7b..cb33cfa8be9 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java @@ -17,15 +17,16 @@ package org.apache.lucene.misc; * limitations under the License. */ -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.store.Directory; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -41,8 +42,10 @@ public class TestHighFreqTerms extends LuceneTestCase { writer = new IndexWriter(dir, newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)) .setMaxBufferedDocs(2)); + writer.setInfoStream(VERBOSE ? System.out : null); indexDocs(writer); reader = IndexReader.open(dir, true); + _TestUtil.checkIndex(dir); } @AfterClass @@ -75,8 +78,8 @@ public class TestHighFreqTerms extends LuceneTestCase { String field="FIELD_1"; TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); for (int i = 0; i < terms.length; i++) { - if (i >0){ - assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq); + if (i > 0) { + assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq); } } } @@ -134,11 +137,12 @@ public class TestHighFreqTerms extends LuceneTestCase { TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms); - for (int i = 0; i < termsWithTF.length; i++) { - // check that they are sorted by descending termfreq order - if (i >0){ - assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq); - } + for (int i = 0; i < termsWithTF.length; i++) { + // check that they are sorted by descending termfreq + // order + if (i > 0) { + assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq); + } } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java index 5e83c75bc9e..8539f855f9f 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java @@ -123,7 +123,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod public TermsEnum iterator() throws IOException { return fcsi.getTermsEnum(); } - + + @Override + public long getSumTotalTermFreq() { + return -1; + } }); assert termsEnum != null; diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index e825c670e29..b91ec32ffd5 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -610,6 +610,8 @@ public class CheckIndex { Comparator termComp = terms.getComparator(); + long sumTotalTermFreq = 0; + while(true) { final BytesRef term = terms.next(); @@ -660,6 +662,8 @@ public class CheckIndex { } int lastDoc = -1; + int docCount = 0; + long totalTermFreq = 0; while(true) { final int doc = docs2.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { @@ -667,6 +671,8 @@ public class CheckIndex { } final int freq = docs2.freq(); status.totPos += freq; + totalTermFreq += freq; + docCount++; if (doc <= lastDoc) { throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); @@ -697,22 +703,39 @@ public class CheckIndex { } } } + + final long totalTermFreq2 = terms.totalTermFreq(); + final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1; - // Now count how many deleted docs occurred in - // this term: - + // Re-count if there are deleted docs: if (reader.hasDeletions()) { final DocsEnum docsNoDel = terms.docs(null, docs); - int count = 0; + docCount = 0; + totalTermFreq = 0; while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - count++; + docCount++; + totalTermFreq += docsNoDel.freq(); } - if (count != docFreq) { - throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count); + } + + if (docCount != docFreq) { + throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount); + } + if (hasTotalTermFreq) { + sumTotalTermFreq += totalTermFreq; + if (totalTermFreq != totalTermFreq2) { + throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq); } } } + if (sumTotalTermFreq != 0) { + final long v = fields.terms(field).getSumTotalTermFreq(); + if (v != -1 && sumTotalTermFreq != v) { + throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); + } + } + // Test seek to last term: if (lastTerm != null) { if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) { diff --git a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java index ffe82b017dc..a5e830fdeaf 100644 --- a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -99,6 +99,11 @@ public class FilterIndexReader extends IndexReader { public long getUniqueTermCount() throws IOException { return in.getUniqueTermCount(); } + + @Override + public long getSumTotalTermFreq() throws IOException { + return in.getSumTotalTermFreq(); + } } /** Base class for filtering {@link TermsEnum} implementations. */ @@ -155,6 +160,11 @@ public class FilterIndexReader extends IndexReader { return in.docFreq(); } + @Override + public long totalTermFreq() { + return in.totalTermFreq(); + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { return in.docs(skipDocs, reuse); diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index d749d24b555..7d9df0a661d 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -20,13 +20,14 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.List; import java.util.Map; -import java.util.Comparator; -import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CollectionUtil; @@ -165,6 +166,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer { // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions + long sumTotalTermFreq = 0; while(numFields > 0) { // Get the next term to merge @@ -197,6 +199,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer { // which all share the same term. Now we must // interleave the docID streams. int numDocs = 0; + long totTF = 0; while(numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; @@ -222,6 +225,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer { // omitTermFreqAndPositions == false so we do write positions & // payload int position = 0; + totTF += termDocFreq; for(int j=0;j> 1; @@ -286,9 +290,10 @@ final class FreqProxTermsWriter extends TermsHashConsumer { } assert numDocs > 0; - termsConsumer.finishTerm(text, numDocs); + termsConsumer.finishTerm(text, new TermStats(numDocs, totTF)); + sumTotalTermFreq += totTF; } - termsConsumer.finish(); + termsConsumer.finish(sumTotalTermFreq); } } diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java index 95f4977f177..eb953c687ba 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java @@ -997,6 +997,23 @@ public abstract class IndexReader implements Cloneable,Closeable { return terms.docFreq(term); } + /** Returns the number of documents containing the term + * t. This method returns 0 if the term or + * field does not exists. This method does not take into + * account deleted documents that have not yet been merged + * away. */ + public long totalTermFreq(String field, BytesRef term) throws IOException { + final Fields fields = fields(); + if (fields == null) { + return 0; + } + final Terms terms = fields.terms(field); + if (terms == null) { + return 0; + } + return terms.totalTermFreq(term); + } + /** This may return null if the field does not exist.*/ public Terms terms(String field) throws IOException { final Fields fields = fields(); diff --git a/lucene/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/src/java/org/apache/lucene/index/MultiTerms.java index 4e265c056e6..2da5db54df8 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTerms.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java @@ -76,6 +76,19 @@ public final class MultiTerms extends Terms { } } + @Override + public long getSumTotalTermFreq() throws IOException { + long sum = 0; + for(Terms terms : subs) { + final long v = terms.getSumTotalTermFreq(); + if (v == -1) { + return -1; + } + sum += v; + } + return sum; + } + @Override public Comparator getComparator() { return termComp; diff --git a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java index 1f4794f3f9f..ded70270901 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -265,6 +265,19 @@ public final class MultiTermsEnum extends TermsEnum { return sum; } + @Override + public long totalTermFreq() { + long sum = 0; + for(int i=0;i= 0; - final long termsStartPointer = in.readLong(); + final long termsStartPointer = in.readVLong(); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - if (numTerms > 0) { - assert !fields.containsKey(fieldInfo.name); - fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer)); - } + final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong(); + assert !fields.containsKey(fieldInfo.name); + fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq)); } success = true; } finally { @@ -245,12 +244,14 @@ public class PrefixCodedTermsReader extends FieldsProducer { final long numTerms; final FieldInfo fieldInfo; final long termsStartPointer; + final long sumTotalTermFreq; - FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) { + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.termsStartPointer = termsStartPointer; + this.sumTotalTermFreq = sumTotalTermFreq; } @Override @@ -273,6 +274,11 @@ public class PrefixCodedTermsReader extends FieldsProducer { return numTerms; } + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + // Iterates through terms in this field, not supporting ord() private final class SegmentTermsEnum extends TermsEnum { private final IndexInput in; @@ -295,6 +301,7 @@ public class PrefixCodedTermsReader extends FieldsProducer { bytesReader = new DeltaBytesReader(in); fieldTerm.field = fieldInfo.name; state = postingsReader.newTermState(); + state.totalTermFreq = -1; state.ord = -1; } @@ -494,6 +501,10 @@ public class PrefixCodedTermsReader extends FieldsProducer { state.docFreq = (in.readVInt() << 6) | (b & 0x3F); } + if (!fieldInfo.omitTermFreqAndPositions) { + state.totalTermFreq = state.docFreq + in.readVLong(); + } + postingsReader.readTerm(in, fieldInfo, state, isIndexTerm); @@ -511,6 +522,11 @@ public class PrefixCodedTermsReader extends FieldsProducer { return state.docFreq; } + @Override + public long totalTermFreq() { + return state.totalTermFreq; + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java index fa87f7f19d3..2be4ce1834d 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java @@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer { final FieldInfos fieldInfos; FieldInfo currentField; private final TermsIndexWriterBase termsIndexWriter; - private final List fields = new ArrayList(); + private final List fields = new ArrayList(); private final Comparator termComp; public PrefixCodedTermsWriter( @@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer { assert currentField == null || currentField.name.compareTo(field.name) < 0; currentField = field; TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field); - TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter); + final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter); fields.add(terms); return terms; } @@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter extends FieldsConsumer { public void close() throws IOException { try { - final int fieldCount = fields.size(); + + int nonZeroCount = 0; + for(TermsWriter field : fields) { + if (field.numTerms > 0) { + nonZeroCount++; + } + } final long dirStart = out.getFilePointer(); - out.writeInt(fieldCount); - for(int i=0;i 0) { + out.writeVInt(field.fieldInfo.number); + out.writeVLong(field.numTerms); + out.writeVLong(field.termsStartPointer); + if (!field.fieldInfo.omitTermFreqAndPositions) { + out.writeVLong(field.sumTotalTermFreq); + } + } } writeTrailer(dirStart); } finally { @@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer { private final long termsStartPointer; private long numTerms; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; + long sumTotalTermFreq; TermsWriter( TermsIndexWriterBase.FieldWriter fieldIndexWriter, @@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter extends FieldsConsumer { } @Override - public void finishTerm(BytesRef text, int numDocs) throws IOException { + public void finishTerm(BytesRef text, TermStats stats) throws IOException { - assert numDocs > 0; + assert stats.docFreq > 0; //System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer()); - final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs); + final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats); termWriter.write(text); final int highBit = isIndexTerm ? 0x80 : 0; @@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter extends FieldsConsumer { // This is a vInt, except, we steal top bit to record // whether this was an indexed term: - if ((numDocs & ~0x3F) == 0) { + if ((stats.docFreq & ~0x3F) == 0) { // Fast case -- docFreq fits in 6 bits - out.writeByte((byte) (highBit | numDocs)); + out.writeByte((byte) (highBit | stats.docFreq)); } else { // Write bottom 6 bits of docFreq, then write the // remainder as vInt: - out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F))); - out.writeVInt(numDocs >>> 6); + out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F))); + out.writeVInt(stats.docFreq >>> 6); } - postingsWriter.finishTerm(numDocs, isIndexTerm); + if (!fieldInfo.omitTermFreqAndPositions) { + assert stats.totalTermFreq >= stats.docFreq; + out.writeVLong(stats.totalTermFreq - stats.docFreq); + } + postingsWriter.finishTerm(stats, isIndexTerm); numTerms++; } // Finishes all terms in this field @Override - public void finish() throws IOException { + public void finish(long sumTotalTermFreq) throws IOException { // EOF marker: + this.sumTotalTermFreq = sumTotalTermFreq; out.writeVInt(DeltaBytesWriter.TERM_EOF); fieldIndexWriter.finish(); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermStats.java b/lucene/src/java/org/apache/lucene/index/codecs/TermStats.java new file mode 100644 index 00000000000..bb2b6f34d27 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermStats.java @@ -0,0 +1,28 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TermStats { + public final int docFreq; + public final long totalTermFreq; + + public TermStats(int docFreq, long totalTermFreq) { + this.docFreq = docFreq; + this.totalTermFreq = totalTermFreq; + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java index 48fc7e01660..7c9095ec756 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java @@ -38,10 +38,10 @@ public abstract class TermsConsumer { public abstract PostingsConsumer startTerm(BytesRef text) throws IOException; /** Finishes the current term; numDocs must be > 0. */ - public abstract void finishTerm(BytesRef text, int numDocs) throws IOException; + public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; /** Called when we are done adding terms to this field */ - public abstract void finish() throws IOException; + public abstract void finish(long sumTotalTermFreq) throws IOException; /** Return the BytesRef Comparator used to sort terms * before feeding to this API. */ @@ -55,6 +55,7 @@ public abstract class TermsConsumer { BytesRef term; assert termsEnum != null; + long sumTotalTermFreq = 0; if (mergeState.fieldInfo.omitTermFreqAndPositions) { if (docsEnum == null) { @@ -69,9 +70,9 @@ public abstract class TermsConsumer { if (docsEnumIn != null) { docsEnum.reset(docsEnumIn); final PostingsConsumer postingsConsumer = startTerm(term); - final int numDocs = postingsConsumer.merge(mergeState, docsEnum); - if (numDocs > 0) { - finishTerm(term, numDocs); + final TermStats stats = postingsConsumer.merge(mergeState, docsEnum); + if (stats.docFreq > 0) { + finishTerm(term, stats); } } } @@ -94,14 +95,15 @@ public abstract class TermsConsumer { } } final PostingsConsumer postingsConsumer = startTerm(term); - final int numDocs = postingsConsumer.merge(mergeState, postingsEnum); - if (numDocs > 0) { - finishTerm(term, numDocs); + final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum); + if (stats.docFreq > 0) { + finishTerm(term, stats); + sumTotalTermFreq += stats.totalTermFreq; } } } } - finish(); + finish(sumTotalTermFreq); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java index e74cd1a52d0..37985013951 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java @@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBase { public abstract void setTermsOutput(IndexOutput out); public abstract class FieldWriter { - public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException; + public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException; public abstract void finish() throws IOException; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java index 12195e813ae..059bf573bec 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java @@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { public static abstract class IndexTermSelector { // Called sequentially on every term being written, // returning true if this term should be indexed - public abstract boolean isIndexTerm(BytesRef term, int docFreq); + public abstract boolean isIndexTerm(BytesRef term, TermStats stats); } /** Same policy as {@link FixedGapTermsIndexWriter} */ @@ -74,7 +74,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { } @Override - public boolean isIndexTerm(BytesRef term, int docFreq) { + public boolean isIndexTerm(BytesRef term, TermStats stats) { if (count >= interval) { count = 0; return true; @@ -99,8 +99,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { } @Override - public boolean isIndexTerm(BytesRef term, int docFreq) { - if (docFreq >= docFreqThresh || count >= interval) { + public boolean isIndexTerm(BytesRef term, TermStats stats) { + if (stats.docFreq >= docFreqThresh || count >= interval) { count = 0; return true; } else { @@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { } @Override - public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException { - if (policy.isIndexTerm(text, docFreq) || first) { + public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException { + if (policy.isIndexTerm(text, stats) || first) { first = false; //System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer()); final int lengthSave = text.length; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index 3f54b0b9037..fe90eac93be 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -33,7 +33,6 @@ import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.CompoundFileReader; @@ -263,6 +262,11 @@ public class PreFlexFields extends FieldsProducer { return BytesRef.getUTF8SortedAsUTF16Comparator(); } } + + @Override + public long getSumTotalTermFreq() { + return -1; + } } private class PreTermsEnum extends TermsEnum { @@ -938,6 +942,11 @@ public class PreFlexFields extends FieldsProducer { return termEnum.docFreq(); } + @Override + public long totalTermFreq() { + return -1; + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { PreDocsEnum docsEnum; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java index 32f647d0adf..a018122c567 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.codecs.PostingsWriterBase; +import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; @@ -177,7 +178,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase { /** Called when we are done adding docs to this term */ @Override - public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException { //System.out.println("PW finishTerm docCount=" + docCount); assert pendingCount > 0 || pendingCount == -1; @@ -186,7 +187,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase { if (pendingCount == -1) { termsOut.writeByte((byte) 0); - wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm); + wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm); pendingIsIndexTerm = false; } else { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java index 6be97d22f9d..7e1fb17787c 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.codecs.PostingsWriterBase; +import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; @@ -239,11 +240,11 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase { /** Called when we are done adding docs to this term */ @Override - public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException { // TODO: -- wasteful we are counting this in two places? - assert docCount > 0; - assert docCount == df; + assert stats.docFreq > 0; + assert stats.docFreq == df; docIndex.write(termsOut, isIndexTerm); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index 673f2261335..e40fba2f44e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.FieldsEnum; -import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; @@ -119,28 +118,31 @@ class SimpleTextFieldsReader extends FieldsProducer { private final IndexInput in; private final boolean omitTF; private int docFreq; + private long totalTermFreq; private long docsStart; private boolean ended; - private final BytesRefFSTEnum> fstEnum; + private final BytesRefFSTEnum>> fstEnum; - public SimpleTextTermsEnum(FST> fst, boolean omitTF) throws IOException { + public SimpleTextTermsEnum(FST>> fst, boolean omitTF) throws IOException { this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); this.omitTF = omitTF; - fstEnum = new BytesRefFSTEnum>(fst); + fstEnum = new BytesRefFSTEnum>>(fst); } public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { //System.out.println("seek to text=" + text.utf8ToString()); - final BytesRefFSTEnum.InputOutput> result = fstEnum.seekCeil(text); + final BytesRefFSTEnum.InputOutput>> result = fstEnum.seekCeil(text); if (result == null) { //System.out.println(" end"); return SeekStatus.END; } else { //System.out.println(" got text=" + term.utf8ToString()); - PairOutputs.Pair pair = result.output; - docsStart = pair.output1; - docFreq = pair.output2.intValue(); + PairOutputs.Pair> pair1 = result.output; + PairOutputs.Pair pair2 = pair1.output2; + docsStart = pair1.output1; + docFreq = pair2.output1.intValue(); + totalTermFreq = pair2.output2; if (result.input.equals(text)) { //System.out.println(" match docsStart=" + docsStart); @@ -155,11 +157,13 @@ class SimpleTextFieldsReader extends FieldsProducer { @Override public BytesRef next() throws IOException { assert !ended; - final BytesRefFSTEnum.InputOutput> result = fstEnum.next(); + final BytesRefFSTEnum.InputOutput>> result = fstEnum.next(); if (result != null) { - final PairOutputs.Pair pair = result.output; - docsStart = pair.output1; - docFreq = pair.output2.intValue(); + PairOutputs.Pair> pair1 = result.output; + PairOutputs.Pair pair2 = pair1.output2; + docsStart = pair1.output1; + docFreq = pair2.output1.intValue(); + totalTermFreq = pair2.output2; return result.input; } else { return null; @@ -186,6 +190,11 @@ class SimpleTextFieldsReader extends FieldsProducer { return docFreq; } + @Override + public long totalTermFreq() { + return totalTermFreq; + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { SimpleTextDocsEnum docsEnum; @@ -438,8 +447,9 @@ class SimpleTextFieldsReader extends FieldsProducer { private class SimpleTextTerms extends Terms { private final long termsStart; private final boolean omitTF; - private FST> fst; - + private long sumTotalTermFreq; + private FST>> fst; + private int termCount; private final BytesRef scratch = new BytesRef(10); public SimpleTextTerms(String field, long termsStart) throws IOException { @@ -450,24 +460,38 @@ class SimpleTextFieldsReader extends FieldsProducer { private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); - Builder> b = new Builder>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs(posIntOutputs, posIntOutputs)); + final Builder>> b; + b = new Builder>>(FST.INPUT_TYPE.BYTE1, + 0, + 0, + true, + new PairOutputs>(posIntOutputs, + new PairOutputs(posIntOutputs, posIntOutputs))); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; + long totalTermFreq = 0; while(true) { readLine(in, scratch); if (scratch.equals(END) || scratch.startsWith(FIELD)) { if (lastDocsStart != -1) { - b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq))); + b.add(lastTerm, new PairOutputs.Pair>(lastDocsStart, + new PairOutputs.Pair((long) docFreq, + posIntOutputs.get(totalTermFreq)))); + sumTotalTermFreq += totalTermFreq; } break; } else if (scratch.startsWith(DOC)) { docFreq++; + } else if (scratch.startsWith(POS)) { + totalTermFreq++; } else if (scratch.startsWith(TERM)) { if (lastDocsStart != -1) { - b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq))); + b.add(lastTerm, new PairOutputs.Pair>(lastDocsStart, + new PairOutputs.Pair((long) docFreq, + posIntOutputs.get(totalTermFreq)))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; @@ -477,6 +501,9 @@ class SimpleTextFieldsReader extends FieldsProducer { System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; + sumTotalTermFreq += totalTermFreq; + totalTermFreq = 0; + termCount++; } } fst = b.finish(); @@ -502,6 +529,16 @@ class SimpleTextFieldsReader extends FieldsProducer { public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } + + @Override + public long getUniqueTermCount() { + return (long) termCount; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java index ae6338943e0..128da45c9b7 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java @@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.IndexOutput; @@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends FieldsConsumer { } @Override - public void finishTerm(BytesRef term, int numDocs) throws IOException { + public void finishTerm(BytesRef term, TermStats stats) throws IOException { } @Override - public void finish() throws IOException { + public void finish(long sumTotalTermFreq) throws IOException { } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java index 6dafdcda728..8f580e57282 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.codecs.PostingsWriterBase; +import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; @@ -184,12 +185,12 @@ public final class StandardPostingsWriter extends PostingsWriterBase { /** Called when we are done adding docs to this term */ @Override - public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { - assert docCount > 0; + public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException { + assert stats.docFreq > 0; // TODO: wasteful we are counting this (counting # docs // for this term) in two places? - assert docCount == df; + assert stats.docFreq == df; if (isIndexTerm) { // Write absolute at seek points diff --git a/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java index 7353b9c3cdf..3415c9057d0 100644 --- a/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java @@ -126,6 +126,11 @@ public abstract class FilteredTermsEnum extends TermsEnum { return tenum.docFreq(); } + @Override + public long totalTermFreq() { + return tenum.totalTermFreq(); + } + /** This enum does not support seeking! * @throws UnsupportedOperationException */ diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index f8e67b267f3..c18d7643470 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -244,6 +244,11 @@ public final class FuzzyTermsEnum extends TermsEnum { public int docFreq() { return actualEnum.docFreq(); } + + @Override + public long totalTermFreq() { + return actualEnum.totalTermFreq(); + } @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { diff --git a/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java b/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java index 314a3a2da72..61e80998b57 100644 --- a/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java +++ b/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java @@ -28,7 +28,6 @@ import org.apache.lucene.index.OrdTermState; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.codecs.PrefixCodedTermState; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache.DocTermsIndex; import org.apache.lucene.util.ArrayUtil; @@ -321,6 +320,11 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions throw new UnsupportedOperationException(); } + @Override + public long totalTermFreq() { + return -1; + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { throw new UnsupportedOperationException(); diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java index 0b97c1bef31..a1ecf79ac68 100644 --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -102,6 +102,8 @@ public class TestExternalCodecs extends LuceneTestCase { static class RAMField extends Terms { final String field; final SortedMap termToDocs = new TreeMap(); + long sumTotalTermFreq; + RAMField(String field) { this.field = field; } @@ -111,6 +113,11 @@ public class TestExternalCodecs extends LuceneTestCase { return termToDocs.size(); } + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + @Override public TermsEnum iterator() { return new RAMTermsEnum(RAMOnlyCodec.RAMField.this); @@ -124,6 +131,7 @@ public class TestExternalCodecs extends LuceneTestCase { static class RAMTerm { final String term; + long totalTermFreq; final List docs = new ArrayList(); public RAMTerm(String term) { this.term = term; @@ -189,14 +197,16 @@ public class TestExternalCodecs extends LuceneTestCase { } @Override - public void finishTerm(BytesRef text, int numDocs) { - assert numDocs > 0; - assert numDocs == current.docs.size(); + public void finishTerm(BytesRef text, TermStats stats) { + assert stats.docFreq > 0; + assert stats.docFreq == current.docs.size(); + current.totalTermFreq = stats.totalTermFreq; field.termToDocs.put(current.term, current); } @Override - public void finish() { + public void finish(long sumTotalTermFreq) { + field.sumTotalTermFreq = sumTotalTermFreq; } } @@ -331,6 +341,10 @@ public class TestExternalCodecs extends LuceneTestCase { } @Override + public long totalTermFreq() { + return ramField.termToDocs.get(current).totalTermFreq; + } + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs); } diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index f21baadca49..cabf1d330d7 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.index.codecs.TermStats; import org.apache.lucene.index.codecs.mocksep.MockSepCodec; import org.apache.lucene.index.codecs.preflex.PreFlexCodec; import org.apache.lucene.search.DocIdSetIterator; @@ -97,9 +98,11 @@ public class TestCodecs extends LuceneTestCase { public void write(final FieldsConsumer consumer) throws Throwable { Arrays.sort(terms); final TermsConsumer termsConsumer = consumer.addField(fieldInfo); - for (final TermData term : terms) - term.write(termsConsumer); - termsConsumer.finish(); + long sumTotalTermCount = 0; + for (final TermData term : terms) { + sumTotalTermCount += term.write(termsConsumer); + } + termsConsumer.finish(sumTotalTermCount); } } @@ -131,8 +134,9 @@ public class TestCodecs extends LuceneTestCase { return text.compareTo(((TermData) o).text); } - public void write(final TermsConsumer termsConsumer) throws Throwable { + public long write(final TermsConsumer termsConsumer) throws Throwable { final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text); + long totTF = 0; for(int i=0;i 0) { + public void finishTerm(BytesRef text, TermStats stats) throws IOException { + if (stats.docFreq > 0) { long skipPointer = skipListWriter.writeSkip(freqOut); - termInfo.docFreq = numDocs; + termInfo.docFreq = stats.docFreq; termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer); //System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number); termsOut.add(fieldInfo.number, @@ -197,7 +198,7 @@ class PreFlexFieldsWriter extends FieldsConsumer { } @Override - public void finish() throws IOException { + public void finish(long sumTotalTermCount) throws IOException { } @Override diff --git a/solr/src/java/org/apache/solr/request/UnInvertedField.java b/solr/src/java/org/apache/solr/request/UnInvertedField.java index d724961cb22..79c909bea21 100755 --- a/solr/src/java/org/apache/solr/request/UnInvertedField.java +++ b/solr/src/java/org/apache/solr/request/UnInvertedField.java @@ -1000,6 +1000,10 @@ class NumberedTermsEnum extends TermsEnum { return tenum.docFreq(); } + @Override + public long totalTermFreq() { + return tenum.totalTermFreq(); + } public BytesRef skipTo(BytesRef target) throws IOException {