LUCENE-2862: add TermsEnum.totalTermFreq() and Terms.getSumTotalTermFreq()

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059344 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-01-15 14:42:37 +00:00
parent f5d5dda6c6
commit a0c82b0f41
45 changed files with 511 additions and 126 deletions

View File

@ -359,6 +359,9 @@ New features
terms dict. This impl stores the indexed terms in an FST, which is terms dict. This impl stores the indexed terms in an FST, which is
much more RAM efficient than FixedGapTermsIndex. (Mike McCandless) much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
* LUCENE-2862: Added TermsEnum.totalTermFreq() and
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
Optimizations Optimizations
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching. * LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.

View File

@ -238,6 +238,10 @@ public class InstantiatedIndex
while((text = termsEnum.next()) != null) { while((text = termsEnum.next()) != null) {
String termText = text.utf8ToString(); String termText = text.utf8ToString();
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText); InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
final long totalTermFreq = termsEnum.totalTermFreq();
if (totalTermFreq != -1) {
instantiatedTerm.addPositionsCount(totalTermFreq);
}
getTermsByFieldAndText().get(field).put(termText, instantiatedTerm); getTermsByFieldAndText().get(field).put(termText, instantiatedTerm);
instantiatedTerm.setTermIndex(terms.size()); instantiatedTerm.setTermIndex(terms.size());
terms.add(instantiatedTerm); terms.add(instantiatedTerm);

View File

@ -398,18 +398,33 @@ public class InstantiatedIndexReader extends IndexReader {
if (i < 0) { if (i < 0) {
i = -i - 1; i = -i - 1;
} }
if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) { if (i >= orderedTerms.length || orderedTerms[i].field() != field) {
// field does not exist // field does not exist
return null; return null;
} }
final int startLoc = i; final int startLoc = i;
// TODO: heavy to do this here; would be better to
// do it up front & cache
long sum = 0;
int upto = i;
while(upto < orderedTerms.length && orderedTerms[i].field() == field) {
sum += orderedTerms[i].getTotalTermFreq();
upto++;
}
final long sumTotalTermFreq = sum;
return new Terms() { return new Terms() {
@Override @Override
public TermsEnum iterator() { public TermsEnum iterator() {
return new InstantiatedTermsEnum(orderedTerms, startLoc, field); return new InstantiatedTermsEnum(orderedTerms, startLoc, field);
} }
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator(); return BytesRef.getUTF8SortedAsUnicodeComparator();

View File

@ -315,6 +315,7 @@ public class InstantiatedIndexWriter implements Closeable {
} }
associatedDocuments[associatedDocuments.length - 1] = info; associatedDocuments[associatedDocuments.length - 1] = info;
term.setAssociatedDocuments(associatedDocuments); term.setAssociatedDocuments(associatedDocuments);
term.addPositionsCount(positions.length);
// todo optimize, only if term vector? // todo optimize, only if term vector?
informationByTermOfCurrentDocument.put(term, info); informationByTermOfCurrentDocument.put(term, info);

View File

@ -45,6 +45,8 @@ public class InstantiatedTerm
private Term term; private Term term;
private long totalTermFreq;
/** /**
* index of term in InstantiatedIndex * index of term in InstantiatedIndex
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */ * @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
@ -92,6 +94,14 @@ public class InstantiatedTerm
this.associatedDocuments = associatedDocuments; this.associatedDocuments = associatedDocuments;
} }
void addPositionsCount(long count) {
totalTermFreq += count;
}
public long getTotalTermFreq() {
return totalTermFreq;
}
/** /**
* Finds index to the first beyond the current whose document number is * Finds index to the first beyond the current whose document number is
* greater than or equal to <i>target</i>, -1 if there is no such element. * greater than or equal to <i>target</i>, -1 if there is no such element.

View File

@ -24,7 +24,6 @@ import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
@ -110,6 +109,12 @@ public class InstantiatedTermsEnum extends TermsEnum {
return terms[upto].getAssociatedDocuments().length; return terms[upto].getAssociatedDocuments().length;
} }
@Override
public long totalTermFreq() {
final long v = terms[upto].getTotalTermFreq();
return v == 0 ? -1 : v;
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) { if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) {

View File

@ -66,6 +66,7 @@ public class TestIndicesEquals extends LuceneTestCase {
// create dir data // create dir data
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig( IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer())); TEST_VERSION_CURRENT, new MockAnalyzer()));
for (int i = 0; i < 20; i++) { for (int i = 0; i < 20; i++) {
Document document = new Document(); Document document = new Document();
assembleDocument(document, i); assembleDocument(document, i);
@ -395,6 +396,10 @@ public class TestIndicesEquals extends LuceneTestCase {
} }
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq()); assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
final long totalTermFreq = aprioriTermEnum.totalTermFreq();
if (totalTermFreq != -1) {
assertEquals(totalTermFreq, testTermEnum.totalTermFreq());
}
// compare termDocs seeking // compare termDocs seeking

View File

@ -610,6 +610,8 @@ public class MemoryIndex implements Serializable {
/** Term for this field's fieldName, lazily computed on demand */ /** Term for this field's fieldName, lazily computed on demand */
public transient Term template; public transient Term template;
private final long sumTotalTermFreq;
private static final long serialVersionUID = 2882195016849084649L; private static final long serialVersionUID = 2882195016849084649L;
public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) { public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) {
@ -617,6 +619,15 @@ public class MemoryIndex implements Serializable {
this.numTokens = numTokens; this.numTokens = numTokens;
this.numOverlapTokens = numOverlapTokens; this.numOverlapTokens = numOverlapTokens;
this.boost = boost; this.boost = boost;
long sum = 0;
for(Map.Entry<BytesRef,ArrayIntList> ent : terms.entrySet()) {
sum += ent.getValue().size();
}
sumTotalTermFreq = sum;
}
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
} }
/** /**
@ -826,6 +837,11 @@ public class MemoryIndex implements Serializable {
public long getUniqueTermCount() { public long getUniqueTermCount() {
return info.sortedTerms.length; return info.sortedTerms.length;
} }
@Override
public long getSumTotalTermFreq() {
return info.getSumTotalTermFreq();
}
}; };
} }
} }
@ -895,6 +911,11 @@ public class MemoryIndex implements Serializable {
return 1; return 1;
} }
@Override
public long totalTermFreq() {
return info.sortedTerms[termUpto].getValue().size();
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
if (reuse == null || !(reuse instanceof MemoryDocsEnum)) { if (reuse == null || !(reuse instanceof MemoryDocsEnum)) {

View File

@ -176,15 +176,34 @@ public class HighFreqTerms {
return ts; return ts;
} }
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception { public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termText) throws Exception {
BytesRef br = termtext;
long totalTF = 0; long totalTF = 0;
Bits skipDocs = MultiFields.getDeletedDocs(reader);
DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br); Terms terms = MultiFields.getTerms(reader, field);
// if term is not in index return totalTF of 0 if (terms == null) {
if (de == null) {
return 0; return 0;
} }
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) {
return 0;
}
Bits skipDocs = MultiFields.getDeletedDocs(reader);
if (skipDocs == null) {
// TODO: we could do this up front, during the scan
// (next()), instead of after-the-fact here w/ seek,
// if the codec supports it and there are no del
// docs...
final long totTF = termsEnum.totalTermFreq();
if (totTF != -1) {
return totTF;
}
}
DocsEnum de = termsEnum.docs(skipDocs, null);
// use DocsEnum.read() and BulkResult api // use DocsEnum.read() and BulkResult api
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult(); final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
int count; int count;

View File

@ -41,4 +41,9 @@ public final class TermStats {
String getTermText() { String getTermText() {
return termtext.utf8ToString(); return termtext.utf8ToString();
} }
@Override
public String toString() {
return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq);
}
} }

View File

@ -17,15 +17,16 @@ package org.apache.lucene.misc;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -41,8 +42,10 @@ public class TestHighFreqTerms extends LuceneTestCase {
writer = new IndexWriter(dir, newIndexWriterConfig(random, writer = new IndexWriter(dir, newIndexWriterConfig(random,
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)) TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))
.setMaxBufferedDocs(2)); .setMaxBufferedDocs(2));
writer.setInfoStream(VERBOSE ? System.out : null);
indexDocs(writer); indexDocs(writer);
reader = IndexReader.open(dir, true); reader = IndexReader.open(dir, true);
_TestUtil.checkIndex(dir);
} }
@AfterClass @AfterClass
@ -75,8 +78,8 @@ public class TestHighFreqTerms extends LuceneTestCase {
String field="FIELD_1"; String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
for (int i = 0; i < terms.length; i++) { for (int i = 0; i < terms.length; i++) {
if (i >0){ if (i > 0) {
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq); assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
} }
} }
} }
@ -134,11 +137,12 @@ public class TestHighFreqTerms extends LuceneTestCase {
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms); TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
for (int i = 0; i < termsWithTF.length; i++) { for (int i = 0; i < termsWithTF.length; i++) {
// check that they are sorted by descending termfreq order // check that they are sorted by descending termfreq
if (i >0){ // order
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq); if (i > 0) {
} assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
}
} }
} }

View File

@ -124,6 +124,10 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
return fcsi.getTermsEnum(); return fcsi.getTermsEnum();
} }
@Override
public long getSumTotalTermFreq() {
return -1;
}
}); });
assert termsEnum != null; assert termsEnum != null;

View File

@ -610,6 +610,8 @@ public class CheckIndex {
Comparator<BytesRef> termComp = terms.getComparator(); Comparator<BytesRef> termComp = terms.getComparator();
long sumTotalTermFreq = 0;
while(true) { while(true) {
final BytesRef term = terms.next(); final BytesRef term = terms.next();
@ -660,6 +662,8 @@ public class CheckIndex {
} }
int lastDoc = -1; int lastDoc = -1;
int docCount = 0;
long totalTermFreq = 0;
while(true) { while(true) {
final int doc = docs2.nextDoc(); final int doc = docs2.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) { if (doc == DocIdSetIterator.NO_MORE_DOCS) {
@ -667,6 +671,8 @@ public class CheckIndex {
} }
final int freq = docs2.freq(); final int freq = docs2.freq();
status.totPos += freq; status.totPos += freq;
totalTermFreq += freq;
docCount++;
if (doc <= lastDoc) { if (doc <= lastDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
@ -698,21 +704,38 @@ public class CheckIndex {
} }
} }
// Now count how many deleted docs occurred in final long totalTermFreq2 = terms.totalTermFreq();
// this term: final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
// Re-count if there are deleted docs:
if (reader.hasDeletions()) { if (reader.hasDeletions()) {
final DocsEnum docsNoDel = terms.docs(null, docs); final DocsEnum docsNoDel = terms.docs(null, docs);
int count = 0; docCount = 0;
totalTermFreq = 0;
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
count++; docCount++;
totalTermFreq += docsNoDel.freq();
} }
if (count != docFreq) { }
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
if (docCount != docFreq) {
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
}
if (hasTotalTermFreq) {
sumTotalTermFreq += totalTermFreq;
if (totalTermFreq != totalTermFreq2) {
throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
} }
} }
} }
if (sumTotalTermFreq != 0) {
final long v = fields.terms(field).getSumTotalTermFreq();
if (v != -1 && sumTotalTermFreq != v) {
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
}
}
// Test seek to last term: // Test seek to last term:
if (lastTerm != null) { if (lastTerm != null) {
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) { if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {

View File

@ -99,6 +99,11 @@ public class FilterIndexReader extends IndexReader {
public long getUniqueTermCount() throws IOException { public long getUniqueTermCount() throws IOException {
return in.getUniqueTermCount(); return in.getUniqueTermCount();
} }
@Override
public long getSumTotalTermFreq() throws IOException {
return in.getSumTotalTermFreq();
}
} }
/** Base class for filtering {@link TermsEnum} implementations. */ /** Base class for filtering {@link TermsEnum} implementations. */
@ -155,6 +160,11 @@ public class FilterIndexReader extends IndexReader {
return in.docFreq(); return in.docFreq();
} }
@Override
public long totalTermFreq() {
return in.totalTermFreq();
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return in.docs(skipDocs, reuse); return in.docs(skipDocs, reuse);

View File

@ -20,13 +20,14 @@ package org.apache.lucene.index;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Comparator;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.CollectionUtil;
@ -165,6 +166,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// multiple threads and interacting with the // multiple threads and interacting with the
// TermsConsumer, only calling out to us (passing us the // TermsConsumer, only calling out to us (passing us the
// DocsConsumer) to handle delivery of docs/positions // DocsConsumer) to handle delivery of docs/positions
long sumTotalTermFreq = 0;
while(numFields > 0) { while(numFields > 0) {
// Get the next term to merge // Get the next term to merge
@ -197,6 +199,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// which all share the same term. Now we must // which all share the same term. Now we must
// interleave the docID streams. // interleave the docID streams.
int numDocs = 0; int numDocs = 0;
long totTF = 0;
while(numToMerge > 0) { while(numToMerge > 0) {
FreqProxFieldMergeState minState = termStates[0]; FreqProxFieldMergeState minState = termStates[0];
@ -222,6 +225,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// omitTermFreqAndPositions == false so we do write positions & // omitTermFreqAndPositions == false so we do write positions &
// payload // payload
int position = 0; int position = 0;
totTF += termDocFreq;
for(int j=0;j<termDocFreq;j++) { for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt(); final int code = prox.readVInt();
position += code >> 1; position += code >> 1;
@ -286,9 +290,10 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
} }
assert numDocs > 0; assert numDocs > 0;
termsConsumer.finishTerm(text, numDocs); termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
sumTotalTermFreq += totTF;
} }
termsConsumer.finish(); termsConsumer.finish(sumTotalTermFreq);
} }
} }

View File

@ -997,6 +997,23 @@ public abstract class IndexReader implements Cloneable,Closeable {
return terms.docFreq(term); return terms.docFreq(term);
} }
/** Returns the number of documents containing the term
* <code>t</code>. This method returns 0 if the term or
* field does not exists. This method does not take into
* account deleted documents that have not yet been merged
* away. */
public long totalTermFreq(String field, BytesRef term) throws IOException {
final Fields fields = fields();
if (fields == null) {
return 0;
}
final Terms terms = fields.terms(field);
if (terms == null) {
return 0;
}
return terms.totalTermFreq(term);
}
/** This may return null if the field does not exist.*/ /** This may return null if the field does not exist.*/
public Terms terms(String field) throws IOException { public Terms terms(String field) throws IOException {
final Fields fields = fields(); final Fields fields = fields();

View File

@ -76,6 +76,19 @@ public final class MultiTerms extends Terms {
} }
} }
@Override
public long getSumTotalTermFreq() throws IOException {
long sum = 0;
for(Terms terms : subs) {
final long v = terms.getSumTotalTermFreq();
if (v == -1) {
return -1;
}
sum += v;
}
return sum;
}
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return termComp; return termComp;

View File

@ -265,6 +265,19 @@ public final class MultiTermsEnum extends TermsEnum {
return sum; return sum;
} }
@Override
public long totalTermFreq() {
long sum = 0;
for(int i=0;i<numTop;i++) {
final long v = top[i].terms.totalTermFreq();
if (v == -1) {
return v;
}
sum += v;
}
return sum;
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final MultiDocsEnum docsEnum; final MultiDocsEnum docsEnum;

View File

@ -57,6 +57,18 @@ public abstract class Terms {
} }
} }
/** Returns the number of documents containing the
* specified term text. Returns 0 if the term does not
* exist. */
public long totalTermFreq(BytesRef text) throws IOException {
final TermsEnum termsEnum = getThreadTermsEnum();
if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
return termsEnum.totalTermFreq();
} else {
return 0;
}
}
/** Get {@link DocsEnum} for the specified term. This /** Get {@link DocsEnum} for the specified term. This
* method may return null if the term does not exist. */ * method may return null if the term does not exist. */
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
@ -115,6 +127,14 @@ public abstract class Terms {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()"); throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
} }
/** Returns the sum of {@link TermsEnum#totalTermFreq} for
* all terms in this field, or -1 if this measure isn't
* stored by the codec (or if this fields omits term freq
* and positions). Note that, just like other term
* measures, this measure does not take deleted documents
* into account. */
public abstract long getSumTotalTermFreq() throws IOException;
/** /**
* Returns a thread-private {@link TermsEnum} instance. Obtaining * Returns a thread-private {@link TermsEnum} instance. Obtaining
* {@link TermsEnum} from this method might be more efficient than using * {@link TermsEnum} from this method might be more efficient than using

View File

@ -126,6 +126,14 @@ public abstract class TermsEnum {
* {@link SeekStatus#END}.*/ * {@link SeekStatus#END}.*/
public abstract int docFreq(); public abstract int docFreq();
/** Returns the total number of occurrences of this term
* across all documents (the sum of the freq() for each
* doc that has this term). This will be -1 if the
* codec doesn't support this measure. Note that, like
* other term measures, this measure does not take
* deleted documents into account. */
public abstract long totalTermFreq();
/** Get {@link DocsEnum} for the current term. Do not /** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link * call this before calling {@link #next} or {@link
* #seek} for the first time. This method will not * #seek} for the first time. This method will not
@ -198,6 +206,11 @@ public abstract class TermsEnum {
throw new IllegalStateException("this method should never be called"); throw new IllegalStateException("this method should never be called");
} }
@Override
public long totalTermFreq() {
throw new IllegalStateException("this method should never be called");
}
@Override @Override
public long ord() { public long ord() {
throw new IllegalStateException("this method should never be called"); throw new IllegalStateException("this method should never be called");

View File

@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
} }
@Override @Override
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException { public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term: // First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) { if (0 == (numTerms++ % termIndexInterval)) {

View File

@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
/** Default merge impl: append documents, mapping around /** Default merge impl: append documents, mapping around
* deletes */ * deletes */
public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException { public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
int df = 0; int df = 0;
long totTF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) { if (mergeState.fieldInfo.omitTermFreqAndPositions) {
while(true) { while(true) {
@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
this.startDoc(doc, postings.freq()); this.startDoc(doc, postings.freq());
this.finishDoc(); this.finishDoc();
df++; df++;
totTF++;
} }
} else { } else {
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings; final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
} }
final int freq = postingsEnum.freq(); final int freq = postingsEnum.freq();
this.startDoc(doc, freq); this.startDoc(doc, freq);
totTF += freq;
for(int i=0;i<freq;i++) { for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition(); final int position = postingsEnum.nextPosition();
final BytesRef payload; final BytesRef payload;
@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
df++; df++;
} }
} }
return df; return new TermStats(df, totTF);
} }
} }

View File

@ -34,7 +34,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
public abstract void startTerm() throws IOException; public abstract void startTerm() throws IOException;
/** Finishes the current term */ /** Finishes the current term */
public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException; public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
public abstract void setField(FieldInfo fieldInfo); public abstract void setField(FieldInfo fieldInfo);

View File

@ -27,6 +27,7 @@ import org.apache.lucene.index.TermState;
public class PrefixCodedTermState extends OrdTermState { public class PrefixCodedTermState extends OrdTermState {
public int docFreq; // how many docs have this term public int docFreq; // how many docs have this term
public long filePointer; // fp into the terms dict primary file (_X.tis) public long filePointer; // fp into the terms dict primary file (_X.tis)
public long totalTermFreq; // total number of occurrences of this term
@Override @Override
public void copyFrom(TermState _other) { public void copyFrom(TermState _other) {
@ -35,11 +36,12 @@ public class PrefixCodedTermState extends OrdTermState {
super.copyFrom(_other); super.copyFrom(_other);
filePointer = other.filePointer; filePointer = other.filePointer;
docFreq = other.docFreq; docFreq = other.docFreq;
totalTermFreq = other.totalTermFreq;
} }
@Override @Override
public String toString() { public String toString() {
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]"; return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
} }
} }

View File

@ -129,18 +129,17 @@ public class PrefixCodedTermsReader extends FieldsProducer {
// Read per-field details // Read per-field details
seekDir(in, dirOffset); seekDir(in, dirOffset);
final int numFields = in.readInt(); final int numFields = in.readVInt();
for(int i=0;i<numFields;i++) { for(int i=0;i<numFields;i++) {
final int field = in.readInt(); final int field = in.readVInt();
final long numTerms = in.readLong(); final long numTerms = in.readVLong();
assert numTerms >= 0; assert numTerms >= 0;
final long termsStartPointer = in.readLong(); final long termsStartPointer = in.readVLong();
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (numTerms > 0) { final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
assert !fields.containsKey(fieldInfo.name); assert !fields.containsKey(fieldInfo.name);
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer)); fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
}
} }
success = true; success = true;
} finally { } finally {
@ -245,12 +244,14 @@ public class PrefixCodedTermsReader extends FieldsProducer {
final long numTerms; final long numTerms;
final FieldInfo fieldInfo; final FieldInfo fieldInfo;
final long termsStartPointer; final long termsStartPointer;
final long sumTotalTermFreq;
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) { FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
assert numTerms > 0; assert numTerms > 0;
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
this.numTerms = numTerms; this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer; this.termsStartPointer = termsStartPointer;
this.sumTotalTermFreq = sumTotalTermFreq;
} }
@Override @Override
@ -273,6 +274,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return numTerms; return numTerms;
} }
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
// Iterates through terms in this field, not supporting ord() // Iterates through terms in this field, not supporting ord()
private final class SegmentTermsEnum extends TermsEnum { private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in; private final IndexInput in;
@ -295,6 +301,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
bytesReader = new DeltaBytesReader(in); bytesReader = new DeltaBytesReader(in);
fieldTerm.field = fieldInfo.name; fieldTerm.field = fieldInfo.name;
state = postingsReader.newTermState(); state = postingsReader.newTermState();
state.totalTermFreq = -1;
state.ord = -1; state.ord = -1;
} }
@ -494,6 +501,10 @@ public class PrefixCodedTermsReader extends FieldsProducer {
state.docFreq = (in.readVInt() << 6) | (b & 0x3F); state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
} }
if (!fieldInfo.omitTermFreqAndPositions) {
state.totalTermFreq = state.docFreq + in.readVLong();
}
postingsReader.readTerm(in, postingsReader.readTerm(in,
fieldInfo, state, fieldInfo, state,
isIndexTerm); isIndexTerm);
@ -511,6 +522,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return state.docFreq; return state.docFreq;
} }
@Override
public long totalTermFreq() {
return state.totalTermFreq;
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse); final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);

View File

@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
final FieldInfos fieldInfos; final FieldInfos fieldInfos;
FieldInfo currentField; FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter; private final TermsIndexWriterBase termsIndexWriter;
private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>(); private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private final Comparator<BytesRef> termComp; private final Comparator<BytesRef> termComp;
public PrefixCodedTermsWriter( public PrefixCodedTermsWriter(
@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
assert currentField == null || currentField.name.compareTo(field.name) < 0; assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field; currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field); TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter); final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
fields.add(terms); fields.add(terms);
return terms; return terms;
} }
@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
public void close() throws IOException { public void close() throws IOException {
try { try {
final int fieldCount = fields.size();
int nonZeroCount = 0;
for(TermsWriter field : fields) {
if (field.numTerms > 0) {
nonZeroCount++;
}
}
final long dirStart = out.getFilePointer(); final long dirStart = out.getFilePointer();
out.writeInt(fieldCount); out.writeVInt(nonZeroCount);
for(int i=0;i<fieldCount;i++) { for(TermsWriter field : fields) {
TermsWriter field = (TermsWriter) fields.get(i); if (field.numTerms > 0) {
out.writeInt(field.fieldInfo.number); out.writeVInt(field.fieldInfo.number);
out.writeLong(field.numTerms); out.writeVLong(field.numTerms);
out.writeLong(field.termsStartPointer); out.writeVLong(field.termsStartPointer);
if (!field.fieldInfo.omitTermFreqAndPositions) {
out.writeVLong(field.sumTotalTermFreq);
}
}
} }
writeTrailer(dirStart); writeTrailer(dirStart);
} finally { } finally {
@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
private final long termsStartPointer; private final long termsStartPointer;
private long numTerms; private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
long sumTotalTermFreq;
TermsWriter( TermsWriter(
TermsIndexWriterBase.FieldWriter fieldIndexWriter, TermsIndexWriterBase.FieldWriter fieldIndexWriter,
@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
} }
@Override @Override
public void finishTerm(BytesRef text, int numDocs) throws IOException { public void finishTerm(BytesRef text, TermStats stats) throws IOException {
assert numDocs > 0; assert stats.docFreq > 0;
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer()); //System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs); final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
termWriter.write(text); termWriter.write(text);
final int highBit = isIndexTerm ? 0x80 : 0; final int highBit = isIndexTerm ? 0x80 : 0;
@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
// This is a vInt, except, we steal top bit to record // This is a vInt, except, we steal top bit to record
// whether this was an indexed term: // whether this was an indexed term:
if ((numDocs & ~0x3F) == 0) { if ((stats.docFreq & ~0x3F) == 0) {
// Fast case -- docFreq fits in 6 bits // Fast case -- docFreq fits in 6 bits
out.writeByte((byte) (highBit | numDocs)); out.writeByte((byte) (highBit | stats.docFreq));
} else { } else {
// Write bottom 6 bits of docFreq, then write the // Write bottom 6 bits of docFreq, then write the
// remainder as vInt: // remainder as vInt:
out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F))); out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
out.writeVInt(numDocs >>> 6); out.writeVInt(stats.docFreq >>> 6);
} }
postingsWriter.finishTerm(numDocs, isIndexTerm); if (!fieldInfo.omitTermFreqAndPositions) {
assert stats.totalTermFreq >= stats.docFreq;
out.writeVLong(stats.totalTermFreq - stats.docFreq);
}
postingsWriter.finishTerm(stats, isIndexTerm);
numTerms++; numTerms++;
} }
// Finishes all terms in this field // Finishes all terms in this field
@Override @Override
public void finish() throws IOException { public void finish(long sumTotalTermFreq) throws IOException {
// EOF marker: // EOF marker:
this.sumTotalTermFreq = sumTotalTermFreq;
out.writeVInt(DeltaBytesWriter.TERM_EOF); out.writeVInt(DeltaBytesWriter.TERM_EOF);
fieldIndexWriter.finish(); fieldIndexWriter.finish();
} }

View File

@ -0,0 +1,28 @@
package org.apache.lucene.index.codecs;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TermStats {
public final int docFreq;
public final long totalTermFreq;
public TermStats(int docFreq, long totalTermFreq) {
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
}

View File

@ -38,10 +38,10 @@ public abstract class TermsConsumer {
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException; public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */ /** Finishes the current term; numDocs must be > 0. */
public abstract void finishTerm(BytesRef text, int numDocs) throws IOException; public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */ /** Called when we are done adding terms to this field */
public abstract void finish() throws IOException; public abstract void finish(long sumTotalTermFreq) throws IOException;
/** Return the BytesRef Comparator used to sort terms /** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */ * before feeding to this API. */
@ -55,6 +55,7 @@ public abstract class TermsConsumer {
BytesRef term; BytesRef term;
assert termsEnum != null; assert termsEnum != null;
long sumTotalTermFreq = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) { if (mergeState.fieldInfo.omitTermFreqAndPositions) {
if (docsEnum == null) { if (docsEnum == null) {
@ -69,9 +70,9 @@ public abstract class TermsConsumer {
if (docsEnumIn != null) { if (docsEnumIn != null) {
docsEnum.reset(docsEnumIn); docsEnum.reset(docsEnumIn);
final PostingsConsumer postingsConsumer = startTerm(term); final PostingsConsumer postingsConsumer = startTerm(term);
final int numDocs = postingsConsumer.merge(mergeState, docsEnum); final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
if (numDocs > 0) { if (stats.docFreq > 0) {
finishTerm(term, numDocs); finishTerm(term, stats);
} }
} }
} }
@ -94,14 +95,15 @@ public abstract class TermsConsumer {
} }
} }
final PostingsConsumer postingsConsumer = startTerm(term); final PostingsConsumer postingsConsumer = startTerm(term);
final int numDocs = postingsConsumer.merge(mergeState, postingsEnum); final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
if (numDocs > 0) { if (stats.docFreq > 0) {
finishTerm(term, numDocs); finishTerm(term, stats);
sumTotalTermFreq += stats.totalTermFreq;
} }
} }
} }
} }
finish(); finish(sumTotalTermFreq);
} }
} }

View File

@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBase {
public abstract void setTermsOutput(IndexOutput out); public abstract void setTermsOutput(IndexOutput out);
public abstract class FieldWriter { public abstract class FieldWriter {
public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException; public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
public abstract void finish() throws IOException; public abstract void finish() throws IOException;
} }

View File

@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public static abstract class IndexTermSelector { public static abstract class IndexTermSelector {
// Called sequentially on every term being written, // Called sequentially on every term being written,
// returning true if this term should be indexed // returning true if this term should be indexed
public abstract boolean isIndexTerm(BytesRef term, int docFreq); public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
} }
/** Same policy as {@link FixedGapTermsIndexWriter} */ /** Same policy as {@link FixedGapTermsIndexWriter} */
@ -74,7 +74,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
} }
@Override @Override
public boolean isIndexTerm(BytesRef term, int docFreq) { public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (count >= interval) { if (count >= interval) {
count = 0; count = 0;
return true; return true;
@ -99,8 +99,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
} }
@Override @Override
public boolean isIndexTerm(BytesRef term, int docFreq) { public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (docFreq >= docFreqThresh || count >= interval) { if (stats.docFreq >= docFreqThresh || count >= interval) {
count = 0; count = 0;
return true; return true;
} else { } else {
@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
} }
@Override @Override
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException { public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
if (policy.isIndexTerm(text, docFreq) || first) { if (policy.isIndexTerm(text, stats) || first) {
first = false; first = false;
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer()); //System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
final int lengthSave = text.length; final int lengthSave = text.length;

View File

@ -33,7 +33,6 @@ import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.CompoundFileReader; import org.apache.lucene.index.CompoundFileReader;
@ -263,6 +262,11 @@ public class PreFlexFields extends FieldsProducer {
return BytesRef.getUTF8SortedAsUTF16Comparator(); return BytesRef.getUTF8SortedAsUTF16Comparator();
} }
} }
@Override
public long getSumTotalTermFreq() {
return -1;
}
} }
private class PreTermsEnum extends TermsEnum { private class PreTermsEnum extends TermsEnum {
@ -938,6 +942,11 @@ public class PreFlexFields extends FieldsProducer {
return termEnum.docFreq(); return termEnum.docFreq();
} }
@Override
public long totalTermFreq() {
return -1;
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
PreDocsEnum docsEnum; PreDocsEnum docsEnum;

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -177,7 +178,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
/** Called when we are done adding docs to this term */ /** Called when we are done adding docs to this term */
@Override @Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
//System.out.println("PW finishTerm docCount=" + docCount); //System.out.println("PW finishTerm docCount=" + docCount);
assert pendingCount > 0 || pendingCount == -1; assert pendingCount > 0 || pendingCount == -1;
@ -186,7 +187,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
if (pendingCount == -1) { if (pendingCount == -1) {
termsOut.writeByte((byte) 0); termsOut.writeByte((byte) 0);
wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm); wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
pendingIsIndexTerm = false; pendingIsIndexTerm = false;
} else { } else {

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.CodecUtil;
@ -239,11 +240,11 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
/** Called when we are done adding docs to this term */ /** Called when we are done adding docs to this term */
@Override @Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
// TODO: -- wasteful we are counting this in two places? // TODO: -- wasteful we are counting this in two places?
assert docCount > 0; assert stats.docFreq > 0;
assert docCount == df; assert stats.docFreq == df;
docIndex.write(termsOut, isIndexTerm); docIndex.write(termsOut, isIndexTerm);

View File

@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
@ -119,28 +118,31 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final IndexInput in; private final IndexInput in;
private final boolean omitTF; private final boolean omitTF;
private int docFreq; private int docFreq;
private long totalTermFreq;
private long docsStart; private long docsStart;
private boolean ended; private boolean ended;
private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum; private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException { public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.omitTF = omitTF; this.omitTF = omitTF;
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst); fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
} }
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
//System.out.println("seek to text=" + text.utf8ToString()); //System.out.println("seek to text=" + text.utf8ToString());
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text); final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
if (result == null) { if (result == null) {
//System.out.println(" end"); //System.out.println(" end");
return SeekStatus.END; return SeekStatus.END;
} else { } else {
//System.out.println(" got text=" + term.utf8ToString()); //System.out.println(" got text=" + term.utf8ToString());
PairOutputs.Pair<Long,Long> pair = result.output; PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
docsStart = pair.output1; PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
docFreq = pair.output2.intValue(); docsStart = pair1.output1;
docFreq = pair2.output1.intValue();
totalTermFreq = pair2.output2;
if (result.input.equals(text)) { if (result.input.equals(text)) {
//System.out.println(" match docsStart=" + docsStart); //System.out.println(" match docsStart=" + docsStart);
@ -155,11 +157,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
assert !ended; assert !ended;
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next(); final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
if (result != null) { if (result != null) {
final PairOutputs.Pair<Long,Long> pair = result.output; PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
docsStart = pair.output1; PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
docFreq = pair.output2.intValue(); docsStart = pair1.output1;
docFreq = pair2.output1.intValue();
totalTermFreq = pair2.output2;
return result.input; return result.input;
} else { } else {
return null; return null;
@ -186,6 +190,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
return docFreq; return docFreq;
} }
@Override
public long totalTermFreq() {
return totalTermFreq;
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
SimpleTextDocsEnum docsEnum; SimpleTextDocsEnum docsEnum;
@ -438,8 +447,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
private class SimpleTextTerms extends Terms { private class SimpleTextTerms extends Terms {
private final long termsStart; private final long termsStart;
private final boolean omitTF; private final boolean omitTF;
private FST<PairOutputs.Pair<Long,Long>> fst; private long sumTotalTermFreq;
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
private int termCount;
private final BytesRef scratch = new BytesRef(10); private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException { public SimpleTextTerms(String field, long termsStart) throws IOException {
@ -450,24 +460,38 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException { private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)); final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
0,
0,
true,
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart); in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10); final BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1; long lastDocsStart = -1;
int docFreq = 0; int docFreq = 0;
long totalTermFreq = 0;
while(true) { while(true) {
readLine(in, scratch); readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) { if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq))); b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
sumTotalTermFreq += totalTermFreq;
} }
break; break;
} else if (scratch.startsWith(DOC)) { } else if (scratch.startsWith(DOC)) {
docFreq++; docFreq++;
} else if (scratch.startsWith(POS)) {
totalTermFreq++;
} else if (scratch.startsWith(TERM)) { } else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq))); b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
} }
lastDocsStart = in.getFilePointer(); lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length; final int len = scratch.length - TERM.length;
@ -477,6 +501,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
lastTerm.length = len; lastTerm.length = len;
docFreq = 0; docFreq = 0;
sumTotalTermFreq += totalTermFreq;
totalTermFreq = 0;
termCount++;
} }
} }
fst = b.finish(); fst = b.finish();
@ -502,6 +529,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator(); return BytesRef.getUTF8SortedAsUnicodeComparator();
} }
@Override
public long getUniqueTermCount() {
return (long) termCount;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
} }
@Override @Override

View File

@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
} }
@Override @Override
public void finishTerm(BytesRef term, int numDocs) throws IOException { public void finishTerm(BytesRef term, TermStats stats) throws IOException {
} }
@Override @Override
public void finish() throws IOException { public void finish(long sumTotalTermFreq) throws IOException {
} }
@Override @Override

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.CodecUtil;
@ -184,12 +185,12 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
/** Called when we are done adding docs to this term */ /** Called when we are done adding docs to this term */
@Override @Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
assert docCount > 0; assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs // TODO: wasteful we are counting this (counting # docs
// for this term) in two places? // for this term) in two places?
assert docCount == df; assert stats.docFreq == df;
if (isIndexTerm) { if (isIndexTerm) {
// Write absolute at seek points // Write absolute at seek points

View File

@ -126,6 +126,11 @@ public abstract class FilteredTermsEnum extends TermsEnum {
return tenum.docFreq(); return tenum.docFreq();
} }
@Override
public long totalTermFreq() {
return tenum.totalTermFreq();
}
/** This enum does not support seeking! /** This enum does not support seeking!
* @throws UnsupportedOperationException * @throws UnsupportedOperationException
*/ */

View File

@ -245,6 +245,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
return actualEnum.docFreq(); return actualEnum.docFreq();
} }
@Override
public long totalTermFreq() {
return actualEnum.totalTermFreq();
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return actualEnum.docs(skipDocs, reuse); return actualEnum.docs(skipDocs, reuse);

View File

@ -28,7 +28,6 @@ import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache.DocTermsIndex; import org.apache.lucene.search.FieldCache.DocTermsIndex;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
@ -321,6 +320,11 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public long totalTermFreq() {
return -1;
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();

View File

@ -102,6 +102,8 @@ public class TestExternalCodecs extends LuceneTestCase {
static class RAMField extends Terms { static class RAMField extends Terms {
final String field; final String field;
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>(); final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
long sumTotalTermFreq;
RAMField(String field) { RAMField(String field) {
this.field = field; this.field = field;
} }
@ -111,6 +113,11 @@ public class TestExternalCodecs extends LuceneTestCase {
return termToDocs.size(); return termToDocs.size();
} }
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override @Override
public TermsEnum iterator() { public TermsEnum iterator() {
return new RAMTermsEnum(RAMOnlyCodec.RAMField.this); return new RAMTermsEnum(RAMOnlyCodec.RAMField.this);
@ -124,6 +131,7 @@ public class TestExternalCodecs extends LuceneTestCase {
static class RAMTerm { static class RAMTerm {
final String term; final String term;
long totalTermFreq;
final List<RAMDoc> docs = new ArrayList<RAMDoc>(); final List<RAMDoc> docs = new ArrayList<RAMDoc>();
public RAMTerm(String term) { public RAMTerm(String term) {
this.term = term; this.term = term;
@ -189,14 +197,16 @@ public class TestExternalCodecs extends LuceneTestCase {
} }
@Override @Override
public void finishTerm(BytesRef text, int numDocs) { public void finishTerm(BytesRef text, TermStats stats) {
assert numDocs > 0; assert stats.docFreq > 0;
assert numDocs == current.docs.size(); assert stats.docFreq == current.docs.size();
current.totalTermFreq = stats.totalTermFreq;
field.termToDocs.put(current.term, current); field.termToDocs.put(current.term, current);
} }
@Override @Override
public void finish() { public void finish(long sumTotalTermFreq) {
field.sumTotalTermFreq = sumTotalTermFreq;
} }
} }
@ -331,6 +341,10 @@ public class TestExternalCodecs extends LuceneTestCase {
} }
@Override @Override
public long totalTermFreq() {
return ramField.termToDocs.get(current).totalTermFreq;
}
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs); return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
} }

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.mocksep.MockSepCodec; import org.apache.lucene.index.codecs.mocksep.MockSepCodec;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec; import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
@ -97,9 +98,11 @@ public class TestCodecs extends LuceneTestCase {
public void write(final FieldsConsumer consumer) throws Throwable { public void write(final FieldsConsumer consumer) throws Throwable {
Arrays.sort(terms); Arrays.sort(terms);
final TermsConsumer termsConsumer = consumer.addField(fieldInfo); final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
for (final TermData term : terms) long sumTotalTermCount = 0;
term.write(termsConsumer); for (final TermData term : terms) {
termsConsumer.finish(); sumTotalTermCount += term.write(termsConsumer);
}
termsConsumer.finish(sumTotalTermCount);
} }
} }
@ -131,8 +134,9 @@ public class TestCodecs extends LuceneTestCase {
return text.compareTo(((TermData) o).text); return text.compareTo(((TermData) o).text);
} }
public void write(final TermsConsumer termsConsumer) throws Throwable { public long write(final TermsConsumer termsConsumer) throws Throwable {
final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text); final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
long totTF = 0;
for(int i=0;i<docs.length;i++) { for(int i=0;i<docs.length;i++) {
final int termDocFreq; final int termDocFreq;
if (field.omitTF) { if (field.omitTF) {
@ -142,6 +146,7 @@ public class TestCodecs extends LuceneTestCase {
} }
postingsConsumer.startDoc(docs[i], termDocFreq); postingsConsumer.startDoc(docs[i], termDocFreq);
if (!field.omitTF) { if (!field.omitTF) {
totTF += positions[i].length;
for(int j=0;j<positions[i].length;j++) { for(int j=0;j<positions[i].length;j++) {
final PositionData pos = positions[i][j]; final PositionData pos = positions[i][j];
postingsConsumer.addPosition(pos.pos, pos.payload); postingsConsumer.addPosition(pos.pos, pos.payload);
@ -149,7 +154,8 @@ public class TestCodecs extends LuceneTestCase {
postingsConsumer.finishDoc(); postingsConsumer.finishDoc();
} }
} }
termsConsumer.finishTerm(text, docs.length); termsConsumer.finishTerm(text, new TermStats(docs.length, totTF));
return totTF;
} }
} }

View File

@ -1865,4 +1865,22 @@ public class TestIndexReader extends LuceneTestCase
assertTrue(IndexReader.indexExists(dir)); assertTrue(IndexReader.indexExists(dir));
dir.close(); dir.close();
} }
// Make sure totalTermFreq works correctly in the terms
// dict cache
public void testTotalTermFreqCached() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
Document d = new Document();
d.add(newField("f", "a a b", Field.Index.ANALYZED));
writer.addDocument(d);
IndexReader r = writer.getReader();
writer.close();
Terms terms = MultiFields.getTerms(r, "f");
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
assertEquals(2, terms.totalTermFreq(new BytesRef("a")));
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
r.close();
dir.close();
}
} }

View File

@ -39,6 +39,7 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.VariableGapTermsIndexReader; import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter; import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec; import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec;
import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec; import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec;
import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory; import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory;
@ -66,7 +67,7 @@ public class MockRandomCodec extends Codec {
public MockRandomCodec(Random random) { public MockRandomCodec(Random random) {
name = "MockRandom"; name = "MockRandom";
this.seedRandom = random; this.seedRandom = new Random(random.nextLong());
} }
@Override @Override
@ -148,7 +149,7 @@ public class MockRandomCodec extends Codec {
final Random rand = new Random(seed2); final Random rand = new Random(seed2);
@Override @Override
public boolean isIndexTerm(BytesRef term, int docFreq) { public boolean isIndexTerm(BytesRef term, TermStats stats) {
return random.nextInt(gap) == 17; return random.nextInt(gap) == 17;
} }
}; };

View File

@ -21,6 +21,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter; import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec; import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
@ -184,10 +185,10 @@ class PreFlexFieldsWriter extends FieldsConsumer {
} }
@Override @Override
public void finishTerm(BytesRef text, int numDocs) throws IOException { public void finishTerm(BytesRef text, TermStats stats) throws IOException {
if (numDocs > 0) { if (stats.docFreq > 0) {
long skipPointer = skipListWriter.writeSkip(freqOut); long skipPointer = skipListWriter.writeSkip(freqOut);
termInfo.docFreq = numDocs; termInfo.docFreq = stats.docFreq;
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer); termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number); //System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
termsOut.add(fieldInfo.number, termsOut.add(fieldInfo.number,
@ -197,7 +198,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
} }
@Override @Override
public void finish() throws IOException { public void finish(long sumTotalTermCount) throws IOException {
} }
@Override @Override

View File

@ -1000,6 +1000,10 @@ class NumberedTermsEnum extends TermsEnum {
return tenum.docFreq(); return tenum.docFreq();
} }
@Override
public long totalTermFreq() {
return tenum.totalTermFreq();
}
public BytesRef skipTo(BytesRef target) throws IOException { public BytesRef skipTo(BytesRef target) throws IOException {