mirror of https://github.com/apache/lucene.git
LUCENE-2862: add TermsEnum.totalTermFreq() and Terms.getSumTotalTermFreq()
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059344 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f5d5dda6c6
commit
a0c82b0f41
|
@ -359,6 +359,9 @@ New features
|
|||
terms dict. This impl stores the indexed terms in an FST, which is
|
||||
much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
|
||||
|
||||
* LUCENE-2862: Added TermsEnum.totalTermFreq() and
|
||||
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.
|
||||
|
|
|
@ -238,6 +238,10 @@ public class InstantiatedIndex
|
|||
while((text = termsEnum.next()) != null) {
|
||||
String termText = text.utf8ToString();
|
||||
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
|
||||
final long totalTermFreq = termsEnum.totalTermFreq();
|
||||
if (totalTermFreq != -1) {
|
||||
instantiatedTerm.addPositionsCount(totalTermFreq);
|
||||
}
|
||||
getTermsByFieldAndText().get(field).put(termText, instantiatedTerm);
|
||||
instantiatedTerm.setTermIndex(terms.size());
|
||||
terms.add(instantiatedTerm);
|
||||
|
|
|
@ -398,18 +398,33 @@ public class InstantiatedIndexReader extends IndexReader {
|
|||
if (i < 0) {
|
||||
i = -i - 1;
|
||||
}
|
||||
if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) {
|
||||
if (i >= orderedTerms.length || orderedTerms[i].field() != field) {
|
||||
// field does not exist
|
||||
return null;
|
||||
}
|
||||
final int startLoc = i;
|
||||
|
||||
// TODO: heavy to do this here; would be better to
|
||||
// do it up front & cache
|
||||
long sum = 0;
|
||||
int upto = i;
|
||||
while(upto < orderedTerms.length && orderedTerms[i].field() == field) {
|
||||
sum += orderedTerms[i].getTotalTermFreq();
|
||||
upto++;
|
||||
}
|
||||
final long sumTotalTermFreq = sum;
|
||||
|
||||
return new Terms() {
|
||||
@Override
|
||||
public TermsEnum iterator() {
|
||||
return new InstantiatedTermsEnum(orderedTerms, startLoc, field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
|
|
|
@ -315,6 +315,7 @@ public class InstantiatedIndexWriter implements Closeable {
|
|||
}
|
||||
associatedDocuments[associatedDocuments.length - 1] = info;
|
||||
term.setAssociatedDocuments(associatedDocuments);
|
||||
term.addPositionsCount(positions.length);
|
||||
|
||||
// todo optimize, only if term vector?
|
||||
informationByTermOfCurrentDocument.put(term, info);
|
||||
|
|
|
@ -45,6 +45,8 @@ public class InstantiatedTerm
|
|||
|
||||
private Term term;
|
||||
|
||||
private long totalTermFreq;
|
||||
|
||||
/**
|
||||
* index of term in InstantiatedIndex
|
||||
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
|
||||
|
@ -92,6 +94,14 @@ public class InstantiatedTerm
|
|||
this.associatedDocuments = associatedDocuments;
|
||||
}
|
||||
|
||||
void addPositionsCount(long count) {
|
||||
totalTermFreq += count;
|
||||
}
|
||||
|
||||
public long getTotalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds index to the first beyond the current whose document number is
|
||||
* greater than or equal to <i>target</i>, -1 if there is no such element.
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.index.TermState;
|
|||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -110,6 +109,12 @@ public class InstantiatedTermsEnum extends TermsEnum {
|
|||
return terms[upto].getAssociatedDocuments().length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
final long v = terms[upto].getTotalTermFreq();
|
||||
return v == 0 ? -1 : v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) {
|
||||
|
|
|
@ -66,6 +66,7 @@ public class TestIndicesEquals extends LuceneTestCase {
|
|||
// create dir data
|
||||
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||
|
||||
for (int i = 0; i < 20; i++) {
|
||||
Document document = new Document();
|
||||
assembleDocument(document, i);
|
||||
|
@ -395,6 +396,10 @@ public class TestIndicesEquals extends LuceneTestCase {
|
|||
}
|
||||
|
||||
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
|
||||
final long totalTermFreq = aprioriTermEnum.totalTermFreq();
|
||||
if (totalTermFreq != -1) {
|
||||
assertEquals(totalTermFreq, testTermEnum.totalTermFreq());
|
||||
}
|
||||
|
||||
// compare termDocs seeking
|
||||
|
||||
|
|
|
@ -610,6 +610,8 @@ public class MemoryIndex implements Serializable {
|
|||
/** Term for this field's fieldName, lazily computed on demand */
|
||||
public transient Term template;
|
||||
|
||||
private final long sumTotalTermFreq;
|
||||
|
||||
private static final long serialVersionUID = 2882195016849084649L;
|
||||
|
||||
public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) {
|
||||
|
@ -617,6 +619,15 @@ public class MemoryIndex implements Serializable {
|
|||
this.numTokens = numTokens;
|
||||
this.numOverlapTokens = numOverlapTokens;
|
||||
this.boost = boost;
|
||||
long sum = 0;
|
||||
for(Map.Entry<BytesRef,ArrayIntList> ent : terms.entrySet()) {
|
||||
sum += ent.getValue().size();
|
||||
}
|
||||
sumTotalTermFreq = sum;
|
||||
}
|
||||
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -826,6 +837,11 @@ public class MemoryIndex implements Serializable {
|
|||
public long getUniqueTermCount() {
|
||||
return info.sortedTerms.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return info.getSumTotalTermFreq();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -895,6 +911,11 @@ public class MemoryIndex implements Serializable {
|
|||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return info.sortedTerms[termUpto].getValue().size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
if (reuse == null || !(reuse instanceof MemoryDocsEnum)) {
|
||||
|
|
|
@ -176,15 +176,34 @@ public class HighFreqTerms {
|
|||
return ts;
|
||||
}
|
||||
|
||||
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
|
||||
BytesRef br = termtext;
|
||||
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termText) throws Exception {
|
||||
|
||||
long totalTF = 0;
|
||||
Bits skipDocs = MultiFields.getDeletedDocs(reader);
|
||||
DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br);
|
||||
// if term is not in index return totalTF of 0
|
||||
if (de == null) {
|
||||
|
||||
Terms terms = MultiFields.getTerms(reader, field);
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Bits skipDocs = MultiFields.getDeletedDocs(reader);
|
||||
if (skipDocs == null) {
|
||||
// TODO: we could do this up front, during the scan
|
||||
// (next()), instead of after-the-fact here w/ seek,
|
||||
// if the codec supports it and there are no del
|
||||
// docs...
|
||||
final long totTF = termsEnum.totalTermFreq();
|
||||
if (totTF != -1) {
|
||||
return totTF;
|
||||
}
|
||||
}
|
||||
|
||||
DocsEnum de = termsEnum.docs(skipDocs, null);
|
||||
|
||||
// use DocsEnum.read() and BulkResult api
|
||||
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
|
||||
int count;
|
||||
|
|
|
@ -41,4 +41,9 @@ public final class TermStats {
|
|||
String getTermText() {
|
||||
return termtext.utf8ToString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq);
|
||||
}
|
||||
}
|
|
@ -17,15 +17,16 @@ package org.apache.lucene.misc;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
|
@ -41,8 +42,10 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
writer = new IndexWriter(dir, newIndexWriterConfig(random,
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))
|
||||
.setMaxBufferedDocs(2));
|
||||
writer.setInfoStream(VERBOSE ? System.out : null);
|
||||
indexDocs(writer);
|
||||
reader = IndexReader.open(dir, true);
|
||||
_TestUtil.checkIndex(dir);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
|
@ -75,8 +78,8 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
String field="FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (i >0){
|
||||
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
|
||||
if (i > 0) {
|
||||
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -134,11 +137,12 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
// check that they are sorted by descending termfreq order
|
||||
if (i >0){
|
||||
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
|
||||
}
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
// check that they are sorted by descending termfreq
|
||||
// order
|
||||
if (i > 0) {
|
||||
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -124,6 +124,10 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
|
|||
return fcsi.getTermsEnum();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
});
|
||||
|
||||
assert termsEnum != null;
|
||||
|
|
|
@ -610,6 +610,8 @@ public class CheckIndex {
|
|||
|
||||
Comparator<BytesRef> termComp = terms.getComparator();
|
||||
|
||||
long sumTotalTermFreq = 0;
|
||||
|
||||
while(true) {
|
||||
|
||||
final BytesRef term = terms.next();
|
||||
|
@ -660,6 +662,8 @@ public class CheckIndex {
|
|||
}
|
||||
|
||||
int lastDoc = -1;
|
||||
int docCount = 0;
|
||||
long totalTermFreq = 0;
|
||||
while(true) {
|
||||
final int doc = docs2.nextDoc();
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
|
@ -667,6 +671,8 @@ public class CheckIndex {
|
|||
}
|
||||
final int freq = docs2.freq();
|
||||
status.totPos += freq;
|
||||
totalTermFreq += freq;
|
||||
docCount++;
|
||||
|
||||
if (doc <= lastDoc) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
|
||||
|
@ -698,21 +704,38 @@ public class CheckIndex {
|
|||
}
|
||||
}
|
||||
|
||||
// Now count how many deleted docs occurred in
|
||||
// this term:
|
||||
final long totalTermFreq2 = terms.totalTermFreq();
|
||||
final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
|
||||
|
||||
// Re-count if there are deleted docs:
|
||||
if (reader.hasDeletions()) {
|
||||
final DocsEnum docsNoDel = terms.docs(null, docs);
|
||||
int count = 0;
|
||||
docCount = 0;
|
||||
totalTermFreq = 0;
|
||||
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
count++;
|
||||
docCount++;
|
||||
totalTermFreq += docsNoDel.freq();
|
||||
}
|
||||
if (count != docFreq) {
|
||||
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
|
||||
}
|
||||
|
||||
if (docCount != docFreq) {
|
||||
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
|
||||
}
|
||||
if (hasTotalTermFreq) {
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
if (totalTermFreq != totalTermFreq2) {
|
||||
throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sumTotalTermFreq != 0) {
|
||||
final long v = fields.terms(field).getSumTotalTermFreq();
|
||||
if (v != -1 && sumTotalTermFreq != v) {
|
||||
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
|
||||
}
|
||||
}
|
||||
|
||||
// Test seek to last term:
|
||||
if (lastTerm != null) {
|
||||
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
|
||||
|
|
|
@ -99,6 +99,11 @@ public class FilterIndexReader extends IndexReader {
|
|||
public long getUniqueTermCount() throws IOException {
|
||||
return in.getUniqueTermCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
return in.getSumTotalTermFreq();
|
||||
}
|
||||
}
|
||||
|
||||
/** Base class for filtering {@link TermsEnum} implementations. */
|
||||
|
@ -155,6 +160,11 @@ public class FilterIndexReader extends IndexReader {
|
|||
return in.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return in.totalTermFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
return in.docs(skipDocs, reuse);
|
||||
|
|
|
@ -20,13 +20,14 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
|
||||
|
@ -165,6 +166,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
// multiple threads and interacting with the
|
||||
// TermsConsumer, only calling out to us (passing us the
|
||||
// DocsConsumer) to handle delivery of docs/positions
|
||||
long sumTotalTermFreq = 0;
|
||||
while(numFields > 0) {
|
||||
|
||||
// Get the next term to merge
|
||||
|
@ -197,6 +199,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
// which all share the same term. Now we must
|
||||
// interleave the docID streams.
|
||||
int numDocs = 0;
|
||||
long totTF = 0;
|
||||
while(numToMerge > 0) {
|
||||
|
||||
FreqProxFieldMergeState minState = termStates[0];
|
||||
|
@ -222,6 +225,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
// omitTermFreqAndPositions == false so we do write positions &
|
||||
// payload
|
||||
int position = 0;
|
||||
totTF += termDocFreq;
|
||||
for(int j=0;j<termDocFreq;j++) {
|
||||
final int code = prox.readVInt();
|
||||
position += code >> 1;
|
||||
|
@ -286,9 +290,10 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
}
|
||||
|
||||
assert numDocs > 0;
|
||||
termsConsumer.finishTerm(text, numDocs);
|
||||
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
|
||||
sumTotalTermFreq += totTF;
|
||||
}
|
||||
|
||||
termsConsumer.finish();
|
||||
termsConsumer.finish(sumTotalTermFreq);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -997,6 +997,23 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
|||
return terms.docFreq(term);
|
||||
}
|
||||
|
||||
/** Returns the number of documents containing the term
|
||||
* <code>t</code>. This method returns 0 if the term or
|
||||
* field does not exists. This method does not take into
|
||||
* account deleted documents that have not yet been merged
|
||||
* away. */
|
||||
public long totalTermFreq(String field, BytesRef term) throws IOException {
|
||||
final Fields fields = fields();
|
||||
if (fields == null) {
|
||||
return 0;
|
||||
}
|
||||
final Terms terms = fields.terms(field);
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
return terms.totalTermFreq(term);
|
||||
}
|
||||
|
||||
/** This may return null if the field does not exist.*/
|
||||
public Terms terms(String field) throws IOException {
|
||||
final Fields fields = fields();
|
||||
|
|
|
@ -76,6 +76,19 @@ public final class MultiTerms extends Terms {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
long sum = 0;
|
||||
for(Terms terms : subs) {
|
||||
final long v = terms.getSumTotalTermFreq();
|
||||
if (v == -1) {
|
||||
return -1;
|
||||
}
|
||||
sum += v;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return termComp;
|
||||
|
|
|
@ -265,6 +265,19 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
long sum = 0;
|
||||
for(int i=0;i<numTop;i++) {
|
||||
final long v = top[i].terms.totalTermFreq();
|
||||
if (v == -1) {
|
||||
return v;
|
||||
}
|
||||
sum += v;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
final MultiDocsEnum docsEnum;
|
||||
|
|
|
@ -57,6 +57,18 @@ public abstract class Terms {
|
|||
}
|
||||
}
|
||||
|
||||
/** Returns the number of documents containing the
|
||||
* specified term text. Returns 0 if the term does not
|
||||
* exist. */
|
||||
public long totalTermFreq(BytesRef text) throws IOException {
|
||||
final TermsEnum termsEnum = getThreadTermsEnum();
|
||||
if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
|
||||
return termsEnum.totalTermFreq();
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Get {@link DocsEnum} for the specified term. This
|
||||
* method may return null if the term does not exist. */
|
||||
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
|
||||
|
@ -115,6 +127,14 @@ public abstract class Terms {
|
|||
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
|
||||
}
|
||||
|
||||
/** Returns the sum of {@link TermsEnum#totalTermFreq} for
|
||||
* all terms in this field, or -1 if this measure isn't
|
||||
* stored by the codec (or if this fields omits term freq
|
||||
* and positions). Note that, just like other term
|
||||
* measures, this measure does not take deleted documents
|
||||
* into account. */
|
||||
public abstract long getSumTotalTermFreq() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns a thread-private {@link TermsEnum} instance. Obtaining
|
||||
* {@link TermsEnum} from this method might be more efficient than using
|
||||
|
|
|
@ -126,6 +126,14 @@ public abstract class TermsEnum {
|
|||
* {@link SeekStatus#END}.*/
|
||||
public abstract int docFreq();
|
||||
|
||||
/** Returns the total number of occurrences of this term
|
||||
* across all documents (the sum of the freq() for each
|
||||
* doc that has this term). This will be -1 if the
|
||||
* codec doesn't support this measure. Note that, like
|
||||
* other term measures, this measure does not take
|
||||
* deleted documents into account. */
|
||||
public abstract long totalTermFreq();
|
||||
|
||||
/** Get {@link DocsEnum} for the current term. Do not
|
||||
* call this before calling {@link #next} or {@link
|
||||
* #seek} for the first time. This method will not
|
||||
|
@ -198,6 +206,11 @@ public abstract class TermsEnum {
|
|||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
|
|
|
@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
|
||||
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
// First term is first indexed term:
|
||||
if (0 == (numTerms++ % termIndexInterval)) {
|
||||
|
||||
|
|
|
@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
|
|||
|
||||
/** Default merge impl: append documents, mapping around
|
||||
* deletes */
|
||||
public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
|
||||
public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
|
||||
|
||||
int df = 0;
|
||||
long totTF = 0;
|
||||
|
||||
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
||||
while(true) {
|
||||
|
@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
|
|||
this.startDoc(doc, postings.freq());
|
||||
this.finishDoc();
|
||||
df++;
|
||||
totTF++;
|
||||
}
|
||||
} else {
|
||||
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
|
||||
|
@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
|
|||
}
|
||||
final int freq = postingsEnum.freq();
|
||||
this.startDoc(doc, freq);
|
||||
totTF += freq;
|
||||
for(int i=0;i<freq;i++) {
|
||||
final int position = postingsEnum.nextPosition();
|
||||
final BytesRef payload;
|
||||
|
@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
|
|||
df++;
|
||||
}
|
||||
}
|
||||
return df;
|
||||
return new TermStats(df, totTF);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
|
|||
public abstract void startTerm() throws IOException;
|
||||
|
||||
/** Finishes the current term */
|
||||
public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
|
||||
public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
|
||||
|
||||
public abstract void setField(FieldInfo fieldInfo);
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.index.TermState;
|
|||
public class PrefixCodedTermState extends OrdTermState {
|
||||
public int docFreq; // how many docs have this term
|
||||
public long filePointer; // fp into the terms dict primary file (_X.tis)
|
||||
public long totalTermFreq; // total number of occurrences of this term
|
||||
|
||||
@Override
|
||||
public void copyFrom(TermState _other) {
|
||||
|
@ -35,11 +36,12 @@ public class PrefixCodedTermState extends OrdTermState {
|
|||
super.copyFrom(_other);
|
||||
filePointer = other.filePointer;
|
||||
docFreq = other.docFreq;
|
||||
totalTermFreq = other.totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
|
||||
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -129,18 +129,17 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
// Read per-field details
|
||||
seekDir(in, dirOffset);
|
||||
|
||||
final int numFields = in.readInt();
|
||||
final int numFields = in.readVInt();
|
||||
|
||||
for(int i=0;i<numFields;i++) {
|
||||
final int field = in.readInt();
|
||||
final long numTerms = in.readLong();
|
||||
final int field = in.readVInt();
|
||||
final long numTerms = in.readVLong();
|
||||
assert numTerms >= 0;
|
||||
final long termsStartPointer = in.readLong();
|
||||
final long termsStartPointer = in.readVLong();
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
if (numTerms > 0) {
|
||||
assert !fields.containsKey(fieldInfo.name);
|
||||
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
|
||||
}
|
||||
final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
|
||||
assert !fields.containsKey(fieldInfo.name);
|
||||
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
|
@ -245,12 +244,14 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
final long numTerms;
|
||||
final FieldInfo fieldInfo;
|
||||
final long termsStartPointer;
|
||||
final long sumTotalTermFreq;
|
||||
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
|
||||
assert numTerms > 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.numTerms = numTerms;
|
||||
this.termsStartPointer = termsStartPointer;
|
||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -273,6 +274,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
return numTerms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
// Iterates through terms in this field, not supporting ord()
|
||||
private final class SegmentTermsEnum extends TermsEnum {
|
||||
private final IndexInput in;
|
||||
|
@ -295,6 +301,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
bytesReader = new DeltaBytesReader(in);
|
||||
fieldTerm.field = fieldInfo.name;
|
||||
state = postingsReader.newTermState();
|
||||
state.totalTermFreq = -1;
|
||||
state.ord = -1;
|
||||
}
|
||||
|
||||
|
@ -494,6 +501,10 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
|
||||
}
|
||||
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
state.totalTermFreq = state.docFreq + in.readVLong();
|
||||
}
|
||||
|
||||
postingsReader.readTerm(in,
|
||||
fieldInfo, state,
|
||||
isIndexTerm);
|
||||
|
@ -511,6 +522,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
return state.docFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return state.totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
|
||||
|
|
|
@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
|||
final FieldInfos fieldInfos;
|
||||
FieldInfo currentField;
|
||||
private final TermsIndexWriterBase termsIndexWriter;
|
||||
private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
|
||||
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
|
||||
private final Comparator<BytesRef> termComp;
|
||||
|
||||
public PrefixCodedTermsWriter(
|
||||
|
@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
|||
assert currentField == null || currentField.name.compareTo(field.name) < 0;
|
||||
currentField = field;
|
||||
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
|
||||
TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
|
||||
final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
|
||||
fields.add(terms);
|
||||
return terms;
|
||||
}
|
||||
|
@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
|||
public void close() throws IOException {
|
||||
|
||||
try {
|
||||
final int fieldCount = fields.size();
|
||||
|
||||
int nonZeroCount = 0;
|
||||
for(TermsWriter field : fields) {
|
||||
if (field.numTerms > 0) {
|
||||
nonZeroCount++;
|
||||
}
|
||||
}
|
||||
|
||||
final long dirStart = out.getFilePointer();
|
||||
|
||||
out.writeInt(fieldCount);
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
TermsWriter field = (TermsWriter) fields.get(i);
|
||||
out.writeInt(field.fieldInfo.number);
|
||||
out.writeLong(field.numTerms);
|
||||
out.writeLong(field.termsStartPointer);
|
||||
out.writeVInt(nonZeroCount);
|
||||
for(TermsWriter field : fields) {
|
||||
if (field.numTerms > 0) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVLong(field.numTerms);
|
||||
out.writeVLong(field.termsStartPointer);
|
||||
if (!field.fieldInfo.omitTermFreqAndPositions) {
|
||||
out.writeVLong(field.sumTotalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
} finally {
|
||||
|
@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
|||
private final long termsStartPointer;
|
||||
private long numTerms;
|
||||
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
|
||||
long sumTotalTermFreq;
|
||||
|
||||
TermsWriter(
|
||||
TermsIndexWriterBase.FieldWriter fieldIndexWriter,
|
||||
|
@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, int numDocs) throws IOException {
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
|
||||
assert numDocs > 0;
|
||||
assert stats.docFreq > 0;
|
||||
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
|
||||
|
||||
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
|
||||
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
|
||||
|
||||
termWriter.write(text);
|
||||
final int highBit = isIndexTerm ? 0x80 : 0;
|
||||
|
@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
|||
|
||||
// This is a vInt, except, we steal top bit to record
|
||||
// whether this was an indexed term:
|
||||
if ((numDocs & ~0x3F) == 0) {
|
||||
if ((stats.docFreq & ~0x3F) == 0) {
|
||||
// Fast case -- docFreq fits in 6 bits
|
||||
out.writeByte((byte) (highBit | numDocs));
|
||||
out.writeByte((byte) (highBit | stats.docFreq));
|
||||
} else {
|
||||
// Write bottom 6 bits of docFreq, then write the
|
||||
// remainder as vInt:
|
||||
out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
|
||||
out.writeVInt(numDocs >>> 6);
|
||||
out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
|
||||
out.writeVInt(stats.docFreq >>> 6);
|
||||
}
|
||||
postingsWriter.finishTerm(numDocs, isIndexTerm);
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
assert stats.totalTermFreq >= stats.docFreq;
|
||||
out.writeVLong(stats.totalTermFreq - stats.docFreq);
|
||||
}
|
||||
postingsWriter.finishTerm(stats, isIndexTerm);
|
||||
numTerms++;
|
||||
}
|
||||
|
||||
// Finishes all terms in this field
|
||||
@Override
|
||||
public void finish() throws IOException {
|
||||
public void finish(long sumTotalTermFreq) throws IOException {
|
||||
// EOF marker:
|
||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
out.writeVInt(DeltaBytesWriter.TERM_EOF);
|
||||
fieldIndexWriter.finish();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
package org.apache.lucene.index.codecs;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TermStats {
|
||||
public final int docFreq;
|
||||
public final long totalTermFreq;
|
||||
|
||||
public TermStats(int docFreq, long totalTermFreq) {
|
||||
this.docFreq = docFreq;
|
||||
this.totalTermFreq = totalTermFreq;
|
||||
}
|
||||
}
|
|
@ -38,10 +38,10 @@ public abstract class TermsConsumer {
|
|||
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
|
||||
|
||||
/** Finishes the current term; numDocs must be > 0. */
|
||||
public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
|
||||
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
|
||||
|
||||
/** Called when we are done adding terms to this field */
|
||||
public abstract void finish() throws IOException;
|
||||
public abstract void finish(long sumTotalTermFreq) throws IOException;
|
||||
|
||||
/** Return the BytesRef Comparator used to sort terms
|
||||
* before feeding to this API. */
|
||||
|
@ -55,6 +55,7 @@ public abstract class TermsConsumer {
|
|||
|
||||
BytesRef term;
|
||||
assert termsEnum != null;
|
||||
long sumTotalTermFreq = 0;
|
||||
|
||||
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
||||
if (docsEnum == null) {
|
||||
|
@ -69,9 +70,9 @@ public abstract class TermsConsumer {
|
|||
if (docsEnumIn != null) {
|
||||
docsEnum.reset(docsEnumIn);
|
||||
final PostingsConsumer postingsConsumer = startTerm(term);
|
||||
final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
|
||||
if (numDocs > 0) {
|
||||
finishTerm(term, numDocs);
|
||||
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
|
||||
if (stats.docFreq > 0) {
|
||||
finishTerm(term, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -94,14 +95,15 @@ public abstract class TermsConsumer {
|
|||
}
|
||||
}
|
||||
final PostingsConsumer postingsConsumer = startTerm(term);
|
||||
final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
|
||||
if (numDocs > 0) {
|
||||
finishTerm(term, numDocs);
|
||||
final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
|
||||
if (stats.docFreq > 0) {
|
||||
finishTerm(term, stats);
|
||||
sumTotalTermFreq += stats.totalTermFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
finish();
|
||||
finish(sumTotalTermFreq);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBase {
|
|||
public abstract void setTermsOutput(IndexOutput out);
|
||||
|
||||
public abstract class FieldWriter {
|
||||
public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
|
||||
public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
|
||||
public abstract void finish() throws IOException;
|
||||
}
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
public static abstract class IndexTermSelector {
|
||||
// Called sequentially on every term being written,
|
||||
// returning true if this term should be indexed
|
||||
public abstract boolean isIndexTerm(BytesRef term, int docFreq);
|
||||
public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
|
||||
}
|
||||
|
||||
/** Same policy as {@link FixedGapTermsIndexWriter} */
|
||||
|
@ -74,7 +74,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
||||
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||
if (count >= interval) {
|
||||
count = 0;
|
||||
return true;
|
||||
|
@ -99,8 +99,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
||||
if (docFreq >= docFreqThresh || count >= interval) {
|
||||
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||
if (stats.docFreq >= docFreqThresh || count >= interval) {
|
||||
count = 0;
|
||||
return true;
|
||||
} else {
|
||||
|
@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
|
||||
if (policy.isIndexTerm(text, docFreq) || first) {
|
||||
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
if (policy.isIndexTerm(text, stats) || first) {
|
||||
first = false;
|
||||
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
|
||||
final int lengthSave = text.length;
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.index.FieldsEnum;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.CompoundFileReader;
|
||||
|
@ -263,6 +262,11 @@ public class PreFlexFields extends FieldsProducer {
|
|||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private class PreTermsEnum extends TermsEnum {
|
||||
|
@ -938,6 +942,11 @@ public class PreFlexFields extends FieldsProducer {
|
|||
return termEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
PreDocsEnum docsEnum;
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -177,7 +178,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
|||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
||||
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||
//System.out.println("PW finishTerm docCount=" + docCount);
|
||||
|
||||
assert pendingCount > 0 || pendingCount == -1;
|
||||
|
@ -186,7 +187,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
|||
|
||||
if (pendingCount == -1) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
|
||||
wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
|
||||
pendingIsIndexTerm = false;
|
||||
} else {
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
|
@ -239,11 +240,11 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
|
|||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
||||
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||
|
||||
// TODO: -- wasteful we are counting this in two places?
|
||||
assert docCount > 0;
|
||||
assert docCount == df;
|
||||
assert stats.docFreq > 0;
|
||||
assert stats.docFreq == df;
|
||||
|
||||
docIndex.write(termsOut, isIndexTerm);
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
|
@ -119,28 +118,31 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private final IndexInput in;
|
||||
private final boolean omitTF;
|
||||
private int docFreq;
|
||||
private long totalTermFreq;
|
||||
private long docsStart;
|
||||
private boolean ended;
|
||||
private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
|
||||
private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
|
||||
|
||||
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
|
||||
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
|
||||
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||
this.omitTF = omitTF;
|
||||
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
|
||||
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
|
||||
}
|
||||
|
||||
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
|
||||
|
||||
//System.out.println("seek to text=" + text.utf8ToString());
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
|
||||
if (result == null) {
|
||||
//System.out.println(" end");
|
||||
return SeekStatus.END;
|
||||
} else {
|
||||
//System.out.println(" got text=" + term.utf8ToString());
|
||||
PairOutputs.Pair<Long,Long> pair = result.output;
|
||||
docsStart = pair.output1;
|
||||
docFreq = pair.output2.intValue();
|
||||
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
|
||||
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
|
||||
docsStart = pair1.output1;
|
||||
docFreq = pair2.output1.intValue();
|
||||
totalTermFreq = pair2.output2;
|
||||
|
||||
if (result.input.equals(text)) {
|
||||
//System.out.println(" match docsStart=" + docsStart);
|
||||
|
@ -155,11 +157,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
assert !ended;
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
|
||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
|
||||
if (result != null) {
|
||||
final PairOutputs.Pair<Long,Long> pair = result.output;
|
||||
docsStart = pair.output1;
|
||||
docFreq = pair.output2.intValue();
|
||||
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
|
||||
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
|
||||
docsStart = pair1.output1;
|
||||
docFreq = pair2.output1.intValue();
|
||||
totalTermFreq = pair2.output2;
|
||||
return result.input;
|
||||
} else {
|
||||
return null;
|
||||
|
@ -186,6 +190,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
return docFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
SimpleTextDocsEnum docsEnum;
|
||||
|
@ -438,8 +447,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private class SimpleTextTerms extends Terms {
|
||||
private final long termsStart;
|
||||
private final boolean omitTF;
|
||||
private FST<PairOutputs.Pair<Long,Long>> fst;
|
||||
|
||||
private long sumTotalTermFreq;
|
||||
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
|
||||
private int termCount;
|
||||
private final BytesRef scratch = new BytesRef(10);
|
||||
|
||||
public SimpleTextTerms(String field, long termsStart) throws IOException {
|
||||
|
@ -450,24 +460,38 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
|
||||
private void loadTerms() throws IOException {
|
||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
||||
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
|
||||
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
||||
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
|
||||
0,
|
||||
0,
|
||||
true,
|
||||
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
||||
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
|
||||
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||
in.seek(termsStart);
|
||||
final BytesRef lastTerm = new BytesRef(10);
|
||||
long lastDocsStart = -1;
|
||||
int docFreq = 0;
|
||||
long totalTermFreq = 0;
|
||||
while(true) {
|
||||
readLine(in, scratch);
|
||||
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||
posIntOutputs.get(totalTermFreq))));
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
}
|
||||
break;
|
||||
} else if (scratch.startsWith(DOC)) {
|
||||
docFreq++;
|
||||
} else if (scratch.startsWith(POS)) {
|
||||
totalTermFreq++;
|
||||
} else if (scratch.startsWith(TERM)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
|
||||
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||
posIntOutputs.get(totalTermFreq))));
|
||||
}
|
||||
lastDocsStart = in.getFilePointer();
|
||||
final int len = scratch.length - TERM.length;
|
||||
|
@ -477,6 +501,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
|
||||
lastTerm.length = len;
|
||||
docFreq = 0;
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
totalTermFreq = 0;
|
||||
termCount++;
|
||||
}
|
||||
}
|
||||
fst = b.finish();
|
||||
|
@ -502,6 +529,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() {
|
||||
return (long) termCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUtil;
|
|||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef term, int numDocs) throws IOException {
|
||||
public void finishTerm(BytesRef term, TermStats stats) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish() throws IOException {
|
||||
public void finish(long sumTotalTermFreq) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
|
||||
|
@ -184,12 +185,12 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
||||
assert docCount > 0;
|
||||
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||
assert stats.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert docCount == df;
|
||||
assert stats.docFreq == df;
|
||||
|
||||
if (isIndexTerm) {
|
||||
// Write absolute at seek points
|
||||
|
|
|
@ -126,6 +126,11 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
return tenum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return tenum.totalTermFreq();
|
||||
}
|
||||
|
||||
/** This enum does not support seeking!
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
|
|
|
@ -245,6 +245,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
return actualEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return actualEnum.totalTermFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
return actualEnum.docs(skipDocs, reuse);
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.index.OrdTermState;
|
|||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache.DocTermsIndex;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -321,6 +320,11 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -102,6 +102,8 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
static class RAMField extends Terms {
|
||||
final String field;
|
||||
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
|
||||
long sumTotalTermFreq;
|
||||
|
||||
RAMField(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
@ -111,6 +113,11 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
return termToDocs.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return sumTotalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator() {
|
||||
return new RAMTermsEnum(RAMOnlyCodec.RAMField.this);
|
||||
|
@ -124,6 +131,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
|
||||
static class RAMTerm {
|
||||
final String term;
|
||||
long totalTermFreq;
|
||||
final List<RAMDoc> docs = new ArrayList<RAMDoc>();
|
||||
public RAMTerm(String term) {
|
||||
this.term = term;
|
||||
|
@ -189,14 +197,16 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, int numDocs) {
|
||||
assert numDocs > 0;
|
||||
assert numDocs == current.docs.size();
|
||||
public void finishTerm(BytesRef text, TermStats stats) {
|
||||
assert stats.docFreq > 0;
|
||||
assert stats.docFreq == current.docs.size();
|
||||
current.totalTermFreq = stats.totalTermFreq;
|
||||
field.termToDocs.put(current.term, current);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish() {
|
||||
public void finish(long sumTotalTermFreq) {
|
||||
field.sumTotalTermFreq = sumTotalTermFreq;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -331,6 +341,10 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return ramField.termToDocs.get(current).totalTermFreq;
|
||||
}
|
||||
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.index.codecs.FieldsConsumer;
|
|||
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.codecs.mocksep.MockSepCodec;
|
||||
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
@ -97,9 +98,11 @@ public class TestCodecs extends LuceneTestCase {
|
|||
public void write(final FieldsConsumer consumer) throws Throwable {
|
||||
Arrays.sort(terms);
|
||||
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
|
||||
for (final TermData term : terms)
|
||||
term.write(termsConsumer);
|
||||
termsConsumer.finish();
|
||||
long sumTotalTermCount = 0;
|
||||
for (final TermData term : terms) {
|
||||
sumTotalTermCount += term.write(termsConsumer);
|
||||
}
|
||||
termsConsumer.finish(sumTotalTermCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -131,8 +134,9 @@ public class TestCodecs extends LuceneTestCase {
|
|||
return text.compareTo(((TermData) o).text);
|
||||
}
|
||||
|
||||
public void write(final TermsConsumer termsConsumer) throws Throwable {
|
||||
public long write(final TermsConsumer termsConsumer) throws Throwable {
|
||||
final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
|
||||
long totTF = 0;
|
||||
for(int i=0;i<docs.length;i++) {
|
||||
final int termDocFreq;
|
||||
if (field.omitTF) {
|
||||
|
@ -142,6 +146,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
}
|
||||
postingsConsumer.startDoc(docs[i], termDocFreq);
|
||||
if (!field.omitTF) {
|
||||
totTF += positions[i].length;
|
||||
for(int j=0;j<positions[i].length;j++) {
|
||||
final PositionData pos = positions[i][j];
|
||||
postingsConsumer.addPosition(pos.pos, pos.payload);
|
||||
|
@ -149,7 +154,8 @@ public class TestCodecs extends LuceneTestCase {
|
|||
postingsConsumer.finishDoc();
|
||||
}
|
||||
}
|
||||
termsConsumer.finishTerm(text, docs.length);
|
||||
termsConsumer.finishTerm(text, new TermStats(docs.length, totTF));
|
||||
return totTF;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1865,4 +1865,22 @@ public class TestIndexReader extends LuceneTestCase
|
|||
assertTrue(IndexReader.indexExists(dir));
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// Make sure totalTermFreq works correctly in the terms
|
||||
// dict cache
|
||||
public void testTotalTermFreqCached() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||
Document d = new Document();
|
||||
d.add(newField("f", "a a b", Field.Index.ANALYZED));
|
||||
writer.addDocument(d);
|
||||
IndexReader r = writer.getReader();
|
||||
writer.close();
|
||||
Terms terms = MultiFields.getTerms(r, "f");
|
||||
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
|
||||
assertEquals(2, terms.totalTermFreq(new BytesRef("a")));
|
||||
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase;
|
|||
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
|
||||
import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
|
||||
import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec;
|
||||
import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec;
|
||||
import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory;
|
||||
|
@ -66,7 +67,7 @@ public class MockRandomCodec extends Codec {
|
|||
|
||||
public MockRandomCodec(Random random) {
|
||||
name = "MockRandom";
|
||||
this.seedRandom = random;
|
||||
this.seedRandom = new Random(random.nextLong());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -148,7 +149,7 @@ public class MockRandomCodec extends Codec {
|
|||
final Random rand = new Random(seed2);
|
||||
|
||||
@Override
|
||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
||||
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||
return random.nextInt(gap) == 17;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.index.codecs.TermStats;
|
||||
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
|
||||
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
|
@ -184,10 +185,10 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, int numDocs) throws IOException {
|
||||
if (numDocs > 0) {
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
if (stats.docFreq > 0) {
|
||||
long skipPointer = skipListWriter.writeSkip(freqOut);
|
||||
termInfo.docFreq = numDocs;
|
||||
termInfo.docFreq = stats.docFreq;
|
||||
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
|
||||
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
|
||||
termsOut.add(fieldInfo.number,
|
||||
|
@ -197,7 +198,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void finish() throws IOException {
|
||||
public void finish(long sumTotalTermCount) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1000,6 +1000,10 @@ class NumberedTermsEnum extends TermsEnum {
|
|||
return tenum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() {
|
||||
return tenum.totalTermFreq();
|
||||
}
|
||||
|
||||
public BytesRef skipTo(BytesRef target) throws IOException {
|
||||
|
||||
|
|
Loading…
Reference in New Issue