LUCENE-2862: add TermsEnum.totalTermFreq() and Terms.getSumTotalTermFreq()

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059344 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-01-15 14:42:37 +00:00
parent f5d5dda6c6
commit a0c82b0f41
45 changed files with 511 additions and 126 deletions

View File

@ -359,6 +359,9 @@ New features
terms dict. This impl stores the indexed terms in an FST, which is
much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
* LUCENE-2862: Added TermsEnum.totalTermFreq() and
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
Optimizations
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.

View File

@ -238,6 +238,10 @@ public class InstantiatedIndex
while((text = termsEnum.next()) != null) {
String termText = text.utf8ToString();
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
final long totalTermFreq = termsEnum.totalTermFreq();
if (totalTermFreq != -1) {
instantiatedTerm.addPositionsCount(totalTermFreq);
}
getTermsByFieldAndText().get(field).put(termText, instantiatedTerm);
instantiatedTerm.setTermIndex(terms.size());
terms.add(instantiatedTerm);

View File

@ -398,18 +398,33 @@ public class InstantiatedIndexReader extends IndexReader {
if (i < 0) {
i = -i - 1;
}
if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) {
if (i >= orderedTerms.length || orderedTerms[i].field() != field) {
// field does not exist
return null;
}
final int startLoc = i;
// TODO: heavy to do this here; would be better to
// do it up front & cache
long sum = 0;
int upto = i;
while(upto < orderedTerms.length && orderedTerms[i].field() == field) {
sum += orderedTerms[i].getTotalTermFreq();
upto++;
}
final long sumTotalTermFreq = sum;
return new Terms() {
@Override
public TermsEnum iterator() {
return new InstantiatedTermsEnum(orderedTerms, startLoc, field);
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();

View File

@ -315,6 +315,7 @@ public class InstantiatedIndexWriter implements Closeable {
}
associatedDocuments[associatedDocuments.length - 1] = info;
term.setAssociatedDocuments(associatedDocuments);
term.addPositionsCount(positions.length);
// todo optimize, only if term vector?
informationByTermOfCurrentDocument.put(term, info);

View File

@ -45,6 +45,8 @@ public class InstantiatedTerm
private Term term;
private long totalTermFreq;
/**
* index of term in InstantiatedIndex
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
@ -92,6 +94,14 @@ public class InstantiatedTerm
this.associatedDocuments = associatedDocuments;
}
void addPositionsCount(long count) {
totalTermFreq += count;
}
public long getTotalTermFreq() {
return totalTermFreq;
}
/**
* Finds index to the first beyond the current whose document number is
* greater than or equal to <i>target</i>, -1 if there is no such element.

View File

@ -24,7 +24,6 @@ import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import java.io.IOException;
import java.util.Arrays;
@ -110,6 +109,12 @@ public class InstantiatedTermsEnum extends TermsEnum {
return terms[upto].getAssociatedDocuments().length;
}
@Override
public long totalTermFreq() {
final long v = terms[upto].getTotalTermFreq();
return v == 0 ? -1 : v;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) {

View File

@ -66,6 +66,7 @@ public class TestIndicesEquals extends LuceneTestCase {
// create dir data
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer()));
for (int i = 0; i < 20; i++) {
Document document = new Document();
assembleDocument(document, i);
@ -395,6 +396,10 @@ public class TestIndicesEquals extends LuceneTestCase {
}
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
final long totalTermFreq = aprioriTermEnum.totalTermFreq();
if (totalTermFreq != -1) {
assertEquals(totalTermFreq, testTermEnum.totalTermFreq());
}
// compare termDocs seeking

View File

@ -610,6 +610,8 @@ public class MemoryIndex implements Serializable {
/** Term for this field's fieldName, lazily computed on demand */
public transient Term template;
private final long sumTotalTermFreq;
private static final long serialVersionUID = 2882195016849084649L;
public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) {
@ -617,6 +619,15 @@ public class MemoryIndex implements Serializable {
this.numTokens = numTokens;
this.numOverlapTokens = numOverlapTokens;
this.boost = boost;
long sum = 0;
for(Map.Entry<BytesRef,ArrayIntList> ent : terms.entrySet()) {
sum += ent.getValue().size();
}
sumTotalTermFreq = sum;
}
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
/**
@ -826,6 +837,11 @@ public class MemoryIndex implements Serializable {
public long getUniqueTermCount() {
return info.sortedTerms.length;
}
@Override
public long getSumTotalTermFreq() {
return info.getSumTotalTermFreq();
}
};
}
}
@ -895,6 +911,11 @@ public class MemoryIndex implements Serializable {
return 1;
}
@Override
public long totalTermFreq() {
return info.sortedTerms[termUpto].getValue().size();
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
if (reuse == null || !(reuse instanceof MemoryDocsEnum)) {

View File

@ -176,15 +176,34 @@ public class HighFreqTerms {
return ts;
}
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
BytesRef br = termtext;
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termText) throws Exception {
long totalTF = 0;
Bits skipDocs = MultiFields.getDeletedDocs(reader);
DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br);
// if term is not in index return totalTF of 0
if (de == null) {
Terms terms = MultiFields.getTerms(reader, field);
if (terms == null) {
return 0;
}
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) {
return 0;
}
Bits skipDocs = MultiFields.getDeletedDocs(reader);
if (skipDocs == null) {
// TODO: we could do this up front, during the scan
// (next()), instead of after-the-fact here w/ seek,
// if the codec supports it and there are no del
// docs...
final long totTF = termsEnum.totalTermFreq();
if (totTF != -1) {
return totTF;
}
}
DocsEnum de = termsEnum.docs(skipDocs, null);
// use DocsEnum.read() and BulkResult api
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
int count;

View File

@ -41,4 +41,9 @@ public final class TermStats {
String getTermText() {
return termtext.utf8ToString();
}
@Override
public String toString() {
return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq);
}
}

View File

@ -17,15 +17,16 @@ package org.apache.lucene.misc;
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@ -41,8 +42,10 @@ public class TestHighFreqTerms extends LuceneTestCase {
writer = new IndexWriter(dir, newIndexWriterConfig(random,
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))
.setMaxBufferedDocs(2));
writer.setInfoStream(VERBOSE ? System.out : null);
indexDocs(writer);
reader = IndexReader.open(dir, true);
_TestUtil.checkIndex(dir);
}
@AfterClass
@ -75,8 +78,8 @@ public class TestHighFreqTerms extends LuceneTestCase {
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
for (int i = 0; i < terms.length; i++) {
if (i >0){
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
if (i > 0) {
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
}
}
}
@ -134,11 +137,12 @@ public class TestHighFreqTerms extends LuceneTestCase {
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
for (int i = 0; i < termsWithTF.length; i++) {
// check that they are sorted by descending termfreq order
if (i >0){
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
}
for (int i = 0; i < termsWithTF.length; i++) {
// check that they are sorted by descending termfreq
// order
if (i > 0) {
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
}
}
}

View File

@ -124,6 +124,10 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
return fcsi.getTermsEnum();
}
@Override
public long getSumTotalTermFreq() {
return -1;
}
});
assert termsEnum != null;

View File

@ -610,6 +610,8 @@ public class CheckIndex {
Comparator<BytesRef> termComp = terms.getComparator();
long sumTotalTermFreq = 0;
while(true) {
final BytesRef term = terms.next();
@ -660,6 +662,8 @@ public class CheckIndex {
}
int lastDoc = -1;
int docCount = 0;
long totalTermFreq = 0;
while(true) {
final int doc = docs2.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
@ -667,6 +671,8 @@ public class CheckIndex {
}
final int freq = docs2.freq();
status.totPos += freq;
totalTermFreq += freq;
docCount++;
if (doc <= lastDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
@ -698,21 +704,38 @@ public class CheckIndex {
}
}
// Now count how many deleted docs occurred in
// this term:
final long totalTermFreq2 = terms.totalTermFreq();
final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
// Re-count if there are deleted docs:
if (reader.hasDeletions()) {
final DocsEnum docsNoDel = terms.docs(null, docs);
int count = 0;
docCount = 0;
totalTermFreq = 0;
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
count++;
docCount++;
totalTermFreq += docsNoDel.freq();
}
if (count != docFreq) {
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
}
if (docCount != docFreq) {
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
}
if (hasTotalTermFreq) {
sumTotalTermFreq += totalTermFreq;
if (totalTermFreq != totalTermFreq2) {
throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
}
}
}
if (sumTotalTermFreq != 0) {
final long v = fields.terms(field).getSumTotalTermFreq();
if (v != -1 && sumTotalTermFreq != v) {
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
}
}
// Test seek to last term:
if (lastTerm != null) {
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {

View File

@ -99,6 +99,11 @@ public class FilterIndexReader extends IndexReader {
public long getUniqueTermCount() throws IOException {
return in.getUniqueTermCount();
}
@Override
public long getSumTotalTermFreq() throws IOException {
return in.getSumTotalTermFreq();
}
}
/** Base class for filtering {@link TermsEnum} implementations. */
@ -155,6 +160,11 @@ public class FilterIndexReader extends IndexReader {
return in.docFreq();
}
@Override
public long totalTermFreq() {
return in.totalTermFreq();
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return in.docs(skipDocs, reuse);

View File

@ -20,13 +20,14 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Comparator;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
@ -165,6 +166,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// multiple threads and interacting with the
// TermsConsumer, only calling out to us (passing us the
// DocsConsumer) to handle delivery of docs/positions
long sumTotalTermFreq = 0;
while(numFields > 0) {
// Get the next term to merge
@ -197,6 +199,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// which all share the same term. Now we must
// interleave the docID streams.
int numDocs = 0;
long totTF = 0;
while(numToMerge > 0) {
FreqProxFieldMergeState minState = termStates[0];
@ -222,6 +225,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// omitTermFreqAndPositions == false so we do write positions &
// payload
int position = 0;
totTF += termDocFreq;
for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt();
position += code >> 1;
@ -286,9 +290,10 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
}
assert numDocs > 0;
termsConsumer.finishTerm(text, numDocs);
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
sumTotalTermFreq += totTF;
}
termsConsumer.finish();
termsConsumer.finish(sumTotalTermFreq);
}
}

View File

@ -997,6 +997,23 @@ public abstract class IndexReader implements Cloneable,Closeable {
return terms.docFreq(term);
}
/** Returns the number of documents containing the term
* <code>t</code>. This method returns 0 if the term or
* field does not exists. This method does not take into
* account deleted documents that have not yet been merged
* away. */
public long totalTermFreq(String field, BytesRef term) throws IOException {
final Fields fields = fields();
if (fields == null) {
return 0;
}
final Terms terms = fields.terms(field);
if (terms == null) {
return 0;
}
return terms.totalTermFreq(term);
}
/** This may return null if the field does not exist.*/
public Terms terms(String field) throws IOException {
final Fields fields = fields();

View File

@ -76,6 +76,19 @@ public final class MultiTerms extends Terms {
}
}
@Override
public long getSumTotalTermFreq() throws IOException {
long sum = 0;
for(Terms terms : subs) {
final long v = terms.getSumTotalTermFreq();
if (v == -1) {
return -1;
}
sum += v;
}
return sum;
}
@Override
public Comparator<BytesRef> getComparator() {
return termComp;

View File

@ -265,6 +265,19 @@ public final class MultiTermsEnum extends TermsEnum {
return sum;
}
@Override
public long totalTermFreq() {
long sum = 0;
for(int i=0;i<numTop;i++) {
final long v = top[i].terms.totalTermFreq();
if (v == -1) {
return v;
}
sum += v;
}
return sum;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final MultiDocsEnum docsEnum;

View File

@ -57,6 +57,18 @@ public abstract class Terms {
}
}
/** Returns the number of documents containing the
* specified term text. Returns 0 if the term does not
* exist. */
public long totalTermFreq(BytesRef text) throws IOException {
final TermsEnum termsEnum = getThreadTermsEnum();
if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
return termsEnum.totalTermFreq();
} else {
return 0;
}
}
/** Get {@link DocsEnum} for the specified term. This
* method may return null if the term does not exist. */
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
@ -115,6 +127,14 @@ public abstract class Terms {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
/** Returns the sum of {@link TermsEnum#totalTermFreq} for
* all terms in this field, or -1 if this measure isn't
* stored by the codec (or if this fields omits term freq
* and positions). Note that, just like other term
* measures, this measure does not take deleted documents
* into account. */
public abstract long getSumTotalTermFreq() throws IOException;
/**
* Returns a thread-private {@link TermsEnum} instance. Obtaining
* {@link TermsEnum} from this method might be more efficient than using

View File

@ -126,6 +126,14 @@ public abstract class TermsEnum {
* {@link SeekStatus#END}.*/
public abstract int docFreq();
/** Returns the total number of occurrences of this term
* across all documents (the sum of the freq() for each
* doc that has this term). This will be -1 if the
* codec doesn't support this measure. Note that, like
* other term measures, this measure does not take
* deleted documents into account. */
public abstract long totalTermFreq();
/** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link
* #seek} for the first time. This method will not
@ -198,6 +206,11 @@ public abstract class TermsEnum {
throw new IllegalStateException("this method should never be called");
}
@Override
public long totalTermFreq() {
throw new IllegalStateException("this method should never be called");
}
@Override
public long ord() {
throw new IllegalStateException("this method should never be called");

View File

@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) {

View File

@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
/** Default merge impl: append documents, mapping around
* deletes */
public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
int df = 0;
long totTF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
while(true) {
@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
this.startDoc(doc, postings.freq());
this.finishDoc();
df++;
totTF++;
}
} else {
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
}
final int freq = postingsEnum.freq();
this.startDoc(doc, freq);
totTF += freq;
for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition();
final BytesRef payload;
@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
df++;
}
}
return df;
return new TermStats(df, totTF);
}
}

View File

@ -34,7 +34,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
public abstract void startTerm() throws IOException;
/** Finishes the current term */
public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
public abstract void setField(FieldInfo fieldInfo);

View File

@ -27,6 +27,7 @@ import org.apache.lucene.index.TermState;
public class PrefixCodedTermState extends OrdTermState {
public int docFreq; // how many docs have this term
public long filePointer; // fp into the terms dict primary file (_X.tis)
public long totalTermFreq; // total number of occurrences of this term
@Override
public void copyFrom(TermState _other) {
@ -35,11 +36,12 @@ public class PrefixCodedTermState extends OrdTermState {
super.copyFrom(_other);
filePointer = other.filePointer;
docFreq = other.docFreq;
totalTermFreq = other.totalTermFreq;
}
@Override
public String toString() {
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
}
}

View File

@ -129,18 +129,17 @@ public class PrefixCodedTermsReader extends FieldsProducer {
// Read per-field details
seekDir(in, dirOffset);
final int numFields = in.readInt();
final int numFields = in.readVInt();
for(int i=0;i<numFields;i++) {
final int field = in.readInt();
final long numTerms = in.readLong();
final int field = in.readVInt();
final long numTerms = in.readVLong();
assert numTerms >= 0;
final long termsStartPointer = in.readLong();
final long termsStartPointer = in.readVLong();
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (numTerms > 0) {
assert !fields.containsKey(fieldInfo.name);
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
}
final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
assert !fields.containsKey(fieldInfo.name);
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
}
success = true;
} finally {
@ -245,12 +244,14 @@ public class PrefixCodedTermsReader extends FieldsProducer {
final long numTerms;
final FieldInfo fieldInfo;
final long termsStartPointer;
final long sumTotalTermFreq;
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer;
this.sumTotalTermFreq = sumTotalTermFreq;
}
@Override
@ -273,6 +274,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return numTerms;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
// Iterates through terms in this field, not supporting ord()
private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in;
@ -295,6 +301,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
bytesReader = new DeltaBytesReader(in);
fieldTerm.field = fieldInfo.name;
state = postingsReader.newTermState();
state.totalTermFreq = -1;
state.ord = -1;
}
@ -494,6 +501,10 @@ public class PrefixCodedTermsReader extends FieldsProducer {
state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
}
if (!fieldInfo.omitTermFreqAndPositions) {
state.totalTermFreq = state.docFreq + in.readVLong();
}
postingsReader.readTerm(in,
fieldInfo, state,
isIndexTerm);
@ -511,6 +522,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return state.docFreq;
}
@Override
public long totalTermFreq() {
return state.totalTermFreq;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);

View File

@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private final Comparator<BytesRef> termComp;
public PrefixCodedTermsWriter(
@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
fields.add(terms);
return terms;
}
@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
public void close() throws IOException {
try {
final int fieldCount = fields.size();
int nonZeroCount = 0;
for(TermsWriter field : fields) {
if (field.numTerms > 0) {
nonZeroCount++;
}
}
final long dirStart = out.getFilePointer();
out.writeInt(fieldCount);
for(int i=0;i<fieldCount;i++) {
TermsWriter field = (TermsWriter) fields.get(i);
out.writeInt(field.fieldInfo.number);
out.writeLong(field.numTerms);
out.writeLong(field.termsStartPointer);
out.writeVInt(nonZeroCount);
for(TermsWriter field : fields) {
if (field.numTerms > 0) {
out.writeVInt(field.fieldInfo.number);
out.writeVLong(field.numTerms);
out.writeVLong(field.termsStartPointer);
if (!field.fieldInfo.omitTermFreqAndPositions) {
out.writeVLong(field.sumTotalTermFreq);
}
}
}
writeTrailer(dirStart);
} finally {
@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
private final long termsStartPointer;
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
long sumTotalTermFreq;
TermsWriter(
TermsIndexWriterBase.FieldWriter fieldIndexWriter,
@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
}
@Override
public void finishTerm(BytesRef text, int numDocs) throws IOException {
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
assert numDocs > 0;
assert stats.docFreq > 0;
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
termWriter.write(text);
final int highBit = isIndexTerm ? 0x80 : 0;
@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
// This is a vInt, except, we steal top bit to record
// whether this was an indexed term:
if ((numDocs & ~0x3F) == 0) {
if ((stats.docFreq & ~0x3F) == 0) {
// Fast case -- docFreq fits in 6 bits
out.writeByte((byte) (highBit | numDocs));
out.writeByte((byte) (highBit | stats.docFreq));
} else {
// Write bottom 6 bits of docFreq, then write the
// remainder as vInt:
out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
out.writeVInt(numDocs >>> 6);
out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
out.writeVInt(stats.docFreq >>> 6);
}
postingsWriter.finishTerm(numDocs, isIndexTerm);
if (!fieldInfo.omitTermFreqAndPositions) {
assert stats.totalTermFreq >= stats.docFreq;
out.writeVLong(stats.totalTermFreq - stats.docFreq);
}
postingsWriter.finishTerm(stats, isIndexTerm);
numTerms++;
}
// Finishes all terms in this field
@Override
public void finish() throws IOException {
public void finish(long sumTotalTermFreq) throws IOException {
// EOF marker:
this.sumTotalTermFreq = sumTotalTermFreq;
out.writeVInt(DeltaBytesWriter.TERM_EOF);
fieldIndexWriter.finish();
}

View File

@ -0,0 +1,28 @@
package org.apache.lucene.index.codecs;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TermStats {
public final int docFreq;
public final long totalTermFreq;
public TermStats(int docFreq, long totalTermFreq) {
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
}

View File

@ -38,10 +38,10 @@ public abstract class TermsConsumer {
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */
public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */
public abstract void finish() throws IOException;
public abstract void finish(long sumTotalTermFreq) throws IOException;
/** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */
@ -55,6 +55,7 @@ public abstract class TermsConsumer {
BytesRef term;
assert termsEnum != null;
long sumTotalTermFreq = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
if (docsEnum == null) {
@ -69,9 +70,9 @@ public abstract class TermsConsumer {
if (docsEnumIn != null) {
docsEnum.reset(docsEnumIn);
final PostingsConsumer postingsConsumer = startTerm(term);
final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
if (numDocs > 0) {
finishTerm(term, numDocs);
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
if (stats.docFreq > 0) {
finishTerm(term, stats);
}
}
}
@ -94,14 +95,15 @@ public abstract class TermsConsumer {
}
}
final PostingsConsumer postingsConsumer = startTerm(term);
final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
if (numDocs > 0) {
finishTerm(term, numDocs);
final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
if (stats.docFreq > 0) {
finishTerm(term, stats);
sumTotalTermFreq += stats.totalTermFreq;
}
}
}
}
finish();
finish(sumTotalTermFreq);
}
}

View File

@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBase {
public abstract void setTermsOutput(IndexOutput out);
public abstract class FieldWriter {
public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
public abstract void finish() throws IOException;
}

View File

@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public static abstract class IndexTermSelector {
// Called sequentially on every term being written,
// returning true if this term should be indexed
public abstract boolean isIndexTerm(BytesRef term, int docFreq);
public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
}
/** Same policy as {@link FixedGapTermsIndexWriter} */
@ -74,7 +74,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
public boolean isIndexTerm(BytesRef term, int docFreq) {
public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (count >= interval) {
count = 0;
return true;
@ -99,8 +99,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
public boolean isIndexTerm(BytesRef term, int docFreq) {
if (docFreq >= docFreqThresh || count >= interval) {
public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (stats.docFreq >= docFreqThresh || count >= interval) {
count = 0;
return true;
} else {
@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
if (policy.isIndexTerm(text, docFreq) || first) {
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
if (policy.isIndexTerm(text, stats) || first) {
first = false;
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
final int lengthSave = text.length;

View File

@ -33,7 +33,6 @@ import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.CompoundFileReader;
@ -263,6 +262,11 @@ public class PreFlexFields extends FieldsProducer {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
@Override
public long getSumTotalTermFreq() {
return -1;
}
}
private class PreTermsEnum extends TermsEnum {
@ -938,6 +942,11 @@ public class PreFlexFields extends FieldsProducer {
return termEnum.docFreq();
}
@Override
public long totalTermFreq() {
return -1;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
PreDocsEnum docsEnum;

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
@ -177,7 +178,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
//System.out.println("PW finishTerm docCount=" + docCount);
assert pendingCount > 0 || pendingCount == -1;
@ -186,7 +187,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
if (pendingCount == -1) {
termsOut.writeByte((byte) 0);
wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
pendingIsIndexTerm = false;
} else {

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@ -239,11 +240,11 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
// TODO: -- wasteful we are counting this in two places?
assert docCount > 0;
assert docCount == df;
assert stats.docFreq > 0;
assert stats.docFreq == df;
docIndex.write(termsOut, isIndexTerm);

View File

@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
@ -119,28 +118,31 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final IndexInput in;
private final boolean omitTF;
private int docFreq;
private long totalTermFreq;
private long docsStart;
private boolean ended;
private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.omitTF = omitTF;
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
}
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
//System.out.println("seek to text=" + text.utf8ToString());
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END;
} else {
//System.out.println(" got text=" + term.utf8ToString());
PairOutputs.Pair<Long,Long> pair = result.output;
docsStart = pair.output1;
docFreq = pair.output2.intValue();
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
docsStart = pair1.output1;
docFreq = pair2.output1.intValue();
totalTermFreq = pair2.output2;
if (result.input.equals(text)) {
//System.out.println(" match docsStart=" + docsStart);
@ -155,11 +157,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override
public BytesRef next() throws IOException {
assert !ended;
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
if (result != null) {
final PairOutputs.Pair<Long,Long> pair = result.output;
docsStart = pair.output1;
docFreq = pair.output2.intValue();
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
docsStart = pair1.output1;
docFreq = pair2.output1.intValue();
totalTermFreq = pair2.output2;
return result.input;
} else {
return null;
@ -186,6 +190,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
return docFreq;
}
@Override
public long totalTermFreq() {
return totalTermFreq;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
SimpleTextDocsEnum docsEnum;
@ -438,8 +447,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
private class SimpleTextTerms extends Terms {
private final long termsStart;
private final boolean omitTF;
private FST<PairOutputs.Pair<Long,Long>> fst;
private long sumTotalTermFreq;
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
private int termCount;
private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException {
@ -450,24 +460,38 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
0,
0,
true,
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1;
int docFreq = 0;
long totalTermFreq = 0;
while(true) {
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) {
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
sumTotalTermFreq += totalTermFreq;
}
break;
} else if (scratch.startsWith(DOC)) {
docFreq++;
} else if (scratch.startsWith(POS)) {
totalTermFreq++;
} else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) {
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
new PairOutputs.Pair<Long,Long>((long) docFreq,
posIntOutputs.get(totalTermFreq))));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
@ -477,6 +501,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
lastTerm.length = len;
docFreq = 0;
sumTotalTermFreq += totalTermFreq;
totalTermFreq = 0;
termCount++;
}
}
fst = b.finish();
@ -502,6 +529,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public long getUniqueTermCount() {
return (long) termCount;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
}
@Override

View File

@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
}
@Override
public void finishTerm(BytesRef term, int numDocs) throws IOException {
public void finishTerm(BytesRef term, TermStats stats) throws IOException {
}
@Override
public void finish() throws IOException {
public void finish(long sumTotalTermFreq) throws IOException {
}
@Override

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@ -184,12 +185,12 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
assert docCount > 0;
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
assert docCount == df;
assert stats.docFreq == df;
if (isIndexTerm) {
// Write absolute at seek points

View File

@ -126,6 +126,11 @@ public abstract class FilteredTermsEnum extends TermsEnum {
return tenum.docFreq();
}
@Override
public long totalTermFreq() {
return tenum.totalTermFreq();
}
/** This enum does not support seeking!
* @throws UnsupportedOperationException
*/

View File

@ -245,6 +245,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
return actualEnum.docFreq();
}
@Override
public long totalTermFreq() {
return actualEnum.totalTermFreq();
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return actualEnum.docs(skipDocs, reuse);

View File

@ -28,7 +28,6 @@ import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache.DocTermsIndex;
import org.apache.lucene.util.ArrayUtil;
@ -321,6 +320,11 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() {
return -1;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
throw new UnsupportedOperationException();

View File

@ -102,6 +102,8 @@ public class TestExternalCodecs extends LuceneTestCase {
static class RAMField extends Terms {
final String field;
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
long sumTotalTermFreq;
RAMField(String field) {
this.field = field;
}
@ -111,6 +113,11 @@ public class TestExternalCodecs extends LuceneTestCase {
return termToDocs.size();
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override
public TermsEnum iterator() {
return new RAMTermsEnum(RAMOnlyCodec.RAMField.this);
@ -124,6 +131,7 @@ public class TestExternalCodecs extends LuceneTestCase {
static class RAMTerm {
final String term;
long totalTermFreq;
final List<RAMDoc> docs = new ArrayList<RAMDoc>();
public RAMTerm(String term) {
this.term = term;
@ -189,14 +197,16 @@ public class TestExternalCodecs extends LuceneTestCase {
}
@Override
public void finishTerm(BytesRef text, int numDocs) {
assert numDocs > 0;
assert numDocs == current.docs.size();
public void finishTerm(BytesRef text, TermStats stats) {
assert stats.docFreq > 0;
assert stats.docFreq == current.docs.size();
current.totalTermFreq = stats.totalTermFreq;
field.termToDocs.put(current.term, current);
}
@Override
public void finish() {
public void finish(long sumTotalTermFreq) {
field.sumTotalTermFreq = sumTotalTermFreq;
}
}
@ -331,6 +341,10 @@ public class TestExternalCodecs extends LuceneTestCase {
}
@Override
public long totalTermFreq() {
return ramField.termToDocs.get(current).totalTermFreq;
}
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
}

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.mocksep.MockSepCodec;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.search.DocIdSetIterator;
@ -97,9 +98,11 @@ public class TestCodecs extends LuceneTestCase {
public void write(final FieldsConsumer consumer) throws Throwable {
Arrays.sort(terms);
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
for (final TermData term : terms)
term.write(termsConsumer);
termsConsumer.finish();
long sumTotalTermCount = 0;
for (final TermData term : terms) {
sumTotalTermCount += term.write(termsConsumer);
}
termsConsumer.finish(sumTotalTermCount);
}
}
@ -131,8 +134,9 @@ public class TestCodecs extends LuceneTestCase {
return text.compareTo(((TermData) o).text);
}
public void write(final TermsConsumer termsConsumer) throws Throwable {
public long write(final TermsConsumer termsConsumer) throws Throwable {
final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
long totTF = 0;
for(int i=0;i<docs.length;i++) {
final int termDocFreq;
if (field.omitTF) {
@ -142,6 +146,7 @@ public class TestCodecs extends LuceneTestCase {
}
postingsConsumer.startDoc(docs[i], termDocFreq);
if (!field.omitTF) {
totTF += positions[i].length;
for(int j=0;j<positions[i].length;j++) {
final PositionData pos = positions[i][j];
postingsConsumer.addPosition(pos.pos, pos.payload);
@ -149,7 +154,8 @@ public class TestCodecs extends LuceneTestCase {
postingsConsumer.finishDoc();
}
}
termsConsumer.finishTerm(text, docs.length);
termsConsumer.finishTerm(text, new TermStats(docs.length, totTF));
return totTF;
}
}

View File

@ -1865,4 +1865,22 @@ public class TestIndexReader extends LuceneTestCase
assertTrue(IndexReader.indexExists(dir));
dir.close();
}
// Make sure totalTermFreq works correctly in the terms
// dict cache
public void testTotalTermFreqCached() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
Document d = new Document();
d.add(newField("f", "a a b", Field.Index.ANALYZED));
writer.addDocument(d);
IndexReader r = writer.getReader();
writer.close();
Terms terms = MultiFields.getTerms(r, "f");
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
assertEquals(2, terms.totalTermFreq(new BytesRef("a")));
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
r.close();
dir.close();
}
}

View File

@ -39,6 +39,7 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec;
import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec;
import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory;
@ -66,7 +67,7 @@ public class MockRandomCodec extends Codec {
public MockRandomCodec(Random random) {
name = "MockRandom";
this.seedRandom = random;
this.seedRandom = new Random(random.nextLong());
}
@Override
@ -148,7 +149,7 @@ public class MockRandomCodec extends Codec {
final Random rand = new Random(seed2);
@Override
public boolean isIndexTerm(BytesRef term, int docFreq) {
public boolean isIndexTerm(BytesRef term, TermStats stats) {
return random.nextInt(gap) == 17;
}
};

View File

@ -21,6 +21,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.CorruptIndexException;
@ -184,10 +185,10 @@ class PreFlexFieldsWriter extends FieldsConsumer {
}
@Override
public void finishTerm(BytesRef text, int numDocs) throws IOException {
if (numDocs > 0) {
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
if (stats.docFreq > 0) {
long skipPointer = skipListWriter.writeSkip(freqOut);
termInfo.docFreq = numDocs;
termInfo.docFreq = stats.docFreq;
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
termsOut.add(fieldInfo.number,
@ -197,7 +198,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
}
@Override
public void finish() throws IOException {
public void finish(long sumTotalTermCount) throws IOException {
}
@Override

View File

@ -1000,6 +1000,10 @@ class NumberedTermsEnum extends TermsEnum {
return tenum.docFreq();
}
@Override
public long totalTermFreq() {
return tenum.totalTermFreq();
}
public BytesRef skipTo(BytesRef target) throws IOException {