mirror of https://github.com/apache/lucene.git
LUCENE-2862: add TermsEnum.totalTermFreq() and Terms.getSumTotalTermFreq()
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059344 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f5d5dda6c6
commit
a0c82b0f41
|
@ -359,6 +359,9 @@ New features
|
||||||
terms dict. This impl stores the indexed terms in an FST, which is
|
terms dict. This impl stores the indexed terms in an FST, which is
|
||||||
much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
|
much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-2862: Added TermsEnum.totalTermFreq() and
|
||||||
|
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.
|
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.
|
||||||
|
|
|
@ -238,6 +238,10 @@ public class InstantiatedIndex
|
||||||
while((text = termsEnum.next()) != null) {
|
while((text = termsEnum.next()) != null) {
|
||||||
String termText = text.utf8ToString();
|
String termText = text.utf8ToString();
|
||||||
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
|
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
|
||||||
|
final long totalTermFreq = termsEnum.totalTermFreq();
|
||||||
|
if (totalTermFreq != -1) {
|
||||||
|
instantiatedTerm.addPositionsCount(totalTermFreq);
|
||||||
|
}
|
||||||
getTermsByFieldAndText().get(field).put(termText, instantiatedTerm);
|
getTermsByFieldAndText().get(field).put(termText, instantiatedTerm);
|
||||||
instantiatedTerm.setTermIndex(terms.size());
|
instantiatedTerm.setTermIndex(terms.size());
|
||||||
terms.add(instantiatedTerm);
|
terms.add(instantiatedTerm);
|
||||||
|
|
|
@ -398,18 +398,33 @@ public class InstantiatedIndexReader extends IndexReader {
|
||||||
if (i < 0) {
|
if (i < 0) {
|
||||||
i = -i - 1;
|
i = -i - 1;
|
||||||
}
|
}
|
||||||
if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) {
|
if (i >= orderedTerms.length || orderedTerms[i].field() != field) {
|
||||||
// field does not exist
|
// field does not exist
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
final int startLoc = i;
|
final int startLoc = i;
|
||||||
|
|
||||||
|
// TODO: heavy to do this here; would be better to
|
||||||
|
// do it up front & cache
|
||||||
|
long sum = 0;
|
||||||
|
int upto = i;
|
||||||
|
while(upto < orderedTerms.length && orderedTerms[i].field() == field) {
|
||||||
|
sum += orderedTerms[i].getTotalTermFreq();
|
||||||
|
upto++;
|
||||||
|
}
|
||||||
|
final long sumTotalTermFreq = sum;
|
||||||
|
|
||||||
return new Terms() {
|
return new Terms() {
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum iterator() {
|
public TermsEnum iterator() {
|
||||||
return new InstantiatedTermsEnum(orderedTerms, startLoc, field);
|
return new InstantiatedTermsEnum(orderedTerms, startLoc, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return sumTotalTermFreq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Comparator<BytesRef> getComparator() {
|
public Comparator<BytesRef> getComparator() {
|
||||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
|
|
|
@ -315,6 +315,7 @@ public class InstantiatedIndexWriter implements Closeable {
|
||||||
}
|
}
|
||||||
associatedDocuments[associatedDocuments.length - 1] = info;
|
associatedDocuments[associatedDocuments.length - 1] = info;
|
||||||
term.setAssociatedDocuments(associatedDocuments);
|
term.setAssociatedDocuments(associatedDocuments);
|
||||||
|
term.addPositionsCount(positions.length);
|
||||||
|
|
||||||
// todo optimize, only if term vector?
|
// todo optimize, only if term vector?
|
||||||
informationByTermOfCurrentDocument.put(term, info);
|
informationByTermOfCurrentDocument.put(term, info);
|
||||||
|
|
|
@ -45,6 +45,8 @@ public class InstantiatedTerm
|
||||||
|
|
||||||
private Term term;
|
private Term term;
|
||||||
|
|
||||||
|
private long totalTermFreq;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* index of term in InstantiatedIndex
|
* index of term in InstantiatedIndex
|
||||||
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
|
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
|
||||||
|
@ -92,6 +94,14 @@ public class InstantiatedTerm
|
||||||
this.associatedDocuments = associatedDocuments;
|
this.associatedDocuments = associatedDocuments;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void addPositionsCount(long count) {
|
||||||
|
totalTermFreq += count;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getTotalTermFreq() {
|
||||||
|
return totalTermFreq;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds index to the first beyond the current whose document number is
|
* Finds index to the first beyond the current whose document number is
|
||||||
* greater than or equal to <i>target</i>, -1 if there is no such element.
|
* greater than or equal to <i>target</i>, -1 if there is no such element.
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.index.TermState;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.index.DocsEnum;
|
import org.apache.lucene.index.DocsEnum;
|
||||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -110,6 +109,12 @@ public class InstantiatedTermsEnum extends TermsEnum {
|
||||||
return terms[upto].getAssociatedDocuments().length;
|
return terms[upto].getAssociatedDocuments().length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
final long v = terms[upto].getTotalTermFreq();
|
||||||
|
return v == 0 ? -1 : v;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||||
if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) {
|
if (reuse == null || !(reuse instanceof InstantiatedDocsEnum)) {
|
||||||
|
|
|
@ -66,6 +66,7 @@ public class TestIndicesEquals extends LuceneTestCase {
|
||||||
// create dir data
|
// create dir data
|
||||||
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig(
|
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig(
|
||||||
TEST_VERSION_CURRENT, new MockAnalyzer()));
|
TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||||
|
|
||||||
for (int i = 0; i < 20; i++) {
|
for (int i = 0; i < 20; i++) {
|
||||||
Document document = new Document();
|
Document document = new Document();
|
||||||
assembleDocument(document, i);
|
assembleDocument(document, i);
|
||||||
|
@ -395,6 +396,10 @@ public class TestIndicesEquals extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
|
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
|
||||||
|
final long totalTermFreq = aprioriTermEnum.totalTermFreq();
|
||||||
|
if (totalTermFreq != -1) {
|
||||||
|
assertEquals(totalTermFreq, testTermEnum.totalTermFreq());
|
||||||
|
}
|
||||||
|
|
||||||
// compare termDocs seeking
|
// compare termDocs seeking
|
||||||
|
|
||||||
|
|
|
@ -610,6 +610,8 @@ public class MemoryIndex implements Serializable {
|
||||||
/** Term for this field's fieldName, lazily computed on demand */
|
/** Term for this field's fieldName, lazily computed on demand */
|
||||||
public transient Term template;
|
public transient Term template;
|
||||||
|
|
||||||
|
private final long sumTotalTermFreq;
|
||||||
|
|
||||||
private static final long serialVersionUID = 2882195016849084649L;
|
private static final long serialVersionUID = 2882195016849084649L;
|
||||||
|
|
||||||
public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) {
|
public Info(HashMap<BytesRef,ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost) {
|
||||||
|
@ -617,6 +619,15 @@ public class MemoryIndex implements Serializable {
|
||||||
this.numTokens = numTokens;
|
this.numTokens = numTokens;
|
||||||
this.numOverlapTokens = numOverlapTokens;
|
this.numOverlapTokens = numOverlapTokens;
|
||||||
this.boost = boost;
|
this.boost = boost;
|
||||||
|
long sum = 0;
|
||||||
|
for(Map.Entry<BytesRef,ArrayIntList> ent : terms.entrySet()) {
|
||||||
|
sum += ent.getValue().size();
|
||||||
|
}
|
||||||
|
sumTotalTermFreq = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -826,6 +837,11 @@ public class MemoryIndex implements Serializable {
|
||||||
public long getUniqueTermCount() {
|
public long getUniqueTermCount() {
|
||||||
return info.sortedTerms.length;
|
return info.sortedTerms.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return info.getSumTotalTermFreq();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -895,6 +911,11 @@ public class MemoryIndex implements Serializable {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return info.sortedTerms[termUpto].getValue().size();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||||
if (reuse == null || !(reuse instanceof MemoryDocsEnum)) {
|
if (reuse == null || !(reuse instanceof MemoryDocsEnum)) {
|
||||||
|
|
|
@ -176,15 +176,34 @@ public class HighFreqTerms {
|
||||||
return ts;
|
return ts;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
|
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termText) throws Exception {
|
||||||
BytesRef br = termtext;
|
|
||||||
long totalTF = 0;
|
long totalTF = 0;
|
||||||
Bits skipDocs = MultiFields.getDeletedDocs(reader);
|
|
||||||
DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br);
|
Terms terms = MultiFields.getTerms(reader, field);
|
||||||
// if term is not in index return totalTF of 0
|
if (terms == null) {
|
||||||
if (de == null) {
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TermsEnum termsEnum = terms.iterator();
|
||||||
|
if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Bits skipDocs = MultiFields.getDeletedDocs(reader);
|
||||||
|
if (skipDocs == null) {
|
||||||
|
// TODO: we could do this up front, during the scan
|
||||||
|
// (next()), instead of after-the-fact here w/ seek,
|
||||||
|
// if the codec supports it and there are no del
|
||||||
|
// docs...
|
||||||
|
final long totTF = termsEnum.totalTermFreq();
|
||||||
|
if (totTF != -1) {
|
||||||
|
return totTF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DocsEnum de = termsEnum.docs(skipDocs, null);
|
||||||
|
|
||||||
// use DocsEnum.read() and BulkResult api
|
// use DocsEnum.read() and BulkResult api
|
||||||
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
|
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
|
||||||
int count;
|
int count;
|
||||||
|
|
|
@ -41,4 +41,9 @@ public final class TermStats {
|
||||||
String getTermText() {
|
String getTermText() {
|
||||||
return termtext.utf8ToString();
|
return termtext.utf8ToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq);
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -17,15 +17,16 @@ package org.apache.lucene.misc;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.junit.AfterClass;
|
import org.junit.AfterClass;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
@ -41,8 +42,10 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
||||||
writer = new IndexWriter(dir, newIndexWriterConfig(random,
|
writer = new IndexWriter(dir, newIndexWriterConfig(random,
|
||||||
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))
|
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))
|
||||||
.setMaxBufferedDocs(2));
|
.setMaxBufferedDocs(2));
|
||||||
|
writer.setInfoStream(VERBOSE ? System.out : null);
|
||||||
indexDocs(writer);
|
indexDocs(writer);
|
||||||
reader = IndexReader.open(dir, true);
|
reader = IndexReader.open(dir, true);
|
||||||
|
_TestUtil.checkIndex(dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterClass
|
@AfterClass
|
||||||
|
@ -75,8 +78,8 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
||||||
String field="FIELD_1";
|
String field="FIELD_1";
|
||||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||||
for (int i = 0; i < terms.length; i++) {
|
for (int i = 0; i < terms.length; i++) {
|
||||||
if (i >0){
|
if (i > 0) {
|
||||||
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
|
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -134,11 +137,12 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
||||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||||
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||||
|
|
||||||
for (int i = 0; i < termsWithTF.length; i++) {
|
for (int i = 0; i < termsWithTF.length; i++) {
|
||||||
// check that they are sorted by descending termfreq order
|
// check that they are sorted by descending termfreq
|
||||||
if (i >0){
|
// order
|
||||||
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
|
if (i > 0) {
|
||||||
}
|
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -124,6 +124,10 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
|
||||||
return fcsi.getTermsEnum();
|
return fcsi.getTermsEnum();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
assert termsEnum != null;
|
assert termsEnum != null;
|
||||||
|
|
|
@ -610,6 +610,8 @@ public class CheckIndex {
|
||||||
|
|
||||||
Comparator<BytesRef> termComp = terms.getComparator();
|
Comparator<BytesRef> termComp = terms.getComparator();
|
||||||
|
|
||||||
|
long sumTotalTermFreq = 0;
|
||||||
|
|
||||||
while(true) {
|
while(true) {
|
||||||
|
|
||||||
final BytesRef term = terms.next();
|
final BytesRef term = terms.next();
|
||||||
|
@ -660,6 +662,8 @@ public class CheckIndex {
|
||||||
}
|
}
|
||||||
|
|
||||||
int lastDoc = -1;
|
int lastDoc = -1;
|
||||||
|
int docCount = 0;
|
||||||
|
long totalTermFreq = 0;
|
||||||
while(true) {
|
while(true) {
|
||||||
final int doc = docs2.nextDoc();
|
final int doc = docs2.nextDoc();
|
||||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
@ -667,6 +671,8 @@ public class CheckIndex {
|
||||||
}
|
}
|
||||||
final int freq = docs2.freq();
|
final int freq = docs2.freq();
|
||||||
status.totPos += freq;
|
status.totPos += freq;
|
||||||
|
totalTermFreq += freq;
|
||||||
|
docCount++;
|
||||||
|
|
||||||
if (doc <= lastDoc) {
|
if (doc <= lastDoc) {
|
||||||
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
|
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
|
||||||
|
@ -698,21 +704,38 @@ public class CheckIndex {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now count how many deleted docs occurred in
|
final long totalTermFreq2 = terms.totalTermFreq();
|
||||||
// this term:
|
final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
|
||||||
|
|
||||||
|
// Re-count if there are deleted docs:
|
||||||
if (reader.hasDeletions()) {
|
if (reader.hasDeletions()) {
|
||||||
final DocsEnum docsNoDel = terms.docs(null, docs);
|
final DocsEnum docsNoDel = terms.docs(null, docs);
|
||||||
int count = 0;
|
docCount = 0;
|
||||||
|
totalTermFreq = 0;
|
||||||
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
count++;
|
docCount++;
|
||||||
|
totalTermFreq += docsNoDel.freq();
|
||||||
}
|
}
|
||||||
if (count != docFreq) {
|
}
|
||||||
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
|
|
||||||
|
if (docCount != docFreq) {
|
||||||
|
throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
|
||||||
|
}
|
||||||
|
if (hasTotalTermFreq) {
|
||||||
|
sumTotalTermFreq += totalTermFreq;
|
||||||
|
if (totalTermFreq != totalTermFreq2) {
|
||||||
|
throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sumTotalTermFreq != 0) {
|
||||||
|
final long v = fields.terms(field).getSumTotalTermFreq();
|
||||||
|
if (v != -1 && sumTotalTermFreq != v) {
|
||||||
|
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Test seek to last term:
|
// Test seek to last term:
|
||||||
if (lastTerm != null) {
|
if (lastTerm != null) {
|
||||||
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
|
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
|
||||||
|
|
|
@ -99,6 +99,11 @@ public class FilterIndexReader extends IndexReader {
|
||||||
public long getUniqueTermCount() throws IOException {
|
public long getUniqueTermCount() throws IOException {
|
||||||
return in.getUniqueTermCount();
|
return in.getUniqueTermCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() throws IOException {
|
||||||
|
return in.getSumTotalTermFreq();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Base class for filtering {@link TermsEnum} implementations. */
|
/** Base class for filtering {@link TermsEnum} implementations. */
|
||||||
|
@ -155,6 +160,11 @@ public class FilterIndexReader extends IndexReader {
|
||||||
return in.docFreq();
|
return in.docFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return in.totalTermFreq();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||||
return in.docs(skipDocs, reuse);
|
return in.docs(skipDocs, reuse);
|
||||||
|
|
|
@ -20,13 +20,14 @@ package org.apache.lucene.index;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
|
||||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||||
|
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CollectionUtil;
|
import org.apache.lucene.util.CollectionUtil;
|
||||||
|
|
||||||
|
@ -165,6 +166,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
||||||
// multiple threads and interacting with the
|
// multiple threads and interacting with the
|
||||||
// TermsConsumer, only calling out to us (passing us the
|
// TermsConsumer, only calling out to us (passing us the
|
||||||
// DocsConsumer) to handle delivery of docs/positions
|
// DocsConsumer) to handle delivery of docs/positions
|
||||||
|
long sumTotalTermFreq = 0;
|
||||||
while(numFields > 0) {
|
while(numFields > 0) {
|
||||||
|
|
||||||
// Get the next term to merge
|
// Get the next term to merge
|
||||||
|
@ -197,6 +199,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
||||||
// which all share the same term. Now we must
|
// which all share the same term. Now we must
|
||||||
// interleave the docID streams.
|
// interleave the docID streams.
|
||||||
int numDocs = 0;
|
int numDocs = 0;
|
||||||
|
long totTF = 0;
|
||||||
while(numToMerge > 0) {
|
while(numToMerge > 0) {
|
||||||
|
|
||||||
FreqProxFieldMergeState minState = termStates[0];
|
FreqProxFieldMergeState minState = termStates[0];
|
||||||
|
@ -222,6 +225,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
||||||
// omitTermFreqAndPositions == false so we do write positions &
|
// omitTermFreqAndPositions == false so we do write positions &
|
||||||
// payload
|
// payload
|
||||||
int position = 0;
|
int position = 0;
|
||||||
|
totTF += termDocFreq;
|
||||||
for(int j=0;j<termDocFreq;j++) {
|
for(int j=0;j<termDocFreq;j++) {
|
||||||
final int code = prox.readVInt();
|
final int code = prox.readVInt();
|
||||||
position += code >> 1;
|
position += code >> 1;
|
||||||
|
@ -286,9 +290,10 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
assert numDocs > 0;
|
assert numDocs > 0;
|
||||||
termsConsumer.finishTerm(text, numDocs);
|
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
|
||||||
|
sumTotalTermFreq += totTF;
|
||||||
}
|
}
|
||||||
|
|
||||||
termsConsumer.finish();
|
termsConsumer.finish(sumTotalTermFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -997,6 +997,23 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
||||||
return terms.docFreq(term);
|
return terms.docFreq(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the number of documents containing the term
|
||||||
|
* <code>t</code>. This method returns 0 if the term or
|
||||||
|
* field does not exists. This method does not take into
|
||||||
|
* account deleted documents that have not yet been merged
|
||||||
|
* away. */
|
||||||
|
public long totalTermFreq(String field, BytesRef term) throws IOException {
|
||||||
|
final Fields fields = fields();
|
||||||
|
if (fields == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
final Terms terms = fields.terms(field);
|
||||||
|
if (terms == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return terms.totalTermFreq(term);
|
||||||
|
}
|
||||||
|
|
||||||
/** This may return null if the field does not exist.*/
|
/** This may return null if the field does not exist.*/
|
||||||
public Terms terms(String field) throws IOException {
|
public Terms terms(String field) throws IOException {
|
||||||
final Fields fields = fields();
|
final Fields fields = fields();
|
||||||
|
|
|
@ -76,6 +76,19 @@ public final class MultiTerms extends Terms {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() throws IOException {
|
||||||
|
long sum = 0;
|
||||||
|
for(Terms terms : subs) {
|
||||||
|
final long v = terms.getSumTotalTermFreq();
|
||||||
|
if (v == -1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
sum += v;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Comparator<BytesRef> getComparator() {
|
public Comparator<BytesRef> getComparator() {
|
||||||
return termComp;
|
return termComp;
|
||||||
|
|
|
@ -265,6 +265,19 @@ public final class MultiTermsEnum extends TermsEnum {
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
long sum = 0;
|
||||||
|
for(int i=0;i<numTop;i++) {
|
||||||
|
final long v = top[i].terms.totalTermFreq();
|
||||||
|
if (v == -1) {
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
sum += v;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||||
final MultiDocsEnum docsEnum;
|
final MultiDocsEnum docsEnum;
|
||||||
|
|
|
@ -57,6 +57,18 @@ public abstract class Terms {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the number of documents containing the
|
||||||
|
* specified term text. Returns 0 if the term does not
|
||||||
|
* exist. */
|
||||||
|
public long totalTermFreq(BytesRef text) throws IOException {
|
||||||
|
final TermsEnum termsEnum = getThreadTermsEnum();
|
||||||
|
if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
|
||||||
|
return termsEnum.totalTermFreq();
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Get {@link DocsEnum} for the specified term. This
|
/** Get {@link DocsEnum} for the specified term. This
|
||||||
* method may return null if the term does not exist. */
|
* method may return null if the term does not exist. */
|
||||||
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
|
||||||
|
@ -115,6 +127,14 @@ public abstract class Terms {
|
||||||
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
|
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the sum of {@link TermsEnum#totalTermFreq} for
|
||||||
|
* all terms in this field, or -1 if this measure isn't
|
||||||
|
* stored by the codec (or if this fields omits term freq
|
||||||
|
* and positions). Note that, just like other term
|
||||||
|
* measures, this measure does not take deleted documents
|
||||||
|
* into account. */
|
||||||
|
public abstract long getSumTotalTermFreq() throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a thread-private {@link TermsEnum} instance. Obtaining
|
* Returns a thread-private {@link TermsEnum} instance. Obtaining
|
||||||
* {@link TermsEnum} from this method might be more efficient than using
|
* {@link TermsEnum} from this method might be more efficient than using
|
||||||
|
|
|
@ -126,6 +126,14 @@ public abstract class TermsEnum {
|
||||||
* {@link SeekStatus#END}.*/
|
* {@link SeekStatus#END}.*/
|
||||||
public abstract int docFreq();
|
public abstract int docFreq();
|
||||||
|
|
||||||
|
/** Returns the total number of occurrences of this term
|
||||||
|
* across all documents (the sum of the freq() for each
|
||||||
|
* doc that has this term). This will be -1 if the
|
||||||
|
* codec doesn't support this measure. Note that, like
|
||||||
|
* other term measures, this measure does not take
|
||||||
|
* deleted documents into account. */
|
||||||
|
public abstract long totalTermFreq();
|
||||||
|
|
||||||
/** Get {@link DocsEnum} for the current term. Do not
|
/** Get {@link DocsEnum} for the current term. Do not
|
||||||
* call this before calling {@link #next} or {@link
|
* call this before calling {@link #next} or {@link
|
||||||
* #seek} for the first time. This method will not
|
* #seek} for the first time. This method will not
|
||||||
|
@ -198,6 +206,11 @@ public abstract class TermsEnum {
|
||||||
throw new IllegalStateException("this method should never be called");
|
throw new IllegalStateException("this method should never be called");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
throw new IllegalStateException("this method should never be called");
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long ord() {
|
public long ord() {
|
||||||
throw new IllegalStateException("this method should never be called");
|
throw new IllegalStateException("this method should never be called");
|
||||||
|
|
|
@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
|
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
|
||||||
// First term is first indexed term:
|
// First term is first indexed term:
|
||||||
if (0 == (numTerms++ % termIndexInterval)) {
|
if (0 == (numTerms++ % termIndexInterval)) {
|
||||||
|
|
||||||
|
|
|
@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
|
||||||
|
|
||||||
/** Default merge impl: append documents, mapping around
|
/** Default merge impl: append documents, mapping around
|
||||||
* deletes */
|
* deletes */
|
||||||
public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
|
public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
|
||||||
|
|
||||||
int df = 0;
|
int df = 0;
|
||||||
|
long totTF = 0;
|
||||||
|
|
||||||
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
||||||
while(true) {
|
while(true) {
|
||||||
|
@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
|
||||||
this.startDoc(doc, postings.freq());
|
this.startDoc(doc, postings.freq());
|
||||||
this.finishDoc();
|
this.finishDoc();
|
||||||
df++;
|
df++;
|
||||||
|
totTF++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
|
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
|
||||||
|
@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
|
||||||
}
|
}
|
||||||
final int freq = postingsEnum.freq();
|
final int freq = postingsEnum.freq();
|
||||||
this.startDoc(doc, freq);
|
this.startDoc(doc, freq);
|
||||||
|
totTF += freq;
|
||||||
for(int i=0;i<freq;i++) {
|
for(int i=0;i<freq;i++) {
|
||||||
final int position = postingsEnum.nextPosition();
|
final int position = postingsEnum.nextPosition();
|
||||||
final BytesRef payload;
|
final BytesRef payload;
|
||||||
|
@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
|
||||||
df++;
|
df++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return df;
|
return new TermStats(df, totTF);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,7 +34,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
|
||||||
public abstract void startTerm() throws IOException;
|
public abstract void startTerm() throws IOException;
|
||||||
|
|
||||||
/** Finishes the current term */
|
/** Finishes the current term */
|
||||||
public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
|
public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
|
||||||
|
|
||||||
public abstract void setField(FieldInfo fieldInfo);
|
public abstract void setField(FieldInfo fieldInfo);
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.index.TermState;
|
||||||
public class PrefixCodedTermState extends OrdTermState {
|
public class PrefixCodedTermState extends OrdTermState {
|
||||||
public int docFreq; // how many docs have this term
|
public int docFreq; // how many docs have this term
|
||||||
public long filePointer; // fp into the terms dict primary file (_X.tis)
|
public long filePointer; // fp into the terms dict primary file (_X.tis)
|
||||||
|
public long totalTermFreq; // total number of occurrences of this term
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void copyFrom(TermState _other) {
|
public void copyFrom(TermState _other) {
|
||||||
|
@ -35,11 +36,12 @@ public class PrefixCodedTermState extends OrdTermState {
|
||||||
super.copyFrom(_other);
|
super.copyFrom(_other);
|
||||||
filePointer = other.filePointer;
|
filePointer = other.filePointer;
|
||||||
docFreq = other.docFreq;
|
docFreq = other.docFreq;
|
||||||
|
totalTermFreq = other.totalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
|
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -129,18 +129,17 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||||
// Read per-field details
|
// Read per-field details
|
||||||
seekDir(in, dirOffset);
|
seekDir(in, dirOffset);
|
||||||
|
|
||||||
final int numFields = in.readInt();
|
final int numFields = in.readVInt();
|
||||||
|
|
||||||
for(int i=0;i<numFields;i++) {
|
for(int i=0;i<numFields;i++) {
|
||||||
final int field = in.readInt();
|
final int field = in.readVInt();
|
||||||
final long numTerms = in.readLong();
|
final long numTerms = in.readVLong();
|
||||||
assert numTerms >= 0;
|
assert numTerms >= 0;
|
||||||
final long termsStartPointer = in.readLong();
|
final long termsStartPointer = in.readVLong();
|
||||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||||
if (numTerms > 0) {
|
final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
|
||||||
assert !fields.containsKey(fieldInfo.name);
|
assert !fields.containsKey(fieldInfo.name);
|
||||||
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
|
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -245,12 +244,14 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||||
final long numTerms;
|
final long numTerms;
|
||||||
final FieldInfo fieldInfo;
|
final FieldInfo fieldInfo;
|
||||||
final long termsStartPointer;
|
final long termsStartPointer;
|
||||||
|
final long sumTotalTermFreq;
|
||||||
|
|
||||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
|
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
|
||||||
assert numTerms > 0;
|
assert numTerms > 0;
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
this.numTerms = numTerms;
|
this.numTerms = numTerms;
|
||||||
this.termsStartPointer = termsStartPointer;
|
this.termsStartPointer = termsStartPointer;
|
||||||
|
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -273,6 +274,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||||
return numTerms;
|
return numTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return sumTotalTermFreq;
|
||||||
|
}
|
||||||
|
|
||||||
// Iterates through terms in this field, not supporting ord()
|
// Iterates through terms in this field, not supporting ord()
|
||||||
private final class SegmentTermsEnum extends TermsEnum {
|
private final class SegmentTermsEnum extends TermsEnum {
|
||||||
private final IndexInput in;
|
private final IndexInput in;
|
||||||
|
@ -295,6 +301,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||||
bytesReader = new DeltaBytesReader(in);
|
bytesReader = new DeltaBytesReader(in);
|
||||||
fieldTerm.field = fieldInfo.name;
|
fieldTerm.field = fieldInfo.name;
|
||||||
state = postingsReader.newTermState();
|
state = postingsReader.newTermState();
|
||||||
|
state.totalTermFreq = -1;
|
||||||
state.ord = -1;
|
state.ord = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -494,6 +501,10 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||||
state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
|
state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||||
|
state.totalTermFreq = state.docFreq + in.readVLong();
|
||||||
|
}
|
||||||
|
|
||||||
postingsReader.readTerm(in,
|
postingsReader.readTerm(in,
|
||||||
fieldInfo, state,
|
fieldInfo, state,
|
||||||
isIndexTerm);
|
isIndexTerm);
|
||||||
|
@ -511,6 +522,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
||||||
return state.docFreq;
|
return state.docFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return state.totalTermFreq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||||
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
|
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||||
final FieldInfos fieldInfos;
|
final FieldInfos fieldInfos;
|
||||||
FieldInfo currentField;
|
FieldInfo currentField;
|
||||||
private final TermsIndexWriterBase termsIndexWriter;
|
private final TermsIndexWriterBase termsIndexWriter;
|
||||||
private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
|
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
|
||||||
private final Comparator<BytesRef> termComp;
|
private final Comparator<BytesRef> termComp;
|
||||||
|
|
||||||
public PrefixCodedTermsWriter(
|
public PrefixCodedTermsWriter(
|
||||||
|
@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||||
assert currentField == null || currentField.name.compareTo(field.name) < 0;
|
assert currentField == null || currentField.name.compareTo(field.name) < 0;
|
||||||
currentField = field;
|
currentField = field;
|
||||||
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
|
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
|
||||||
TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
|
final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
|
||||||
fields.add(terms);
|
fields.add(terms);
|
||||||
return terms;
|
return terms;
|
||||||
}
|
}
|
||||||
|
@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
final int fieldCount = fields.size();
|
|
||||||
|
int nonZeroCount = 0;
|
||||||
|
for(TermsWriter field : fields) {
|
||||||
|
if (field.numTerms > 0) {
|
||||||
|
nonZeroCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
final long dirStart = out.getFilePointer();
|
final long dirStart = out.getFilePointer();
|
||||||
|
|
||||||
out.writeInt(fieldCount);
|
out.writeVInt(nonZeroCount);
|
||||||
for(int i=0;i<fieldCount;i++) {
|
for(TermsWriter field : fields) {
|
||||||
TermsWriter field = (TermsWriter) fields.get(i);
|
if (field.numTerms > 0) {
|
||||||
out.writeInt(field.fieldInfo.number);
|
out.writeVInt(field.fieldInfo.number);
|
||||||
out.writeLong(field.numTerms);
|
out.writeVLong(field.numTerms);
|
||||||
out.writeLong(field.termsStartPointer);
|
out.writeVLong(field.termsStartPointer);
|
||||||
|
if (!field.fieldInfo.omitTermFreqAndPositions) {
|
||||||
|
out.writeVLong(field.sumTotalTermFreq);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
writeTrailer(dirStart);
|
writeTrailer(dirStart);
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||||
private final long termsStartPointer;
|
private final long termsStartPointer;
|
||||||
private long numTerms;
|
private long numTerms;
|
||||||
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
|
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
|
||||||
|
long sumTotalTermFreq;
|
||||||
|
|
||||||
TermsWriter(
|
TermsWriter(
|
||||||
TermsIndexWriterBase.FieldWriter fieldIndexWriter,
|
TermsIndexWriterBase.FieldWriter fieldIndexWriter,
|
||||||
|
@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finishTerm(BytesRef text, int numDocs) throws IOException {
|
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||||
|
|
||||||
assert numDocs > 0;
|
assert stats.docFreq > 0;
|
||||||
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
|
//System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
|
||||||
|
|
||||||
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
|
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
|
||||||
|
|
||||||
termWriter.write(text);
|
termWriter.write(text);
|
||||||
final int highBit = isIndexTerm ? 0x80 : 0;
|
final int highBit = isIndexTerm ? 0x80 : 0;
|
||||||
|
@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter extends FieldsConsumer {
|
||||||
|
|
||||||
// This is a vInt, except, we steal top bit to record
|
// This is a vInt, except, we steal top bit to record
|
||||||
// whether this was an indexed term:
|
// whether this was an indexed term:
|
||||||
if ((numDocs & ~0x3F) == 0) {
|
if ((stats.docFreq & ~0x3F) == 0) {
|
||||||
// Fast case -- docFreq fits in 6 bits
|
// Fast case -- docFreq fits in 6 bits
|
||||||
out.writeByte((byte) (highBit | numDocs));
|
out.writeByte((byte) (highBit | stats.docFreq));
|
||||||
} else {
|
} else {
|
||||||
// Write bottom 6 bits of docFreq, then write the
|
// Write bottom 6 bits of docFreq, then write the
|
||||||
// remainder as vInt:
|
// remainder as vInt:
|
||||||
out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
|
out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
|
||||||
out.writeVInt(numDocs >>> 6);
|
out.writeVInt(stats.docFreq >>> 6);
|
||||||
}
|
}
|
||||||
postingsWriter.finishTerm(numDocs, isIndexTerm);
|
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||||
|
assert stats.totalTermFreq >= stats.docFreq;
|
||||||
|
out.writeVLong(stats.totalTermFreq - stats.docFreq);
|
||||||
|
}
|
||||||
|
postingsWriter.finishTerm(stats, isIndexTerm);
|
||||||
numTerms++;
|
numTerms++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finishes all terms in this field
|
// Finishes all terms in this field
|
||||||
@Override
|
@Override
|
||||||
public void finish() throws IOException {
|
public void finish(long sumTotalTermFreq) throws IOException {
|
||||||
// EOF marker:
|
// EOF marker:
|
||||||
|
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
out.writeVInt(DeltaBytesWriter.TERM_EOF);
|
out.writeVInt(DeltaBytesWriter.TERM_EOF);
|
||||||
fieldIndexWriter.finish();
|
fieldIndexWriter.finish();
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.lucene.index.codecs;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TermStats {
|
||||||
|
public final int docFreq;
|
||||||
|
public final long totalTermFreq;
|
||||||
|
|
||||||
|
public TermStats(int docFreq, long totalTermFreq) {
|
||||||
|
this.docFreq = docFreq;
|
||||||
|
this.totalTermFreq = totalTermFreq;
|
||||||
|
}
|
||||||
|
}
|
|
@ -38,10 +38,10 @@ public abstract class TermsConsumer {
|
||||||
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
|
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
|
||||||
|
|
||||||
/** Finishes the current term; numDocs must be > 0. */
|
/** Finishes the current term; numDocs must be > 0. */
|
||||||
public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
|
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
|
||||||
|
|
||||||
/** Called when we are done adding terms to this field */
|
/** Called when we are done adding terms to this field */
|
||||||
public abstract void finish() throws IOException;
|
public abstract void finish(long sumTotalTermFreq) throws IOException;
|
||||||
|
|
||||||
/** Return the BytesRef Comparator used to sort terms
|
/** Return the BytesRef Comparator used to sort terms
|
||||||
* before feeding to this API. */
|
* before feeding to this API. */
|
||||||
|
@ -55,6 +55,7 @@ public abstract class TermsConsumer {
|
||||||
|
|
||||||
BytesRef term;
|
BytesRef term;
|
||||||
assert termsEnum != null;
|
assert termsEnum != null;
|
||||||
|
long sumTotalTermFreq = 0;
|
||||||
|
|
||||||
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
||||||
if (docsEnum == null) {
|
if (docsEnum == null) {
|
||||||
|
@ -69,9 +70,9 @@ public abstract class TermsConsumer {
|
||||||
if (docsEnumIn != null) {
|
if (docsEnumIn != null) {
|
||||||
docsEnum.reset(docsEnumIn);
|
docsEnum.reset(docsEnumIn);
|
||||||
final PostingsConsumer postingsConsumer = startTerm(term);
|
final PostingsConsumer postingsConsumer = startTerm(term);
|
||||||
final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
|
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
|
||||||
if (numDocs > 0) {
|
if (stats.docFreq > 0) {
|
||||||
finishTerm(term, numDocs);
|
finishTerm(term, stats);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -94,14 +95,15 @@ public abstract class TermsConsumer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
final PostingsConsumer postingsConsumer = startTerm(term);
|
final PostingsConsumer postingsConsumer = startTerm(term);
|
||||||
final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
|
final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
|
||||||
if (numDocs > 0) {
|
if (stats.docFreq > 0) {
|
||||||
finishTerm(term, numDocs);
|
finishTerm(term, stats);
|
||||||
|
sumTotalTermFreq += stats.totalTermFreq;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
finish();
|
finish(sumTotalTermFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBase {
|
||||||
public abstract void setTermsOutput(IndexOutput out);
|
public abstract void setTermsOutput(IndexOutput out);
|
||||||
|
|
||||||
public abstract class FieldWriter {
|
public abstract class FieldWriter {
|
||||||
public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
|
public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
|
||||||
public abstract void finish() throws IOException;
|
public abstract void finish() throws IOException;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
public static abstract class IndexTermSelector {
|
public static abstract class IndexTermSelector {
|
||||||
// Called sequentially on every term being written,
|
// Called sequentially on every term being written,
|
||||||
// returning true if this term should be indexed
|
// returning true if this term should be indexed
|
||||||
public abstract boolean isIndexTerm(BytesRef term, int docFreq);
|
public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Same policy as {@link FixedGapTermsIndexWriter} */
|
/** Same policy as {@link FixedGapTermsIndexWriter} */
|
||||||
|
@ -74,7 +74,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||||
if (count >= interval) {
|
if (count >= interval) {
|
||||||
count = 0;
|
count = 0;
|
||||||
return true;
|
return true;
|
||||||
|
@ -99,8 +99,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||||
if (docFreq >= docFreqThresh || count >= interval) {
|
if (stats.docFreq >= docFreqThresh || count >= interval) {
|
||||||
count = 0;
|
count = 0;
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
|
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
|
||||||
if (policy.isIndexTerm(text, docFreq) || first) {
|
if (policy.isIndexTerm(text, stats) || first) {
|
||||||
first = false;
|
first = false;
|
||||||
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
|
//System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
|
||||||
final int lengthSave = text.length;
|
final int lengthSave = text.length;
|
||||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.index.FieldsEnum;
|
||||||
import org.apache.lucene.index.IndexFileNames;
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
import org.apache.lucene.index.SegmentInfo;
|
import org.apache.lucene.index.SegmentInfo;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermState;
|
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.index.CompoundFileReader;
|
import org.apache.lucene.index.CompoundFileReader;
|
||||||
|
@ -263,6 +262,11 @@ public class PreFlexFields extends FieldsProducer {
|
||||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class PreTermsEnum extends TermsEnum {
|
private class PreTermsEnum extends TermsEnum {
|
||||||
|
@ -938,6 +942,11 @@ public class PreFlexFields extends FieldsProducer {
|
||||||
return termEnum.docFreq();
|
return termEnum.docFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||||
PreDocsEnum docsEnum;
|
PreDocsEnum docsEnum;
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.store.RAMOutputStream;
|
import org.apache.lucene.store.RAMOutputStream;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -177,7 +178,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
||||||
|
|
||||||
/** Called when we are done adding docs to this term */
|
/** Called when we are done adding docs to this term */
|
||||||
@Override
|
@Override
|
||||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||||
//System.out.println("PW finishTerm docCount=" + docCount);
|
//System.out.println("PW finishTerm docCount=" + docCount);
|
||||||
|
|
||||||
assert pendingCount > 0 || pendingCount == -1;
|
assert pendingCount > 0 || pendingCount == -1;
|
||||||
|
@ -186,7 +187,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
|
||||||
|
|
||||||
if (pendingCount == -1) {
|
if (pendingCount == -1) {
|
||||||
termsOut.writeByte((byte) 0);
|
termsOut.writeByte((byte) 0);
|
||||||
wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
|
wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
|
||||||
pendingIsIndexTerm = false;
|
pendingIsIndexTerm = false;
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.IndexFileNames;
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CodecUtil;
|
import org.apache.lucene.util.CodecUtil;
|
||||||
|
@ -239,11 +240,11 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
|
||||||
|
|
||||||
/** Called when we are done adding docs to this term */
|
/** Called when we are done adding docs to this term */
|
||||||
@Override
|
@Override
|
||||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||||
|
|
||||||
// TODO: -- wasteful we are counting this in two places?
|
// TODO: -- wasteful we are counting this in two places?
|
||||||
assert docCount > 0;
|
assert stats.docFreq > 0;
|
||||||
assert docCount == df;
|
assert stats.docFreq == df;
|
||||||
|
|
||||||
docIndex.write(termsOut, isIndexTerm);
|
docIndex.write(termsOut, isIndexTerm);
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.index.codecs.FieldsProducer;
|
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.FieldsEnum;
|
import org.apache.lucene.index.FieldsEnum;
|
||||||
import org.apache.lucene.index.TermState;
|
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.DocsEnum;
|
import org.apache.lucene.index.DocsEnum;
|
||||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
|
@ -119,28 +118,31 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private final IndexInput in;
|
private final IndexInput in;
|
||||||
private final boolean omitTF;
|
private final boolean omitTF;
|
||||||
private int docFreq;
|
private int docFreq;
|
||||||
|
private long totalTermFreq;
|
||||||
private long docsStart;
|
private long docsStart;
|
||||||
private boolean ended;
|
private boolean ended;
|
||||||
private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
|
private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
|
||||||
|
|
||||||
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
|
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
|
||||||
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||||
this.omitTF = omitTF;
|
this.omitTF = omitTF;
|
||||||
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
|
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
|
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
|
||||||
|
|
||||||
//System.out.println("seek to text=" + text.utf8ToString());
|
//System.out.println("seek to text=" + text.utf8ToString());
|
||||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
|
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
|
||||||
if (result == null) {
|
if (result == null) {
|
||||||
//System.out.println(" end");
|
//System.out.println(" end");
|
||||||
return SeekStatus.END;
|
return SeekStatus.END;
|
||||||
} else {
|
} else {
|
||||||
//System.out.println(" got text=" + term.utf8ToString());
|
//System.out.println(" got text=" + term.utf8ToString());
|
||||||
PairOutputs.Pair<Long,Long> pair = result.output;
|
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
|
||||||
docsStart = pair.output1;
|
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
|
||||||
docFreq = pair.output2.intValue();
|
docsStart = pair1.output1;
|
||||||
|
docFreq = pair2.output1.intValue();
|
||||||
|
totalTermFreq = pair2.output2;
|
||||||
|
|
||||||
if (result.input.equals(text)) {
|
if (result.input.equals(text)) {
|
||||||
//System.out.println(" match docsStart=" + docsStart);
|
//System.out.println(" match docsStart=" + docsStart);
|
||||||
|
@ -155,11 +157,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
@Override
|
@Override
|
||||||
public BytesRef next() throws IOException {
|
public BytesRef next() throws IOException {
|
||||||
assert !ended;
|
assert !ended;
|
||||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
|
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
final PairOutputs.Pair<Long,Long> pair = result.output;
|
PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
|
||||||
docsStart = pair.output1;
|
PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
|
||||||
docFreq = pair.output2.intValue();
|
docsStart = pair1.output1;
|
||||||
|
docFreq = pair2.output1.intValue();
|
||||||
|
totalTermFreq = pair2.output2;
|
||||||
return result.input;
|
return result.input;
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return null;
|
||||||
|
@ -186,6 +190,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
return docFreq;
|
return docFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return totalTermFreq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||||
SimpleTextDocsEnum docsEnum;
|
SimpleTextDocsEnum docsEnum;
|
||||||
|
@ -438,8 +447,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private class SimpleTextTerms extends Terms {
|
private class SimpleTextTerms extends Terms {
|
||||||
private final long termsStart;
|
private final long termsStart;
|
||||||
private final boolean omitTF;
|
private final boolean omitTF;
|
||||||
private FST<PairOutputs.Pair<Long,Long>> fst;
|
private long sumTotalTermFreq;
|
||||||
|
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
|
||||||
|
private int termCount;
|
||||||
private final BytesRef scratch = new BytesRef(10);
|
private final BytesRef scratch = new BytesRef(10);
|
||||||
|
|
||||||
public SimpleTextTerms(String field, long termsStart) throws IOException {
|
public SimpleTextTerms(String field, long termsStart) throws IOException {
|
||||||
|
@ -450,24 +460,38 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
|
|
||||||
private void loadTerms() throws IOException {
|
private void loadTerms() throws IOException {
|
||||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
||||||
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
|
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
||||||
|
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
true,
|
||||||
|
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
||||||
|
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
|
||||||
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||||
in.seek(termsStart);
|
in.seek(termsStart);
|
||||||
final BytesRef lastTerm = new BytesRef(10);
|
final BytesRef lastTerm = new BytesRef(10);
|
||||||
long lastDocsStart = -1;
|
long lastDocsStart = -1;
|
||||||
int docFreq = 0;
|
int docFreq = 0;
|
||||||
|
long totalTermFreq = 0;
|
||||||
while(true) {
|
while(true) {
|
||||||
readLine(in, scratch);
|
readLine(in, scratch);
|
||||||
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
|
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
|
||||||
if (lastDocsStart != -1) {
|
if (lastDocsStart != -1) {
|
||||||
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
|
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||||
|
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||||
|
posIntOutputs.get(totalTermFreq))));
|
||||||
|
sumTotalTermFreq += totalTermFreq;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
} else if (scratch.startsWith(DOC)) {
|
} else if (scratch.startsWith(DOC)) {
|
||||||
docFreq++;
|
docFreq++;
|
||||||
|
} else if (scratch.startsWith(POS)) {
|
||||||
|
totalTermFreq++;
|
||||||
} else if (scratch.startsWith(TERM)) {
|
} else if (scratch.startsWith(TERM)) {
|
||||||
if (lastDocsStart != -1) {
|
if (lastDocsStart != -1) {
|
||||||
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
|
b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||||
|
new PairOutputs.Pair<Long,Long>((long) docFreq,
|
||||||
|
posIntOutputs.get(totalTermFreq))));
|
||||||
}
|
}
|
||||||
lastDocsStart = in.getFilePointer();
|
lastDocsStart = in.getFilePointer();
|
||||||
final int len = scratch.length - TERM.length;
|
final int len = scratch.length - TERM.length;
|
||||||
|
@ -477,6 +501,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
|
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
|
||||||
lastTerm.length = len;
|
lastTerm.length = len;
|
||||||
docFreq = 0;
|
docFreq = 0;
|
||||||
|
sumTotalTermFreq += totalTermFreq;
|
||||||
|
totalTermFreq = 0;
|
||||||
|
termCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fst = b.finish();
|
fst = b.finish();
|
||||||
|
@ -502,6 +529,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
public Comparator<BytesRef> getComparator() {
|
public Comparator<BytesRef> getComparator() {
|
||||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getUniqueTermCount() {
|
||||||
|
return (long) termCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return sumTotalTermFreq;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUtil;
|
||||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finishTerm(BytesRef term, int numDocs) throws IOException {
|
public void finishTerm(BytesRef term, TermStats stats) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish() throws IOException {
|
public void finish(long sumTotalTermFreq) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.IndexFileNames;
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
import org.apache.lucene.index.CorruptIndexException;
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CodecUtil;
|
import org.apache.lucene.util.CodecUtil;
|
||||||
|
|
||||||
|
@ -184,12 +185,12 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
|
||||||
|
|
||||||
/** Called when we are done adding docs to this term */
|
/** Called when we are done adding docs to this term */
|
||||||
@Override
|
@Override
|
||||||
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
|
public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
|
||||||
assert docCount > 0;
|
assert stats.docFreq > 0;
|
||||||
|
|
||||||
// TODO: wasteful we are counting this (counting # docs
|
// TODO: wasteful we are counting this (counting # docs
|
||||||
// for this term) in two places?
|
// for this term) in two places?
|
||||||
assert docCount == df;
|
assert stats.docFreq == df;
|
||||||
|
|
||||||
if (isIndexTerm) {
|
if (isIndexTerm) {
|
||||||
// Write absolute at seek points
|
// Write absolute at seek points
|
||||||
|
|
|
@ -126,6 +126,11 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
||||||
return tenum.docFreq();
|
return tenum.docFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return tenum.totalTermFreq();
|
||||||
|
}
|
||||||
|
|
||||||
/** This enum does not support seeking!
|
/** This enum does not support seeking!
|
||||||
* @throws UnsupportedOperationException
|
* @throws UnsupportedOperationException
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -245,6 +245,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
return actualEnum.docFreq();
|
return actualEnum.docFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return actualEnum.totalTermFreq();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||||
return actualEnum.docs(skipDocs, reuse);
|
return actualEnum.docs(skipDocs, reuse);
|
||||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.index.OrdTermState;
|
||||||
import org.apache.lucene.index.TermState;
|
import org.apache.lucene.index.TermState;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.FieldCache.DocTermsIndex;
|
import org.apache.lucene.search.FieldCache.DocTermsIndex;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
@ -321,6 +320,11 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
|
|
@ -102,6 +102,8 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
static class RAMField extends Terms {
|
static class RAMField extends Terms {
|
||||||
final String field;
|
final String field;
|
||||||
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
|
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
|
||||||
|
long sumTotalTermFreq;
|
||||||
|
|
||||||
RAMField(String field) {
|
RAMField(String field) {
|
||||||
this.field = field;
|
this.field = field;
|
||||||
}
|
}
|
||||||
|
@ -111,6 +113,11 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
return termToDocs.size();
|
return termToDocs.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return sumTotalTermFreq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum iterator() {
|
public TermsEnum iterator() {
|
||||||
return new RAMTermsEnum(RAMOnlyCodec.RAMField.this);
|
return new RAMTermsEnum(RAMOnlyCodec.RAMField.this);
|
||||||
|
@ -124,6 +131,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
|
|
||||||
static class RAMTerm {
|
static class RAMTerm {
|
||||||
final String term;
|
final String term;
|
||||||
|
long totalTermFreq;
|
||||||
final List<RAMDoc> docs = new ArrayList<RAMDoc>();
|
final List<RAMDoc> docs = new ArrayList<RAMDoc>();
|
||||||
public RAMTerm(String term) {
|
public RAMTerm(String term) {
|
||||||
this.term = term;
|
this.term = term;
|
||||||
|
@ -189,14 +197,16 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finishTerm(BytesRef text, int numDocs) {
|
public void finishTerm(BytesRef text, TermStats stats) {
|
||||||
assert numDocs > 0;
|
assert stats.docFreq > 0;
|
||||||
assert numDocs == current.docs.size();
|
assert stats.docFreq == current.docs.size();
|
||||||
|
current.totalTermFreq = stats.totalTermFreq;
|
||||||
field.termToDocs.put(current.term, current);
|
field.termToDocs.put(current.term, current);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish() {
|
public void finish(long sumTotalTermFreq) {
|
||||||
|
field.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -331,6 +341,10 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return ramField.termToDocs.get(current).totalTermFreq;
|
||||||
|
}
|
||||||
|
|
||||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||||
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
|
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||||
import org.apache.lucene.index.codecs.FieldsProducer;
|
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.index.codecs.mocksep.MockSepCodec;
|
import org.apache.lucene.index.codecs.mocksep.MockSepCodec;
|
||||||
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
@ -97,9 +98,11 @@ public class TestCodecs extends LuceneTestCase {
|
||||||
public void write(final FieldsConsumer consumer) throws Throwable {
|
public void write(final FieldsConsumer consumer) throws Throwable {
|
||||||
Arrays.sort(terms);
|
Arrays.sort(terms);
|
||||||
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
|
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
|
||||||
for (final TermData term : terms)
|
long sumTotalTermCount = 0;
|
||||||
term.write(termsConsumer);
|
for (final TermData term : terms) {
|
||||||
termsConsumer.finish();
|
sumTotalTermCount += term.write(termsConsumer);
|
||||||
|
}
|
||||||
|
termsConsumer.finish(sumTotalTermCount);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -131,8 +134,9 @@ public class TestCodecs extends LuceneTestCase {
|
||||||
return text.compareTo(((TermData) o).text);
|
return text.compareTo(((TermData) o).text);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(final TermsConsumer termsConsumer) throws Throwable {
|
public long write(final TermsConsumer termsConsumer) throws Throwable {
|
||||||
final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
|
final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
|
||||||
|
long totTF = 0;
|
||||||
for(int i=0;i<docs.length;i++) {
|
for(int i=0;i<docs.length;i++) {
|
||||||
final int termDocFreq;
|
final int termDocFreq;
|
||||||
if (field.omitTF) {
|
if (field.omitTF) {
|
||||||
|
@ -142,6 +146,7 @@ public class TestCodecs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
postingsConsumer.startDoc(docs[i], termDocFreq);
|
postingsConsumer.startDoc(docs[i], termDocFreq);
|
||||||
if (!field.omitTF) {
|
if (!field.omitTF) {
|
||||||
|
totTF += positions[i].length;
|
||||||
for(int j=0;j<positions[i].length;j++) {
|
for(int j=0;j<positions[i].length;j++) {
|
||||||
final PositionData pos = positions[i][j];
|
final PositionData pos = positions[i][j];
|
||||||
postingsConsumer.addPosition(pos.pos, pos.payload);
|
postingsConsumer.addPosition(pos.pos, pos.payload);
|
||||||
|
@ -149,7 +154,8 @@ public class TestCodecs extends LuceneTestCase {
|
||||||
postingsConsumer.finishDoc();
|
postingsConsumer.finishDoc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
termsConsumer.finishTerm(text, docs.length);
|
termsConsumer.finishTerm(text, new TermStats(docs.length, totTF));
|
||||||
|
return totTF;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1865,4 +1865,22 @@ public class TestIndexReader extends LuceneTestCase
|
||||||
assertTrue(IndexReader.indexExists(dir));
|
assertTrue(IndexReader.indexExists(dir));
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Make sure totalTermFreq works correctly in the terms
|
||||||
|
// dict cache
|
||||||
|
public void testTotalTermFreqCached() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||||
|
Document d = new Document();
|
||||||
|
d.add(newField("f", "a a b", Field.Index.ANALYZED));
|
||||||
|
writer.addDocument(d);
|
||||||
|
IndexReader r = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
Terms terms = MultiFields.getTerms(r, "f");
|
||||||
|
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
|
||||||
|
assertEquals(2, terms.totalTermFreq(new BytesRef("a")));
|
||||||
|
assertEquals(1, terms.totalTermFreq(new BytesRef("b")));
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase;
|
||||||
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
|
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
|
||||||
import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
|
import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
|
||||||
import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
|
import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec;
|
import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec;
|
||||||
import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec;
|
import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec;
|
||||||
import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory;
|
import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory;
|
||||||
|
@ -66,7 +67,7 @@ public class MockRandomCodec extends Codec {
|
||||||
|
|
||||||
public MockRandomCodec(Random random) {
|
public MockRandomCodec(Random random) {
|
||||||
name = "MockRandom";
|
name = "MockRandom";
|
||||||
this.seedRandom = random;
|
this.seedRandom = new Random(random.nextLong());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -148,7 +149,7 @@ public class MockRandomCodec extends Codec {
|
||||||
final Random rand = new Random(seed2);
|
final Random rand = new Random(seed2);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isIndexTerm(BytesRef term, int docFreq) {
|
public boolean isIndexTerm(BytesRef term, TermStats stats) {
|
||||||
return random.nextInt(gap) == 17;
|
return random.nextInt(gap) == 17;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||||
import org.apache.lucene.index.codecs.TermsConsumer;
|
import org.apache.lucene.index.codecs.TermsConsumer;
|
||||||
import org.apache.lucene.index.codecs.PostingsConsumer;
|
import org.apache.lucene.index.codecs.PostingsConsumer;
|
||||||
|
import org.apache.lucene.index.codecs.TermStats;
|
||||||
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
|
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
|
||||||
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
||||||
import org.apache.lucene.index.CorruptIndexException;
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
@ -184,10 +185,10 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finishTerm(BytesRef text, int numDocs) throws IOException {
|
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||||
if (numDocs > 0) {
|
if (stats.docFreq > 0) {
|
||||||
long skipPointer = skipListWriter.writeSkip(freqOut);
|
long skipPointer = skipListWriter.writeSkip(freqOut);
|
||||||
termInfo.docFreq = numDocs;
|
termInfo.docFreq = stats.docFreq;
|
||||||
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
|
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
|
||||||
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
|
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
|
||||||
termsOut.add(fieldInfo.number,
|
termsOut.add(fieldInfo.number,
|
||||||
|
@ -197,7 +198,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish() throws IOException {
|
public void finish(long sumTotalTermCount) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1000,6 +1000,10 @@ class NumberedTermsEnum extends TermsEnum {
|
||||||
return tenum.docFreq();
|
return tenum.docFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return tenum.totalTermFreq();
|
||||||
|
}
|
||||||
|
|
||||||
public BytesRef skipTo(BytesRef target) throws IOException {
|
public BytesRef skipTo(BytesRef target) throws IOException {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue