mirror of https://github.com/apache/lucene.git
LUCENE-3290: add FieldInvertState.numUniqueTerms, Terms.sumDocFreq
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1144513 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
87850a3a9c
commit
1ae1d6b4fa
|
@ -421,6 +421,8 @@ New features
|
||||||
* LUCENE-2862: Added TermsEnum.totalTermFreq() and
|
* LUCENE-2862: Added TermsEnum.totalTermFreq() and
|
||||||
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
|
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-3290: Added Terms.getSumDocFreq() (Mike McCandless, Robert Muir)
|
||||||
|
|
||||||
* LUCENE-3003: Added new expert class oal.index.DocTermsOrd,
|
* LUCENE-3003: Added new expert class oal.index.DocTermsOrd,
|
||||||
refactored from Solr's UnInvertedField, for accessing term ords for
|
refactored from Solr's UnInvertedField, for accessing term ords for
|
||||||
multi-valued fields, per document. This is similar to FieldCache in
|
multi-valued fields, per document. This is similar to FieldCache in
|
||||||
|
@ -512,6 +514,11 @@ Bug fixes
|
||||||
causing the file to sometimes be larger than it needed to be. (Mike
|
causing the file to sometimes be larger than it needed to be. (Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
||||||
|
New Features
|
||||||
|
|
||||||
|
* LUCENE-3290: Added FieldInvertState.numUniqueTerms
|
||||||
|
(Mike McCandless, Robert Muir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated
|
* LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated
|
||||||
|
|
|
@ -426,6 +426,12 @@ public class InstantiatedIndexReader extends IndexReader {
|
||||||
public long getSumTotalTermFreq() {
|
public long getSumTotalTermFreq() {
|
||||||
return sumTotalTermFreq;
|
return sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: support this?
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Comparator<BytesRef> getComparator() {
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
|
|
@ -842,6 +842,12 @@ public class MemoryIndex {
|
||||||
public long getSumTotalTermFreq() {
|
public long getSumTotalTermFreq() {
|
||||||
return info.getSumTotalTermFreq();
|
return info.getSumTotalTermFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
// each term has df=1
|
||||||
|
return info.sortedTerms.length;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -128,6 +128,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
|
||||||
public long getSumTotalTermFreq() {
|
public long getSumTotalTermFreq() {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
assert termsEnum != null;
|
assert termsEnum != null;
|
||||||
|
|
|
@ -691,7 +691,7 @@ public class CheckIndex {
|
||||||
Comparator<BytesRef> termComp = terms.getComparator();
|
Comparator<BytesRef> termComp = terms.getComparator();
|
||||||
|
|
||||||
long sumTotalTermFreq = 0;
|
long sumTotalTermFreq = 0;
|
||||||
|
long sumDocFreq = 0;
|
||||||
while(true) {
|
while(true) {
|
||||||
|
|
||||||
final BytesRef term = terms.next();
|
final BytesRef term = terms.next();
|
||||||
|
@ -712,6 +712,7 @@ public class CheckIndex {
|
||||||
|
|
||||||
final int docFreq = terms.docFreq();
|
final int docFreq = terms.docFreq();
|
||||||
status.totFreq += docFreq;
|
status.totFreq += docFreq;
|
||||||
|
sumDocFreq += docFreq;
|
||||||
|
|
||||||
docs = terms.docs(liveDocs, docs);
|
docs = terms.docs(liveDocs, docs);
|
||||||
postings = terms.docsAndPositions(liveDocs, postings);
|
postings = terms.docsAndPositions(liveDocs, postings);
|
||||||
|
@ -879,6 +880,13 @@ public class CheckIndex {
|
||||||
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
|
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sumDocFreq != 0) {
|
||||||
|
final long v = fields.terms(field).getSumDocFreq();
|
||||||
|
if (v != -1 && sumDocFreq != v) {
|
||||||
|
throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Test seek to last term:
|
// Test seek to last term:
|
||||||
if (lastTerm != null) {
|
if (lastTerm != null) {
|
||||||
|
|
|
@ -31,6 +31,7 @@ public final class FieldInvertState {
|
||||||
int numOverlap;
|
int numOverlap;
|
||||||
int offset;
|
int offset;
|
||||||
int maxTermFrequency;
|
int maxTermFrequency;
|
||||||
|
int uniqueTermCount;
|
||||||
float boost;
|
float boost;
|
||||||
AttributeSource attributeSource;
|
AttributeSource attributeSource;
|
||||||
|
|
||||||
|
@ -55,6 +56,7 @@ public final class FieldInvertState {
|
||||||
numOverlap = 0;
|
numOverlap = 0;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
maxTermFrequency = 0;
|
maxTermFrequency = 0;
|
||||||
|
uniqueTermCount = 0;
|
||||||
boost = docBoost;
|
boost = docBoost;
|
||||||
attributeSource = null;
|
attributeSource = null;
|
||||||
}
|
}
|
||||||
|
@ -122,6 +124,13 @@ public final class FieldInvertState {
|
||||||
return maxTermFrequency;
|
return maxTermFrequency;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the number of unique terms encountered in this field.
|
||||||
|
*/
|
||||||
|
public int getUniqueTermCount() {
|
||||||
|
return uniqueTermCount;
|
||||||
|
}
|
||||||
|
|
||||||
public AttributeSource getAttributeSource() {
|
public AttributeSource getAttributeSource() {
|
||||||
return attributeSource;
|
return attributeSource;
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,6 +105,11 @@ public class FilterIndexReader extends IndexReader {
|
||||||
public long getSumTotalTermFreq() throws IOException {
|
public long getSumTotalTermFreq() throws IOException {
|
||||||
return in.getSumTotalTermFreq();
|
return in.getSumTotalTermFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
return in.getSumDocFreq();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Base class for filtering {@link TermsEnum} implementations. */
|
/** Base class for filtering {@link TermsEnum} implementations. */
|
||||||
|
|
|
@ -134,6 +134,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
writeProx(termID, fieldState.position);
|
writeProx(termID, fieldState.position);
|
||||||
}
|
}
|
||||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||||
|
fieldState.uniqueTermCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -151,6 +152,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
|
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
|
||||||
postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
|
postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
|
||||||
postings.lastDocIDs[termID] = docState.docID;
|
postings.lastDocIDs[termID] = docState.docID;
|
||||||
|
fieldState.uniqueTermCount++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (docState.docID != postings.lastDocIDs[termID]) {
|
if (docState.docID != postings.lastDocIDs[termID]) {
|
||||||
|
@ -171,6 +173,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
||||||
postings.lastDocIDs[termID] = docState.docID;
|
postings.lastDocIDs[termID] = docState.docID;
|
||||||
writeProx(termID, fieldState.position);
|
writeProx(termID, fieldState.position);
|
||||||
|
fieldState.uniqueTermCount++;
|
||||||
} else {
|
} else {
|
||||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
|
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
|
||||||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||||
|
@ -251,6 +254,8 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
final ByteSliceReader prox = new ByteSliceReader();
|
final ByteSliceReader prox = new ByteSliceReader();
|
||||||
|
|
||||||
long sumTotalTermFreq = 0;
|
long sumTotalTermFreq = 0;
|
||||||
|
long sumDocFreq = 0;
|
||||||
|
|
||||||
for (int i = 0; i < numTerms; i++) {
|
for (int i = 0; i < numTerms; i++) {
|
||||||
final int termID = termIDs[i];
|
final int termID = termIDs[i];
|
||||||
// Get BytesRef
|
// Get BytesRef
|
||||||
|
@ -389,9 +394,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
}
|
}
|
||||||
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
|
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
|
||||||
sumTotalTermFreq += totTF;
|
sumTotalTermFreq += totTF;
|
||||||
|
sumDocFreq += numDocs;
|
||||||
}
|
}
|
||||||
|
|
||||||
termsConsumer.finish(sumTotalTermFreq);
|
termsConsumer.finish(sumTotalTermFreq, sumDocFreq);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,6 +88,19 @@ public final class MultiTerms extends Terms {
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
long sum = 0;
|
||||||
|
for(Terms terms : subs) {
|
||||||
|
final long v = terms.getSumDocFreq();
|
||||||
|
if (v == -1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
sum += v;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Comparator<BytesRef> getComparator() {
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
|
|
@ -132,6 +132,13 @@ public abstract class Terms {
|
||||||
* into account. */
|
* into account. */
|
||||||
public abstract long getSumTotalTermFreq() throws IOException;
|
public abstract long getSumTotalTermFreq() throws IOException;
|
||||||
|
|
||||||
|
/** Returns the sum of {@link #docFreq(BytesRef)} for
|
||||||
|
* all terms in this field, or -1 if this measure isn't
|
||||||
|
* stored by the codec. Note that, just like other term
|
||||||
|
* measures, this measure does not take deleted documents
|
||||||
|
* into account. */
|
||||||
|
public abstract long getSumDocFreq() throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a thread-private {@link TermsEnum} instance. Obtaining
|
* Returns a thread-private {@link TermsEnum} instance. Obtaining
|
||||||
* {@link TermsEnum} from this method might be more efficient than using
|
* {@link TermsEnum} from this method might be more efficient than using
|
||||||
|
|
|
@ -137,8 +137,9 @@ public class BlockTermsReader extends FieldsProducer {
|
||||||
final long termsStartPointer = in.readVLong();
|
final long termsStartPointer = in.readVLong();
|
||||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||||
final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
|
final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
|
||||||
|
final long sumDocFreq = in.readVLong();
|
||||||
assert !fields.containsKey(fieldInfo.name);
|
assert !fields.containsKey(fieldInfo.name);
|
||||||
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
|
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq));
|
||||||
}
|
}
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -245,13 +246,15 @@ public class BlockTermsReader extends FieldsProducer {
|
||||||
final FieldInfo fieldInfo;
|
final FieldInfo fieldInfo;
|
||||||
final long termsStartPointer;
|
final long termsStartPointer;
|
||||||
final long sumTotalTermFreq;
|
final long sumTotalTermFreq;
|
||||||
|
final long sumDocFreq;
|
||||||
|
|
||||||
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
|
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq) {
|
||||||
assert numTerms > 0;
|
assert numTerms > 0;
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
this.numTerms = numTerms;
|
this.numTerms = numTerms;
|
||||||
this.termsStartPointer = termsStartPointer;
|
this.termsStartPointer = termsStartPointer;
|
||||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
|
this.sumDocFreq = sumDocFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -279,6 +282,11 @@ public class BlockTermsReader extends FieldsProducer {
|
||||||
return sumTotalTermFreq;
|
return sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
return sumDocFreq;
|
||||||
|
}
|
||||||
|
|
||||||
// Iterates through terms in this field
|
// Iterates through terms in this field
|
||||||
private final class SegmentTermsEnum extends TermsEnum {
|
private final class SegmentTermsEnum extends TermsEnum {
|
||||||
private final IndexInput in;
|
private final IndexInput in;
|
||||||
|
|
|
@ -132,6 +132,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
||||||
if (!field.fieldInfo.omitTermFreqAndPositions) {
|
if (!field.fieldInfo.omitTermFreqAndPositions) {
|
||||||
out.writeVLong(field.sumTotalTermFreq);
|
out.writeVLong(field.sumTotalTermFreq);
|
||||||
}
|
}
|
||||||
|
out.writeVLong(field.sumDocFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
writeTrailer(dirStart);
|
writeTrailer(dirStart);
|
||||||
|
@ -157,6 +158,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
||||||
private long numTerms;
|
private long numTerms;
|
||||||
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
|
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
|
||||||
long sumTotalTermFreq;
|
long sumTotalTermFreq;
|
||||||
|
long sumDocFreq;
|
||||||
|
|
||||||
private TermEntry[] pendingTerms;
|
private TermEntry[] pendingTerms;
|
||||||
|
|
||||||
|
@ -231,7 +233,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
||||||
|
|
||||||
// Finishes all terms in this field
|
// Finishes all terms in this field
|
||||||
@Override
|
@Override
|
||||||
public void finish(long sumTotalTermFreq) throws IOException {
|
public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException {
|
||||||
if (pendingCount > 0) {
|
if (pendingCount > 0) {
|
||||||
flushBlock();
|
flushBlock();
|
||||||
}
|
}
|
||||||
|
@ -239,6 +241,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
||||||
out.writeVInt(0);
|
out.writeVInt(0);
|
||||||
|
|
||||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
|
this.sumDocFreq = sumDocFreq;
|
||||||
fieldIndexWriter.finish(out.getFilePointer());
|
fieldIndexWriter.finish(out.getFilePointer());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ public abstract class TermsConsumer {
|
||||||
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
|
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
|
||||||
|
|
||||||
/** Called when we are done adding terms to this field */
|
/** Called when we are done adding terms to this field */
|
||||||
public abstract void finish(long sumTotalTermFreq) throws IOException;
|
public abstract void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException;
|
||||||
|
|
||||||
/** Return the BytesRef Comparator used to sort terms
|
/** Return the BytesRef Comparator used to sort terms
|
||||||
* before feeding to this API. */
|
* before feeding to this API. */
|
||||||
|
@ -56,7 +56,8 @@ public abstract class TermsConsumer {
|
||||||
BytesRef term;
|
BytesRef term;
|
||||||
assert termsEnum != null;
|
assert termsEnum != null;
|
||||||
long sumTotalTermFreq = 0;
|
long sumTotalTermFreq = 0;
|
||||||
long sumDF = 0;
|
long sumDocFreq = 0;
|
||||||
|
long sumDFsinceLastAbortCheck = 0;
|
||||||
|
|
||||||
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
|
||||||
if (docsEnum == null) {
|
if (docsEnum == null) {
|
||||||
|
@ -74,10 +75,11 @@ public abstract class TermsConsumer {
|
||||||
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
|
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
|
||||||
if (stats.docFreq > 0) {
|
if (stats.docFreq > 0) {
|
||||||
finishTerm(term, stats);
|
finishTerm(term, stats);
|
||||||
sumDF += stats.docFreq;
|
sumDFsinceLastAbortCheck += stats.docFreq;
|
||||||
if (sumDF > 60000) {
|
sumDocFreq += stats.docFreq;
|
||||||
mergeState.checkAbort.work(sumDF/5.0);
|
if (sumDFsinceLastAbortCheck > 60000) {
|
||||||
sumDF = 0;
|
mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0);
|
||||||
|
sumDFsinceLastAbortCheck = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -105,16 +107,17 @@ public abstract class TermsConsumer {
|
||||||
if (stats.docFreq > 0) {
|
if (stats.docFreq > 0) {
|
||||||
finishTerm(term, stats);
|
finishTerm(term, stats);
|
||||||
sumTotalTermFreq += stats.totalTermFreq;
|
sumTotalTermFreq += stats.totalTermFreq;
|
||||||
sumDF += stats.docFreq;
|
sumDFsinceLastAbortCheck += stats.docFreq;
|
||||||
if (sumDF > 60000) {
|
sumDocFreq += stats.docFreq;
|
||||||
mergeState.checkAbort.work(sumDF/5.0);
|
if (sumDFsinceLastAbortCheck > 60000) {
|
||||||
sumDF = 0;
|
mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0);
|
||||||
|
sumDFsinceLastAbortCheck = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
finish(sumTotalTermFreq);
|
finish(sumTotalTermFreq, sumDocFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -219,13 +219,14 @@ public class MemoryCodec extends Codec {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish(long sumTotalTermFreq) throws IOException {
|
public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException {
|
||||||
if (termCount > 0) {
|
if (termCount > 0) {
|
||||||
out.writeVInt(termCount);
|
out.writeVInt(termCount);
|
||||||
out.writeVInt(field.number);
|
out.writeVInt(field.number);
|
||||||
if (!field.omitTermFreqAndPositions) {
|
if (!field.omitTermFreqAndPositions) {
|
||||||
out.writeVLong(sumTotalTermFreq);
|
out.writeVLong(sumTotalTermFreq);
|
||||||
}
|
}
|
||||||
|
out.writeVLong(sumDocFreq);
|
||||||
builder.finish().save(out);
|
builder.finish().save(out);
|
||||||
if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
|
if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
|
||||||
}
|
}
|
||||||
|
@ -683,6 +684,7 @@ public class MemoryCodec extends Codec {
|
||||||
private final static class TermsReader extends Terms {
|
private final static class TermsReader extends Terms {
|
||||||
|
|
||||||
private final long sumTotalTermFreq;
|
private final long sumTotalTermFreq;
|
||||||
|
private final long sumDocFreq;
|
||||||
private FST<BytesRef> fst;
|
private FST<BytesRef> fst;
|
||||||
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
private final FieldInfo field;
|
private final FieldInfo field;
|
||||||
|
@ -695,6 +697,7 @@ public class MemoryCodec extends Codec {
|
||||||
} else {
|
} else {
|
||||||
sumTotalTermFreq = 0;
|
sumTotalTermFreq = 0;
|
||||||
}
|
}
|
||||||
|
sumDocFreq = in.readVLong();
|
||||||
|
|
||||||
fst = new FST<BytesRef>(in, outputs);
|
fst = new FST<BytesRef>(in, outputs);
|
||||||
}
|
}
|
||||||
|
@ -704,6 +707,11 @@ public class MemoryCodec extends Codec {
|
||||||
return sumTotalTermFreq;
|
return sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
return sumDocFreq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum iterator() {
|
public TermsEnum iterator() {
|
||||||
return new FSTTermsEnum(field, fst);
|
return new FSTTermsEnum(field, fst);
|
||||||
|
|
|
@ -266,6 +266,11 @@ public class PreFlexFields extends FieldsProducer {
|
||||||
public long getSumTotalTermFreq() {
|
public long getSumTotalTermFreq() {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class PreTermsEnum extends TermsEnum {
|
private class PreTermsEnum extends TermsEnum {
|
||||||
|
|
|
@ -463,6 +463,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private final long termsStart;
|
private final long termsStart;
|
||||||
private final boolean omitTF;
|
private final boolean omitTF;
|
||||||
private long sumTotalTermFreq;
|
private long sumTotalTermFreq;
|
||||||
|
private long sumDocFreq;
|
||||||
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
|
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
|
||||||
private int termCount;
|
private int termCount;
|
||||||
private final BytesRef scratch = new BytesRef(10);
|
private final BytesRef scratch = new BytesRef(10);
|
||||||
|
@ -500,6 +501,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
break;
|
break;
|
||||||
} else if (scratch.startsWith(DOC)) {
|
} else if (scratch.startsWith(DOC)) {
|
||||||
docFreq++;
|
docFreq++;
|
||||||
|
sumDocFreq++;
|
||||||
} else if (scratch.startsWith(POS)) {
|
} else if (scratch.startsWith(POS)) {
|
||||||
totalTermFreq++;
|
totalTermFreq++;
|
||||||
} else if (scratch.startsWith(TERM)) {
|
} else if (scratch.startsWith(TERM)) {
|
||||||
|
@ -554,6 +556,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
public long getSumTotalTermFreq() {
|
public long getSumTotalTermFreq() {
|
||||||
return sumTotalTermFreq;
|
return sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
return sumDocFreq;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -89,7 +89,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish(long sumTotalTermFreq) throws IOException {
|
public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -195,7 +195,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish(long sumTotalTermCount) throws IOException {
|
public void finish(long sumTotalTermCount, long sumDocFreq) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -102,6 +102,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
final String field;
|
final String field;
|
||||||
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
|
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
|
||||||
long sumTotalTermFreq;
|
long sumTotalTermFreq;
|
||||||
|
long sumDocFreq;
|
||||||
|
|
||||||
RAMField(String field) {
|
RAMField(String field) {
|
||||||
this.field = field;
|
this.field = field;
|
||||||
|
@ -116,6 +117,11 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
public long getSumTotalTermFreq() {
|
public long getSumTotalTermFreq() {
|
||||||
return sumTotalTermFreq;
|
return sumTotalTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() throws IOException {
|
||||||
|
return sumDocFreq;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum iterator() {
|
public TermsEnum iterator() {
|
||||||
|
@ -204,8 +210,9 @@ public class TestExternalCodecs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish(long sumTotalTermFreq) {
|
public void finish(long sumTotalTermFreq, long sumDocFreq) {
|
||||||
field.sumTotalTermFreq = sumTotalTermFreq;
|
field.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
|
field.sumDocFreq = sumDocFreq;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -101,10 +101,12 @@ public class TestCodecs extends LuceneTestCase {
|
||||||
Arrays.sort(terms);
|
Arrays.sort(terms);
|
||||||
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
|
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
|
||||||
long sumTotalTermCount = 0;
|
long sumTotalTermCount = 0;
|
||||||
|
long sumDF = 0;
|
||||||
for (final TermData term : terms) {
|
for (final TermData term : terms) {
|
||||||
|
sumDF += term.docs.length;
|
||||||
sumTotalTermCount += term.write(termsConsumer);
|
sumTotalTermCount += term.write(termsConsumer);
|
||||||
}
|
}
|
||||||
termsConsumer.finish(sumTotalTermCount);
|
termsConsumer.finish(sumTotalTermCount, sumDF);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,101 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests {@link Terms#getSumDocFreq()}
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class TestSumDocFreq extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testSumDocFreq() throws Exception {
|
||||||
|
final int numDocs = atLeast(500);
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random, dir);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
Field field1 = newField("foo", "", Field.Index.ANALYZED);
|
||||||
|
Field field2 = newField("bar", "", Field.Index.ANALYZED);
|
||||||
|
doc.add(field1);
|
||||||
|
doc.add(field2);
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
char ch1 = (char) _TestUtil.nextInt(random, 'a', 'z');
|
||||||
|
char ch2 = (char) _TestUtil.nextInt(random, 'a', 'z');
|
||||||
|
field1.setValue("" + ch1 + " " + ch2);
|
||||||
|
ch1 = (char) _TestUtil.nextInt(random, 'a', 'z');
|
||||||
|
ch2 = (char) _TestUtil.nextInt(random, 'a', 'z');
|
||||||
|
field2.setValue("" + ch1 + " " + ch2);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
assertSumDocFreq(ir);
|
||||||
|
ir.close();
|
||||||
|
|
||||||
|
ir = IndexReader.open(dir, false);
|
||||||
|
int numDeletions = atLeast(20);
|
||||||
|
for (int i = 0; i < numDeletions; i++) {
|
||||||
|
ir.deleteDocument(random.nextInt(ir.maxDoc()));
|
||||||
|
}
|
||||||
|
ir.close();
|
||||||
|
|
||||||
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
|
||||||
|
w.optimize();
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
ir = IndexReader.open(dir, true);
|
||||||
|
assertSumDocFreq(ir);
|
||||||
|
ir.close();
|
||||||
|
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertSumDocFreq(IndexReader ir) throws Exception {
|
||||||
|
// compute sumDocFreq across all fields
|
||||||
|
Fields fields = MultiFields.getFields(ir);
|
||||||
|
FieldsEnum fieldEnum = fields.iterator();
|
||||||
|
String f = null;
|
||||||
|
while ((f = fieldEnum.next()) != null) {
|
||||||
|
Terms terms = fields.terms(f);
|
||||||
|
long sumDocFreq = terms.getSumDocFreq();
|
||||||
|
if (sumDocFreq == -1) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("skipping field: " + f + ", codec does not support sumDocFreq");
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
long computedSumDocFreq = 0;
|
||||||
|
TermsEnum termsEnum = terms.iterator();
|
||||||
|
while (termsEnum.next() != null) {
|
||||||
|
computedSumDocFreq += termsEnum.docFreq();
|
||||||
|
}
|
||||||
|
assertEquals(computedSumDocFreq, sumDocFreq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,108 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.search.DefaultSimilarity;
|
||||||
|
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests the uniqueTermCount statistic in FieldInvertState
|
||||||
|
*/
|
||||||
|
public class TestUniqueTermCount extends LuceneTestCase {
|
||||||
|
Directory dir;
|
||||||
|
IndexReader reader;
|
||||||
|
/* expected uniqueTermCount values for our documents */
|
||||||
|
ArrayList<Integer> expected = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
dir = newDirectory();
|
||||||
|
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||||
|
new MockAnalyzer(random, MockTokenizer.SIMPLE, true)).setMergePolicy(newLogMergePolicy());
|
||||||
|
config.setSimilarityProvider(new DefaultSimilarityProvider() {
|
||||||
|
@Override
|
||||||
|
public Similarity get(String field) {
|
||||||
|
return new TestSimilarity();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
|
||||||
|
Document doc = new Document();
|
||||||
|
Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
|
||||||
|
doc.add(foo);
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
foo.setValue(addValue());
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
reader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws Exception {
|
||||||
|
byte fooNorms[] = MultiNorms.norms(reader, "foo");
|
||||||
|
for (int i = 0; i < reader.maxDoc(); i++)
|
||||||
|
assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Makes a bunch of single-char tokens (the max # unique terms will at most be 26).
|
||||||
|
* puts the # unique terms into expected, to be checked against the norm.
|
||||||
|
*/
|
||||||
|
private String addValue() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
HashSet<String> terms = new HashSet<String>();
|
||||||
|
int num = _TestUtil.nextInt(random, 0, 255);
|
||||||
|
for (int i = 0; i < num; i++) {
|
||||||
|
sb.append(' ');
|
||||||
|
char term = (char) _TestUtil.nextInt(random, 'a', 'z');
|
||||||
|
sb.append(term);
|
||||||
|
terms.add("" + term);
|
||||||
|
}
|
||||||
|
expected.add(terms.size());
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple similarity that encodes maxTermFrequency directly as a byte
|
||||||
|
*/
|
||||||
|
class TestSimilarity extends DefaultSimilarity {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte computeNorm(FieldInvertState state) {
|
||||||
|
return (byte) state.getUniqueTermCount();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue