LUCENE-3290: add FieldInvertState.numUniqueTerms, Terms.sumDocFreq

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1144513 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-07-08 21:03:43 +00:00
parent 87850a3a9c
commit 1ae1d6b4fa
22 changed files with 345 additions and 21 deletions

View File

@ -421,6 +421,8 @@ New features
* LUCENE-2862: Added TermsEnum.totalTermFreq() and * LUCENE-2862: Added TermsEnum.totalTermFreq() and
Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir) Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir)
* LUCENE-3290: Added Terms.getSumDocFreq() (Mike McCandless, Robert Muir)
* LUCENE-3003: Added new expert class oal.index.DocTermsOrd, * LUCENE-3003: Added new expert class oal.index.DocTermsOrd,
refactored from Solr's UnInvertedField, for accessing term ords for refactored from Solr's UnInvertedField, for accessing term ords for
multi-valued fields, per document. This is similar to FieldCache in multi-valued fields, per document. This is similar to FieldCache in
@ -512,6 +514,11 @@ Bug fixes
causing the file to sometimes be larger than it needed to be. (Mike causing the file to sometimes be larger than it needed to be. (Mike
McCandless) McCandless)
New Features
* LUCENE-3290: Added FieldInvertState.numUniqueTerms
(Mike McCandless, Robert Muir)
Optimizations Optimizations
* LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated

View File

@ -426,6 +426,12 @@ public class InstantiatedIndexReader extends IndexReader {
public long getSumTotalTermFreq() { public long getSumTotalTermFreq() {
return sumTotalTermFreq; return sumTotalTermFreq;
} }
// TODO: support this?
@Override
public long getSumDocFreq() {
return -1;
}
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {

View File

@ -842,6 +842,12 @@ public class MemoryIndex {
public long getSumTotalTermFreq() { public long getSumTotalTermFreq() {
return info.getSumTotalTermFreq(); return info.getSumTotalTermFreq();
} }
@Override
public long getSumDocFreq() throws IOException {
// each term has df=1
return info.sortedTerms.length;
}
}; };
} }
} }

View File

@ -128,6 +128,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
public long getSumTotalTermFreq() { public long getSumTotalTermFreq() {
return -1; return -1;
} }
@Override
public long getSumDocFreq() throws IOException {
return -1;
}
}); });
assert termsEnum != null; assert termsEnum != null;

View File

@ -691,7 +691,7 @@ public class CheckIndex {
Comparator<BytesRef> termComp = terms.getComparator(); Comparator<BytesRef> termComp = terms.getComparator();
long sumTotalTermFreq = 0; long sumTotalTermFreq = 0;
long sumDocFreq = 0;
while(true) { while(true) {
final BytesRef term = terms.next(); final BytesRef term = terms.next();
@ -712,6 +712,7 @@ public class CheckIndex {
final int docFreq = terms.docFreq(); final int docFreq = terms.docFreq();
status.totFreq += docFreq; status.totFreq += docFreq;
sumDocFreq += docFreq;
docs = terms.docs(liveDocs, docs); docs = terms.docs(liveDocs, docs);
postings = terms.docsAndPositions(liveDocs, postings); postings = terms.docsAndPositions(liveDocs, postings);
@ -879,6 +880,13 @@ public class CheckIndex {
throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
} }
} }
if (sumDocFreq != 0) {
final long v = fields.terms(field).getSumDocFreq();
if (v != -1 && sumDocFreq != v) {
throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq);
}
}
// Test seek to last term: // Test seek to last term:
if (lastTerm != null) { if (lastTerm != null) {

View File

@ -31,6 +31,7 @@ public final class FieldInvertState {
int numOverlap; int numOverlap;
int offset; int offset;
int maxTermFrequency; int maxTermFrequency;
int uniqueTermCount;
float boost; float boost;
AttributeSource attributeSource; AttributeSource attributeSource;
@ -55,6 +56,7 @@ public final class FieldInvertState {
numOverlap = 0; numOverlap = 0;
offset = 0; offset = 0;
maxTermFrequency = 0; maxTermFrequency = 0;
uniqueTermCount = 0;
boost = docBoost; boost = docBoost;
attributeSource = null; attributeSource = null;
} }
@ -122,6 +124,13 @@ public final class FieldInvertState {
return maxTermFrequency; return maxTermFrequency;
} }
/**
* Return the number of unique terms encountered in this field.
*/
public int getUniqueTermCount() {
return uniqueTermCount;
}
public AttributeSource getAttributeSource() { public AttributeSource getAttributeSource() {
return attributeSource; return attributeSource;
} }

View File

@ -105,6 +105,11 @@ public class FilterIndexReader extends IndexReader {
public long getSumTotalTermFreq() throws IOException { public long getSumTotalTermFreq() throws IOException {
return in.getSumTotalTermFreq(); return in.getSumTotalTermFreq();
} }
@Override
public long getSumDocFreq() throws IOException {
return in.getSumDocFreq();
}
} }
/** Base class for filtering {@link TermsEnum} implementations. */ /** Base class for filtering {@link TermsEnum} implementations. */

View File

@ -134,6 +134,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
writeProx(termID, fieldState.position); writeProx(termID, fieldState.position);
} }
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
fieldState.uniqueTermCount++;
} }
@Override @Override
@ -151,6 +152,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID]; postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
postings.lastDocIDs[termID] = docState.docID; postings.lastDocIDs[termID] = docState.docID;
fieldState.uniqueTermCount++;
} }
} else { } else {
if (docState.docID != postings.lastDocIDs[termID]) { if (docState.docID != postings.lastDocIDs[termID]) {
@ -171,6 +173,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID; postings.lastDocIDs[termID] = docState.docID;
writeProx(termID, fieldState.position); writeProx(termID, fieldState.position);
fieldState.uniqueTermCount++;
} else { } else {
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]); fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
writeProx(termID, fieldState.position-postings.lastPositions[termID]); writeProx(termID, fieldState.position-postings.lastPositions[termID]);
@ -251,6 +254,8 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final ByteSliceReader prox = new ByteSliceReader(); final ByteSliceReader prox = new ByteSliceReader();
long sumTotalTermFreq = 0; long sumTotalTermFreq = 0;
long sumDocFreq = 0;
for (int i = 0; i < numTerms; i++) { for (int i = 0; i < numTerms; i++) {
final int termID = termIDs[i]; final int termID = termIDs[i];
// Get BytesRef // Get BytesRef
@ -389,9 +394,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
} }
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF)); termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
sumTotalTermFreq += totTF; sumTotalTermFreq += totTF;
sumDocFreq += numDocs;
} }
termsConsumer.finish(sumTotalTermFreq); termsConsumer.finish(sumTotalTermFreq, sumDocFreq);
} }
} }

View File

@ -88,6 +88,19 @@ public final class MultiTerms extends Terms {
} }
return sum; return sum;
} }
@Override
public long getSumDocFreq() throws IOException {
long sum = 0;
for(Terms terms : subs) {
final long v = terms.getSumDocFreq();
if (v == -1) {
return -1;
}
sum += v;
}
return sum;
}
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {

View File

@ -132,6 +132,13 @@ public abstract class Terms {
* into account. */ * into account. */
public abstract long getSumTotalTermFreq() throws IOException; public abstract long getSumTotalTermFreq() throws IOException;
/** Returns the sum of {@link #docFreq(BytesRef)} for
* all terms in this field, or -1 if this measure isn't
* stored by the codec. Note that, just like other term
* measures, this measure does not take deleted documents
* into account. */
public abstract long getSumDocFreq() throws IOException;
/** /**
* Returns a thread-private {@link TermsEnum} instance. Obtaining * Returns a thread-private {@link TermsEnum} instance. Obtaining
* {@link TermsEnum} from this method might be more efficient than using * {@link TermsEnum} from this method might be more efficient than using

View File

@ -137,8 +137,9 @@ public class BlockTermsReader extends FieldsProducer {
final long termsStartPointer = in.readVLong(); final long termsStartPointer = in.readVLong();
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong(); final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong();
assert !fields.containsKey(fieldInfo.name); assert !fields.containsKey(fieldInfo.name);
fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq)); fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq));
} }
success = true; success = true;
} finally { } finally {
@ -245,13 +246,15 @@ public class BlockTermsReader extends FieldsProducer {
final FieldInfo fieldInfo; final FieldInfo fieldInfo;
final long termsStartPointer; final long termsStartPointer;
final long sumTotalTermFreq; final long sumTotalTermFreq;
final long sumDocFreq;
FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) { FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq) {
assert numTerms > 0; assert numTerms > 0;
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
this.numTerms = numTerms; this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer; this.termsStartPointer = termsStartPointer;
this.sumTotalTermFreq = sumTotalTermFreq; this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
} }
@Override @Override
@ -279,6 +282,11 @@ public class BlockTermsReader extends FieldsProducer {
return sumTotalTermFreq; return sumTotalTermFreq;
} }
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
// Iterates through terms in this field // Iterates through terms in this field
private final class SegmentTermsEnum extends TermsEnum { private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in; private final IndexInput in;

View File

@ -132,6 +132,7 @@ public class BlockTermsWriter extends FieldsConsumer {
if (!field.fieldInfo.omitTermFreqAndPositions) { if (!field.fieldInfo.omitTermFreqAndPositions) {
out.writeVLong(field.sumTotalTermFreq); out.writeVLong(field.sumTotalTermFreq);
} }
out.writeVLong(field.sumDocFreq);
} }
} }
writeTrailer(dirStart); writeTrailer(dirStart);
@ -157,6 +158,7 @@ public class BlockTermsWriter extends FieldsConsumer {
private long numTerms; private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
long sumTotalTermFreq; long sumTotalTermFreq;
long sumDocFreq;
private TermEntry[] pendingTerms; private TermEntry[] pendingTerms;
@ -231,7 +233,7 @@ public class BlockTermsWriter extends FieldsConsumer {
// Finishes all terms in this field // Finishes all terms in this field
@Override @Override
public void finish(long sumTotalTermFreq) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException {
if (pendingCount > 0) { if (pendingCount > 0) {
flushBlock(); flushBlock();
} }
@ -239,6 +241,7 @@ public class BlockTermsWriter extends FieldsConsumer {
out.writeVInt(0); out.writeVInt(0);
this.sumTotalTermFreq = sumTotalTermFreq; this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
fieldIndexWriter.finish(out.getFilePointer()); fieldIndexWriter.finish(out.getFilePointer());
} }

View File

@ -41,7 +41,7 @@ public abstract class TermsConsumer {
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */ /** Called when we are done adding terms to this field */
public abstract void finish(long sumTotalTermFreq) throws IOException; public abstract void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException;
/** Return the BytesRef Comparator used to sort terms /** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */ * before feeding to this API. */
@ -56,7 +56,8 @@ public abstract class TermsConsumer {
BytesRef term; BytesRef term;
assert termsEnum != null; assert termsEnum != null;
long sumTotalTermFreq = 0; long sumTotalTermFreq = 0;
long sumDF = 0; long sumDocFreq = 0;
long sumDFsinceLastAbortCheck = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) { if (mergeState.fieldInfo.omitTermFreqAndPositions) {
if (docsEnum == null) { if (docsEnum == null) {
@ -74,10 +75,11 @@ public abstract class TermsConsumer {
final TermStats stats = postingsConsumer.merge(mergeState, docsEnum); final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
if (stats.docFreq > 0) { if (stats.docFreq > 0) {
finishTerm(term, stats); finishTerm(term, stats);
sumDF += stats.docFreq; sumDFsinceLastAbortCheck += stats.docFreq;
if (sumDF > 60000) { sumDocFreq += stats.docFreq;
mergeState.checkAbort.work(sumDF/5.0); if (sumDFsinceLastAbortCheck > 60000) {
sumDF = 0; mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0);
sumDFsinceLastAbortCheck = 0;
} }
} }
} }
@ -105,16 +107,17 @@ public abstract class TermsConsumer {
if (stats.docFreq > 0) { if (stats.docFreq > 0) {
finishTerm(term, stats); finishTerm(term, stats);
sumTotalTermFreq += stats.totalTermFreq; sumTotalTermFreq += stats.totalTermFreq;
sumDF += stats.docFreq; sumDFsinceLastAbortCheck += stats.docFreq;
if (sumDF > 60000) { sumDocFreq += stats.docFreq;
mergeState.checkAbort.work(sumDF/5.0); if (sumDFsinceLastAbortCheck > 60000) {
sumDF = 0; mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0);
sumDFsinceLastAbortCheck = 0;
} }
} }
} }
} }
} }
finish(sumTotalTermFreq); finish(sumTotalTermFreq, sumDocFreq);
} }
} }

View File

@ -219,13 +219,14 @@ public class MemoryCodec extends Codec {
} }
@Override @Override
public void finish(long sumTotalTermFreq) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException {
if (termCount > 0) { if (termCount > 0) {
out.writeVInt(termCount); out.writeVInt(termCount);
out.writeVInt(field.number); out.writeVInt(field.number);
if (!field.omitTermFreqAndPositions) { if (!field.omitTermFreqAndPositions) {
out.writeVLong(sumTotalTermFreq); out.writeVLong(sumTotalTermFreq);
} }
out.writeVLong(sumDocFreq);
builder.finish().save(out); builder.finish().save(out);
if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
} }
@ -683,6 +684,7 @@ public class MemoryCodec extends Codec {
private final static class TermsReader extends Terms { private final static class TermsReader extends Terms {
private final long sumTotalTermFreq; private final long sumTotalTermFreq;
private final long sumDocFreq;
private FST<BytesRef> fst; private FST<BytesRef> fst;
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
private final FieldInfo field; private final FieldInfo field;
@ -695,6 +697,7 @@ public class MemoryCodec extends Codec {
} else { } else {
sumTotalTermFreq = 0; sumTotalTermFreq = 0;
} }
sumDocFreq = in.readVLong();
fst = new FST<BytesRef>(in, outputs); fst = new FST<BytesRef>(in, outputs);
} }
@ -704,6 +707,11 @@ public class MemoryCodec extends Codec {
return sumTotalTermFreq; return sumTotalTermFreq;
} }
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
@Override @Override
public TermsEnum iterator() { public TermsEnum iterator() {
return new FSTTermsEnum(field, fst); return new FSTTermsEnum(field, fst);

View File

@ -266,6 +266,11 @@ public class PreFlexFields extends FieldsProducer {
public long getSumTotalTermFreq() { public long getSumTotalTermFreq() {
return -1; return -1;
} }
@Override
public long getSumDocFreq() throws IOException {
return -1;
}
} }
private class PreTermsEnum extends TermsEnum { private class PreTermsEnum extends TermsEnum {

View File

@ -463,6 +463,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final long termsStart; private final long termsStart;
private final boolean omitTF; private final boolean omitTF;
private long sumTotalTermFreq; private long sumTotalTermFreq;
private long sumDocFreq;
private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst; private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
private int termCount; private int termCount;
private final BytesRef scratch = new BytesRef(10); private final BytesRef scratch = new BytesRef(10);
@ -500,6 +501,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
break; break;
} else if (scratch.startsWith(DOC)) { } else if (scratch.startsWith(DOC)) {
docFreq++; docFreq++;
sumDocFreq++;
} else if (scratch.startsWith(POS)) { } else if (scratch.startsWith(POS)) {
totalTermFreq++; totalTermFreq++;
} else if (scratch.startsWith(TERM)) { } else if (scratch.startsWith(TERM)) {
@ -554,6 +556,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
public long getSumTotalTermFreq() { public long getSumTotalTermFreq() {
return sumTotalTermFreq; return sumTotalTermFreq;
} }
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
} }
@Override @Override

View File

@ -89,7 +89,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
} }
@Override @Override
public void finish(long sumTotalTermFreq) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException {
} }
@Override @Override

View File

@ -195,7 +195,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
} }
@Override @Override
public void finish(long sumTotalTermCount) throws IOException { public void finish(long sumTotalTermCount, long sumDocFreq) throws IOException {
} }
@Override @Override

View File

@ -102,6 +102,7 @@ public class TestExternalCodecs extends LuceneTestCase {
final String field; final String field;
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>(); final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
long sumTotalTermFreq; long sumTotalTermFreq;
long sumDocFreq;
RAMField(String field) { RAMField(String field) {
this.field = field; this.field = field;
@ -116,6 +117,11 @@ public class TestExternalCodecs extends LuceneTestCase {
public long getSumTotalTermFreq() { public long getSumTotalTermFreq() {
return sumTotalTermFreq; return sumTotalTermFreq;
} }
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
@Override @Override
public TermsEnum iterator() { public TermsEnum iterator() {
@ -204,8 +210,9 @@ public class TestExternalCodecs extends LuceneTestCase {
} }
@Override @Override
public void finish(long sumTotalTermFreq) { public void finish(long sumTotalTermFreq, long sumDocFreq) {
field.sumTotalTermFreq = sumTotalTermFreq; field.sumTotalTermFreq = sumTotalTermFreq;
field.sumDocFreq = sumDocFreq;
} }
} }

View File

@ -101,10 +101,12 @@ public class TestCodecs extends LuceneTestCase {
Arrays.sort(terms); Arrays.sort(terms);
final TermsConsumer termsConsumer = consumer.addField(fieldInfo); final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
long sumTotalTermCount = 0; long sumTotalTermCount = 0;
long sumDF = 0;
for (final TermData term : terms) { for (final TermData term : terms) {
sumDF += term.docs.length;
sumTotalTermCount += term.write(termsConsumer); sumTotalTermCount += term.write(termsConsumer);
} }
termsConsumer.finish(sumTotalTermCount); termsConsumer.finish(sumTotalTermCount, sumDF);
} }
} }

View File

@ -0,0 +1,101 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
/**
* Tests {@link Terms#getSumDocFreq()}
* @lucene.experimental
*/
public class TestSumDocFreq extends LuceneTestCase {
public void testSumDocFreq() throws Exception {
final int numDocs = atLeast(500);
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, dir);
Document doc = new Document();
Field field1 = newField("foo", "", Field.Index.ANALYZED);
Field field2 = newField("bar", "", Field.Index.ANALYZED);
doc.add(field1);
doc.add(field2);
for (int i = 0; i < numDocs; i++) {
char ch1 = (char) _TestUtil.nextInt(random, 'a', 'z');
char ch2 = (char) _TestUtil.nextInt(random, 'a', 'z');
field1.setValue("" + ch1 + " " + ch2);
ch1 = (char) _TestUtil.nextInt(random, 'a', 'z');
ch2 = (char) _TestUtil.nextInt(random, 'a', 'z');
field2.setValue("" + ch1 + " " + ch2);
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
writer.close();
assertSumDocFreq(ir);
ir.close();
ir = IndexReader.open(dir, false);
int numDeletions = atLeast(20);
for (int i = 0; i < numDeletions; i++) {
ir.deleteDocument(random.nextInt(ir.maxDoc()));
}
ir.close();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
w.optimize();
w.close();
ir = IndexReader.open(dir, true);
assertSumDocFreq(ir);
ir.close();
dir.close();
}
private void assertSumDocFreq(IndexReader ir) throws Exception {
// compute sumDocFreq across all fields
Fields fields = MultiFields.getFields(ir);
FieldsEnum fieldEnum = fields.iterator();
String f = null;
while ((f = fieldEnum.next()) != null) {
Terms terms = fields.terms(f);
long sumDocFreq = terms.getSumDocFreq();
if (sumDocFreq == -1) {
if (VERBOSE) {
System.out.println("skipping field: " + f + ", codec does not support sumDocFreq");
}
continue;
}
long computedSumDocFreq = 0;
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
computedSumDocFreq += termsEnum.docFreq();
}
assertEquals(computedSumDocFreq, sumDocFreq);
}
}
}

View File

@ -0,0 +1,108 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.HashSet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
/**
* Tests the uniqueTermCount statistic in FieldInvertState
*/
public class TestUniqueTermCount extends LuceneTestCase {
Directory dir;
IndexReader reader;
/* expected uniqueTermCount values for our documents */
ArrayList<Integer> expected = new ArrayList<Integer>();
@Override
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random, MockTokenizer.SIMPLE, true)).setMergePolicy(newLogMergePolicy());
config.setSimilarityProvider(new DefaultSimilarityProvider() {
@Override
public Similarity get(String field) {
return new TestSimilarity();
}
});
RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
Document doc = new Document();
Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(foo);
for (int i = 0; i < 100; i++) {
foo.setValue(addValue());
writer.addDocument(doc);
}
reader = writer.getReader();
writer.close();
}
@Override
public void tearDown() throws Exception {
reader.close();
dir.close();
super.tearDown();
}
public void test() throws Exception {
byte fooNorms[] = MultiNorms.norms(reader, "foo");
for (int i = 0; i < reader.maxDoc(); i++)
assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
}
/**
* Makes a bunch of single-char tokens (the max # unique terms will at most be 26).
* puts the # unique terms into expected, to be checked against the norm.
*/
private String addValue() {
StringBuilder sb = new StringBuilder();
HashSet<String> terms = new HashSet<String>();
int num = _TestUtil.nextInt(random, 0, 255);
for (int i = 0; i < num; i++) {
sb.append(' ');
char term = (char) _TestUtil.nextInt(random, 'a', 'z');
sb.append(term);
terms.add("" + term);
}
expected.add(terms.size());
return sb.toString();
}
/**
* Simple similarity that encodes maxTermFrequency directly as a byte
*/
class TestSimilarity extends DefaultSimilarity {
@Override
public byte computeNorm(FieldInvertState state) {
return (byte) state.getUniqueTermCount();
}
}
}