LUCENE-3473: CheckIndex should verify numUniqueTerms == recomputedNumUniqueTerms

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1188455 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-10-25 00:15:43 +00:00
parent 2c6afec00e
commit 51d010010c
18 changed files with 127 additions and 41 deletions

View File

@ -184,6 +184,11 @@ Changes in backwards compatibility policy
with the old tokenStream() method removed. Consequently it is now mandatory
for all Analyzers to support reusability. (Chris Male)
* LUCENE-3473: IndexReader.getUniqueTermCount() no longer throws UOE when
it cannot be easily determined (e.g. Multi*Readers). Instead, it returns
-1 to be consistent with this behavior across other index statistics.
(Robert Muir)
Changes in Runtime Behavior
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you

View File

@ -407,6 +407,11 @@ public class InstantiatedIndexReader extends IndexReader {
return -1;
}
@Override
public long getUniqueTermCount() throws IOException {
return -1;
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();

View File

@ -945,20 +945,20 @@ public class CheckIndex {
is.search(new TermQuery(new Term(field, lastTerm)), 1);
}
// Test seeking by ord
if (hasOrd && status.termCount-termCountStart > 0) {
long termCount;
try {
termCount = fields.terms(field).getUniqueTermCount();
} catch (UnsupportedOperationException uoe) {
termCount = -1;
}
// check unique term count
long termCount = -1;
if (status.termCount-termCountStart > 0) {
termCount = fields.terms(field).getUniqueTermCount();
if (termCount != -1 && termCount != status.termCount - termCountStart) {
throw new RuntimeException("termCount mismatch " + termCount + " vs " + (status.termCount - termCountStart));
}
}
// Test seeking by ord
if (hasOrd && status.termCount-termCountStart > 0) {
int seekCount = (int) Math.min(10000L, termCount);
if (seekCount > 0) {
BytesRef[] seekTerms = new BytesRef[seekCount];
@ -1001,6 +1001,21 @@ public class CheckIndex {
}
}
// for most implementations, this is boring (just the sum across all fields)
// but codecs that don't work per-field like preflex actually implement this,
// but don't implement it on Terms, so the check isn't redundant.
long uniqueTermCountAllFields = reader.getUniqueTermCount();
// this means something is seriously screwed, e.g. we are somehow getting enclosed in PFCW!!!!!!
if (uniqueTermCountAllFields == -1) {
throw new RuntimeException("invalid termCount: -1");
}
if (status.termCount != uniqueTermCountAllFields) {
throw new RuntimeException("termCount mismatch " + uniqueTermCountAllFields + " vs " + (status.termCount));
}
msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
if (verbose && status.blockTreeStats != null && infoStream != null && status.termCount > 0) {

View File

@ -818,7 +818,7 @@ class DirectoryReader extends IndexReader implements Cloneable {
@Override
public long getUniqueTermCount() throws IOException {
throw new UnsupportedOperationException("");
return -1;
}
@Override

View File

@ -32,5 +32,31 @@ public abstract class Fields {
* null if the field does not exist. */
public abstract Terms terms(String field) throws IOException;
/** Returns the number of terms for all fields, or -1 if this
* measure isn't stored by the codec. Note that, just like
* other term measures, this measure does not take deleted
* documents into account. */
// TODO: deprecate?
public long getUniqueTermCount() throws IOException {
long numTerms = 0;
FieldsEnum it = iterator();
while(true) {
String field = it.next();
if (field == null) {
break;
}
Terms terms = terms(field);
if (terms != null) {
final long termCount = terms.getUniqueTermCount();
if (termCount == -1) {
return -1;
}
numTerms += termCount;
}
}
return numTerms;
}
public final static Fields[] EMPTY_ARRAY = new Fields[0];
}

View File

@ -1589,26 +1589,17 @@ public abstract class IndexReader implements Cloneable,Closeable {
/** Returns the number of unique terms (across all fields)
* in this reader.
*
* @throws UnsupportedOperationException if this count
* @return number of unique terms or -1 if this count
* cannot be easily determined (eg Multi*Readers).
* Instead, you should call {@link
* #getSequentialSubReaders} and ask each sub reader for
* its unique term count. */
public long getUniqueTermCount() throws IOException {
long numTerms = 0;
final Fields fields = fields();
if (fields == null) {
return 0;
}
FieldsEnum it = fields.iterator();
while(true) {
String field = it.next();
if (field == null) {
break;
}
numTerms += fields.terms(field).getUniqueTermCount();
}
return numTerms;
return fields.getUniqueTermCount();
}
/** For IndexReader implementations that use

View File

@ -95,6 +95,11 @@ public final class MultiTerms extends Terms {
}
}
@Override
public long getUniqueTermCount() throws IOException {
return -1;
}
@Override
public long getSumTotalTermFreq() throws IOException {
long sum = 0;

View File

@ -569,7 +569,9 @@ public class ParallelReader extends IndexReader {
void addField(String field, IndexReader r) throws IOException {
PerDocValues perDocs = MultiPerDocValues.getPerDocs(r);
fields.put(field, perDocs.docValues(field));
if (perDocs != null) {
fields.put(field, perDocs.docValues(field));
}
}
@Override

View File

@ -25,6 +25,7 @@ import java.util.Set;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
@ -65,7 +66,7 @@ final class SegmentCodecs implements Cloneable {
*/
final Codec[] codecs;
final CodecProvider provider;
private final Codec codec = new PerFieldCodecWrapper(this);
private final Codec codec;
SegmentCodecs(CodecProvider provider, IndexInput input) throws IOException {
this(provider, read(input, provider));
@ -74,6 +75,11 @@ final class SegmentCodecs implements Cloneable {
SegmentCodecs(CodecProvider provider, Codec... codecs) {
this.provider = provider;
this.codecs = codecs;
if (codecs.length == 1 && codecs[0] instanceof PreFlexCodec) {
this.codec = codecs[0]; // hack for backwards break... don't wrap the codec in preflex
} else {
this.codec = new PerFieldCodecWrapper(this);
}
}
Codec codec() {

View File

@ -586,6 +586,11 @@ final class SegmentMerger {
private void mergePerDoc() throws IOException {
final PerDocConsumer docsConsumer = codec
.docsConsumer(new PerDocWriteState(segmentWriteState));
// TODO: remove this check when 3.x indexes are no longer supported
// (3.x indexes don't have docvalues)
if (docsConsumer == null) {
return;
}
boolean success = false;
try {
docsConsumer.merge(mergeState);

View File

@ -155,10 +155,12 @@ public abstract class Terms {
return termsEnum.docsAndPositions(liveDocs, reuse);
}
public long getUniqueTermCount() throws IOException {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
/** Returns the number of terms for this field, or -1 if this
* measure isn't stored by the codec. Note that, just like
* other term measures, this measure does not take deleted
* documents into account. */
public abstract long getUniqueTermCount() throws IOException;
/** Returns the sum of {@link TermsEnum#totalTermFreq} for
* all terms in this field, or -1 if this measure isn't
* stored by the codec (or if this fields omits term freq

View File

@ -684,11 +684,13 @@ public class MemoryCodec extends Codec {
private final long sumTotalTermFreq;
private final long sumDocFreq;
private final int docCount;
private final int termCount;
private FST<BytesRef> fst;
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
private final FieldInfo field;
public TermsReader(FieldInfos fieldInfos, IndexInput in) throws IOException {
public TermsReader(FieldInfos fieldInfos, IndexInput in, int termCount) throws IOException {
this.termCount = termCount;
final int fieldNumber = in.readVInt();
field = fieldInfos.fieldInfo(fieldNumber);
if (field.indexOptions != IndexOptions.DOCS_ONLY) {
@ -717,6 +719,11 @@ public class MemoryCodec extends Codec {
return docCount;
}
@Override
public long getUniqueTermCount() throws IOException {
return termCount;
}
@Override
public TermsEnum iterator() {
return new FSTTermsEnum(field, fst);
@ -741,7 +748,7 @@ public class MemoryCodec extends Codec {
if (termCount == 0) {
break;
}
final TermsReader termsReader = new TermsReader(state.fieldInfos, in);
final TermsReader termsReader = new TermsReader(state.fieldInfos, in, termCount);
fields.put(termsReader.field.name, termsReader);
}
} finally {

View File

@ -84,11 +84,11 @@ public class PreFlexCodec extends Codec {
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
throw new UnsupportedOperationException("PerDocConsumer is not supported by Preflex codec");
return null;
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
throw new UnsupportedOperationException("PerDocValues is not supported by Preflex codec");
return null;
}
}

View File

@ -162,6 +162,11 @@ public class PreFlexFields extends FieldsProducer {
return preTerms.get(field);
}
@Override
public long getUniqueTermCount() throws IOException {
return getTermsDict().size();
}
synchronized private TermInfosReader getTermsDict() {
if (tis != null) {
return tis;
@ -240,6 +245,11 @@ public class PreFlexFields extends FieldsProducer {
}
}
@Override
public long getUniqueTermCount() throws IOException {
return -1;
}
@Override
public long getSumTotalTermFreq() {
return -1;

View File

@ -1201,7 +1201,7 @@ public class TestIndexReader extends LuceneTestCase
// LUCENE-1586: getUniqueTermCount
public void testUniqueTermCount() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
Document doc = new Document();
doc.add(newField("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", TextField.TYPE_UNSTORED));
doc.add(newField("number", "0 1 2 3 4 5 6 7 8 9", TextField.TYPE_UNSTORED));
@ -1217,12 +1217,8 @@ public class TestIndexReader extends LuceneTestCase
IndexReader r2 = IndexReader.openIfChanged(r);
assertNotNull(r2);
r.close();
try {
r2.getUniqueTermCount();
fail("expected exception");
} catch (UnsupportedOperationException uoe) {
// expected
}
assertEquals(-1, r2.getUniqueTermCount());
IndexReader[] subs = r2.getSequentialSubReaders();
for(int i=0;i<subs.length;i++) {
assertEquals(36, subs[i].getUniqueTermCount());

View File

@ -37,7 +37,7 @@ public class TestRollingUpdates extends LuceneTestCase {
CodecProvider provider = CodecProvider.getDefault();
//provider.register(new MemoryCodec());
if (random.nextBoolean()) {
if ( (!"PreFlex".equals(provider.getDefaultFieldCodec())) && random.nextBoolean()) {
provider.setFieldCodec("docid", "Memory");
}

View File

@ -139,6 +139,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
public int getDocCount() throws IOException {
return -1;
}
@Override
public long getUniqueTermCount() throws IOException {
return -1;
}
});
assert termsEnum != null;

View File

@ -203,11 +203,17 @@ public class SpoofIndexSearcher extends IndexSearcher {
// ------------------------ Not implemented methods ------------------------
@Override
public TermsEnum iterator() throws IOException {
return null;
}
@Override
public long getUniqueTermCount() throws IOException {
return -1;
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return null;