mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 05:19:17 +00:00
LUCENE-3473: CheckIndex should verify numUniqueTerms == recomputedNumUniqueTerms
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1188455 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c6afec00e
commit
51d010010c
@ -184,6 +184,11 @@ Changes in backwards compatibility policy
|
||||
with the old tokenStream() method removed. Consequently it is now mandatory
|
||||
for all Analyzers to support reusability. (Chris Male)
|
||||
|
||||
* LUCENE-3473: IndexReader.getUniqueTermCount() no longer throws UOE when
|
||||
it cannot be easily determined (e.g. Multi*Readers). Instead, it returns
|
||||
-1 to be consistent with this behavior across other index statistics.
|
||||
(Robert Muir)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you
|
||||
|
@ -407,6 +407,11 @@ public class InstantiatedIndexReader extends IndexReader {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
|
@ -945,20 +945,20 @@ public class CheckIndex {
|
||||
|
||||
is.search(new TermQuery(new Term(field, lastTerm)), 1);
|
||||
}
|
||||
|
||||
// Test seeking by ord
|
||||
if (hasOrd && status.termCount-termCountStart > 0) {
|
||||
long termCount;
|
||||
try {
|
||||
termCount = fields.terms(field).getUniqueTermCount();
|
||||
} catch (UnsupportedOperationException uoe) {
|
||||
termCount = -1;
|
||||
}
|
||||
|
||||
|
||||
// check unique term count
|
||||
long termCount = -1;
|
||||
|
||||
if (status.termCount-termCountStart > 0) {
|
||||
termCount = fields.terms(field).getUniqueTermCount();
|
||||
|
||||
if (termCount != -1 && termCount != status.termCount - termCountStart) {
|
||||
throw new RuntimeException("termCount mismatch " + termCount + " vs " + (status.termCount - termCountStart));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Test seeking by ord
|
||||
if (hasOrd && status.termCount-termCountStart > 0) {
|
||||
int seekCount = (int) Math.min(10000L, termCount);
|
||||
if (seekCount > 0) {
|
||||
BytesRef[] seekTerms = new BytesRef[seekCount];
|
||||
@ -1001,6 +1001,21 @@ public class CheckIndex {
|
||||
}
|
||||
}
|
||||
|
||||
// for most implementations, this is boring (just the sum across all fields)
|
||||
// but codecs that don't work per-field like preflex actually implement this,
|
||||
// but don't implement it on Terms, so the check isn't redundant.
|
||||
long uniqueTermCountAllFields = reader.getUniqueTermCount();
|
||||
|
||||
// this means something is seriously screwed, e.g. we are somehow getting enclosed in PFCW!!!!!!
|
||||
|
||||
if (uniqueTermCountAllFields == -1) {
|
||||
throw new RuntimeException("invalid termCount: -1");
|
||||
}
|
||||
|
||||
if (status.termCount != uniqueTermCountAllFields) {
|
||||
throw new RuntimeException("termCount mismatch " + uniqueTermCountAllFields + " vs " + (status.termCount));
|
||||
}
|
||||
|
||||
msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
|
||||
|
||||
if (verbose && status.blockTreeStats != null && infoStream != null && status.termCount > 0) {
|
||||
|
@ -818,7 +818,7 @@ class DirectoryReader extends IndexReader implements Cloneable {
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
throw new UnsupportedOperationException("");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -32,5 +32,31 @@ public abstract class Fields {
|
||||
* null if the field does not exist. */
|
||||
public abstract Terms terms(String field) throws IOException;
|
||||
|
||||
/** Returns the number of terms for all fields, or -1 if this
|
||||
* measure isn't stored by the codec. Note that, just like
|
||||
* other term measures, this measure does not take deleted
|
||||
* documents into account. */
|
||||
// TODO: deprecate?
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
long numTerms = 0;
|
||||
FieldsEnum it = iterator();
|
||||
while(true) {
|
||||
String field = it.next();
|
||||
if (field == null) {
|
||||
break;
|
||||
}
|
||||
Terms terms = terms(field);
|
||||
if (terms != null) {
|
||||
final long termCount = terms.getUniqueTermCount();
|
||||
if (termCount == -1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
numTerms += termCount;
|
||||
}
|
||||
}
|
||||
return numTerms;
|
||||
}
|
||||
|
||||
public final static Fields[] EMPTY_ARRAY = new Fields[0];
|
||||
}
|
||||
|
@ -1589,26 +1589,17 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
||||
/** Returns the number of unique terms (across all fields)
|
||||
* in this reader.
|
||||
*
|
||||
* @throws UnsupportedOperationException if this count
|
||||
* @return number of unique terms or -1 if this count
|
||||
* cannot be easily determined (eg Multi*Readers).
|
||||
* Instead, you should call {@link
|
||||
* #getSequentialSubReaders} and ask each sub reader for
|
||||
* its unique term count. */
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
long numTerms = 0;
|
||||
final Fields fields = fields();
|
||||
if (fields == null) {
|
||||
return 0;
|
||||
}
|
||||
FieldsEnum it = fields.iterator();
|
||||
while(true) {
|
||||
String field = it.next();
|
||||
if (field == null) {
|
||||
break;
|
||||
}
|
||||
numTerms += fields.terms(field).getUniqueTermCount();
|
||||
}
|
||||
return numTerms;
|
||||
return fields.getUniqueTermCount();
|
||||
}
|
||||
|
||||
/** For IndexReader implementations that use
|
||||
|
@ -95,6 +95,11 @@ public final class MultiTerms extends Terms {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
long sum = 0;
|
||||
|
@ -569,7 +569,9 @@ public class ParallelReader extends IndexReader {
|
||||
|
||||
void addField(String field, IndexReader r) throws IOException {
|
||||
PerDocValues perDocs = MultiPerDocValues.getPerDocs(r);
|
||||
fields.put(field, perDocs.docValues(field));
|
||||
if (perDocs != null) {
|
||||
fields.put(field, perDocs.docValues(field));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -25,6 +25,7 @@ import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.codecs.Codec;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
@ -65,7 +66,7 @@ final class SegmentCodecs implements Cloneable {
|
||||
*/
|
||||
final Codec[] codecs;
|
||||
final CodecProvider provider;
|
||||
private final Codec codec = new PerFieldCodecWrapper(this);
|
||||
private final Codec codec;
|
||||
|
||||
SegmentCodecs(CodecProvider provider, IndexInput input) throws IOException {
|
||||
this(provider, read(input, provider));
|
||||
@ -74,6 +75,11 @@ final class SegmentCodecs implements Cloneable {
|
||||
SegmentCodecs(CodecProvider provider, Codec... codecs) {
|
||||
this.provider = provider;
|
||||
this.codecs = codecs;
|
||||
if (codecs.length == 1 && codecs[0] instanceof PreFlexCodec) {
|
||||
this.codec = codecs[0]; // hack for backwards break... don't wrap the codec in preflex
|
||||
} else {
|
||||
this.codec = new PerFieldCodecWrapper(this);
|
||||
}
|
||||
}
|
||||
|
||||
Codec codec() {
|
||||
|
@ -586,6 +586,11 @@ final class SegmentMerger {
|
||||
private void mergePerDoc() throws IOException {
|
||||
final PerDocConsumer docsConsumer = codec
|
||||
.docsConsumer(new PerDocWriteState(segmentWriteState));
|
||||
// TODO: remove this check when 3.x indexes are no longer supported
|
||||
// (3.x indexes don't have docvalues)
|
||||
if (docsConsumer == null) {
|
||||
return;
|
||||
}
|
||||
boolean success = false;
|
||||
try {
|
||||
docsConsumer.merge(mergeState);
|
||||
|
@ -155,10 +155,12 @@ public abstract class Terms {
|
||||
return termsEnum.docsAndPositions(liveDocs, reuse);
|
||||
}
|
||||
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
|
||||
}
|
||||
|
||||
/** Returns the number of terms for this field, or -1 if this
|
||||
* measure isn't stored by the codec. Note that, just like
|
||||
* other term measures, this measure does not take deleted
|
||||
* documents into account. */
|
||||
public abstract long getUniqueTermCount() throws IOException;
|
||||
|
||||
/** Returns the sum of {@link TermsEnum#totalTermFreq} for
|
||||
* all terms in this field, or -1 if this measure isn't
|
||||
* stored by the codec (or if this fields omits term freq
|
||||
|
@ -684,11 +684,13 @@ public class MemoryCodec extends Codec {
|
||||
private final long sumTotalTermFreq;
|
||||
private final long sumDocFreq;
|
||||
private final int docCount;
|
||||
private final int termCount;
|
||||
private FST<BytesRef> fst;
|
||||
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
private final FieldInfo field;
|
||||
|
||||
public TermsReader(FieldInfos fieldInfos, IndexInput in) throws IOException {
|
||||
public TermsReader(FieldInfos fieldInfos, IndexInput in, int termCount) throws IOException {
|
||||
this.termCount = termCount;
|
||||
final int fieldNumber = in.readVInt();
|
||||
field = fieldInfos.fieldInfo(fieldNumber);
|
||||
if (field.indexOptions != IndexOptions.DOCS_ONLY) {
|
||||
@ -717,6 +719,11 @@ public class MemoryCodec extends Codec {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return termCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator() {
|
||||
return new FSTTermsEnum(field, fst);
|
||||
@ -741,7 +748,7 @@ public class MemoryCodec extends Codec {
|
||||
if (termCount == 0) {
|
||||
break;
|
||||
}
|
||||
final TermsReader termsReader = new TermsReader(state.fieldInfos, in);
|
||||
final TermsReader termsReader = new TermsReader(state.fieldInfos, in, termCount);
|
||||
fields.put(termsReader.field.name, termsReader);
|
||||
}
|
||||
} finally {
|
||||
|
@ -84,11 +84,11 @@ public class PreFlexCodec extends Codec {
|
||||
|
||||
@Override
|
||||
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
|
||||
throw new UnsupportedOperationException("PerDocConsumer is not supported by Preflex codec");
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
|
||||
throw new UnsupportedOperationException("PerDocValues is not supported by Preflex codec");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -162,6 +162,11 @@ public class PreFlexFields extends FieldsProducer {
|
||||
return preTerms.get(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return getTermsDict().size();
|
||||
}
|
||||
|
||||
synchronized private TermInfosReader getTermsDict() {
|
||||
if (tis != null) {
|
||||
return tis;
|
||||
@ -240,6 +245,11 @@ public class PreFlexFields extends FieldsProducer {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return -1;
|
||||
|
@ -1201,7 +1201,7 @@ public class TestIndexReader extends LuceneTestCase
|
||||
// LUCENE-1586: getUniqueTermCount
|
||||
public void testUniqueTermCount() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
|
||||
Document doc = new Document();
|
||||
doc.add(newField("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", TextField.TYPE_UNSTORED));
|
||||
doc.add(newField("number", "0 1 2 3 4 5 6 7 8 9", TextField.TYPE_UNSTORED));
|
||||
@ -1217,12 +1217,8 @@ public class TestIndexReader extends LuceneTestCase
|
||||
IndexReader r2 = IndexReader.openIfChanged(r);
|
||||
assertNotNull(r2);
|
||||
r.close();
|
||||
try {
|
||||
r2.getUniqueTermCount();
|
||||
fail("expected exception");
|
||||
} catch (UnsupportedOperationException uoe) {
|
||||
// expected
|
||||
}
|
||||
assertEquals(-1, r2.getUniqueTermCount());
|
||||
|
||||
IndexReader[] subs = r2.getSequentialSubReaders();
|
||||
for(int i=0;i<subs.length;i++) {
|
||||
assertEquals(36, subs[i].getUniqueTermCount());
|
||||
|
@ -37,7 +37,7 @@ public class TestRollingUpdates extends LuceneTestCase {
|
||||
|
||||
CodecProvider provider = CodecProvider.getDefault();
|
||||
//provider.register(new MemoryCodec());
|
||||
if (random.nextBoolean()) {
|
||||
if ( (!"PreFlex".equals(provider.getDefaultFieldCodec())) && random.nextBoolean()) {
|
||||
provider.setFieldCodec("docid", "Memory");
|
||||
}
|
||||
|
||||
|
@ -139,6 +139,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
|
||||
public int getDocCount() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
});
|
||||
|
||||
assert termsEnum != null;
|
||||
|
@ -203,11 +203,17 @@ public class SpoofIndexSearcher extends IndexSearcher {
|
||||
|
||||
// ------------------------ Not implemented methods ------------------------
|
||||
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator() throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return null;
|
||||
|
Loading…
x
Reference in New Issue
Block a user