LUCENE-3473: CheckIndex should verify numUniqueTerms == recomputedNumUniqueTerms

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1188455 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 05:19:17 +00:00 · 2011-10-25 00:15:43 +00:00 · 2011-10-25 00:15:43 +00:00 · 51d010010c
commit 51d010010c
parent 2c6afec00e
18 changed files with 127 additions and 41 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -184,6 +184,11 @@ Changes in backwards compatibility policy
  with the old tokenStream() method removed.  Consequently it is now mandatory
  for all Analyzers to support reusability. (Chris Male)

+* LUCENE-3473: IndexReader.getUniqueTermCount() no longer throws UOE when
+  it cannot be easily determined (e.g. Multi*Readers). Instead, it returns
+  -1 to be consistent with this behavior across other index statistics.
+  (Robert Muir)
+
 Changes in Runtime Behavior

 * LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you
--- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
+++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
@ -407,6 +407,11 @@ public class InstantiatedIndexReader extends IndexReader {
            return -1;
          }

+          @Override
+          public long getUniqueTermCount() throws IOException {
+            return -1;
+          }
+
          @Override
          public Comparator<BytesRef> getComparator() {
            return BytesRef.getUTF8SortedAsUnicodeComparator();
--- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java
@ -945,20 +945,20 @@ public class CheckIndex {

            is.search(new TermQuery(new Term(field, lastTerm)), 1);
          }
-
-          // Test seeking by ord
-          if (hasOrd && status.termCount-termCountStart > 0) {
-            long termCount;
-            try {
-              termCount = fields.terms(field).getUniqueTermCount();
-            } catch (UnsupportedOperationException uoe) {
-              termCount = -1;
-            }
-
+          
+          // check unique term count
+          long termCount = -1;
+          
+          if (status.termCount-termCountStart > 0) {
+            termCount = fields.terms(field).getUniqueTermCount();
+            
            if (termCount != -1 && termCount != status.termCount - termCountStart) {
              throw new RuntimeException("termCount mismatch " + termCount + " vs " + (status.termCount - termCountStart));
            }
-
+          }
+          
+          // Test seeking by ord
+          if (hasOrd && status.termCount-termCountStart > 0) {
            int seekCount = (int) Math.min(10000L, termCount);
            if (seekCount > 0) {
              BytesRef[] seekTerms = new BytesRef[seekCount];
@ -1001,6 +1001,21 @@ public class CheckIndex {
        }
      }

+      // for most implementations, this is boring (just the sum across all fields)
+      // but codecs that don't work per-field like preflex actually implement this,
+      // but don't implement it on Terms, so the check isn't redundant.
+      long uniqueTermCountAllFields = reader.getUniqueTermCount();
+      
+      // this means something is seriously screwed, e.g. we are somehow getting enclosed in PFCW!!!!!!
+      
+      if (uniqueTermCountAllFields == -1) {
+        throw new RuntimeException("invalid termCount: -1");
+     }
+
+      if (status.termCount != uniqueTermCountAllFields) {
+        throw new RuntimeException("termCount mismatch " + uniqueTermCountAllFields + " vs " + (status.termCount));
+      }
+
      msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");

      if (verbose && status.blockTreeStats != null && infoStream != null && status.termCount > 0) {
--- a/lucene/src/java/org/apache/lucene/index/DirectoryReader.java
+++ b/lucene/src/java/org/apache/lucene/index/DirectoryReader.java
@ -818,7 +818,7 @@ class DirectoryReader extends IndexReader implements Cloneable {

  @Override
  public long getUniqueTermCount() throws IOException {
-    throw new UnsupportedOperationException("");
+    return -1;
  }

  @Override
--- a/lucene/src/java/org/apache/lucene/index/Fields.java
+++ b/lucene/src/java/org/apache/lucene/index/Fields.java
@ -32,5 +32,31 @@ public abstract class Fields {
   *  null if the field does not exist. */
  public abstract Terms terms(String field) throws IOException;
  
+  /** Returns the number of terms for all fields, or -1 if this 
+   *  measure isn't stored by the codec. Note that, just like 
+   *  other term measures, this measure does not take deleted 
+   *  documents into account. */
+  // TODO: deprecate?
+  public long getUniqueTermCount() throws IOException {
+    long numTerms = 0;
+    FieldsEnum it = iterator();
+    while(true) {
+      String field = it.next();
+      if (field == null) {
+        break;
+      }
+      Terms terms = terms(field);
+      if (terms != null) {
+        final long termCount = terms.getUniqueTermCount();
+        if (termCount == -1) {
+          return -1;
+        }
+          
+        numTerms += termCount;
+      }
+    }
+    return numTerms;
+  }
+  
  public final static Fields[] EMPTY_ARRAY = new Fields[0];
 }
--- a/lucene/src/java/org/apache/lucene/index/IndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java
@ -1589,26 +1589,17 @@ public abstract class IndexReader implements Cloneable,Closeable {
  /** Returns the number of unique terms (across all fields)
   *  in this reader.
   *
-   *  @throws UnsupportedOperationException if this count
+   *  @return number of unique terms or -1 if this count
   *  cannot be easily determined (eg Multi*Readers).
   *  Instead, you should call {@link
   *  #getSequentialSubReaders} and ask each sub reader for
   *  its unique term count. */
  public long getUniqueTermCount() throws IOException {
-    long numTerms = 0;
    final Fields fields = fields();
    if (fields == null) {
      return 0;
    }
-    FieldsEnum it = fields.iterator();
-    while(true) {
-      String field = it.next();
-      if (field == null) {
-        break;
-      }
-      numTerms += fields.terms(field).getUniqueTermCount();
-    }
-    return numTerms;
+    return fields.getUniqueTermCount();
  }

  /** For IndexReader implementations that use
--- a/lucene/src/java/org/apache/lucene/index/MultiTerms.java
+++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java
@ -95,6 +95,11 @@ public final class MultiTerms extends Terms {
    }
  }

+  @Override
+  public long getUniqueTermCount() throws IOException {
+    return -1;
+  }
+
  @Override
  public long getSumTotalTermFreq() throws IOException {
    long sum = 0;
--- a/lucene/src/java/org/apache/lucene/index/ParallelReader.java
+++ b/lucene/src/java/org/apache/lucene/index/ParallelReader.java
@ -569,7 +569,9 @@ public class ParallelReader extends IndexReader {

    void addField(String field, IndexReader r) throws IOException {
      PerDocValues perDocs = MultiPerDocValues.getPerDocs(r);
-      fields.put(field, perDocs.docValues(field));
+      if (perDocs != null) {
+        fields.put(field, perDocs.docValues(field));
+      }
    }

    @Override
--- a/lucene/src/java/org/apache/lucene/index/SegmentCodecs.java
+++ b/lucene/src/java/org/apache/lucene/index/SegmentCodecs.java
@ -25,6 +25,7 @@ import java.util.Set;

 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
@ -65,7 +66,7 @@ final class SegmentCodecs implements Cloneable {
   */
  final Codec[] codecs;
  final CodecProvider provider;
-  private final Codec codec = new PerFieldCodecWrapper(this);
+  private final Codec codec;
  
  SegmentCodecs(CodecProvider provider, IndexInput input) throws IOException {
    this(provider, read(input, provider));
@ -74,6 +75,11 @@ final class SegmentCodecs implements Cloneable {
  SegmentCodecs(CodecProvider provider, Codec... codecs) {
    this.provider = provider;
    this.codecs = codecs;
+    if (codecs.length == 1 && codecs[0] instanceof PreFlexCodec) {
+      this.codec = codecs[0]; // hack for backwards break... don't wrap the codec in preflex
+    } else {
+      this.codec = new PerFieldCodecWrapper(this);
+    }
  }

  Codec codec() {
--- a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
@ -586,6 +586,11 @@ final class SegmentMerger {
  private void mergePerDoc() throws IOException {
      final PerDocConsumer docsConsumer = codec
          .docsConsumer(new PerDocWriteState(segmentWriteState));
+      // TODO: remove this check when 3.x indexes are no longer supported
+      // (3.x indexes don't have docvalues)
+      if (docsConsumer == null) {
+        return;
+      }
      boolean success = false;
      try {
        docsConsumer.merge(mergeState);
--- a/lucene/src/java/org/apache/lucene/index/Terms.java
+++ b/lucene/src/java/org/apache/lucene/index/Terms.java
@ -155,10 +155,12 @@ public abstract class Terms {
    return termsEnum.docsAndPositions(liveDocs, reuse);
  }

-  public long getUniqueTermCount() throws IOException {
-    throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
-  }
-
+  /** Returns the number of terms for this field, or -1 if this 
+   *  measure isn't stored by the codec. Note that, just like 
+   *  other term measures, this measure does not take deleted 
+   *  documents into account. */
+  public abstract long getUniqueTermCount() throws IOException;
+  
  /** Returns the sum of {@link TermsEnum#totalTermFreq} for
   *  all terms in this field, or -1 if this measure isn't
   *  stored by the codec (or if this fields omits term freq
--- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java
@ -684,11 +684,13 @@ public class MemoryCodec extends Codec {
    private final long sumTotalTermFreq;
    private final long sumDocFreq;
    private final int docCount;
+    private final int termCount;
    private FST<BytesRef> fst;
    private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
    private final FieldInfo field;

-    public TermsReader(FieldInfos fieldInfos, IndexInput in) throws IOException {
+    public TermsReader(FieldInfos fieldInfos, IndexInput in, int termCount) throws IOException {
+      this.termCount = termCount;
      final int fieldNumber = in.readVInt();
      field = fieldInfos.fieldInfo(fieldNumber);
      if (field.indexOptions != IndexOptions.DOCS_ONLY) {
@ -717,6 +719,11 @@ public class MemoryCodec extends Codec {
      return docCount;
    }

+    @Override
+    public long getUniqueTermCount() throws IOException {
+      return termCount;
+    }
+
    @Override
    public TermsEnum iterator() {
      return new FSTTermsEnum(field, fst);
@ -741,7 +748,7 @@ public class MemoryCodec extends Codec {
        if (termCount == 0) {
          break;
        }
-        final TermsReader termsReader = new TermsReader(state.fieldInfos, in);
+        final TermsReader termsReader = new TermsReader(state.fieldInfos, in, termCount);
        fields.put(termsReader.field.name, termsReader);
      }
    } finally {
--- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java
@ -84,11 +84,11 @@ public class PreFlexCodec extends Codec {

  @Override
  public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
-    throw new UnsupportedOperationException("PerDocConsumer is not supported by Preflex codec");
+    return null;
  }

  @Override
  public PerDocValues docsProducer(SegmentReadState state) throws IOException {
-    throw new UnsupportedOperationException("PerDocValues is not supported by Preflex codec");
+    return null;
  }
 }
--- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
@ -162,6 +162,11 @@ public class PreFlexFields extends FieldsProducer {
    return preTerms.get(field);
  }

+  @Override
+  public long getUniqueTermCount() throws IOException {
+    return getTermsDict().size();
+  }
+
  synchronized private TermInfosReader getTermsDict() {
    if (tis != null) {
      return tis;
@ -240,6 +245,11 @@ public class PreFlexFields extends FieldsProducer {
      }
    }

+    @Override
+    public long getUniqueTermCount() throws IOException {
+      return -1;
+    }
+
    @Override
    public long getSumTotalTermFreq() {
      return -1;
--- a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java
@ -1201,7 +1201,7 @@ public class TestIndexReader extends LuceneTestCase
  // LUCENE-1586: getUniqueTermCount
  public void testUniqueTermCount() throws Exception {
    Directory dir = newDirectory();
-    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document doc = new Document();
    doc.add(newField("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", TextField.TYPE_UNSTORED));
    doc.add(newField("number", "0 1 2 3 4 5 6 7 8 9", TextField.TYPE_UNSTORED));
@ -1217,12 +1217,8 @@ public class TestIndexReader extends LuceneTestCase
    IndexReader r2 = IndexReader.openIfChanged(r);
    assertNotNull(r2);
    r.close();
-    try {
-      r2.getUniqueTermCount();
-      fail("expected exception");
-    } catch (UnsupportedOperationException uoe) {
-      // expected
-    }
+    assertEquals(-1, r2.getUniqueTermCount());
+
    IndexReader[] subs = r2.getSequentialSubReaders();
    for(int i=0;i<subs.length;i++) {
      assertEquals(36, subs[i].getUniqueTermCount());
--- a/lucene/src/test/org/apache/lucene/index/TestRollingUpdates.java
+++ b/lucene/src/test/org/apache/lucene/index/TestRollingUpdates.java
@ -37,7 +37,7 @@ public class TestRollingUpdates extends LuceneTestCase {

    CodecProvider provider = CodecProvider.getDefault();
    //provider.register(new MemoryCodec());
-    if (random.nextBoolean()) {
+    if ( (!"PreFlex".equals(provider.getDefaultFieldCodec())) && random.nextBoolean()) {
      provider.setFieldCodec("docid", "Memory");
    }

--- a/lucene/src/test/org/apache/lucene/search/FieldCacheRewriteMethod.java
+++ b/lucene/src/test/org/apache/lucene/search/FieldCacheRewriteMethod.java
@ -139,6 +139,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
        public int getDocCount() throws IOException {
          return -1;
        }
+
+        @Override
+        public long getUniqueTermCount() throws IOException {
+          return -1;
+        }
      });
      
      assert termsEnum != null;
--- a/lucene/src/test/org/apache/lucene/search/similarities/SpoofIndexSearcher.java
+++ b/lucene/src/test/org/apache/lucene/search/similarities/SpoofIndexSearcher.java
@ -203,11 +203,17 @@ public class SpoofIndexSearcher extends IndexSearcher {
    
    // ------------------------ Not implemented methods ------------------------

+    
    @Override
    public TermsEnum iterator() throws IOException {
      return null;
    }

+    @Override
+    public long getUniqueTermCount() throws IOException {
+      return -1;
+    }
+
    @Override
    public Comparator<BytesRef> getComparator() throws IOException {
      return null;