LUCENE-1625: return more status details in CheckIndex, broken out by component into separate status classes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@788800 13f79535-47bb-0310-9956-ffa450edef68
2009-06-26 18:10:02 +00:00 · 2009-06-26 18:10:02 +00:00 · 890d53acfb
parent 2735779bf6
commit 890d53acfb
3 changed files with 304 additions and 92 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -276,6 +276,11 @@ Changes in runtime behavior
 26. LUCENE-1703: Add IndexWriter.waitForMerges.  (Tim Smith via Mike
    McCandless)

+27. LUCENE-1625: CheckIndex's programmatic API now returns separate
+    classes detailing the status of each component in the index, and
+    includes more detailed status than previously.  (Tim Smith via
+    Mike McCandless)
+
 Bug fixes

 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
--- a/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/src/java/org/apache/lucene/index/CheckIndex.java
@ -179,6 +179,76 @@ public class CheckIndex {
       *  debugging details that IndexWriter records into
       *  each segment it creates */
      public Map diagnostics;
+
+      /** Status for testing of field norms (null if field norms could not be tested). */
+      public FieldNormStatus fieldNormStatus;
+
+      /** Status for testing of indexed terms (null if indexed terms could not be tested). */
+      public TermIndexStatus termIndexStatus;
+
+      /** Status for testing of stored fields (null if stored fields could not be tested). */
+      public StoredFieldStatus storedFieldStatus;
+
+      /** Status for testing of term vectors (null if term vectors could not be tested). */
+      public TermVectorStatus termVectorStatus;
+    }
+
+    /**
+     * Status from testing field norms.
+     */
+    public static final class FieldNormStatus {
+      /** Number of fields successfully tested */
+      public long totFields = 0L;
+
+      /** Exception thrown during term index test (null on success) */
+      public Throwable error = null;
+    }
+
+    /**
+     * Status from testing term index.
+     */
+    public static final class TermIndexStatus {
+      /** Total term count */
+      public long termCount = 0L;
+
+      /** Total frequency across all terms. */
+      public long totFreq = 0L;
+      
+      /** Total number of positions. */
+      public long totPos = 0L;
+
+      /** Exception thrown during term index test (null on success) */
+      public Throwable error = null;
+    }
+
+    /**
+     * Status from testing stored fields.
+     */
+    public static final class StoredFieldStatus {
+      
+      /** Number of documents tested. */
+      public int docCount = 0;
+      
+      /** Total number of stored fields tested. */
+      public long totFields = 0;
+      
+      /** Exception thrown during stored fields test (null on success) */
+      public Throwable error = null;
+    }
+
+    /**
+     * Status from testing stored fields.
+     */
+    public static final class TermVectorStatus {
+      
+      /** Number of documents tested. */
+      public int docCount = 0;
+      
+      /** Total number of term vectors tested. */
+      public long totVectors = 0;
+      
+      /** Exception thrown during term vector test (null on success) */
+      public Throwable error = null;
    }
  }

@ -443,107 +513,38 @@ public class CheckIndex {
        if (reader.maxDoc() != info.docCount)
          throw new RuntimeException("SegmentReader.maxDoc() " + reader.maxDoc() + " != SegmentInfos.docCount " + info.docCount);

-        if (infoStream != null)
-          infoStream.print("    test: fields, norms.......");
+        // Test getFieldNames()
+        if (infoStream != null) {
+          infoStream.print("    test: fields..............");
+        }         
        Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
-        Iterator it = fieldNames.iterator();
-        final byte[] b = new byte[reader.maxDoc()];
-        while(it.hasNext()) {
-          final String fieldName = (String) it.next();
-          reader.norms(fieldName, b, 0);
-        }
        msg("OK [" + fieldNames.size() + " fields]");
        segInfoStat.numFields = fieldNames.size();
-        if (infoStream != null)
-          infoStream.print("    test: terms, freq, prox...");
-        final TermEnum termEnum = reader.terms();
-        final TermPositions termPositions = reader.termPositions();
+        
+        // Test Field Norms
+        segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader);

-        // Used only to count up # deleted docs for this
-        // term
-        final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);
+        // Test the Term Index
+        segInfoStat.termIndexStatus = testTermIndex(info, reader);

-        long termCount = 0;
-        long totFreq = 0;
-        long totPos = 0;
-        final int maxDoc = reader.maxDoc();
+        // Test Stored Fields
+        segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);

-        while(termEnum.next()) {
-          termCount++;
-          final Term term = termEnum.term();
-          final int docFreq = termEnum.docFreq();
-          termPositions.seek(term);
-          int lastDoc = -1;
-          int freq0 = 0;
-          totFreq += docFreq;
-          while(termPositions.next()) {
-            freq0++;
-            final int doc = termPositions.doc();
-            final int freq = termPositions.freq();
-            if (doc <= lastDoc)
-              throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
-            if (doc >= maxDoc)
-              throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
+        // Test Term Vectors
+        segInfoStat.termVectorStatus = testTermVectors(info, reader, nf);

-            lastDoc = doc;
-            if (freq <= 0)
-              throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
-            
-            int lastPos = -1;
-            totPos += freq;
-            for(int j=0;j<freq;j++) {
-              final int pos = termPositions.nextPosition();
-              if (pos < -1)
-                throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
-              if (pos < lastPos)
-                throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
-            }
-          }
-
-          // Now count how many deleted docs occurred in
-          // this term:
-          final int delCount;
-          if (reader.hasDeletions()) {
-            myTermDocs.seek(term);
-            while(myTermDocs.next()) {
-            }
-            delCount = myTermDocs.delCount;
-          } else
-            delCount = 0;
-
-          if (freq0 + delCount != docFreq)
-            throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
+        // Rethrow the first exception we encountered
+        //  This will cause stats for failed segments to be incremented properly
+        if (segInfoStat.fieldNormStatus.error != null) {
+          throw new RuntimeException("Field Norm test failed");
+        } else if (segInfoStat.termIndexStatus.error != null) {
+          throw new RuntimeException("Term Index test failed");
+        } else if (segInfoStat.storedFieldStatus.error != null) {
+          throw new RuntimeException("Stored Field test failed");
+        } else if (segInfoStat.termVectorStatus.error != null) {
+          throw new RuntimeException("Term Vector test failed");
        }

-        msg("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]");
-
-        if (infoStream != null)
-          infoStream.print("    test: stored fields.......");
-        int docCount = 0;
-        long totFields = 0;
-        for(int j=0;j<info.docCount;j++)
-          if (!reader.isDeleted(j)) {
-            docCount++;
-            Document doc = reader.document(j);
-            totFields += doc.getFields().size();
-          }
-
-        if (docCount != reader.numDocs())
-          throw new RuntimeException("docCount=" + docCount + " but saw " + docCount + " undeleted docs");
-
-        msg("OK [" + totFields + " total field count; avg " + nf.format((((float) totFields)/docCount)) + " fields per doc]");
-
-        if (infoStream != null)
-          infoStream.print("    test: term vectors........");
-        int totVectors = 0;
-        for(int j=0;j<info.docCount;j++)
-          if (!reader.isDeleted(j)) {
-            TermFreqVector[] tfv = reader.getTermFreqVectors(j);
-            if (tfv != null)
-              totVectors += tfv.length;
-          }
-
-        msg("OK [" + totVectors + " total vector count; avg " + nf.format((((float) totVectors)/docCount)) + " term/freq vector fields per doc]");
        msg("");

      } catch (Throwable t) {
@ -574,7 +575,191 @@ public class CheckIndex {

    return result;
  }
+
+  /**
+   * Test field norms.
+   */
+  private Status.FieldNormStatus testFieldNorms(Collection fieldNames, SegmentReader reader) {
+    final Status.FieldNormStatus status = new Status.FieldNormStatus();
+
+    try {
+      // Test Field Norms
+      if (infoStream != null) {
+        infoStream.print("    test: field norms.........");
+      }
+      Iterator it = fieldNames.iterator();
+      final byte[] b = new byte[reader.maxDoc()];
+      while (it.hasNext()) {
+        final String fieldName = (String) it.next();
+        reader.norms(fieldName, b, 0);
+        ++status.totFields;
+      }
+
+      msg("OK [" + status.totFields + " fields]");
+    } catch (Throwable e) {
+      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
+      status.error = e;
+      if (infoStream != null) {
+        e.printStackTrace(infoStream);
+      }
+    }
+
+    return status;
+  }
+
+  /**
+   * Test the term index.
+   */
+  private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) {
+    final Status.TermIndexStatus status = new Status.TermIndexStatus();
+
+    try {
+      if (infoStream != null) {
+        infoStream.print("    test: terms, freq, prox...");
+      }
+
+      final TermEnum termEnum = reader.terms();
+      final TermPositions termPositions = reader.termPositions();
+
+      // Used only to count up # deleted docs for this term
+      final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);
+
+      final int maxDoc = reader.maxDoc();
+
+      while (termEnum.next()) {
+        status.termCount++;
+        final Term term = termEnum.term();
+        final int docFreq = termEnum.docFreq();
+        termPositions.seek(term);
+        int lastDoc = -1;
+        int freq0 = 0;
+        status.totFreq += docFreq;
+        while (termPositions.next()) {
+          freq0++;
+          final int doc = termPositions.doc();
+          final int freq = termPositions.freq();
+          if (doc <= lastDoc)
+            throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
+          if (doc >= maxDoc)
+            throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
+
+          lastDoc = doc;
+          if (freq <= 0)
+            throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
+            
+          int lastPos = -1;
+          status.totPos += freq;
+          for(int j=0;j<freq;j++) {
+            final int pos = termPositions.nextPosition();
+            if (pos < -1)
+              throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
+            if (pos < lastPos)
+              throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
+          }
+        }
+
+        // Now count how many deleted docs occurred in
+        // this term:
+        final int delCount;
+        if (reader.hasDeletions()) {
+          myTermDocs.seek(term);
+          while(myTermDocs.next()) { }
+          delCount = myTermDocs.delCount;
+        } else {
+          delCount = 0; 
+        }
+
+        if (freq0 + delCount != docFreq) {
+          throw new RuntimeException("term " + term + " docFreq=" + 
+                                     docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
+        }
+      }
+
+      msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
+
+    } catch (Throwable e) {
+      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
+      status.error = e;
+      if (infoStream != null) {
+        e.printStackTrace(infoStream);
+      }
+    }
+
+    return status;
+  }
  
+  /**
+   * Test stored fields for a segment.
+   */
+  private Status.StoredFieldStatus testStoredFields(SegmentInfo info, SegmentReader reader, NumberFormat format) {
+    final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
+
+    try {
+      if (infoStream != null) {
+        infoStream.print("    test: stored fields.......");
+      }
+
+      // Scan stored fields for all documents
+      for (int j = 0; j < info.docCount; ++j) {
+        if (!reader.isDeleted(j)) {
+          status.docCount++;
+          Document doc = reader.document(j);
+          status.totFields += doc.getFields().size();
+        }
+      }      
+
+      // Validate docCount
+      if (status.docCount != reader.numDocs()) {
+        throw new RuntimeException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
+      }
+
+      msg("OK [" + status.totFields + " total field count; avg " + 
+          format.format((((float) status.totFields)/status.docCount)) + " fields per doc]");      
+    } catch (Throwable e) {
+      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
+      status.error = e;
+      if (infoStream != null) {
+        e.printStackTrace(infoStream);
+      }
+    }
+
+    return status;
+  }
+
+  /**
+   * Test term vectors for a segment.
+   */
+  private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
+    final Status.TermVectorStatus status = new Status.TermVectorStatus();
+    
+    try {
+      if (infoStream != null) {
+        infoStream.print("    test: term vectors........");
+      }
+
+      for (int j = 0; j < info.docCount; ++j) {
+        if (!reader.isDeleted(j)) {
+          status.docCount++;
+          TermFreqVector[] tfv = reader.getTermFreqVectors(j);
+          if (tfv != null) {
+            status.totVectors += tfv.length;
+          }
+        }
+      }
+      
+      msg("OK [" + status.totVectors + " total vector count; avg " + 
+          format.format((((float) status.totVectors) / status.docCount)) + " term/freq vector fields per doc]");
+    } catch (Throwable e) {
+      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
+      status.error = e;
+      if (infoStream != null) {
+        e.printStackTrace(infoStream);
+      }
+    }
+    
+    return status;
+  }
+
  /** Repairs the index using previously returned result
   *  from {@link #checkIndex}.  Note that this does not
   *  remove any of the unreferenced files after it's done;
--- a/src/test/org/apache/lucene/index/TestCheckIndex.java
+++ b/src/test/org/apache/lucene/index/TestCheckIndex.java
@ -50,6 +50,7 @@ public class TestCheckIndex extends LuceneTestCase {
    ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
    CheckIndex checker = new CheckIndex(dir);
    checker.setInfoStream(new PrintStream(bos));
+    //checker.setInfoStream(System.out);
    CheckIndex.Status indexStatus = checker.checkIndex();
    if (indexStatus.clean == false) {
      System.out.println("CheckIndex failed");
@ -61,6 +62,27 @@ public class TestCheckIndex extends LuceneTestCase {
    assertTrue(seg.openReaderPassed);

    assertNotNull(seg.diagnostics);
+    
+    assertNotNull(seg.fieldNormStatus);
+    assertNull(seg.fieldNormStatus.error);
+    assertEquals(1, seg.fieldNormStatus.totFields);
+
+    assertNotNull(seg.termIndexStatus);
+    assertNull(seg.termIndexStatus.error);
+    assertEquals(1, seg.termIndexStatus.termCount);
+    assertEquals(19, seg.termIndexStatus.totFreq);
+    assertEquals(18, seg.termIndexStatus.totPos);
+
+    assertNotNull(seg.storedFieldStatus);
+    assertNull(seg.storedFieldStatus.error);
+    assertEquals(18, seg.storedFieldStatus.docCount);
+    assertEquals(18, seg.storedFieldStatus.totFields);
+
+    assertNotNull(seg.termVectorStatus);
+    assertNull(seg.termVectorStatus.error);
+    assertEquals(18, seg.termVectorStatus.docCount);
+    assertEquals(18, seg.termVectorStatus.totVectors);
+
    assertTrue(seg.diagnostics.size() > 0);
    final List onlySegments = new ArrayList();
    onlySegments.add("_0");