clear some nocommits and clean up sortedbytesmerge a bit

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1441005 13f79535-47bb-0310-9956-ffa450edef68
2025-02-11 20:45:27 +00:00 · 2013-01-31 15:07:51 +00:00 · 2013-01-31 15:07:51 +00:00 · 5b38004a55
commit 5b38004a55
parent 8a2a727873
5 changed files with 24 additions and 51 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@ -309,8 +309,8 @@ public abstract class DocValuesConsumer implements Closeable {
          }
        }

-        // nocommit we can unload the bits to disk to reduce
-        // transient ram spike...
+        // TODO: we can unload the bits/packed ints to disk to reduce
+        // transient ram spike... most of these just require iterators
      }

      // Second pass: merge only the live terms
@ -337,7 +337,11 @@ public abstract class DocValuesConsumer implements Closeable {
          lastOrds[readerId] = sourceOrd;
          top.ordDeltas.add(delta);
          
-          lastTerm = BytesRef.deepCopyOf(top.scratch);
+          if (lastTerm == null) {
+            lastTerm = BytesRef.deepCopyOf(top.scratch);
+          } else {
+            lastTerm.copyBytes(top.scratch);
+          }
          ord++;
        }

@ -360,28 +364,6 @@ public abstract class DocValuesConsumer implements Closeable {
        state.liveTerms = null;
      }
    }
-
-    /*
-    public void finish(SortedDocValuesConsumer consumer) throws IOException {
-
-      // Third pass: write merged result
-      for(BytesRef term : mergedTerms) {
-        consumer.addValue(term);
-      }
-
-      for(SegmentState segState : segStates) {
-        Bits liveDocs = segState.reader.getLiveDocs();
-        int maxDoc = segState.reader.maxDoc();
-        for(int docID=0;docID<maxDoc;docID++) {
-          if (liveDocs == null || liveDocs.get(docID)) {
-            int segOrd = segState.values.getOrd(docID);
-            int mergedOrd = segState.segOrdToMergedOrd[segOrd];
-            consumer.addDoc(mergedOrd);
-          }
-        }
-      }
-    }
-    */
  }

  /**
@ -472,7 +454,7 @@ public abstract class DocValuesConsumer implements Closeable {
                            }
                            assert nextIsSet;
                            nextIsSet = false;
-                            // nocommit make a mutable number
+                            // TODO make a mutable number
                            return nextValue;
                          }

--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@ -763,8 +763,6 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
  private FieldNumbers getFieldNumberMap() throws IOException {
    final FieldNumbers map = new FieldNumbers();

-    // nocommit for a 4.0 index that has inconsistent DV
-    // types ... this will throw exc on init of IW?
    for(SegmentInfoPerCommit info : segmentInfos) {
      for(FieldInfo fi : getFieldInfos(info.info)) {
        map.addOrGet(fi.name, fi.number, fi.getDocValuesType());
--- a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
+++ b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
@ -30,7 +30,8 @@ import org.apache.lucene.store.IndexInput;
 *
 * @lucene.internal
 **/
-// nocommit: make this simply a big ass array and nothing more.
+// TODO: refactor this, byteblockpool, fst.bytestore, and any
+// other "shift/mask big arrays". there are too many of these classes!
 public final class PagedBytes {
  private final List<byte[]> blocks = new ArrayList<byte[]>();
  private final List<Integer> blockEnd = new ArrayList<Integer>();
@ -106,7 +107,7 @@ public final class PagedBytes {
     * 
     * @lucene.internal
     **/
-    // nocommit: move this shit and any other vint bogusness to fieldcacheimpl!
+    // TODO: this really needs to be refactored into fieldcacheimpl
    public void fill(BytesRef b, long start) {
      final int index = (int) (start >> blockBits);
      final int offset = (int) (start & blockMask);
@ -217,7 +218,7 @@ public final class PagedBytes {

  /** Copy bytes in, writing the length as a 1 or 2 byte
   *  vInt prefix. */
-  // nocommit: move this shit and any other vint bogusness to fieldcacheimpl!
+  // TODO: this really needs to be refactored into fieldcacheimpl!
  public long copyUsingLengthPrefix(BytesRef bytes) {
    if (bytes.length >= 32768) {
      throw new IllegalArgumentException("max length is 32767 (got " + bytes.length + ")");
--- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java
@ -65,8 +65,8 @@ public class TestPerFieldDocValuesFormat extends BaseDocValuesFormatTestCase {
  }
  
  // just a simple trivial test
-  // nocommit: if we are going to pass down suffixes to segmentread/writestate,
-  // then they should be respected by *all* codec apis!
+  // TODO: we should come up with a test that somehow checks that segment suffix
+  // is respected by all codec apis (not just docvalues and postings)
  public void testTwoFieldsTwoFormats() throws IOException {
    Analyzer analyzer = new MockAnalyzer(random());

--- a/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java
+++ b/lucene/core/src/test/org/apache/lucene/index/Test2BSortedDocValues.java
@ -17,13 +17,12 @@ package org.apache.lucene.index;
 * limitations under the License.
 */

+import java.util.Random;
+
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.store.BaseDirectoryWrapper;
-import org.apache.lucene.store.ByteArrayDataInput;
-import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.MockDirectoryWrapper;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@ -34,7 +33,7 @@ import org.junit.Ignore;
 import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;

@TimeoutSuite(millis = 80 * TimeUnits.HOUR)
-@Ignore("takes ?? minutes")
+@Ignore("very slow")
 public class Test2BSortedDocValues extends LuceneTestCase {
  
  // indexes Integer.MAX_VALUE docs with a fixed binary field
@ -94,10 +93,7 @@ public class Test2BSortedDocValues extends LuceneTestCase {
  }
  
  // indexes Integer.MAX_VALUE docs with a fixed binary field
-  // nocommit: this must be some kind of worst case for BytesRefHash / its hash fn... 
-  // or there is some other perf bug...VERY slow!
-  // if you cut this test to use random.nextBytes its much faster, but still quite slow...
-  // and its not unrealistic for users to index something thats already in sorted order?
+  // TODO: must use random.nextBytes (like Test2BTerms) to avoid BytesRefHash probing issues
  public void test2BOrds() throws Exception {
    BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BOrds"));
    if (dir instanceof MockDirectoryWrapper) {
@ -118,11 +114,11 @@ public class Test2BSortedDocValues extends LuceneTestCase {
    SortedDocValuesField dvField = new SortedDocValuesField("dv", data);
    doc.add(dvField);
    
+    long seed = random().nextLong();
+    Random random = new Random(seed);
+    
    for (int i = 0; i < Integer.MAX_VALUE; i++) {
-      bytes[0] = (byte)(i >> 24);
-      bytes[1] = (byte)(i >> 16);
-      bytes[2] = (byte)(i >> 8);
-      bytes[3] = (byte) i;
+      random.nextBytes(bytes);
      w.addDocument(doc);
      if (i % 100000 == 0) {
        System.out.println("indexed: " + i);
@ -137,19 +133,15 @@ public class Test2BSortedDocValues extends LuceneTestCase {
    System.out.flush();
    
    DirectoryReader r = DirectoryReader.open(dir);
-    int expectedValue = 0;
+    random.setSeed(seed);
    for (AtomicReaderContext context : r.leaves()) {
      AtomicReader reader = context.reader();
      BytesRef scratch = new BytesRef();
      BinaryDocValues dv = reader.getSortedDocValues("dv");
      for (int i = 0; i < reader.maxDoc(); i++) {
-        bytes[0] = (byte)(expectedValue >> 24);
-        bytes[1] = (byte)(expectedValue >> 16);
-        bytes[2] = (byte)(expectedValue >> 8);
-        bytes[3] = (byte) expectedValue;
+        random.nextBytes(bytes);
        dv.get(i, scratch);
        assertEquals(data, scratch);
-        expectedValue++;
      }
    }