ensure values are sent to codec in sorted order: beef up assertingcodec and add more simple tests

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1444653 13f79535-47bb-0310-9956-ffa450edef68
2013-02-11 03:43:12 +00:00 · 2013-02-11 03:43:12 +00:00 · 7bd948623e
parent 2dcf80718c
commit 7bd948623e
3 changed files with 246 additions and 10 deletions
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@ -46,6 +46,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
  private int currentDoc;
  private int currentValues[] = new int[8];
  private int currentUpto = 0;
+  private int maxCount = 0;

  public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
    this.fieldInfo = fieldInfo;
@ -83,7 +84,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
    updateBytesUsed();
  }
  
-  // finalize currentDoc
+  // finalize currentDoc: this deduplicates the current term ids
  private void finishCurrentDoc() {
    Arrays.sort(currentValues, 0, currentUpto);
    int lastValue = -1;
@ -99,6 +100,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
    }
    // record the number of unique ords for this doc
    pendingCounts.add(count);
+    maxCount = Math.max(maxCount, count);
    currentUpto = 0;
    currentDoc++;
  }
@ -127,7 +129,9 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
    
    if (currentUpto == currentValues.length) {
      currentValues = ArrayUtil.grow(currentValues, currentValues.length+1);
-      iwBytesUsed.addAndGet((currentValues.length - currentUpto) * RamUsageEstimator.NUM_BYTES_INT);
+      // reserve additional space for max # values per-doc
+      // when flushing, we need an int[] to sort the mapped-ords within the doc
+      iwBytesUsed.addAndGet((currentValues.length - currentUpto) * 2 * RamUsageEstimator.NUM_BYTES_INT);
    }
    
    currentValues[currentUpto] = ord;
@ -143,7 +147,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
  @Override
  public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
    final int maxDoc = state.segmentInfo.getDocCount();
-
+    final int maxCountPerDoc = maxCount;
    assert pendingCounts.size() == maxDoc;
    final int valueCount = hash.size();

@ -176,7 +180,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
                              new Iterable<Number>() {
                                @Override
                                public Iterator<Number> iterator() {
-                                  return new OrdsIterator(ordMap);
+                                  return new OrdsIterator(ordMap, maxCountPerDoc);
                                }
                              });
  }
@ -221,11 +225,17 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
  // iterates over the ords for each doc we have in ram
  private class OrdsIterator implements Iterator<Number> {
    final AppendingLongBuffer.Iterator iter = pending.iterator();
+    final AppendingLongBuffer.Iterator counts = pendingCounts.iterator();
    final int ordMap[];
    final long numOrds;
    long ordUpto;
    
-    OrdsIterator(int ordMap[]) {
+    final int currentDoc[];
+    int currentUpto;
+    int currentLength;
+    
+    OrdsIterator(int ordMap[], int maxCount) {
+      this.currentDoc = new int[maxCount];
      this.ordMap = ordMap;
      this.numOrds = pending.size();
    }
@ -240,10 +250,20 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
      if (!hasNext()) {
        throw new NoSuchElementException();
      }
-      int ord = (int) iter.next();
+      if (currentUpto == currentLength) {
+        // refill next doc, and sort remapped ords within the doc.
+        currentUpto = 0;
+        currentLength = (int) counts.next();
+        for (int i = 0; i < currentLength; i++) {
+          currentDoc[i] = ordMap[(int) iter.next()];
+        }
+        Arrays.sort(currentDoc, 0, currentLength);
+      }
+      int ord = currentDoc[currentUpto];
+      currentUpto++;
      ordUpto++;
      // TODO: make reusable Number
-      return ordMap[ord];
+      return ord;
    }

    @Override
--- a/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
+++ b/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
@ -163,6 +163,39 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
+  public void testTwoValuesUnordered() throws IOException {
+    Analyzer analyzer = new MockAnalyzer(random());
+
+    // Store the index in memory:
+    Directory directory = newDirectory();
+    // To store an index on disk, use this instead:
+    // Directory directory = FSDirectory.open(new File("/tmp/testindex"));
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, analyzer);
+    Document doc = new Document();
+    doc.add(new SortedSetDocValuesField("field", new BytesRef("world")));
+    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    iwriter.close();
+    
+    // Now search the index:
+    DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
+    SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
+    OrdIterator oi = dv.getOrds(0, null);
+    assertEquals(0, oi.nextOrd());
+    assertEquals(1, oi.nextOrd());
+    assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
+    
+    BytesRef bytes = new BytesRef();
+    dv.lookupOrd(0, bytes);
+    assertEquals(new BytesRef("hello"), bytes);
+    
+    dv.lookupOrd(1, bytes);
+    assertEquals(new BytesRef("world"), bytes);
+
+    ireader.close();
+    directory.close();
+  }
+  
  public void testThreeValuesTwoDocs() throws IOException {
    Analyzer analyzer = new MockAnalyzer(random());

@ -213,4 +246,146 @@ public class TestDemoDocValue extends LuceneTestCase {
    ireader.close();
    directory.close();
  }
+  
+  public void testTwoDocumentsLastMissing() throws IOException {
+    Analyzer analyzer = new MockAnalyzer(random());
+
+    // Store the index in memory:
+    Directory directory = newDirectory();
+    // To store an index on disk, use this instead:
+    // Directory directory = FSDirectory.open(new File("/tmp/testindex"));
+    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+    Document doc = new Document();
+    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    doc = new Document();
+    iwriter.addDocument(doc);
+    iwriter.forceMerge(1);
+    iwriter.close();
+    
+    // Now search the index:
+    DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
+    SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
+    OrdIterator oi = dv.getOrds(0, null);
+    assertEquals(0, oi.nextOrd());
+    assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
+    
+    BytesRef bytes = new BytesRef();
+    dv.lookupOrd(0, bytes);
+    assertEquals(new BytesRef("hello"), bytes);
+    
+    assertEquals(1, dv.getValueCount());
+
+    ireader.close();
+    directory.close();
+  }
+  
+  public void testTwoDocumentsLastMissingMerge() throws IOException {
+    Analyzer analyzer = new MockAnalyzer(random());
+
+    // Store the index in memory:
+    Directory directory = newDirectory();
+    // To store an index on disk, use this instead:
+    // Directory directory = FSDirectory.open(new File("/tmp/testindex"));
+    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+    Document doc = new Document();
+    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    iwriter.commit();
+    doc = new Document();
+    iwriter.addDocument(doc);
+    iwriter.forceMerge(1);
+    iwriter.close();
+    
+    // Now search the index:
+    DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
+    SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
+    OrdIterator oi = dv.getOrds(0, null);
+    assertEquals(0, oi.nextOrd());
+    assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
+    
+    BytesRef bytes = new BytesRef();
+    dv.lookupOrd(0, bytes);
+    assertEquals(new BytesRef("hello"), bytes);
+    
+    assertEquals(1, dv.getValueCount());
+
+    ireader.close();
+    directory.close();
+  }
+  
+  public void testTwoDocumentsFirstMissing() throws IOException {
+    Analyzer analyzer = new MockAnalyzer(random());
+
+    // Store the index in memory:
+    Directory directory = newDirectory();
+    // To store an index on disk, use this instead:
+    // Directory directory = FSDirectory.open(new File("/tmp/testindex"));
+    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+    Document doc = new Document();
+    iwriter.addDocument(doc);
+    doc = new Document();
+    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    iwriter.forceMerge(1);
+    iwriter.close();
+    
+    // Now search the index:
+    DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
+    SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
+    OrdIterator oi = dv.getOrds(1, null);
+    assertEquals(0, oi.nextOrd());
+    assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
+    
+    BytesRef bytes = new BytesRef();
+    dv.lookupOrd(0, bytes);
+    assertEquals(new BytesRef("hello"), bytes);
+    
+    assertEquals(1, dv.getValueCount());
+
+    ireader.close();
+    directory.close();
+  }
+  
+  public void testTwoDocumentsFirstMissingMerge() throws IOException {
+    Analyzer analyzer = new MockAnalyzer(random());
+
+    // Store the index in memory:
+    Directory directory = newDirectory();
+    // To store an index on disk, use this instead:
+    // Directory directory = FSDirectory.open(new File("/tmp/testindex"));
+    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+    Document doc = new Document();
+    iwriter.addDocument(doc);
+    iwriter.commit();
+    doc = new Document();
+    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    iwriter.forceMerge(1);
+    iwriter.close();
+    
+    // Now search the index:
+    DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
+    SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
+    OrdIterator oi = dv.getOrds(1, null);
+    assertEquals(0, oi.nextOrd());
+    assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
+    
+    BytesRef bytes = new BytesRef();
+    dv.lookupOrd(0, bytes);
+    assertEquals(new BytesRef("hello"), bytes);
+    
+    assertEquals(1, dv.getValueCount());
+
+    ireader.close();
+    directory.close();
+  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java
@ -130,12 +130,53 @@ public class AssertingDocValuesFormat extends DocValuesFormat {
    
    @Override
    public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
-      // nocommit: add checks
+      long valueCount = 0;
+      BytesRef lastValue = null;
+      for (BytesRef b : values) {
+        assert b != null;
+        assert b.isValid();
+        if (valueCount > 0) {
+          assert b.compareTo(lastValue) > 0;
+        }
+        lastValue = BytesRef.deepCopyOf(b);
+        valueCount++;
+      }
+      
+      int docCount = 0;
+      long ordCount = 0;
+      // nocommit
+      FixedBitSet seenOrds = new FixedBitSet((int)valueCount);
+      Iterator<Number> ordIterator = ords.iterator();
+      for (Number v : docToOrdCount) {
+        assert v != null;
+        int count = v.intValue();
+        assert count >= 0;
+        docCount++;
+        ordCount += count;
+        
+        long lastOrd = -1;
+        for (int i = 0; i < count; i++) {
+          Number o = ordIterator.next();
+          assert o != null;
+          long ord = o.longValue();
+          assert ord >= 0 && ord < valueCount;
+          assert ord > lastOrd : "ord=" + ord + ",lastOrd=" + lastOrd;
+          seenOrds.set((int)ord); // nocommit
+          lastOrd = ord;
+        }
+      }
+      assert ordIterator.hasNext() == false;
+      
+      assert docCount == maxDoc;
+      assert seenOrds.cardinality() == valueCount;
+      checkIterator(values.iterator(), valueCount);
+      checkIterator(docToOrdCount.iterator(), maxDoc);
+      checkIterator(ords.iterator(), ordCount);
      in.addSortedSetField(field, values, docToOrdCount, ords);
    }

-    private <T> void checkIterator(Iterator<T> iterator, int expectedSize) {
-      for (int i = 0; i < expectedSize; i++) {
+    private <T> void checkIterator(Iterator<T> iterator, long expectedSize) {
+      for (long i = 0; i < expectedSize; i++) {
        boolean hasNext = iterator.hasNext();
        assert hasNext;
        T v = iterator.next();