add all-on-disk codec

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446911 13f79535-47bb-0310-9956-ffa450edef68
2013-02-16 16:21:00 +00:00 · 2013-02-16 16:21:00 +00:00 · 33e60bf352
parent b69a50fb12
commit 33e60bf352
3 changed files with 114 additions and 23 deletions
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
@ -304,6 +304,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
        IndexInput data = this.data.clone();
        data.seek(entry.offset);
        ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
+        ordIndexInstances.put(field.number, ordIndexInstance);
      }
      ordIndex = ordIndexInstance;
    }
--- a/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
+++ b/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
@ -52,10 +52,11 @@ import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
 * to this class.
 */
 // nocommit: should only be Lucene40 and Lucene41
-@SuppressCodecs({ "Lucene40", "Lucene41", "SimpleText", "CheapBastard" })
+// nocommit: move to BaseDocValuesTestCase, but allow these to be assume()d (for 4.0 and 4.1)
+@SuppressCodecs({ "Lucene40", "Lucene41", "SimpleText" })
 public class TestDemoDocValue extends LuceneTestCase {
  
-  public void testOneValue() throws IOException {
+  public void testSortedSetOneValue() throws IOException {
    Directory directory = newDirectory();
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
    
@ -80,7 +81,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testTwoDocumentsMerged() throws IOException {
+  public void testSortedSetTwoDocumentsMerged() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
@ -122,7 +123,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testTwoValues() throws IOException {
+  public void testSortedSetTwoValues() throws IOException {
    Directory directory = newDirectory();
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
    
@ -152,7 +153,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testTwoValuesUnordered() throws IOException {
+  public void testSortedSetTwoValuesUnordered() throws IOException {
    Directory directory = newDirectory();
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
    
@ -182,7 +183,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testThreeValuesTwoDocs() throws IOException {
+  public void testSortedSetThreeValuesTwoDocs() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
@ -231,7 +232,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testTwoDocumentsLastMissing() throws IOException {
+  public void testSortedSetTwoDocumentsLastMissing() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
@ -263,7 +264,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testTwoDocumentsLastMissingMerge() throws IOException {
+  public void testSortedSetTwoDocumentsLastMissingMerge() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
@ -297,7 +298,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testTwoDocumentsFirstMissing() throws IOException {
+  public void testSortedSetTwoDocumentsFirstMissing() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
@ -330,7 +331,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testTwoDocumentsFirstMissingMerge() throws IOException {
+  public void testSortedSetTwoDocumentsFirstMissingMerge() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
@ -364,7 +365,7 @@ public class TestDemoDocValue extends LuceneTestCase {
    directory.close();
  }
  
-  public void testMergeAwayAllValues() throws IOException {
+  public void testSortedSetMergeAwayAllValues() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java
@ -41,6 +41,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 class CheapBastardDocValuesProducer extends DocValuesProducer {
  private final Map<Integer,NumericEntry> numerics;
  private final Map<Integer,NumericEntry> ords;
+  private final Map<Integer,NumericEntry> ordIndexes;
  private final Map<Integer,BinaryEntry> binaries;
  private final IndexInput data;
  
@ -55,6 +56,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
                                DiskDocValuesFormat.VERSION_START);
      numerics = new HashMap<Integer,NumericEntry>();
      ords = new HashMap<Integer,NumericEntry>();
+      ordIndexes = new HashMap<Integer,NumericEntry>();
      binaries = new HashMap<Integer,BinaryEntry>();
      readFields(in);
      success = true;
@ -101,6 +103,36 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
        }
        NumericEntry n = readNumericEntry(meta);
        ords.put(fieldNumber, n);
+      } else if (type == DiskDocValuesFormat.SORTED_SET) {
+        // sortedset = binary + numeric + ordIndex
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.BINARY) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        BinaryEntry b = readBinaryEntry(meta);
+        binaries.put(fieldNumber, b);
+        
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.NUMERIC) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        NumericEntry n1 = readNumericEntry(meta);
+        ords.put(fieldNumber, n1);
+        
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.NUMERIC) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        NumericEntry n2 = readNumericEntry(meta);
+        ordIndexes.put(fieldNumber, n2);
+      } else {
+        throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta);
      }
      fieldNumber = meta.readVInt();
    }
@ -135,15 +167,15 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
    return getNumeric(field, entry);
  }
  
-  private NumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException {
+  private LongNumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException {
    final IndexInput data = this.data.clone();
    data.seek(entry.offset);

    final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
-    return new NumericDocValues() {
+    return new LongNumericDocValues() {
      @Override
-      public long get(int docID) {
-        return reader.get(docID);
+      public long get(long id) {
+        return reader.get(id);
      }
    };
  }
@ -161,10 +193,10 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
  private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) {
    final IndexInput data = this.data.clone();

-    return new BinaryDocValues() {
+    return new LongBinaryDocValues() {
      @Override
-      public void get(int docID, BytesRef result) {
-        long address = bytes.offset + docID * (long)bytes.maxLength;
+      public void get(long id, BytesRef result) {
+        long address = bytes.offset + id * bytes.maxLength;
        try {
          data.seek(address);
          // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) 
@ -186,11 +218,11 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
    data.seek(bytes.addressesOffset);

    final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
-    return new BinaryDocValues() {
+    return new LongBinaryDocValues() {
      @Override
-      public void get(int docID, BytesRef result) {
-        long startAddress = bytes.offset + (docID == 0 ? 0 : + addresses.get(docID-1));
-        long endAddress = bytes.offset + addresses.get(docID);
+      public void get(long id, BytesRef result) {
+        long startAddress = bytes.offset + (id == 0 ? 0 : + addresses.get(id-1));
+        long endAddress = bytes.offset + addresses.get(id);
        int length = (int) (endAddress - startAddress);
        try {
          data.seek(startAddress);
@ -234,7 +266,45 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {

  @Override
  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
-    throw new UnsupportedOperationException(); // nocommit
+    final long valueCount = binaries.get(field.number).count;
+    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
+    final LongNumericDocValues ordinals = getNumeric(field, ords.get(field.number));
+    NumericEntry entry = ordIndexes.get(field.number);
+    IndexInput data = this.data.clone();
+    data.seek(entry.offset);
+    final MonotonicBlockPackedReader ordIndex = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
+    
+    return new SortedSetDocValues() {
+      long offset;
+      long endOffset;
+      
+      @Override
+      public long nextOrd() {
+        if (offset == endOffset) {
+          return NO_MORE_ORDS;
+        } else {
+          long ord = ordinals.get(offset);
+          offset++;
+          return ord;
+        }
+      }
+
+      @Override
+      public void setDocument(int docID) {
+        offset = (docID == 0 ? 0 : ordIndex.get(docID-1));
+        endOffset = ordIndex.get(docID);
+      }
+
+      @Override
+      public void lookupOrd(long ord, BytesRef result) {
+        binary.get(ord, result);
+      }
+
+      @Override
+      public long getValueCount() {
+        return valueCount;
+      }
+    };
  }

  @Override
@ -260,4 +330,23 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
    int packedIntsVersion;
    int blockSize;
  }
+  
+  // internally we compose complex dv (sorted/sortedset) from other ones
+  static abstract class LongNumericDocValues extends NumericDocValues {
+    @Override
+    public final long get(int docID) {
+      return get((long) docID);
+    }
+    
+    abstract long get(long id);
+  }
+  
+  static abstract class LongBinaryDocValues extends BinaryDocValues {
+    @Override
+    public final void get(int docID, BytesRef result) {
+      get((long)docID, result);
+    }
+    
+    abstract void get(long id, BytesRef Result);
+  }
 }