LUCENE-5159: prefix-code the sorted/sortedset value dictionaries in DiskDV

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512543 13f79535-47bb-0310-9956-ffa450edef68
2025-02-08 19:15:06 +00:00 · 2013-08-10 00:37:41 +00:00 · 2013-08-10 00:37:41 +00:00 · 9cb38dd42a
commit 9cb38dd42a
parent b2070fefbc
8 changed files with 534 additions and 17 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -168,6 +168,9 @@ Optimizations
 * LUCENE-5150: Make WAH8DocIdSet able to inverse its encoding in order to
  compress dense sets efficiently as well. (Adrien Grand)

+* LUCENE-5159: Prefix-code the sorted/sortedset value dictionaries in DiskDV.
+  (Robert Muir)
+
 Documentation

 * LUCENE-4894: remove facet userguide as it was outdated. Partially absorbed into
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java
@ -27,9 +27,11 @@ import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.MathUtil;
+import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.packed.BlockPackedWriter;
 import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;
@ -38,6 +40,7 @@ import org.apache.lucene.util.packed.PackedInts;
 public class DiskDocValuesConsumer extends DocValuesConsumer {

  static final int BLOCK_SIZE = 16384;
+  static final int ADDRESS_INTERVAL = 16;

  /** Compressed using packed blocks of ints. */
  public static final int DELTA_COMPRESSED = 0;
@ -45,6 +48,13 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
  public static final int GCD_COMPRESSED = 1;
  /** Compressed by giving IDs to unique values. */
  public static final int TABLE_COMPRESSED = 2;
+  
+  /** Uncompressed binary, written directly (fixed length). */
+  public static final int BINARY_FIXED_UNCOMPRESSED = 0;
+  /** Uncompressed binary, written directly (variable length). */
+  public static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
+  /** Compressed binary with shared prefixes */
+  public static final int BINARY_PREFIX_COMPRESSED = 2;

  final IndexOutput data, meta;
  final int maxDoc;
@ -173,7 +183,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
  }

  @Override
-  public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+  public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
    // write the byte[] data
    meta.writeVInt(field.number);
    meta.writeByte(DiskDocValuesFormat.BINARY);
@ -187,6 +197,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
      data.writeBytes(v.bytes, v.offset, v.length);
      count++;
    }
+    meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
    meta.writeVInt(minLength);
    meta.writeVInt(maxLength);
    meta.writeVLong(count);
@ -208,12 +219,68 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
      writer.finish();
    }
  }
+  
+  protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+    // first check if its a "fixed-length" terms dict
+    int minLength = Integer.MAX_VALUE;
+    int maxLength = Integer.MIN_VALUE;
+    for (BytesRef v : values) {
+      minLength = Math.min(minLength, v.length);
+      maxLength = Math.max(maxLength, v.length);
+    }
+    if (minLength == maxLength) {
+      // no index needed: direct addressing by mult
+      addBinaryField(field, values);
+    } else {
+      // header
+      meta.writeVInt(field.number);
+      meta.writeByte(DiskDocValuesFormat.BINARY);
+      meta.writeVInt(BINARY_PREFIX_COMPRESSED);
+      // now write the bytes: sharing prefixes within a block
+      final long startFP = data.getFilePointer();
+      // currently, we have to store the delta from expected for every 1/nth term
+      // we could avoid this, but its not much and less overall RAM than the previous approach!
+      RAMOutputStream addressBuffer = new RAMOutputStream();
+      MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
+      BytesRef lastTerm = new BytesRef();
+      long count = 0;
+      for (BytesRef v : values) {
+        if (count % ADDRESS_INTERVAL == 0) {
+          termAddresses.add(data.getFilePointer() - startFP);
+          // force the first term in a block to be abs-encoded
+          lastTerm.length = 0;
+        }
+        
+        // prefix-code
+        int sharedPrefix = StringHelper.bytesDifference(lastTerm, v);
+        data.writeVInt(sharedPrefix);
+        data.writeVInt(v.length - sharedPrefix);
+        data.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
+        lastTerm.copyBytes(v);
+        count++;
+      }
+      final long indexStartFP = data.getFilePointer();
+      // write addresses of indexed terms
+      termAddresses.finish();
+      addressBuffer.writeTo(data);
+      addressBuffer = null;
+      termAddresses = null;
+      meta.writeVInt(minLength);
+      meta.writeVInt(maxLength);
+      meta.writeVLong(count);
+      meta.writeLong(startFP);
+      meta.writeVInt(ADDRESS_INTERVAL);
+      meta.writeLong(indexStartFP);
+      meta.writeVInt(PackedInts.VERSION_CURRENT);
+      meta.writeVInt(BLOCK_SIZE);
+    }
+  }

  @Override
  public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(DiskDocValuesFormat.SORTED);
-    addBinaryField(field, values);
+    addTermsDict(field, values);
    addNumericField(field, docToOrd, false);
  }
  
@ -222,7 +289,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
    meta.writeVInt(field.number);
    meta.writeByte(DiskDocValuesFormat.SORTED_SET);
    // write the ord -> byte[] as a binary field
-    addBinaryField(field, values);
+    addTermsDict(field, values);
    // write the stream of ords as a numeric field
    // NOTE: we could return an iterator that delta-encodes these within a doc
    addNumericField(field, ords, false);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java
@ -53,7 +53,8 @@ public final class DiskDocValuesFormat extends DocValuesFormat {
  public static final String META_CODEC = "DiskDocValuesMetadata";
  public static final String META_EXTENSION = "dvdm";
  public static final int VERSION_START = 0;
-  public static final int VERSION_CURRENT = VERSION_START;
+  public static final int VERSION_COMPRESSED_TERMS = 1;
+  public static final int VERSION_CURRENT = VERSION_COMPRESSED_TERMS;
  public static final byte NUMERIC = 0;
  public static final byte BINARY = 1;
  public static final byte SORTED = 2;
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
@ -21,7 +21,12 @@ import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.DELTA_COMPRE
 import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED;
 import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED;

+import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED;
+import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED;
+import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_PREFIX_COMPRESSED;
+
 import java.io.IOException;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;

@ -29,6 +34,8 @@ import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexFileNames;
@ -36,7 +43,10 @@ import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.packed.BlockPackedReader;
@ -62,7 +72,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
    final int version;
    try {
      version = CodecUtil.checkHeader(in, metaCodec, 
-                                      DiskDocValuesFormat.VERSION_START,
+                                      DiskDocValuesFormat.VERSION_CURRENT,
                                      DiskDocValuesFormat.VERSION_CURRENT);
      numerics = new HashMap<Integer,NumericEntry>();
      ords = new HashMap<Integer,NumericEntry>();
@ -84,7 +94,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
      data = state.directory.openInput(dataName, state.context);
      final int version2 = CodecUtil.checkHeader(data, dataCodec, 
-                                                 DiskDocValuesFormat.VERSION_START,
+                                                 DiskDocValuesFormat.VERSION_CURRENT,
                                                 DiskDocValuesFormat.VERSION_CURRENT);
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
@ -196,14 +206,27 @@ class DiskDocValuesProducer extends DocValuesProducer {
  
  static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
    BinaryEntry entry = new BinaryEntry();
+    entry.format = meta.readVInt();
    entry.minLength = meta.readVInt();
    entry.maxLength = meta.readVInt();
    entry.count = meta.readVLong();
    entry.offset = meta.readLong();
-    if (entry.minLength != entry.maxLength) {
-      entry.addressesOffset = meta.readLong();
-      entry.packedIntsVersion = meta.readVInt();
-      entry.blockSize = meta.readVInt();
+    switch(entry.format) {
+      case BINARY_FIXED_UNCOMPRESSED:
+        break;
+      case BINARY_PREFIX_COMPRESSED:
+        entry.addressInterval = meta.readVInt();
+        entry.addressesOffset = meta.readLong();
+        entry.packedIntsVersion = meta.readVInt();
+        entry.blockSize = meta.readVInt();
+        break;
+      case BINARY_VARIABLE_UNCOMPRESSED:
+        entry.addressesOffset = meta.readLong();
+        entry.packedIntsVersion = meta.readVInt();
+        entry.blockSize = meta.readVInt();
+        break;
+      default:
+        throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
    }
    return entry;
  }
@ -255,10 +278,15 @@ class DiskDocValuesProducer extends DocValuesProducer {
  @Override
  public BinaryDocValues getBinary(FieldInfo field) throws IOException {
    BinaryEntry bytes = binaries.get(field.number);
-    if (bytes.minLength == bytes.maxLength) {
-      return getFixedBinary(field, bytes);
-    } else {
-      return getVariableBinary(field, bytes);
+    switch(bytes.format) {
+      case BINARY_FIXED_UNCOMPRESSED:
+        return getFixedBinary(field, bytes);
+      case BINARY_VARIABLE_UNCOMPRESSED:
+        return getVariableBinary(field, bytes);
+      case BINARY_PREFIX_COMPRESSED:
+        return getCompressedBinary(field, bytes);
+      default:
+        throw new AssertionError();
    }
  }
  
@ -321,6 +349,30 @@ class DiskDocValuesProducer extends DocValuesProducer {
    };
  }

+  private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
+    final IndexInput data = this.data.clone();
+    final long interval = bytes.addressInterval;
+
+    final MonotonicBlockPackedReader addresses;
+    synchronized (addressInstances) {
+      MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
+      if (addrInstance == null) {
+        data.seek(bytes.addressesOffset);
+        final long size;
+        if (bytes.count % interval == 0) {
+          size = bytes.count / interval;
+        } else {
+          size = 1L + bytes.count / interval;
+        }
+        addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
+        addressInstances.put(field.number, addrInstance);
+      }
+      addresses = addrInstance;
+    }
+    
+    return new CompressedBinaryDocValues(bytes, addresses, data);
+  }
+
  @Override
  public SortedDocValues getSorted(FieldInfo field) throws IOException {
    final int valueCount = (int) binaries.get(field.number).count;
@ -346,6 +398,24 @@ class DiskDocValuesProducer extends DocValuesProducer {
      public int getValueCount() {
        return valueCount;
      }
+
+      @Override
+      public int lookupTerm(BytesRef key) {
+        if (binary instanceof CompressedBinaryDocValues) {
+          return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key);
+        } else {
+        return super.lookupTerm(key);
+        }
+      }
+
+      @Override
+      public TermsEnum termsEnum() {
+        if (binary instanceof CompressedBinaryDocValues) {
+          return ((CompressedBinaryDocValues)binary).getTermsEnum();
+        } else {
+          return super.termsEnum();
+        }
+      }
    };
  }

@ -399,6 +469,24 @@ class DiskDocValuesProducer extends DocValuesProducer {
      public long getValueCount() {
        return valueCount;
      }
+      
+      @Override
+      public long lookupTerm(BytesRef key) {
+        if (binary instanceof CompressedBinaryDocValues) {
+          return ((CompressedBinaryDocValues)binary).lookupTerm(key);
+        } else {
+          return super.lookupTerm(key);
+        }
+      }
+
+      @Override
+      public TermsEnum termsEnum() {
+        if (binary instanceof CompressedBinaryDocValues) {
+          return ((CompressedBinaryDocValues)binary).getTermsEnum();
+        } else {
+          return super.termsEnum();
+        }
+      }
    };
  }

@ -423,10 +511,12 @@ class DiskDocValuesProducer extends DocValuesProducer {
  static class BinaryEntry {
    long offset;

+    int format;
    long count;
    int minLength;
    int maxLength;
    long addressesOffset;
+    long addressInterval;
    int packedIntsVersion;
    int blockSize;
  }
@ -449,4 +539,204 @@ class DiskDocValuesProducer extends DocValuesProducer {
    
    abstract void get(long id, BytesRef Result);
  }
+  
+  // in the compressed case, we add a few additional operations for
+  // more efficient reverse lookup and enumeration
+  static class CompressedBinaryDocValues extends LongBinaryDocValues {
+    final BinaryEntry bytes;
+    final long interval;
+    final long numValues;
+    final long numIndexValues;
+    final MonotonicBlockPackedReader addresses;
+    final IndexInput data;
+    final TermsEnum termsEnum;
+    
+    public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException {
+      this.bytes = bytes;
+      this.interval = bytes.addressInterval;
+      this.addresses = addresses;
+      this.data = data;
+      this.numValues = bytes.count;
+      this.numIndexValues = addresses.size();
+      this.termsEnum = getTermsEnum(data);
+    }
+    
+    @Override
+    public void get(long id, BytesRef result) {
+      try {
+        termsEnum.seekExact(id);
+        BytesRef term = termsEnum.term();
+        result.bytes = term.bytes;
+        result.offset = term.offset;
+        result.length = term.length;
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    
+    long lookupTerm(BytesRef key) {
+      try {
+        SeekStatus status = termsEnum.seekCeil(key);
+        if (status == SeekStatus.END) {
+          return -numValues-1;
+        } else if (status == SeekStatus.FOUND) {
+          return termsEnum.ord();
+        } else {
+          return -termsEnum.ord()-1;
+        }
+      } catch (IOException bogus) {
+        throw new RuntimeException(bogus);
+      }
+    }
+    
+    TermsEnum getTermsEnum() {
+      try {
+        return getTermsEnum(data.clone());
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    
+    private TermsEnum getTermsEnum(final IndexInput input) throws IOException {
+      input.seek(bytes.offset);
+      
+      return new TermsEnum() {
+        private long currentOrd = -1;
+        // TODO: maxLength is negative when all terms are merged away...
+        private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength);
+        private final BytesRef term = new BytesRef(); // TODO: paranoia?
+
+        @Override
+        public BytesRef next() throws IOException {
+          if (doNext() == null) {
+            return null;
+          } else {
+            setTerm();
+            return term;
+          }
+        }
+        
+        private BytesRef doNext() throws IOException {
+          if (++currentOrd >= numValues) {
+            return null;
+          } else {
+            int start = input.readVInt();
+            int suffix = input.readVInt();
+            input.readBytes(termBuffer.bytes, start, suffix);
+            termBuffer.length = start + suffix;
+            return termBuffer;
+          }
+        }
+
+        @Override
+        public SeekStatus seekCeil(BytesRef text) throws IOException {
+          // binary-search just the index values to find the block,
+          // then scan within the block
+          long low = 0;
+          long high = numIndexValues-1;
+
+          while (low <= high) {
+            long mid = (low + high) >>> 1;
+            doSeek(mid * interval);
+            int cmp = termBuffer.compareTo(text);
+
+            if (cmp < 0) {
+              low = mid + 1;
+            } else if (cmp > 0) {
+              high = mid - 1;
+            } else {
+              // we got lucky, found an indexed term
+              setTerm();
+              return SeekStatus.FOUND;
+            }
+          }
+          
+          if (numIndexValues == 0) {
+            return SeekStatus.END;
+          }
+          
+          // block before insertion point
+          long block = low-1;
+          doSeek(block < 0 ? -1 : block * interval);
+          
+          while (doNext() != null) {
+            int cmp = termBuffer.compareTo(text);
+            if (cmp == 0) {
+              setTerm();
+              return SeekStatus.FOUND;
+            } else if (cmp > 0) {
+              setTerm();
+              return SeekStatus.NOT_FOUND;
+            }
+          }
+          
+          return SeekStatus.END;
+        }
+
+        @Override
+        public void seekExact(long ord) throws IOException {
+          doSeek(ord);
+          setTerm();
+        }
+        
+        private void doSeek(long ord) throws IOException {
+          long block = ord / interval;
+
+          if (ord >= currentOrd && block == currentOrd / interval) {
+            // seek within current block
+          } else {
+            // position before start of block
+            currentOrd = ord - ord % interval - 1;
+            input.seek(bytes.offset + addresses.get(block));
+          }
+          
+          while (currentOrd < ord) {
+            doNext();
+          }
+        }
+        
+        private void setTerm() {
+          // TODO: is there a cleaner way
+          term.bytes = new byte[termBuffer.length];
+          term.offset = 0;
+          term.copyBytes(termBuffer);
+        }
+
+        @Override
+        public BytesRef term() throws IOException {
+          return term;
+        }
+
+        @Override
+        public long ord() throws IOException {
+          return currentOrd;
+        }
+        
+        @Override
+        public Comparator<BytesRef> getComparator() {
+          return BytesRef.getUTF8SortedAsUnicodeComparator();
+        }
+
+        @Override
+        public int docFreq() throws IOException {
+          throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public long totalTermFreq() throws IOException {
+          return -1;
+        }
+
+        @Override
+        public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
+          throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
+          throw new UnsupportedOperationException();
+        }
+      };
+    }
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java
@ -78,5 +78,10 @@ public final class MonotonicBlockPackedReader {
    final int idx = (int) (index & blockMask);
    return minValues[block] + (long) (idx * averages[block]) + zigZagDecode(subReaders[block].get(idx));
  }
+  
+  /** Returns the number of values */
+  public long size() {
+    return valueCount;
+  }

 }
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java
@ -24,8 +24,10 @@ import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
 import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
+import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.BytesRef;

 /**
 * DocValues format that keeps everything on disk.
@ -53,7 +55,13 @@ public final class CheapBastardDocValuesFormat extends DocValuesFormat {
    return new DiskDocValuesConsumer(state, DiskDocValuesFormat.DATA_CODEC, 
                                            DiskDocValuesFormat.DATA_EXTENSION, 
                                            DiskDocValuesFormat.META_CODEC, 
-                                            DiskDocValuesFormat.META_EXTENSION);
+                                            DiskDocValuesFormat.META_EXTENSION) {
+      // don't ever write an index, we dont want to use RAM :)
+      @Override
+      protected void addTermsDict(FieldInfo field, Iterable<BytesRef> values) throws IOException {
+        addBinaryField(field, values);
+      }  
+    };
  }

  @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java
@ -27,6 +27,7 @@ import java.util.Map;

 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
 import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.CorruptIndexException;
@ -58,7 +59,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
    final int version;
    try {
      version = CodecUtil.checkHeader(in, metaCodec, 
-                                      DiskDocValuesFormat.VERSION_START,
+                                      DiskDocValuesFormat.VERSION_CURRENT,
                                      DiskDocValuesFormat.VERSION_CURRENT);
      numerics = new HashMap<Integer,NumericEntry>();
      ords = new HashMap<Integer,NumericEntry>();
@ -80,7 +81,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
      data = state.directory.openInput(dataName, state.context);
      final int version2 = CodecUtil.checkHeader(data, dataCodec, 
-                                                 DiskDocValuesFormat.VERSION_START,
+                                                 DiskDocValuesFormat.VERSION_CURRENT,
                                                 DiskDocValuesFormat.VERSION_CURRENT);
      if (version != version2) {
        throw new CorruptIndexException("Versions mismatch");
@ -193,6 +194,10 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
  
  static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
    BinaryEntry entry = new BinaryEntry();
+    int format = meta.readVInt();
+    if (format != DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) {
+      throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta);
+    }
    entry.minLength = meta.readVInt();
    entry.maxLength = meta.readVInt();
    entry.count = meta.readVLong();
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@ -1350,6 +1350,57 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
    dir.close();
  }
  
+  private void doTestSortedVsFieldCache(int minLength, int maxLength) throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
+    Document doc = new Document();
+    Field idField = new StringField("id", "", Field.Store.NO);
+    Field indexedField = new StringField("indexed", "", Field.Store.NO);
+    Field dvField = new SortedDocValuesField("dv", new BytesRef());
+    doc.add(idField);
+    doc.add(indexedField);
+    doc.add(dvField);
+    
+    // index some docs
+    int numDocs = atLeast(300);
+    for (int i = 0; i < numDocs; i++) {
+      idField.setStringValue(Integer.toString(i));
+      final int length;
+      if (minLength == maxLength) {
+        length = minLength; // fixed length
+      } else {
+        length = _TestUtil.nextInt(random(), minLength, maxLength);
+      }
+      String value = _TestUtil.randomSimpleString(random(), length);
+      indexedField.setStringValue(value);
+      dvField.setBytesValue(new BytesRef(value));
+      writer.addDocument(doc);
+      if (random().nextInt(31) == 0) {
+        writer.commit();
+      }
+    }
+    
+    // delete some docs
+    int numDeletions = random().nextInt(numDocs/10);
+    for (int i = 0; i < numDeletions; i++) {
+      int id = random().nextInt(numDocs);
+      writer.deleteDocuments(new Term("id", Integer.toString(id)));
+    }
+    writer.close();
+    
+    // compare
+    DirectoryReader ir = DirectoryReader.open(dir);
+    for (AtomicReaderContext context : ir.leaves()) {
+      AtomicReader r = context.reader();
+      SortedDocValues expected = FieldCache.DEFAULT.getTermsIndex(r, "indexed");
+      SortedDocValues actual = r.getSortedDocValues("dv");
+      assertEquals(r.maxDoc(), expected, actual);
+    }
+    ir.close();
+    dir.close();
+  }
+  
  public void testSortedFixedLengthVsStoredFields() throws Exception {
    int numIterations = atLeast(1);
    for (int i = 0; i < numIterations; i++) {
@ -1358,6 +1409,21 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
    }
  }
  
+  public void testSortedFixedLengthVsFieldCache() throws Exception {
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      int fixedLength = _TestUtil.nextInt(random(), 1, 10);
+      doTestSortedVsFieldCache(fixedLength, fixedLength);
+    }
+  }
+  
+  public void testSortedVariableLengthVsFieldCache() throws Exception {
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      doTestSortedVsFieldCache(1, 10);
+    }
+  }
+  
  public void testSortedVariableLengthVsStoredFields() throws Exception {
    int numIterations = atLeast(1);
    for (int i = 0; i < numIterations; i++) {
@ -1905,6 +1971,10 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
    }
  }
  
+  private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception {
+    assertEquals(maxDoc, new SingletonSortedSetDocValues(expected), new SingletonSortedSetDocValues(actual));
+  }
+  
  private void assertEquals(int maxDoc, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception {
    // can be null for the segment if no docs actually had any SortedDocValues
    // in this case FC.getDocTermsOrds returns EMPTY
@ -1932,6 +2002,74 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
      actual.lookupTerm(actualBytes);
      assertEquals(expectedBytes, actualBytes);
    }
+    
+    // compare termsenum
+    assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum());
+  }
+  
+  private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
+    BytesRef ref;
+    
+    // sequential next() through all terms
+    while ((ref = expected.next()) != null) {
+      assertEquals(ref, actual.next());
+      assertEquals(expected.ord(), actual.ord());
+      assertEquals(expected.term(), actual.term());
+    }
+    assertNull(actual.next());
+    
+    // sequential seekExact(ord) through all terms
+    for (long i = 0; i < numOrds; i++) {
+      expected.seekExact(i);
+      actual.seekExact(i);
+      assertEquals(expected.ord(), actual.ord());
+      assertEquals(expected.term(), actual.term());
+    }
+    
+    // sequential seekExact(BytesRef) through all terms
+    for (long i = 0; i < numOrds; i++) {
+      expected.seekExact(i);
+      assertTrue(actual.seekExact(expected.term()));
+      assertEquals(expected.ord(), actual.ord());
+      assertEquals(expected.term(), actual.term());
+    }
+    
+    // sequential seekCeil(BytesRef) through all terms
+    for (long i = 0; i < numOrds; i++) {
+      expected.seekExact(i);
+      assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
+      assertEquals(expected.ord(), actual.ord());
+      assertEquals(expected.term(), actual.term());
+    }
+    
+    // random seekExact(ord)
+    for (long i = 0; i < numOrds; i++) {
+      long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
+      expected.seekExact(randomOrd);
+      actual.seekExact(randomOrd);
+      assertEquals(expected.ord(), actual.ord());
+      assertEquals(expected.term(), actual.term());
+    }
+    
+    // random seekExact(BytesRef)
+    for (long i = 0; i < numOrds; i++) {
+      long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
+      expected.seekExact(randomOrd);
+      actual.seekExact(expected.term());
+      assertEquals(expected.ord(), actual.ord());
+      assertEquals(expected.term(), actual.term());
+    }
+    
+    // random seekCeil(BytesRef)
+    for (long i = 0; i < numOrds; i++) {
+      BytesRef target = new BytesRef(_TestUtil.randomUnicodeString(random()));
+      SeekStatus expectedStatus = expected.seekCeil(target);
+      assertEquals(expectedStatus, actual.seekCeil(target));
+      if (expectedStatus != SeekStatus.END) {
+        assertEquals(expected.ord(), actual.ord());
+        assertEquals(expected.term(), actual.term());
+      }
+    }
  }
  
  private void doTestSortedSetVsUninvertedField(int minLength, int maxLength) throws Exception {