LUCENE-5296: add DirectDocValuesFormat

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1537105 13f79535-47bb-0310-9956-ffa450edef68
2025-02-08 02:58:58 +00:00 · 2013-10-30 14:23:36 +00:00 · 2013-10-30 14:23:36 +00:00 · 47480a8496
commit 47480a8496
parent fe4699b6b0
9 changed files with 945 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -128,6 +128,9 @@ New Features

 * LUCENE-5302: Make StemmerOverrideMap's methods public (Alan Woodward)

+* LUCENE-5296: Add DirectDocValuesFormat, which holds all doc values
+  in heap as uncompressed java native arrays.  (Mike McCandless)
+
 Bug Fixes

 * LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesConsumer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesConsumer.java
@ -0,0 +1,299 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.VERSION_CURRENT;
+import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.BYTES;
+import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED;
+import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED_SET;
+import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.NUMBER;
+
+/**
+ * Writer for {@link DirectDocValuesFormat}
+ */
+
+class DirectDocValuesConsumer extends DocValuesConsumer {
+  final IndexOutput data, meta;
+  final int maxDoc;
+
+  DirectDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+    maxDoc = state.segmentInfo.getDocCount();
+    boolean success = false;
+    try {
+      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+      data = state.directory.createOutput(dataName, state.context);
+      CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT);
+      String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+      meta = state.directory.createOutput(metaName, state.context);
+      CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT);
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
+
+  @Override
+  public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
+    meta.writeVInt(field.number);
+    meta.writeByte(NUMBER);
+    addNumericFieldValues(field, values);
+  }
+
+  private void addNumericFieldValues(FieldInfo field, Iterable<Number> values) throws IOException {
+    meta.writeLong(data.getFilePointer());
+    long minValue = Long.MAX_VALUE;
+    long maxValue = Long.MIN_VALUE;
+    boolean missing = false;
+
+    long count = 0;
+    for (Number nv : values) {
+      if (nv != null) {
+        long v = nv.longValue();
+        minValue = Math.min(minValue, v);
+        maxValue = Math.max(maxValue, v);
+      } else {
+        missing = true;
+      }
+      count++;
+      if (count >= DirectDocValuesFormat.MAX_SORTED_SET_ORDS) {
+        throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + DirectDocValuesFormat.MAX_SORTED_SET_ORDS + " values/total ords");
+      }
+    }
+    meta.writeInt((int) count);
+    
+    if (missing) {
+      long start = data.getFilePointer();
+      writeMissingBitset(values);
+      meta.writeLong(start);
+      meta.writeLong(data.getFilePointer() - start);
+    } else {
+      meta.writeLong(-1L);
+    }
+
+    byte byteWidth;
+    if (minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
+      byteWidth = 1;
+    } else if (minValue >= Short.MIN_VALUE && maxValue <= Short.MAX_VALUE) {
+      byteWidth = 2;
+    } else if (minValue >= Integer.MIN_VALUE && maxValue <= Integer.MAX_VALUE) {
+      byteWidth = 4;
+    } else {
+      byteWidth = 8;
+    }
+    meta.writeByte(byteWidth);
+
+    for (Number nv : values) {
+      long v;
+      if (nv != null) {
+        v = nv.longValue();
+      } else {
+        v = 0;
+      }
+
+      switch(byteWidth) {
+      case 1:
+        data.writeByte((byte) v);
+        break;
+      case 2:
+        data.writeShort((short) v);
+        break;
+      case 4:
+        data.writeInt((int) v);
+        break;
+      case 8:
+        data.writeLong(v);
+        break;
+      }
+    }
+  }
+  
+  @Override
+  public void close() throws IOException {
+    boolean success = false;
+    try {
+      if (meta != null) {
+        meta.writeVInt(-1); // write EOF marker
+      }
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(data, meta);
+      } else {
+        IOUtils.closeWhileHandlingException(data, meta);
+      }
+    }
+  }
+
+  @Override
+  public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+    meta.writeVInt(field.number);
+    meta.writeByte(BYTES);
+    addBinaryFieldValues(field, values);
+  }
+
+  private void addBinaryFieldValues(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+    // write the byte[] data
+    final long startFP = data.getFilePointer();
+    boolean missing = false;
+    long totalBytes = 0;
+    int count = 0;
+    for(BytesRef v : values) {
+      if (v != null) {
+        data.writeBytes(v.bytes, v.offset, v.length);
+        totalBytes += v.length;
+        if (totalBytes > DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH) {
+          throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, cannot have more than DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH (" + DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH + ") bytes");
+        }
+      } else {
+        missing = true;
+      }
+      count++;
+    }
+
+    meta.writeLong(startFP);
+    meta.writeInt((int) totalBytes);
+    meta.writeInt(count);
+    if (missing) {
+      long start = data.getFilePointer();
+      writeMissingBitset(values);
+      meta.writeLong(start);
+      meta.writeLong(data.getFilePointer() - start);
+    } else {
+      meta.writeLong(-1L);
+    }
+    
+    int addr = 0;
+    for (BytesRef v : values) {
+      data.writeInt(addr);
+      if (v != null) {
+        addr += v.length;
+      }
+    }
+    data.writeInt(addr);
+  }
+  
+  // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
+  // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
+  void writeMissingBitset(Iterable<?> values) throws IOException {
+    long bits = 0;
+    int count = 0;
+    for (Object v : values) {
+      if (count == 64) {
+        data.writeLong(bits);
+        count = 0;
+        bits = 0;
+      }
+      if (v != null) {
+        bits |= 1L << (count & 0x3f);
+      }
+      count++;
+    }
+    if (count > 0) {
+      data.writeLong(bits);
+    }
+  }
+
+  @Override
+  public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
+    meta.writeVInt(field.number);
+    meta.writeByte(SORTED);
+
+    // write the ordinals as numerics
+    addNumericFieldValues(field, docToOrd);
+    
+    // write the values as binary
+    addBinaryFieldValues(field, values);
+  }
+
+  // note: this might not be the most efficient... but its fairly simple
+  @Override
+  public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws IOException {
+    meta.writeVInt(field.number);
+    meta.writeByte(SORTED_SET);
+
+    // First write docToOrdCounts, except we "aggregate" the
+    // counts so they turn into addresses, and add a final
+    // value = the total aggregate:
+    addNumericFieldValues(field, new Iterable<Number>() {
+
+        // Just aggregates the count values so they become
+        // "addresses", and adds one more value in the end
+        // (the final sum):
+
+        @Override
+        public Iterator<Number> iterator() {
+          final Iterator<Number> iter = docToOrdCount.iterator();
+
+          return new Iterator<Number>() {
+
+            long sum;
+            boolean ended;
+
+            @Override
+            public boolean hasNext() {
+              return iter.hasNext() || !ended;
+            }
+
+            @Override
+            public Number next() {
+              long toReturn = sum;
+
+              if (iter.hasNext()) {
+                Number n = iter.next();
+                if (n != null) {
+                  sum += n.longValue();
+                }
+              } else if (!ended) {
+                ended = true;
+              } else {
+                assert false;
+              }
+
+              return toReturn;
+            }
+
+            @Override
+            public void remove() {
+              throw new UnsupportedOperationException();
+            }
+          };
+        }
+      });
+
+    // Write ordinals for all docs, appended into one big
+    // numerics:
+    addNumericFieldValues(field, ords);
+      
+    // write the values as binary
+    addBinaryFieldValues(field, values);
+  }
+}
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesFormat.java
@ -0,0 +1,78 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.ArrayUtil;
+
+/** In-memory docvalues format that does no (or very little)
+ *  compression.  Indexed values are stored on disk, but
+ *  then at search time all values are loaded into memory as
+ *  simple java arrays.  For numeric values, it uses
+ *  byte[], short[], int[], long[] as necessary to fit the
+ *  range of the values.  For binary values, there is an int
+ *  (4 bytes) overhead per value.
+ *
+ *  <p>Limitations:
+ *  <ul>
+ *    <li>For binary and sorted fields the total space
+ *        required for all binary values cannot exceed about
+ *        2.1 GB (see #MAX_TOTAL_BYTES_LENGTH).</li>
+ *
+ *    <li>For sorted set fields, the sum of the size of each
+ *        document's set of values cannot exceed about 2.1
+ *        B (see #MAX_SORTED_SET_ORDS).</li>
+ *  </ul> */
+
+public class DirectDocValuesFormat extends DocValuesFormat {
+
+  /** The sum of all byte lengths for binary field, or for
+   *  the unique values in sorted or sorted set fields, cannot
+   *  exceed this. */
+  public final static int MAX_TOTAL_BYTES_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
+
+  /** The sum of the number of values across all documents
+   *  in a sorted set field cannot exceed this. */
+  public final static int MAX_SORTED_SET_ORDS = ArrayUtil.MAX_ARRAY_LENGTH;
+
+  /** Sole constructor. */
+  public DirectDocValuesFormat() {
+    super("Direct");
+  }
+  
+  @Override
+  public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+    return new DirectDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+  }
+  
+  @Override
+  public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
+    return new DirectDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+  }
+  
+  static final String DATA_CODEC = "DirectDocValuesData";
+  static final String DATA_EXTENSION = "dvdd";
+  static final String METADATA_CODEC = "DirectDocValuesMetadata";
+  static final String METADATA_EXTENSION = "dvdm";
+}
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java
@ -0,0 +1,478 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Reader for {@link DirectDocValuesFormat}
+ */
+
+class DirectDocValuesProducer extends DocValuesProducer {
+  // metadata maps (just file pointers and minimal stuff)
+  private final Map<Integer,NumericEntry> numerics = new HashMap<Integer,NumericEntry>();
+  private final Map<Integer,BinaryEntry> binaries = new HashMap<Integer,BinaryEntry>();
+  private final Map<Integer,SortedEntry> sorteds = new HashMap<Integer,SortedEntry>();
+  private final Map<Integer,SortedSetEntry> sortedSets = new HashMap<Integer,SortedSetEntry>();
+  private final IndexInput data;
+  
+  // ram instances we have already loaded
+  private final Map<Integer,NumericDocValues> numericInstances = 
+      new HashMap<Integer,NumericDocValues>();
+  private final Map<Integer,BinaryDocValues> binaryInstances =
+      new HashMap<Integer,BinaryDocValues>();
+  private final Map<Integer,SortedDocValues> sortedInstances =
+      new HashMap<Integer,SortedDocValues>();
+  private final Map<Integer,SortedSetRawValues> sortedSetInstances =
+      new HashMap<Integer,SortedSetRawValues>();
+  private final Map<Integer,Bits> docsWithFieldInstances = new HashMap<Integer,Bits>();
+  
+  private final int maxDoc;
+  
+  static final byte NUMBER = 0;
+  static final byte BYTES = 1;
+  static final byte SORTED = 2;
+  static final byte SORTED_SET = 3;
+
+  static final int VERSION_START = 0;
+  static final int VERSION_CURRENT = VERSION_START;
+    
+  DirectDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+    maxDoc = state.segmentInfo.getDocCount();
+    String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+    // read in the entries from the metadata file.
+    IndexInput in = state.directory.openInput(metaName, state.context);
+    boolean success = false;
+    final int version;
+    try {
+      version = CodecUtil.checkHeader(in, metaCodec, 
+                                      VERSION_START,
+                                      VERSION_CURRENT);
+      readFields(in);
+
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(in);
+      } else {
+        IOUtils.closeWhileHandlingException(in);
+      }
+    }
+
+    success = false;
+    try {
+      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+      data = state.directory.openInput(dataName, state.context);
+      final int version2 = CodecUtil.checkHeader(data, dataCodec, 
+                                                 VERSION_START,
+                                                 VERSION_CURRENT);
+      if (version != version2) {
+        throw new CorruptIndexException("Format versions mismatch");
+      }
+
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this.data);
+      }
+    }
+  }
+
+  private NumericEntry readNumericEntry(IndexInput meta) throws IOException {
+    NumericEntry entry = new NumericEntry();
+    entry.offset = meta.readLong();
+    entry.count = meta.readInt();
+    entry.missingOffset = meta.readLong();
+    if (entry.missingOffset != -1) {
+      entry.missingBytes = meta.readLong();
+    } else {
+      entry.missingBytes = 0;
+    }
+    entry.byteWidth = meta.readByte();
+
+    return entry;
+  }
+
+  private BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
+    BinaryEntry entry = new BinaryEntry();
+    entry.offset = meta.readLong();
+    entry.numBytes = meta.readInt();
+    entry.count = meta.readInt();
+    entry.missingOffset = meta.readLong();
+    if (entry.missingOffset != -1) {
+      entry.missingBytes = meta.readLong();
+    } else {
+      entry.missingBytes = 0;
+    }
+
+    return entry;
+  }
+
+  private SortedEntry readSortedEntry(IndexInput meta) throws IOException {
+    SortedEntry entry = new SortedEntry();
+    entry.docToOrd = readNumericEntry(meta);
+    entry.values = readBinaryEntry(meta);
+    return entry;
+  }
+
+  private SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException {
+    SortedSetEntry entry = new SortedSetEntry();
+    entry.docToOrdAddress = readNumericEntry(meta);
+    entry.ords = readNumericEntry(meta);
+    entry.values = readBinaryEntry(meta);
+    return entry;
+  }
+
+  private void readFields(IndexInput meta) throws IOException {
+    int fieldNumber = meta.readVInt();
+    while (fieldNumber != -1) {
+      int fieldType = meta.readByte();
+      if (fieldType == NUMBER) {
+        numerics.put(fieldNumber, readNumericEntry(meta));
+      } else if (fieldType == BYTES) {
+        binaries.put(fieldNumber, readBinaryEntry(meta));
+      } else if (fieldType == SORTED) {
+        sorteds.put(fieldNumber, readSortedEntry(meta));
+      } else if (fieldType == SORTED_SET) {
+        sortedSets.put(fieldNumber, readSortedSetEntry(meta));
+      } else {
+        throw new CorruptIndexException("invalid entry type: " + fieldType + ", input=" + meta);
+      }
+      fieldNumber = meta.readVInt();
+    }
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    // TODO: optimize me
+    return RamUsageEstimator.sizeOf(this);
+  }
+  
+  @Override
+  public synchronized NumericDocValues getNumeric(FieldInfo field) throws IOException {
+    NumericDocValues instance = numericInstances.get(field.number);
+    if (instance == null) {
+      // Lazy load
+      instance = loadNumeric(numerics.get(field.number));
+      numericInstances.put(field.number, instance);
+    }
+    return instance;
+  }
+  
+  private NumericDocValues loadNumeric(NumericEntry entry) throws IOException {
+    data.seek(entry.offset + entry.missingBytes);
+    switch (entry.byteWidth) {
+    case 1:
+      {
+        final byte[] values = new byte[entry.count];
+        for(int i=0;i<entry.count;i++) {
+          values[i] = data.readByte();
+        }
+        return new NumericDocValues() {
+          @Override
+          public long get(int idx) {
+            return values[idx];
+          }
+        };
+      }
+
+    case 2:
+      {
+        final short[] values = new short[entry.count];
+        for(int i=0;i<entry.count;i++) {
+          values[i] = data.readShort();
+        }
+        return new NumericDocValues() {
+          @Override
+          public long get(int idx) {
+            return values[idx];
+          }
+        };
+      }
+
+    case 4:
+      {
+        final int[] values = new int[entry.count];
+        for(int i=0;i<entry.count;i++) {
+          values[i] = data.readInt();
+        }
+        return new NumericDocValues() {
+          @Override
+          public long get(int idx) {
+            return values[idx];
+          }
+        };
+      }
+
+    case 8:
+      {
+        final long[] values = new long[entry.count];
+        for(int i=0;i<entry.count;i++) {
+          values[i] = data.readLong();
+        }
+        return new NumericDocValues() {
+          @Override
+          public long get(int idx) {
+            return values[idx];
+          }
+        };
+      }
+    
+    default:
+      throw new AssertionError();
+    }
+  }
+
+  @Override
+  public synchronized BinaryDocValues getBinary(FieldInfo field) throws IOException {
+    BinaryDocValues instance = binaryInstances.get(field.number);
+    if (instance == null) {
+      // Lazy load
+      instance = loadBinary(binaries.get(field.number));
+      binaryInstances.put(field.number, instance);
+    }
+    return instance;
+  }
+  
+  private BinaryDocValues loadBinary(BinaryEntry entry) throws IOException {
+    data.seek(entry.offset);
+    final byte[] bytes = new byte[entry.numBytes];
+    data.readBytes(bytes, 0, entry.numBytes);
+    data.seek(entry.offset + entry.numBytes + entry.missingBytes);
+
+    final int[] address = new int[entry.count+1];
+    for(int i=0;i<entry.count;i++) {
+      address[i] = data.readInt();
+    }
+    address[entry.count] = data.readInt();
+
+    return new BinaryDocValues() {
+      @Override
+      public void get(int docID, BytesRef result) {
+        result.bytes = bytes;
+        result.offset = address[docID];
+        result.length = address[docID+1] - result.offset;
+      };
+    };
+  }
+  
+  @Override
+  public synchronized SortedDocValues getSorted(FieldInfo field) throws IOException {
+    SortedDocValues instance = sortedInstances.get(field.number);
+    if (instance == null) {
+      // Lazy load
+      instance = loadSorted(field);
+      sortedInstances.put(field.number, instance);
+    }
+    return instance;
+  }
+
+  private SortedDocValues loadSorted(FieldInfo field) throws IOException {
+    final SortedEntry entry = sorteds.get(field.number);
+    final NumericDocValues docToOrd = loadNumeric(entry.docToOrd);
+    final BinaryDocValues values = loadBinary(entry.values);
+
+    return new SortedDocValues() {
+
+      @Override
+      public int getOrd(int docID) {
+        return (int) docToOrd.get(docID);
+      }
+
+      @Override
+      public void lookupOrd(int ord, BytesRef result) {
+        values.get(ord, result);
+      }
+
+      @Override
+      public int getValueCount() {
+        return entry.values.count;
+      }
+
+      // Leave lookupTerm to super's binary search
+
+      // Leave termsEnum to super
+    };
+  }
+
+  @Override
+  public synchronized SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+    SortedSetRawValues instance = sortedSetInstances.get(field.number);
+    final SortedSetEntry entry = sortedSets.get(field.number);
+    if (instance == null) {
+      // Lazy load
+      instance = loadSortedSet(entry);
+      sortedSetInstances.put(field.number, instance);
+    }
+
+    final NumericDocValues docToOrdAddress = instance.docToOrdAddress;
+    final NumericDocValues ords = instance.ords;
+    final BinaryDocValues values = instance.values;
+
+    // Must make a new instance since the iterator has state:
+    return new SortedSetDocValues() {
+      int ordUpto;
+      int ordLimit;
+
+      @Override
+      public long nextOrd() {
+        if (ordUpto == ordLimit) {
+          return NO_MORE_ORDS;
+        } else {
+          return ords.get(ordUpto++);
+        }
+      }
+      
+      @Override
+      public void setDocument(int docID) {
+        ordUpto = (int) docToOrdAddress.get(docID);
+        ordLimit = (int) docToOrdAddress.get(docID+1);
+      }
+
+      @Override
+      public void lookupOrd(long ord, BytesRef result) {
+        values.get((int) ord, result);
+      }
+
+      @Override
+      public long getValueCount() {
+        return entry.values.count;
+      }
+
+      // Leave lookupTerm to super's binary search
+
+      // Leave termsEnum to super
+    };
+  }
+  
+  private SortedSetRawValues loadSortedSet(SortedSetEntry entry) throws IOException {
+    SortedSetRawValues instance = new SortedSetRawValues();
+    instance.docToOrdAddress = loadNumeric(entry.docToOrdAddress);
+    instance.ords = loadNumeric(entry.ords);
+    instance.values = loadBinary(entry.values);
+    return instance;
+  }
+
+  private Bits getMissingBits(int fieldNumber, final long offset, final long length) throws IOException {
+    if (offset == -1) {
+      return new Bits.MatchAllBits(maxDoc);
+    } else {
+      Bits instance;
+      synchronized(this) {
+        instance = docsWithFieldInstances.get(fieldNumber);
+        if (instance == null) {
+          IndexInput data = this.data.clone();
+          data.seek(offset);
+          assert length % 8 == 0;
+          long bits[] = new long[(int) length >> 3];
+          for (int i = 0; i < bits.length; i++) {
+            bits[i] = data.readLong();
+          }
+          instance = new FixedBitSet(bits, maxDoc);
+          docsWithFieldInstances.put(fieldNumber, instance);
+        }
+      }
+      return instance;
+    }
+  }
+  
+  @Override
+  public Bits getDocsWithField(FieldInfo field) throws IOException {
+    switch(field.getDocValuesType()) {
+      case SORTED_SET:
+        return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
+      case SORTED:
+        return new SortedDocsWithField(getSorted(field), maxDoc);
+      case BINARY:
+        BinaryEntry be = binaries.get(field.number);
+        return getMissingBits(field.number, be.missingOffset, be.missingBytes);
+      case NUMERIC:
+        NumericEntry ne = numerics.get(field.number);
+        return getMissingBits(field.number, ne.missingOffset, ne.missingBytes);
+      default: 
+        throw new AssertionError();
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    data.close();
+  }
+  
+  static class SortedSetRawValues {
+    NumericDocValues docToOrdAddress;
+    NumericDocValues ords;
+    BinaryDocValues values;
+  }
+
+  static class NumericEntry {
+    long offset;
+    int count;
+    long missingOffset;
+    long missingBytes;
+    byte byteWidth;
+    int packedIntsVersion;
+  }
+
+  static class BinaryEntry {
+    long offset;
+    long missingOffset;
+    long missingBytes;
+    int count;
+    int numBytes;
+    int minLength;
+    int maxLength;
+    int packedIntsVersion;
+    int blockSize;
+  }
+  
+  static class SortedEntry {
+    NumericEntry docToOrd;
+    BinaryEntry values;
+  }
+
+  static class SortedSetEntry {
+    NumericEntry docToOrdAddress;
+    NumericEntry ords;
+    BinaryEntry values;
+  }
+  
+  static class FSTEntry {
+    long offset;
+    long numOrds;
+  }
+}
--- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
+++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
@ -15,4 +15,5 @@

 org.apache.lucene.codecs.diskdv.DiskDocValuesFormat
 org.apache.lucene.codecs.memory.MemoryDocValuesFormat
+org.apache.lucene.codecs.memory.DirectDocValuesFormat
 org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestDirectDocValuesFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestDirectDocValuesFormat.java
@ -0,0 +1,34 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseDocValuesFormatTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Tests DirectDocValuesFormat
+ */
+public class TestDirectDocValuesFormat extends BaseDocValuesFormatTestCase {
+  private final Codec codec = _TestUtil.alwaysDocValuesFormat(new DirectDocValuesFormat());
+
+  @Override
+  protected Codec getCodec() {
+    return codec;
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@ -24,11 +24,12 @@ import java.util.NoSuchElementException;
 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.OpenBitSet;
-import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.PagedBytes;
+import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer;
 import org.apache.lucene.util.packed.PackedInts;

@ -36,12 +37,8 @@ import org.apache.lucene.util.packed.PackedInts;
 *  segment flushes. */
 class BinaryDocValuesWriter extends DocValuesWriter {

-  /** Maximum length for a binary field; we set this to "a
-   *  bit" below Integer.MAX_VALUE because the exact max
-   *  allowed byte[] is JVM dependent, so we want to avoid
-   *  a case where a large value worked in one JVM but
-   *  failed later at search time with a different JVM.  */
-  private static final int MAX_LENGTH = Integer.MAX_VALUE-256;
+  /** Maximum length for a binary field. */
+  private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;

  // 32 KB block sizes for PagedBytes storage:
  private final static int BLOCK_BITS = 15;
--- a/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java
@ -28,6 +28,14 @@ import java.util.Comparator;

 public final class ArrayUtil {

+  /** Maximum length for an array; we set this to "a
+   *  bit" below Integer.MAX_VALUE because the exact max
+   *  allowed byte[] is JVM dependent, so we want to avoid
+   *  a case where a large value worked during indexing on
+   *  one JVM but failed later at search time with a
+   *  different JVM. */
+  public static final int MAX_ARRAY_LENGTH = Integer.MAX_VALUE - 256;
+
  private ArrayUtil() {} // no instance

  /*
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@ -186,6 +186,44 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
    directory.close();
  }

+  public void testTwoBinaryValues() throws IOException {
+    Directory directory = newDirectory();
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
+    Document doc = new Document();
+    String longTerm = "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm";
+    String text = "This is the text to be indexed. " + longTerm;
+    doc.add(newTextField("fieldname", text, Field.Store.YES));
+    doc.add(new BinaryDocValuesField("dv1", new BytesRef(longTerm)));
+    doc.add(new BinaryDocValuesField("dv2", new BytesRef(text)));
+    iwriter.addDocument(doc);
+    iwriter.close();
+    
+    // Now search the index:
+    IndexReader ireader = DirectoryReader.open(directory); // read-only=true
+    IndexSearcher isearcher = new IndexSearcher(ireader);
+
+    assertEquals(1, isearcher.search(new TermQuery(new Term("fieldname", longTerm)), 1).totalHits);
+    Query query = new TermQuery(new Term("fieldname", "text"));
+    TopDocs hits = isearcher.search(query, null, 1);
+    assertEquals(1, hits.totalHits);
+    // Iterate through the results:
+    for (int i = 0; i < hits.scoreDocs.length; i++) {
+      StoredDocument hitDoc = isearcher.doc(hits.scoreDocs[i].doc);
+      assertEquals(text, hitDoc.get("fieldname"));
+      assert ireader.leaves().size() == 1;
+      BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv1");
+      BytesRef scratch = new BytesRef();
+      dv.get(hits.scoreDocs[i].doc, scratch);
+      assertEquals(new BytesRef(longTerm), scratch);
+      dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv2");
+      dv.get(hits.scoreDocs[i].doc, scratch);
+      assertEquals(new BytesRef(text), scratch);
+    }
+
+    ireader.close();
+    directory.close();
+  }
+
  public void testTwoFieldsMixed() throws IOException {
    Directory directory = newDirectory();
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
@ -2943,12 +2981,12 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
                if (values.length > 0) {
                  assertNotNull(sortedSet);
                  sortedSet.setDocument(j);
-                  for (int i = 0; i < values.length; i++) {
+                  for (int k = 0; k < values.length; k++) {
                    long ord = sortedSet.nextOrd();
                    assertTrue(ord != SortedSetDocValues.NO_MORE_ORDS);
                    BytesRef value = new BytesRef();
                    sortedSet.lookupOrd(ord, value);
-                    assertEquals(values[i], value.utf8ToString());
+                    assertEquals(values[k], value.utf8ToString());
                  }
                  assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd());
                  assertTrue(sortedSetBits.get(j));