4.0 VAR_DEREF

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1438119 13f79535-47bb-0310-9956-ffa450edef68
2013-01-24 18:31:28 +00:00 · 2013-01-24 18:31:28 +00:00 · 48719dcfba
parent b8c2238a46
commit 48719dcfba
2 changed files with 98 additions and 4 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
@ -401,7 +401,50 @@ class Lucene40DocValuesReader extends DocValuesProducer {
  }
  
  private BinaryDocValues loadBytesVarDeref(FieldInfo field) throws IOException {
-    throw new AssertionError(); // nocommit
+    String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "dat");
+    String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "idx");
+    IndexInput data = null;
+    IndexInput index = null;
+    boolean success = false;
+    try {
+      data = dir.openInput(dataName, state.context);
+      CodecUtil.checkHeader(data, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT, 
+                                  Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_START, 
+                                  Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
+      index = dir.openInput(indexName, state.context);
+      CodecUtil.checkHeader(index, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX, 
+                                   Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_START, 
+                                   Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
+      
+      final long totalBytes = index.readLong();
+      // nocommit? can the current impl even handle > 2G?
+      final byte bytes[] = new byte[(int)totalBytes];
+      data.readBytes(bytes, 0, bytes.length);
+      final PackedInts.Reader reader = PackedInts.getReader(index);
+      success = true;
+      return new BinaryDocValues() {
+        @Override
+        public void get(int docID, BytesRef result) {
+          int startAddress = (int)reader.get(docID);
+          result.bytes = bytes;
+          result.offset = startAddress;
+          if ((bytes[startAddress] & 128) == 0) {
+            // length is 1 byte
+            result.offset++;
+            result.length = bytes[startAddress];
+          } else {
+            result.offset += 2;
+            result.length = ((bytes[startAddress] & 0x7f) << 8) | ((bytes[startAddress+1] & 0xff));
+          }
+        }
+      };
+    } finally {
+      if (success) {
+        IOUtils.close(data, index);
+      } else {
+        IOUtils.closeWhileHandlingException(data, index);
+      }
+    }
  }

  @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java
@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene40;
 */

 import java.io.IOException;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.TreeSet;

@ -168,8 +169,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
    
    int maxDoc = state.segmentInfo.getDocCount();
    final boolean fixed = minLength == maxLength;
-    // nocommit
-    final boolean dedup = fixed && (uniqueValues != null && uniqueValues.size() * 2 < maxDoc);
+    final boolean dedup = uniqueValues != null && uniqueValues.size() * 2 < maxDoc;
    
    if (dedup) {
      // we will deduplicate and deref values
@ -184,7 +184,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
        if (fixed) {
          addFixedDerefBytesField(field, data, index, values, minLength);
        } else {
-          assert false; // nocommit
+          addVarDerefBytesField(field, data, index, values);
        }
        success = true;
      } finally {
@ -323,6 +323,57 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
    }
    w.finish();
  }
+  
+  private void addVarDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values) throws IOException {
+    field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_DEREF.name());
+
+    CodecUtil.writeHeader(data, 
+                          Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT,
+                          Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
+    
+    CodecUtil.writeHeader(index, 
+                          Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX,
+                          Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
+    
+    // deduplicate
+    TreeSet<BytesRef> dictionary = new TreeSet<BytesRef>();
+    for (BytesRef v : values) {
+      dictionary.add(BytesRef.deepCopyOf(v));
+    }
+    
+    /* values */
+    long startPosition = data.getFilePointer();
+    long currentAddress = 0;
+    HashMap<BytesRef,Long> valueToAddress = new HashMap<BytesRef,Long>();
+    for (BytesRef v : dictionary) {
+      currentAddress = data.getFilePointer() - startPosition;
+      valueToAddress.put(v, currentAddress);
+      writeVShort(data, v.length);
+      data.writeBytes(v.bytes, v.offset, v.length);
+    }
+    
+    /* ordinals */
+    long totalBytes = data.getFilePointer() - startPosition;
+    index.writeLong(totalBytes);
+    final int maxDoc = state.segmentInfo.getDocCount();
+    final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(currentAddress), PackedInts.DEFAULT);
+
+    for (BytesRef v : values) {
+      w.add(valueToAddress.get(v));
+    }
+    w.finish();
+  }
+  
+  // the little vint encoding used for var-deref
+  private static void writeVShort(IndexOutput o, int i) throws IOException {
+    assert i >= 0 && i <= Short.MAX_VALUE;
+    if (i < 128) {
+      o.writeByte((byte)i);
+    } else {
+      o.writeByte((byte) (0x80 | (i >> 8)));
+      o.writeByte((byte) (i & 0xff));
+    }
+  }

  @Override
  public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {