LUCENE-5743: Add Lucene49NormsFormat

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1601606 13f79535-47bb-0310-9956-ffa450edef68
2014-06-10 11:35:48 +00:00 · 2014-06-10 11:35:48 +00:00 · 6384ae9fb7
parent 84e1847228
commit 6384ae9fb7
14 changed files with 955 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -129,6 +129,9 @@ New Features
  from Directory. Add Lucene49Codec and Lucene49DocValuesFormat that make
  use of these.  (Robert Muir)

+* LUCENE-5743: Add Lucene49NormsFormat, which can compress in some cases
+  such as very short fields.  (Ryan Ernst, Adrien Grand, Robert Muir)
+
 Changes in Backwards Compatibility Policy

 * LUCENE-5634: Add reuse argument to IndexableField.tokenStream. This
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskNormsFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestDiskNormsFormat.java
@ -0,0 +1,32 @@
+package org.apache.lucene.codecs.diskdv;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
+import org.apache.lucene.index.BaseNormsFormatTestCase;
+
+/** Tests DiskNormsFormat */
+public class TestDiskNormsFormat extends BaseNormsFormatTestCase {
+  private final Codec codec = new CheapBastardCodec();
+
+  @Override
+  protected Codec getCodec() {
+    return codec;
+  }
+}
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextNormsFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextNormsFormat.java
@ -0,0 +1,31 @@
+package org.apache.lucene.codecs.simpletext;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseNormsFormatTestCase;
+
+/** Tests SimpleTextNormsFormat */
+public class TestSimpleTextNormsFormat extends BaseNormsFormatTestCase {
+  private final Codec codec = new SimpleTextCodec();
+
+  @Override
+  protected Codec getCodec() {
+    return codec;
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49Codec.java
@ -131,7 +131,7 @@ public class Lucene49Codec extends Codec {
  private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
  private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene49");

-  private final NormsFormat normsFormat = new Lucene42NormsFormat();
+  private final NormsFormat normsFormat = new Lucene49NormsFormat();

  @Override
  public final NormsFormat normsFormat() {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java
@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene49;

 import java.io.Closeable; // javadocs
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@ -199,6 +200,7 @@ public class Lucene49DocValuesConsumer extends DocValuesConsumer implements Clos
        break;
      case TABLE_COMPRESSED:
        final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
+        Arrays.sort(decode);
        final HashMap<Long,Integer> encode = new HashMap<>();
        meta.writeVInt(decode.length);
        for (int i = 0; i < decode.length; i++) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java
@ -0,0 +1,208 @@
+package org.apache.lucene.codecs.lucene49;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.BlockPackedWriter;
+import org.apache.lucene.util.packed.PackedInts;
+
+import static org.apache.lucene.codecs.lucene49.Lucene49NormsFormat.VERSION_CURRENT;
+
+/**
+ * Writer for {@link Lucene49NormsFormat}
+ */
+class Lucene49NormsConsumer extends DocValuesConsumer { 
+  static final byte DELTA_COMPRESSED = 0;
+  static final byte TABLE_COMPRESSED = 1;
+  static final byte CONST_COMPRESSED = 2;
+  static final byte UNCOMPRESSED = 3;
+  static final int BLOCK_SIZE = 16384;
+
+  IndexOutput data, meta;
+  final int maxDoc;
+  
+  Lucene49NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+    maxDoc = state.segmentInfo.getDocCount();
+    boolean success = false;
+    try {
+      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+      data = state.directory.createOutput(dataName, state.context);
+      CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT);
+      String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+      meta = state.directory.createOutput(metaName, state.context);
+      CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT);
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
+  
+  // we explicitly use only certain bits per value and a specified format, so we statically check this will work
+  static {
+    assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(1);
+    assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(2);
+    assert PackedInts.Format.PACKED_SINGLE_BLOCK.isSupported(4);
+  }
+
+  @Override
+  public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
+    meta.writeVInt(field.number);
+    long minValue = Long.MAX_VALUE;
+    long maxValue = Long.MIN_VALUE;
+    // TODO: more efficient?
+    HashSet<Long> uniqueValues = null;
+    uniqueValues = new HashSet<>();
+    
+    long count = 0;
+    for (Number nv : values) {
+      if (nv == null) {
+        throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count);
+      }
+      final long v = nv.longValue();
+      
+      minValue = Math.min(minValue, v);
+      maxValue = Math.max(maxValue, v);
+      
+      if (uniqueValues != null) {
+        if (uniqueValues.add(v)) {
+          if (uniqueValues.size() > 256) {
+            uniqueValues = null;
+          }
+        }
+      }
+      ++count;
+    }
+    
+    if (count != maxDoc) {
+      throw new IllegalStateException("illegal norms data for field " + field.name + ", expected " + maxDoc + " values, got " + count);
+    }
+    
+    if (uniqueValues != null && uniqueValues.size() == 1) {
+      // 0 bpv
+      meta.writeByte(CONST_COMPRESSED);
+      meta.writeLong(minValue);
+    } else if (uniqueValues != null) {
+      // small number of unique values: this is the typical case:
+      // we only use bpv=1,2,4,8     
+      PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
+      int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
+      if (bitsPerValue == 3) {
+        bitsPerValue = 4;
+      } else if (bitsPerValue > 4) {
+        bitsPerValue = 8;
+      }
+      
+      if (bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
+        meta.writeByte(UNCOMPRESSED); // uncompressed byte[]
+        meta.writeLong(data.getFilePointer());
+        for (Number nv : values) {
+          data.writeByte(nv == null ? 0 : (byte) nv.longValue());
+        }
+      } else {
+        meta.writeByte(TABLE_COMPRESSED); // table-compressed
+        meta.writeLong(data.getFilePointer());
+        data.writeVInt(PackedInts.VERSION_CURRENT);
+        
+        Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
+        Arrays.sort(decode);
+        final HashMap<Long,Integer> encode = new HashMap<>();
+        // upgrade to power of two sized array
+        int size = 1 << bitsPerValue;
+        data.writeVInt(size);
+        for (int i = 0; i < decode.length; i++) {
+          data.writeLong(decode[i]);
+          encode.put(decode[i], i);
+        }
+        for (int i = decode.length; i < size; i++) {
+          data.writeLong(0);
+        }
+
+        data.writeVInt(format.getId());
+        data.writeVInt(bitsPerValue);
+
+        final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, format, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
+        for(Number nv : values) {
+          writer.add(encode.get(nv.longValue()));
+        }
+        writer.finish();
+      }
+    } else {
+      meta.writeByte(DELTA_COMPRESSED); // delta-compressed
+      meta.writeLong(data.getFilePointer());
+      data.writeVInt(PackedInts.VERSION_CURRENT);
+      data.writeVInt(BLOCK_SIZE);
+
+      final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
+      for (Number nv : values) {
+        writer.add(nv.longValue());
+      }
+      writer.finish();
+    }
+  }
+  
+  @Override
+  public void close() throws IOException {
+    boolean success = false;
+    try {
+      if (meta != null) {
+        meta.writeVInt(-1); // write EOF marker
+        CodecUtil.writeFooter(meta); // write checksum
+      }
+      if (data != null) {
+        CodecUtil.writeFooter(data); // write checksum
+      }
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(data, meta);
+      } else {
+        IOUtils.closeWhileHandlingException(data, meta);
+      }
+      meta = data = null;
+    }
+  }
+
+  @Override
+  public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws IOException {
+    throw new UnsupportedOperationException();
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsFormat.java
@ -0,0 +1,121 @@
+package org.apache.lucene.codecs.lucene49;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.NormsFormat;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.SmallFloat;
+import org.apache.lucene.util.packed.BlockPackedWriter;
+import org.apache.lucene.util.packed.PackedInts;
+
+/**
+ * Lucene 4.9 Score normalization format.
+ * <p>
+ * Encodes normalization values with these strategies:
+ * <p>
+ * <ul>
+ *    <li>Uncompressed: when values fit into a single byte and would require more than 4 bits
+ *        per value, they are just encoded as an uncompressed byte array.
+ *    <li>Constant: when there is only one value present for the entire field, no actual data
+ *        is written: this constant is encoded in the metadata
+ *    <li>Table-compressed: when the number of unique values is very small (&lt; 64), and
+ *        when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), 
+ *        a lookup table is written instead. Each per-document entry is instead the ordinal 
+ *        to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}). 
+ *    <li>Delta-compressed: per-document integers written as deltas from the minimum value,
+ *        compressed with bitpacking. For more information, see {@link BlockPackedWriter}.
+ *        This is only used when norms of larger than one byte are present.
+ * </ul>
+ * <p>
+ * Files:
+ * <ol>
+ *   <li><tt>.nvd</tt>: Norms data</li>
+ *   <li><tt>.nvm</tt>: Norms metadata</li>
+ * </ol>
+ * <ol>
+ *   <li><a name="nvm" id="nvm"></a>
+ *   <p>The Norms metadata or .nvm file.</p>
+ *   <p>For each norms field, this stores metadata, such as the offset into the 
+ *      Norms data (.nvd)</p>
+ *   <p>Norms metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup>,Footer</p>
+ *   <ul>
+ *     <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *     <li>Entry --&gt; FieldNumber,Type,Offset</li>
+ *     <li>FieldNumber --&gt; {@link DataOutput#writeVInt vInt}</li>
+ *     <li>Type --&gt; {@link DataOutput#writeByte Byte}</li>
+ *     <li>Offset --&gt; {@link DataOutput#writeLong Int64}</li>
+ *     <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
+ *   </ul>
+ *   <p>FieldNumber of -1 indicates the end of metadata.</p>
+ *   <p>Offset is the pointer to the start of the data in the norms data (.nvd), or the singleton value for Constant</p>
+ *   <p>Type indicates how Numeric values will be compressed:
+ *      <ul>
+ *         <li>0 --&gt; delta-compressed. For each block of 16k integers, every integer is delta-encoded
+ *             from the minimum value within the block. 
+ *         <li>1 --&gt; table-compressed. When the number of unique numeric values is small and it would save space,
+ *             a lookup table of unique values is written, followed by the ordinal for each document.
+ *         <li>2 --&gt; constant. When there is a single value for the entire field.
+ *         <li>3 --&gt; uncompressed: Values written as a simple byte[].
+ *      </ul>
+ *   <li><a name="nvd" id="nvd"></a>
+ *   <p>The Norms data or .nvd file.</p>
+ *   <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
+ *   <p>Norms data (.nvd) --&gt; Header,&lt;Uncompressed | TableCompressed | DeltaCompressed&gt;<sup>NumFields</sup>,Footer</p>
+ *   <ul>
+ *     <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *     <li>Uncompressed --&gt;  {@link DataOutput#writeByte Byte}<sup>maxDoc</sup></li>
+ *     <li>TableCompressed --&gt; PackedIntsVersion,Table,BitPackedData</li>
+ *     <li>Table --&gt; TableSize, {@link DataOutput#writeLong int64}<sup>TableSize</sup></li>
+ *     <li>BitpackedData --&gt; {@link PackedInts}</li>
+ *     <li>DeltaCompressed --&gt; PackedIntsVersion,BlockSize,DeltaCompressedData</li>
+ *     <li>DeltaCompressedData --&gt; {@link BlockPackedWriter BlockPackedWriter(blockSize=16k)}</li>
+ *     <li>PackedIntsVersion,BlockSize,TableSize --&gt; {@link DataOutput#writeVInt vInt}</li>
+ *     <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
+ *   </ul>
+ * </ol>
+ * @lucene.experimental
+ */
+public class Lucene49NormsFormat extends NormsFormat {
+
+  /** Sole Constructor */
+  public Lucene49NormsFormat() {}
+  
+  @Override
+  public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException {
+    return new Lucene49NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+  }
+
+  @Override
+  public DocValuesProducer normsProducer(SegmentReadState state) throws IOException {
+    return new Lucene49NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+  }
+  
+  private static final String DATA_CODEC = "Lucene49NormsData";
+  private static final String DATA_EXTENSION = "nvd";
+  private static final String METADATA_CODEC = "Lucene49NormsMetadata";
+  private static final String METADATA_EXTENSION = "nvm";
+  static final int VERSION_START = 0;
+  static final int VERSION_CURRENT = VERSION_START;
+}
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java
@ -0,0 +1,233 @@
+package org.apache.lucene.codecs.lucene49;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.BlockPackedReader;
+import org.apache.lucene.util.packed.PackedInts;
+
+import static org.apache.lucene.codecs.lucene49.Lucene49NormsFormat.VERSION_START;
+import static org.apache.lucene.codecs.lucene49.Lucene49NormsFormat.VERSION_CURRENT;
+import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.CONST_COMPRESSED;
+import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.DELTA_COMPRESSED;
+import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.TABLE_COMPRESSED;
+import static org.apache.lucene.codecs.lucene49.Lucene49NormsConsumer.UNCOMPRESSED;
+
+/**
+ * Reader for {@link Lucene49NormsFormat}
+ */
+class Lucene49NormsProducer extends DocValuesProducer {
+  // metadata maps (just file pointers and minimal stuff)
+  private final Map<Integer,NormsEntry> norms = new HashMap<>();
+  private final IndexInput data;
+  private final int version;
+  
+  // ram instances we have already loaded
+  final Map<Integer,NumericDocValues> instances = new HashMap<>();
+  
+  private final int maxDoc;
+  private final AtomicLong ramBytesUsed;
+    
+  Lucene49NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+    maxDoc = state.segmentInfo.getDocCount();
+    String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+    // read in the entries from the metadata file.
+    ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context);
+    boolean success = false;
+    ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
+    try {
+      version = CodecUtil.checkHeader(in, metaCodec, VERSION_START, VERSION_CURRENT);
+      readFields(in, state.fieldInfos);
+      CodecUtil.checkFooter(in);
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(in);
+      } else {
+        IOUtils.closeWhileHandlingException(in);
+      }
+    }
+
+    String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+    this.data = state.directory.openInput(dataName, state.context);
+    success = false;
+    try {
+      final int version2 = CodecUtil.checkHeader(data, dataCodec, VERSION_START, VERSION_CURRENT);
+      if (version != version2) {
+        throw new CorruptIndexException("Format versions mismatch");
+      }
+
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this.data);
+      }
+    }
+  }
+  
+  private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
+    int fieldNumber = meta.readVInt();
+    while (fieldNumber != -1) {
+      FieldInfo info = infos.fieldInfo(fieldNumber);
+      if (info == null) {
+        throw new CorruptIndexException("Invalid field number: " + fieldNumber + " (resource=" + meta + ")");
+      } else if (!info.hasNorms()) {
+        throw new CorruptIndexException("Invalid field: " + info.name + " (resource=" + meta + ")");
+      }
+      NormsEntry entry = new NormsEntry();
+      entry.format = meta.readByte();
+      entry.offset = meta.readLong();
+      switch(entry.format) {
+        case CONST_COMPRESSED:
+        case UNCOMPRESSED:
+        case TABLE_COMPRESSED:
+        case DELTA_COMPRESSED:
+          break;
+        default:
+          throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
+      }
+      norms.put(fieldNumber, entry);
+      fieldNumber = meta.readVInt();
+    }
+  }
+
+  @Override
+  public synchronized NumericDocValues getNumeric(FieldInfo field) throws IOException {
+    NumericDocValues instance = instances.get(field.number);
+    if (instance == null) {
+      instance = loadNorms(field);
+      instances.put(field.number, instance);
+    }
+    return instance;
+  }
+  
+  @Override
+  public long ramBytesUsed() {
+    return ramBytesUsed.get();
+  }
+  
+  @Override
+  public void checkIntegrity() throws IOException {
+    CodecUtil.checksumEntireFile(data);
+  }
+
+  private NumericDocValues loadNorms(FieldInfo field) throws IOException {
+    NormsEntry entry = norms.get(field.number);
+    switch(entry.format) {
+      case CONST_COMPRESSED:
+        final long v = entry.offset;
+        return new NumericDocValues() {
+          @Override
+          public long get(int docID) {
+            return v;
+          }
+        };
+      case UNCOMPRESSED:
+        data.seek(entry.offset);
+        final byte bytes[] = new byte[maxDoc];
+        data.readBytes(bytes, 0, bytes.length);
+        ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(bytes));
+        return new NumericDocValues() {
+          @Override
+          public long get(int docID) {
+            return bytes[docID];
+          }
+        };
+      case DELTA_COMPRESSED:
+        data.seek(entry.offset);
+        int packedIntsVersion = data.readVInt();
+        int blockSize = data.readVInt();
+        final BlockPackedReader reader = new BlockPackedReader(data, packedIntsVersion, blockSize, maxDoc, false);
+        ramBytesUsed.addAndGet(reader.ramBytesUsed());
+        return reader;
+      case TABLE_COMPRESSED:
+        data.seek(entry.offset);
+        int packedVersion = data.readVInt();
+        int size = data.readVInt();
+        if (size > 256) {
+          throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + data);
+        }
+        final long decode[] = new long[size];
+        for (int i = 0; i < decode.length; i++) {
+          decode[i] = data.readLong();
+        }
+        final int formatID = data.readVInt();
+        final int bitsPerValue = data.readVInt();
+        final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), packedVersion, maxDoc, bitsPerValue);
+        ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed());
+        return new NumericDocValues() {
+          @Override
+          public long get(int docID) {
+            return decode[(int)ordsReader.get(docID)];
+          }
+        };
+      default:
+        throw new AssertionError();
+    }
+  }
+
+  @Override
+  public BinaryDocValues getBinary(FieldInfo field) throws IOException {
+    throw new IllegalStateException();
+  }
+  
+  @Override
+  public SortedDocValues getSorted(FieldInfo field) throws IOException {
+    throw new IllegalStateException();
+  }
+  
+  @Override
+  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+    throw new IllegalStateException();
+  }
+  
+  @Override
+  public Bits getDocsWithField(FieldInfo field) throws IOException {
+    throw new IllegalStateException();
+  }
+
+  @Override
+  public void close() throws IOException {
+    data.close();
+  }
+  
+  static class NormsEntry {
+    byte format;
+    long offset;
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/package.html
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/package.html
@ -173,7 +173,7 @@ term occurs in each document. Note that this will not exist if all fields in
 all documents omit position data.
 </li>
 <li>
-{@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Normalization factors}. 
+{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Normalization factors}. 
 For each field in each document, a value is stored
 that is multiplied into the score for hits on that field.
 </li>
@ -289,7 +289,7 @@ systems that frequently run out of file handles.</td>
 <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
 </tr>
 <tr>
-<td>{@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Norms}</td>
+<td>{@link org.apache.lucene.codecs.lucene49.Lucene49NormsFormat Norms}</td>
 <td>.nvd, .nvm</td>
 <td>Encodes length and boost factors for docs and fields</td>
 </tr>
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40NormsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40NormsFormat.java
@ -0,0 +1,38 @@
+package org.apache.lucene.codecs.lucene40;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseNormsFormatTestCase;
+import org.junit.BeforeClass;
+
+
+/** Tests Lucene40's norms format */
+public class TestLucene40NormsFormat extends BaseNormsFormatTestCase {
+  final Codec codec = new Lucene40RWCodec();
+  
+  @Override
+  protected Codec getCodec() {
+    return codec;
+  }
+  
+  @BeforeClass
+  public static void beforeClass() {
+    OLD_FORMAT_IMPERSONATION_IS_ACTIVE = true; // explicitly instantiates ancient codec
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42NormsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42NormsFormat.java
@ -0,0 +1,38 @@
+package org.apache.lucene.codecs.lucene42;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseNormsFormatTestCase;
+import org.junit.BeforeClass;
+
+
+/** Tests Lucene42's norms format */
+public class TestLucene42NormsFormat extends BaseNormsFormatTestCase {
+  final Codec codec = new Lucene42RWCodec();
+  
+  @Override
+  protected Codec getCodec() {
+    return codec;
+  }
+  
+  @BeforeClass
+  public static void beforeClass() {
+    OLD_FORMAT_IMPERSONATION_IS_ACTIVE = true; // explicitly instantiates ancient codec
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene49/TestLucene49NormsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene49/TestLucene49NormsFormat.java
@ -0,0 +1,33 @@
+package org.apache.lucene.codecs.lucene49;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseNormsFormatTestCase;
+
+/**
+ * Tests Lucene49NormsFormat
+ */
+public class TestLucene49NormsFormat extends BaseNormsFormatTestCase {
+  final Codec codec = new Lucene49Codec();
+  
+  @Override
+  protected Codec getCodec() {
+    return codec;
+  }  
+}
--- a/lucene/core/src/test/org/apache/lucene/index/TestNormsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestNormsFormat.java
@ -0,0 +1,30 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+
+/** Tests the codec configuration defined by LuceneTestCase randomly
+ */
+public class TestNormsFormat extends BaseNormsFormatTestCase {
+
+  @Override
+  protected Codec getCodec() {
+    return Codec.getDefault();
+  }
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
@ -0,0 +1,183 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.TestUtil;
+
+/**
+ * Abstract class to do basic tests for a norms format.
+ * NOTE: This test focuses on the norms impl, nothing else.
+ * The [stretch] goal is for this test to be
+ * so thorough in testing a new NormsFormat that if this
+ * test passes, then all Lucene/Solr tests should also pass.  Ie,
+ * if there is some bug in a given NormsFormat that this
+ * test fails to catch then this test needs to be improved! */
+public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCase {
+  
+  public void testByteRange() throws Exception {
+    int iterations = atLeast(1);
+    final Random r = random();
+    for (int i = 0; i < iterations; i++) {
+      doTestNormsVersusStoredFields(new LongProducer() {
+        @Override
+        long next() {
+          return TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+        }
+      });
+    }
+  }
+  
+  public void testLongRange() throws Exception {
+    int iterations = atLeast(1);
+    final Random r = random();
+    for (int i = 0; i < iterations; i++) {
+      doTestNormsVersusStoredFields(new LongProducer() {
+        @Override
+        long next() {
+          return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE);
+        }
+      });
+    }
+  }
+  
+  public void testFewValues() throws Exception {
+    int iterations = atLeast(1);
+    final Random r = random();
+    for (int i = 0; i < iterations; i++) {
+      doTestNormsVersusStoredFields(new LongProducer() {
+        @Override
+        long next() {
+          return r.nextBoolean() ? 20 : 3;
+        }
+      });
+    }
+  }
+  
+  public void testAllZeros() throws Exception {
+    int iterations = atLeast(1);
+    final Random r = random();
+    for (int i = 0; i < iterations; i++) {
+      doTestNormsVersusStoredFields(new LongProducer() {
+        @Override
+        long next() {
+          return 0;
+        }
+      });
+    }
+  }
+  
+  private void doTestNormsVersusStoredFields(LongProducer longs) throws Exception {
+    int numDocs = atLeast(500);
+    long norms[] = new long[numDocs];
+    for (int i = 0; i < numDocs; i++) {
+      norms[i] = longs.next();
+    }
+    
+    Directory dir = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
+    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+    conf.setSimilarity(new CannedNormSimilarity(norms));
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
+    Document doc = new Document();
+    Field idField = new StringField("id", "", Field.Store.NO);
+    Field storedField = newTextField("stored", "", Field.Store.YES);
+    doc.add(idField);
+    doc.add(storedField);
+    
+    for (int i = 0; i < numDocs; i++) {
+      idField.setStringValue(Integer.toString(i));
+      long value = norms[i];
+      storedField.setStringValue(Long.toString(value));
+      writer.addDocument(doc);
+      if (random().nextInt(31) == 0) {
+        writer.commit();
+      }
+    }
+    
+    // delete some docs
+    int numDeletions = random().nextInt(numDocs/10);
+    for (int i = 0; i < numDeletions; i++) {
+      int id = random().nextInt(numDocs);
+      writer.deleteDocuments(new Term("id", Integer.toString(id)));
+    }
+    
+    writer.shutdown();
+    
+    // compare
+    DirectoryReader ir = DirectoryReader.open(dir);
+    for (AtomicReaderContext context : ir.leaves()) {
+      AtomicReader r = context.reader();
+      NumericDocValues docValues = r.getNormValues("stored");
+      for (int i = 0; i < r.maxDoc(); i++) {
+        long storedValue = Long.parseLong(r.document(i).get("stored"));
+        assertEquals(storedValue, docValues.get(i));
+      }
+    }
+    ir.close();
+    dir.close();
+  }
+  
+  
+  static abstract class LongProducer {
+    abstract long next();
+  }
+  
+  static class CannedNormSimilarity extends Similarity {
+    final long norms[];
+    int index = 0;
+    
+    CannedNormSimilarity(long norms[]) {
+      this.norms = norms;
+    }
+
+    @Override
+    public long computeNorm(FieldInvertState state) {
+      return norms[index++];
+    }
+
+    @Override
+    public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  @Override
+  protected void addRandomFields(Document doc) {
+    // TODO: improve
+    doc.add(new TextField("foobar", "boo", Field.Store.NO));
+  }
+}