HBASE-3857 New files

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1153640 13f79535-47bb-0310-9956-ffa450edef68
2011-08-03 20:18:03 +00:00 · 2011-08-03 20:18:03 +00:00 · 8e2b477566
parent b354e4ea98
commit 8e2b477566
6 changed files with 2869 additions and 0 deletions
--- a/src/main/java/org/apache/hadoop/hbase/io/hfile/FixedFileTrailer.java
+++ b/src/main/java/org/apache/hadoop/hbase/io/hfile/FixedFileTrailer.java
@ -0,0 +1,475 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.RawComparator;
+
+import static org.apache.hadoop.hbase.io.hfile.HFile.MIN_FORMAT_VERSION;
+import static org.apache.hadoop.hbase.io.hfile.HFile.MAX_FORMAT_VERSION;
+
+import com.google.common.io.NullOutputStream;
+
+/**
+ * The {@link HFile} has a fixed trailer which contains offsets to other
+ * variable parts of the file. Also includes basic metadata on this file. The
+ * trailer size is fixed within a given {@link HFile} format version only, but
+ * we always store the version number as the last four-byte integer of the file.
+ */
+public class FixedFileTrailer {
+
+  private static final Log LOG = LogFactory.getLog(FixedFileTrailer.class);
+
+  /**
+   * We store the comparator class name as a fixed-length field in the trailer.
+   */
+  private static final int MAX_COMPARATOR_NAME_LENGTH = 128;
+
+  /**
+   * Offset to the fileinfo data, a small block of vitals. Necessary in v1 but
+   * only potentially useful for pretty-printing in v2.
+   */
+  private long fileInfoOffset;
+
+  /**
+   * In version 1, the offset to the data block index. Starting from version 2,
+   * the meaning of this field is the offset to the section of the file that
+   * should be loaded at the time the file is being opened, and as of the time
+   * of writing, this happens to be the offset of the file info section.
+   */
+  private long loadOnOpenDataOffset;
+
+  /** The number of entries in the root data index. */
+  private int dataIndexCount;
+
+  /** Total uncompressed size of all blocks of the data index */
+  private long uncompressedDataIndexSize;
+
+  /** The number of entries in the meta index */
+  private int metaIndexCount;
+
+  /** The total uncompressed size of keys/values stored in the file. */
+  private long totalUncompressedBytes;
+
+  /**
+   * The number of key/value pairs in the file. This field was int in version 1,
+   * but is now long.
+   */
+  private long entryCount;
+
+  /** The compression codec used for all blocks. */
+  private Compression.Algorithm compressionCodec = Compression.Algorithm.NONE;
+
+  /**
+   * The number of levels in the potentially multi-level data index. Used from
+   * version 2 onwards.
+   */
+  private int numDataIndexLevels;
+
+  /** The offset of the first data block. */
+  private long firstDataBlockOffset;
+
+  /**
+   * It is guaranteed that no key/value data blocks start after this offset in
+   * the file.
+   */
+  private long lastDataBlockOffset;
+
+  /** Raw key comparator class name in version 2 */
+  private String comparatorClassName = RawComparator.class.getName();
+
+  /** The {@link HFile} format version. */
+  private final int version;
+
+  FixedFileTrailer(int version) {
+    this.version = version;
+    HFile.checkFormatVersion(version);
+  }
+
+  private static int[] computeTrailerSizeByVersion() {
+    int versionToSize[] = new int[HFile.MAX_FORMAT_VERSION + 1];
+    for (int version = MIN_FORMAT_VERSION;
+         version <= MAX_FORMAT_VERSION;
+         ++version) {
+      FixedFileTrailer fft = new FixedFileTrailer(version);
+      DataOutputStream dos = new DataOutputStream(new NullOutputStream());
+      try {
+        fft.serialize(dos);
+      } catch (IOException ex) {
+        // The above has no reason to fail.
+        throw new RuntimeException(ex);
+      }
+      versionToSize[version] = dos.size();
+    }
+    return versionToSize;
+  }
+
+  private static int getMaxTrailerSize() {
+    int maxSize = 0;
+    for (int version = MIN_FORMAT_VERSION;
+         version <= MAX_FORMAT_VERSION;
+         ++version)
+      maxSize = Math.max(getTrailerSize(version), maxSize);
+    return maxSize;
+  }
+
+  private static final int TRAILER_SIZE[] = computeTrailerSizeByVersion();
+  private static final int MAX_TRAILER_SIZE = getMaxTrailerSize();
+
+  static int getTrailerSize(int version) {
+    return TRAILER_SIZE[version];
+  }
+
+  public int getTrailerSize() {
+    return getTrailerSize(version);
+  }
+
+  /**
+   * Write the trailer to a data stream. We support writing version 1 for
+   * testing and for determining version 1 trailer size. It is also easy to see
+   * what fields changed in version 2.
+   *
+   * @param outputStream
+   * @throws IOException
+   */
+  void serialize(DataOutputStream outputStream) throws IOException {
+    HFile.checkFormatVersion(version);
+
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput baosDos = new DataOutputStream(baos);
+
+    BlockType.TRAILER.write(baosDos);
+    baosDos.writeLong(fileInfoOffset);
+    baosDos.writeLong(loadOnOpenDataOffset);
+    baosDos.writeInt(dataIndexCount);
+
+    if (version == 1) {
+      // This used to be metaIndexOffset, but it was not used in version 1.
+      baosDos.writeLong(0);
+    } else {
+      baosDos.writeLong(uncompressedDataIndexSize);
+    }
+
+    baosDos.writeInt(metaIndexCount);
+    baosDos.writeLong(totalUncompressedBytes);
+    if (version == 1) {
+      baosDos.writeInt((int) Math.min(Integer.MAX_VALUE, entryCount));
+    } else {
+      // This field is long from version 2 onwards.
+      baosDos.writeLong(entryCount);
+    }
+    baosDos.writeInt(compressionCodec.ordinal());
+
+    if (version > 1) {
+      baosDos.writeInt(numDataIndexLevels);
+      baosDos.writeLong(firstDataBlockOffset);
+      baosDos.writeLong(lastDataBlockOffset);
+      Bytes.writeStringFixedSize(baosDos, comparatorClassName,
+          MAX_COMPARATOR_NAME_LENGTH);
+    }
+    baosDos.writeInt(version);
+
+    outputStream.write(baos.toByteArray());
+  }
+
+  /**
+   * Deserialize the fixed file trailer from the given stream. The version needs
+   * to already be specified. Make sure this is consistent with
+   * {@link #serialize(DataOutputStream)}.
+   *
+   * @param inputStream
+   * @param version
+   * @throws IOException
+   */
+  void deserialize(DataInputStream inputStream) throws IOException {
+    HFile.checkFormatVersion(version);
+
+    BlockType.TRAILER.readAndCheck(inputStream);
+
+    fileInfoOffset = inputStream.readLong();
+    loadOnOpenDataOffset = inputStream.readLong();
+    dataIndexCount = inputStream.readInt();
+
+    if (version == 1) {
+      inputStream.readLong(); // Read and skip metaIndexOffset.
+    } else {
+      uncompressedDataIndexSize = inputStream.readLong();
+    }
+    metaIndexCount = inputStream.readInt();
+
+    totalUncompressedBytes = inputStream.readLong();
+    entryCount = version == 1 ? inputStream.readInt() : inputStream.readLong();
+    compressionCodec = Compression.Algorithm.values()[inputStream.readInt()];
+    if (version > 1) {
+      numDataIndexLevels = inputStream.readInt();
+      firstDataBlockOffset = inputStream.readLong();
+      lastDataBlockOffset = inputStream.readLong();
+      comparatorClassName =
+          Bytes.readStringFixedSize(inputStream, MAX_COMPARATOR_NAME_LENGTH);
+    }
+
+    expectVersion(inputStream.readInt());
+  }
+
+  private void append(StringBuilder sb, String s) {
+    if (sb.length() > 0)
+      sb.append(", ");
+    sb.append(s);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    append(sb, "fileinfoOffset=" + fileInfoOffset);
+    append(sb, "loadOnOpenDataOffset=" + loadOnOpenDataOffset);
+    append(sb, "dataIndexCount=" + dataIndexCount);
+    append(sb, "metaIndexCount=" + metaIndexCount);
+    append(sb, "totalUncomressedBytes=" + totalUncompressedBytes);
+    append(sb, "entryCount=" + entryCount);
+    append(sb, "compressionCodec=" + compressionCodec);
+    if (version == 2) {
+      append(sb, "uncompressedDataIndexSize=" + uncompressedDataIndexSize);
+      append(sb, "numDataIndexLevels=" + numDataIndexLevels);
+      append(sb, "firstDataBlockOffset=" + firstDataBlockOffset);
+      append(sb, "lastDataBlockOffset=" + lastDataBlockOffset);
+      append(sb, "comparatorClassName=" + comparatorClassName);
+    }
+    append(sb, "version=" + version);
+
+    return sb.toString();
+  }
+
+  /**
+   * Reads a file trailer from the given file.
+   *
+   * @param istream the input stream with the ability to seek. Does not have to
+   *          be buffered, as only one read operation is made.
+   * @param fileSize the file size. Can be obtained using
+   *          {@link org.apache.hadoop.fs.FileSystem#getFileStatus(
+   *          org.apache.hadoop.fs.Path)}.
+   * @return the fixed file trailer read
+   * @throws IOException if failed to read from the underlying stream, or the
+   *           trailer is corrupted, or the version of the trailer is
+   *           unsupported
+   */
+  public static FixedFileTrailer readFromStream(FSDataInputStream istream,
+      long fileSize) throws IOException {
+    int bufferSize = MAX_TRAILER_SIZE;
+    long seekPoint = fileSize - bufferSize;
+    if (seekPoint < 0) {
+      // It is hard to imagine such a small HFile.
+      seekPoint = 0;
+      bufferSize = (int) fileSize;
+    }
+
+    istream.seek(seekPoint);
+    ByteBuffer buf = ByteBuffer.allocate(bufferSize);
+    istream.readFully(buf.array(), buf.arrayOffset(),
+        buf.arrayOffset() + buf.limit());
+
+    // Read the version from the last int of the file.
+    buf.position(buf.limit() - Bytes.SIZEOF_INT);
+    int version = buf.getInt();
+
+    try {
+      HFile.checkFormatVersion(version);
+    } catch (IllegalArgumentException iae) {
+      // In this context, an invalid version might indicate a corrupt HFile.
+      throw new IOException(iae);
+    }
+
+    int trailerSize = getTrailerSize(version);
+
+    FixedFileTrailer fft = new FixedFileTrailer(version);
+    fft.deserialize(new DataInputStream(new ByteArrayInputStream(buf.array(),
+        buf.arrayOffset() + bufferSize - trailerSize, trailerSize)));
+    return fft;
+  }
+
+  public void expectVersion(int expected) {
+    if (version != expected) {
+      throw new IllegalArgumentException("Invalid HFile version: " + version
+          + " (expected: " + expected + ")");
+    }
+  }
+
+  public void expectAtLeastVersion(int lowerBound) {
+    if (version < lowerBound) {
+      throw new IllegalArgumentException("Invalid HFile version: " + version
+          + " (expected: " + lowerBound + " or higher).");
+    }
+  }
+
+  public long getFileInfoOffset() {
+    return fileInfoOffset;
+  }
+
+  public void setFileInfoOffset(long fileInfoOffset) {
+    this.fileInfoOffset = fileInfoOffset;
+  }
+
+  public long getLoadOnOpenDataOffset() {
+    return loadOnOpenDataOffset;
+  }
+
+  public void setLoadOnOpenOffset(long loadOnOpenDataOffset) {
+    this.loadOnOpenDataOffset = loadOnOpenDataOffset;
+  }
+
+  public int getDataIndexCount() {
+    return dataIndexCount;
+  }
+
+  public void setDataIndexCount(int dataIndexCount) {
+    this.dataIndexCount = dataIndexCount;
+  }
+
+  public int getMetaIndexCount() {
+    return metaIndexCount;
+  }
+
+  public void setMetaIndexCount(int metaIndexCount) {
+    this.metaIndexCount = metaIndexCount;
+  }
+
+  public long getTotalUncompressedBytes() {
+    return totalUncompressedBytes;
+  }
+
+  public void setTotalUncompressedBytes(long totalUncompressedBytes) {
+    this.totalUncompressedBytes = totalUncompressedBytes;
+  }
+
+  public long getEntryCount() {
+    return entryCount;
+  }
+
+  public void setEntryCount(long newEntryCount) {
+    if (version == 1) {
+      int intEntryCount = (int) Math.min(Integer.MAX_VALUE, newEntryCount);
+      if (intEntryCount != newEntryCount) {
+        LOG.info("Warning: entry count is " + newEntryCount + " but writing "
+            + intEntryCount + " into the version " + version + " trailer");
+      }
+      entryCount = intEntryCount;
+      return;
+    }
+    entryCount = newEntryCount;
+  }
+
+  public Compression.Algorithm getCompressionCodec() {
+    return compressionCodec;
+  }
+
+  public void setCompressionCodec(Compression.Algorithm compressionCodec) {
+    this.compressionCodec = compressionCodec;
+  }
+
+  public int getNumDataIndexLevels() {
+    expectAtLeastVersion(2);
+    return numDataIndexLevels;
+  }
+
+  public void setNumDataIndexLevels(int numDataIndexLevels) {
+    expectAtLeastVersion(2);
+    this.numDataIndexLevels = numDataIndexLevels;
+  }
+
+  public long getLastDataBlockOffset() {
+    expectAtLeastVersion(2);
+    return lastDataBlockOffset;
+  }
+
+  public void setLastDataBlockOffset(long lastDataBlockOffset) {
+    expectAtLeastVersion(2);
+    this.lastDataBlockOffset = lastDataBlockOffset;
+  }
+
+  public long getFirstDataBlockOffset() {
+    expectAtLeastVersion(2);
+    return firstDataBlockOffset;
+  }
+
+  public void setFirstDataBlockOffset(long firstDataBlockOffset) {
+    expectAtLeastVersion(2);
+    this.firstDataBlockOffset = firstDataBlockOffset;
+  }
+
+  public int getVersion() {
+    return version;
+  }
+
+  @SuppressWarnings("rawtypes")
+  public void setComparatorClass(Class<? extends RawComparator> klass) {
+    expectAtLeastVersion(2);
+    comparatorClassName = klass.getName();
+  }
+
+  @SuppressWarnings("unchecked")
+  private static Class<? extends RawComparator<byte[]>> getComparatorClass(
+      String comparatorClassName) throws IOException {
+    try {
+      return (Class<? extends RawComparator<byte[]>>)
+          Class.forName(comparatorClassName);
+    } catch (ClassNotFoundException ex) {
+      throw new IOException(ex);
+    }
+  }
+
+  public static RawComparator<byte[]> createComparator(
+      String comparatorClassName) throws IOException {
+    try {
+      return getComparatorClass(comparatorClassName).newInstance();
+    } catch (InstantiationException e) {
+      throw new IOException(e);
+    } catch (IllegalAccessException e) {
+      throw new IOException(e);
+    }
+  }
+
+  RawComparator<byte[]> createComparator() throws IOException {
+    expectAtLeastVersion(2);
+    return createComparator(comparatorClassName);
+  }
+
+  public long getUncompressedDataIndexSize() {
+    if (version == 1)
+      return 0;
+    return uncompressedDataIndexSize;
+  }
+
+  public void setUncompressedDataIndexSize(
+      long uncompressedDataIndexSize) {
+    expectAtLeastVersion(2);
+    this.uncompressedDataIndexSize = uncompressedDataIndexSize;
+  }
+
+}
--- a/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV1.java
+++ b/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV1.java
@ -0,0 +1,666 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
+import org.apache.hadoop.hbase.io.hfile.HFile.Reader;
+import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.RawComparator;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * {@link HFile} reader for version 1.
+ */
+public class HFileReaderV1 extends AbstractHFileReader {
+  private static final Log LOG = LogFactory.getLog(HFileReaderV1.class);
+
+  private volatile boolean fileInfoLoaded = false;
+
+  /**
+   * Opens a HFile.  You must load the index before you can
+   * use it by calling {@link #loadFileInfo()}.
+   *
+   * @param fsdis input stream.  Caller is responsible for closing the passed
+   * stream.
+   * @param size Length of the stream.
+   * @param blockCache block cache. Pass null if none.
+   * @param inMemory whether blocks should be marked as in-memory in cache
+   * @param evictOnClose whether blocks in cache should be evicted on close
+   * @throws IOException
+   */
+  public HFileReaderV1(Path path, FixedFileTrailer trailer,
+      final FSDataInputStream fsdis, final long size,
+      final boolean closeIStream,
+      final BlockCache blockCache, final boolean inMemory,
+      final boolean evictOnClose) {
+    super(path, trailer, fsdis, size, closeIStream, blockCache, inMemory,
+        evictOnClose);
+
+    trailer.expectVersion(1);
+    fsBlockReader = new HFileBlock.FSReaderV1(fsdis, compressAlgo, fileSize);
+  }
+
+  private byte[] readAllIndex(final FSDataInputStream in,
+      final long indexOffset, final int indexSize) throws IOException {
+    byte[] allIndex = new byte[indexSize];
+    in.seek(indexOffset);
+    IOUtils.readFully(in, allIndex, 0, allIndex.length);
+
+    return allIndex;
+  }
+
+  /**
+   * Read in the index and file info.
+   *
+   * @return A map of fileinfo data.
+   * @see {@link Writer#appendFileInfo(byte[], byte[])}.
+   * @throws IOException
+   */
+  @Override
+  public FileInfo loadFileInfo() throws IOException {
+    if (fileInfoLoaded)
+      return fileInfo;
+
+    // Read in the fileinfo and get what we need from it.
+    istream.seek(trailer.getFileInfoOffset());
+    fileInfo = new FileInfo();
+    fileInfo.readFields(istream);
+    lastKey = fileInfo.get(FileInfo.LASTKEY);
+    avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
+    avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
+
+    // Comparator is stored in the file info in version 1.
+    String clazzName = Bytes.toString(fileInfo.get(FileInfo.COMPARATOR));
+    comparator = getComparator(clazzName);
+
+    dataBlockIndexReader =
+        new HFileBlockIndex.BlockIndexReader(comparator, 1);
+    metaBlockIndexReader =
+        new HFileBlockIndex.BlockIndexReader(Bytes.BYTES_RAWCOMPARATOR, 1);
+
+    int sizeToLoadOnOpen = (int) (fileSize - trailer.getLoadOnOpenDataOffset() -
+        trailer.getTrailerSize());
+    byte[] dataAndMetaIndex = readAllIndex(istream,
+        trailer.getLoadOnOpenDataOffset(), sizeToLoadOnOpen);
+
+    ByteArrayInputStream bis = new ByteArrayInputStream(dataAndMetaIndex);
+    DataInputStream dis = new DataInputStream(bis);
+
+    // Read in the data index.
+    if (trailer.getDataIndexCount() > 0)
+      BlockType.INDEX_V1.readAndCheck(dis);
+    dataBlockIndexReader.readRootIndex(dis, trailer.getDataIndexCount());
+
+    // Read in the metadata index.
+    if (trailer.getMetaIndexCount() > 0)
+      BlockType.INDEX_V1.readAndCheck(dis);
+    metaBlockIndexReader.readRootIndex(dis, trailer.getMetaIndexCount());
+
+    fileInfoLoaded = true;
+    return fileInfo;
+  }
+
+  /**
+   * Creates comparator from the given class name.
+   *
+   * @param clazzName the comparator class name read from the trailer
+   * @return an instance of the comparator to use
+   * @throws IOException in case comparator class name is invalid
+   */
+  @SuppressWarnings("unchecked")
+  private RawComparator<byte[]> getComparator(final String clazzName)
+  throws IOException {
+    if (clazzName == null || clazzName.length() == 0) {
+      return null;
+    }
+    try {
+      return (RawComparator<byte[]>)Class.forName(clazzName).newInstance();
+    } catch (InstantiationException e) {
+      throw new IOException(e);
+    } catch (IllegalAccessException e) {
+      throw new IOException(e);
+    } catch (ClassNotFoundException e) {
+      throw new IOException(e);
+    }
+  }
+
+  /**
+   * Create a Scanner on this file. No seeks or reads are done on creation. Call
+   * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
+   * nothing to clean up in a Scanner. Letting go of your references to the
+   * scanner is sufficient.
+   *
+   * @param cacheBlocks True if we should cache blocks read in by this scanner.
+   * @param pread Use positional read rather than seek+read if true (pread is
+   *          better for random reads, seek+read is better scanning).
+   * @param isCompaction is scanner being used for a compaction?
+   * @return Scanner on this file.
+   */
+  @Override
+  public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
+                                final boolean isCompaction) {
+    return new ScannerV1(this, cacheBlocks, pread, isCompaction);
+  }
+
+  /**
+   * @param key Key to search.
+   * @return Block number of the block containing the key or -1 if not in this
+   * file.
+   */
+  protected int blockContainingKey(final byte[] key, int offset, int length) {
+    Preconditions.checkState(!dataBlockIndexReader.isEmpty(),
+        "Block index not loaded");
+    return dataBlockIndexReader.rootBlockContainingKey(key, offset, length);
+  }
+
+  /**
+   * @param metaBlockName
+   * @param cacheBlock Add block to cache, if found
+   * @return Block wrapped in a ByteBuffer
+   * @throws IOException
+   */
+  @Override
+  public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
+      throws IOException {
+    if (trailer.getMetaIndexCount() == 0) {
+      return null; // there are no meta blocks
+    }
+    if (metaBlockIndexReader == null) {
+      throw new IOException("Meta index not loaded");
+    }
+
+    byte[] nameBytes = Bytes.toBytes(metaBlockName);
+    int block = metaBlockIndexReader.rootBlockContainingKey(nameBytes, 0,
+        nameBytes.length);
+    if (block == -1)
+      return null;
+    long offset = metaBlockIndexReader.getRootBlockOffset(block);
+    long nextOffset;
+    if (block == metaBlockIndexReader.getRootBlockCount() - 1) {
+      nextOffset = trailer.getFileInfoOffset();
+    } else {
+      nextOffset = metaBlockIndexReader.getRootBlockOffset(block + 1);
+    }
+
+    long startTimeNs = System.nanoTime();
+
+    String cacheKey = HFile.getBlockCacheKey(name, offset);
+
+    // Per meta key from any given file, synchronize reads for said block
+    synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
+      metaLoads.incrementAndGet();
+      // Check cache for block.  If found return.
+      if (blockCache != null) {
+        HFileBlock cachedBlock = (HFileBlock) blockCache.getBlock(cacheKey,
+            true);
+        if (cachedBlock != null) {
+          cacheHits.incrementAndGet();
+          return cachedBlock.getBufferWithoutHeader();
+        }
+        // Cache Miss, please load.
+      }
+
+      HFileBlock hfileBlock = fsBlockReader.readBlockData(offset,
+          nextOffset - offset, metaBlockIndexReader.getRootBlockDataSize(block),
+          true);
+      hfileBlock.expectType(BlockType.META);
+
+      HFile.readTimeNano.addAndGet(System.nanoTime() - startTimeNs);
+      HFile.readOps.incrementAndGet();
+
+      // Cache the block
+      if (cacheBlock && blockCache != null) {
+        blockCache.cacheBlock(cacheKey, hfileBlock, inMemory);
+      }
+
+      return hfileBlock.getBufferWithoutHeader();
+    }
+  }
+
+  /**
+   * Read in a file block.
+   * @param block Index of block to read.
+   * @param pread Use positional read instead of seek+read (positional is
+   * better doing random reads whereas seek+read is better scanning).
+   * @param isCompaction is this block being read as part of a compaction
+   * @return Block wrapped in a ByteBuffer.
+   * @throws IOException
+   */
+  ByteBuffer readBlockBuffer(int block, boolean cacheBlock,
+      final boolean pread, final boolean isCompaction) throws IOException {
+    if (dataBlockIndexReader == null) {
+      throw new IOException("Block index not loaded");
+    }
+    if (block < 0 || block >= dataBlockIndexReader.getRootBlockCount()) {
+      throw new IOException("Requested block is out of range: " + block +
+        ", max: " + dataBlockIndexReader.getRootBlockCount());
+    }
+
+    long offset = dataBlockIndexReader.getRootBlockOffset(block);
+    String cacheKey = HFile.getBlockCacheKey(name, offset);
+
+    // For any given block from any given file, synchronize reads for said
+    // block.
+    // Without a cache, this synchronizing is needless overhead, but really
+    // the other choice is to duplicate work (which the cache would prevent you
+    // from doing).
+    synchronized (dataBlockIndexReader.getRootBlockKey(block)) {
+      blockLoads.incrementAndGet();
+
+      // Check cache for block.  If found return.
+      if (blockCache != null) {
+        HFileBlock cachedBlock = (HFileBlock) blockCache.getBlock(cacheKey,
+            true);
+        if (cachedBlock != null) {
+          cacheHits.incrementAndGet();
+          return cachedBlock.getBufferWithoutHeader();
+        }
+        // Carry on, please load.
+      }
+
+      // Load block from filesystem.
+      long startTimeNs = System.nanoTime();
+      long nextOffset;
+
+      if (block == dataBlockIndexReader.getRootBlockCount() - 1) {
+        // last block!  The end of data block is first meta block if there is
+        // one or if there isn't, the fileinfo offset.
+        nextOffset = (metaBlockIndexReader.getRootBlockCount() == 0) ?
+            this.trailer.getFileInfoOffset() :
+            metaBlockIndexReader.getRootBlockOffset(0);
+      } else {
+        nextOffset = dataBlockIndexReader.getRootBlockOffset(block + 1);
+      }
+
+      HFileBlock hfileBlock = fsBlockReader.readBlockData(offset, nextOffset
+          - offset, dataBlockIndexReader.getRootBlockDataSize(block), pread);
+      hfileBlock.expectType(BlockType.DATA);
+      ByteBuffer buf = hfileBlock.getBufferWithoutHeader();
+
+      HFile.readTimeNano.addAndGet(System.nanoTime() - startTimeNs);
+      HFile.readOps.incrementAndGet();
+
+      // Cache the block
+      if (cacheBlock && blockCache != null) {
+        blockCache.cacheBlock(cacheKey, hfileBlock, inMemory);
+      }
+
+      return buf;
+    }
+  }
+
+  /**
+   * @return Last key in the file.  May be null if file has no entries.
+   * Note that this is not the last rowkey, but rather the byte form of
+   * the last KeyValue.
+   */
+  public byte[] getLastKey() {
+    if (!fileInfoLoaded) {
+      throw new RuntimeException("Load file info first");
+    }
+    return dataBlockIndexReader.isEmpty() ? null : lastKey;
+  }
+
+  /**
+   * @return Midkey for this file. We work with block boundaries only so
+   *         returned midkey is an approximation only.
+   *
+   * @throws IOException
+   */
+  @Override
+  public byte[] midkey() throws IOException {
+    Preconditions.checkState(isFileInfoLoaded(), "File info is not loaded");
+    Preconditions.checkState(!dataBlockIndexReader.isEmpty(),
+        "Data block index is not loaded or is empty");
+    return dataBlockIndexReader.midkey();
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (evictOnClose && this.blockCache != null) {
+      int numEvicted = 0;
+      for (int i = 0; i < dataBlockIndexReader.getRootBlockCount(); i++) {
+        if (blockCache.evictBlock(HFile.getBlockCacheKey(name,
+            dataBlockIndexReader.getRootBlockOffset(i))))
+          numEvicted++;
+      }
+      LOG.debug("On close of file " + name + " evicted " + numEvicted
+          + " block(s) of " + dataBlockIndexReader.getRootBlockCount()
+          + " total blocks");
+    }
+    if (this.closeIStream && this.istream != null) {
+      this.istream.close();
+      this.istream = null;
+    }
+  }
+
+  /**
+   * Implementation of {@link HFileScanner} interface.
+   */
+  protected static class ScannerV1 extends AbstractHFileReader.Scanner {
+    private final HFileReaderV1 readerV1;
+    private int currBlock;
+
+    public ScannerV1(HFileReaderV1 reader, boolean cacheBlocks,
+        final boolean pread, final boolean isCompaction) {
+      super(reader, cacheBlocks, pread, isCompaction);
+      readerV1 = reader;
+    }
+
+    @Override
+    public KeyValue getKeyValue() {
+      if (blockBuffer == null) {
+        return null;
+      }
+      return new KeyValue(blockBuffer.array(), blockBuffer.arrayOffset()
+          + blockBuffer.position() - 8);
+    }
+
+    @Override
+    public ByteBuffer getKey() {
+      Preconditions.checkState(blockBuffer != null && currKeyLen > 0,
+          "you need to seekTo() before calling getKey()");
+
+      ByteBuffer keyBuff = blockBuffer.slice();
+      keyBuff.limit(currKeyLen);
+      keyBuff.rewind();
+      // Do keyBuff.asReadOnly()?
+      return keyBuff;
+    }
+
+    @Override
+    public ByteBuffer getValue() {
+      if (blockBuffer == null || currKeyLen == 0) {
+        throw new RuntimeException(
+            "you need to seekTo() before calling getValue()");
+      }
+
+      // TODO: Could this be done with one ByteBuffer rather than create two?
+      ByteBuffer valueBuff = blockBuffer.slice();
+      valueBuff.position(currKeyLen);
+      valueBuff = valueBuff.slice();
+      valueBuff.limit(currValueLen);
+      valueBuff.rewind();
+      return valueBuff;
+    }
+
+    @Override
+    public boolean next() throws IOException {
+      if (blockBuffer == null) {
+        throw new IOException("Next called on non-seeked scanner");
+      }
+
+      try {
+        blockBuffer.position(blockBuffer.position() + currKeyLen
+            + currValueLen);
+      } catch (IllegalArgumentException e) {
+        LOG.error("Current pos = " + blockBuffer.position() +
+                  "; currKeyLen = " + currKeyLen +
+                  "; currValLen = " + currValueLen +
+                  "; block limit = " + blockBuffer.limit() +
+                  "; HFile name = " + reader.getName() +
+                  "; currBlock id = " + currBlock, e);
+        throw e;
+      }
+      if (blockBuffer.remaining() <= 0) {
+        // LOG.debug("Fetch next block");
+        currBlock++;
+        if (currBlock >= reader.getDataBlockIndexReader().getRootBlockCount()) {
+          // damn we are at the end
+          currBlock = 0;
+          blockBuffer = null;
+          return false;
+        }
+        blockBuffer = readerV1.readBlockBuffer(currBlock, cacheBlocks, pread,
+            isCompaction);
+        currKeyLen = blockBuffer.getInt();
+        currValueLen = blockBuffer.getInt();
+        blockFetches++;
+        return true;
+      }
+
+      currKeyLen = blockBuffer.getInt();
+      currValueLen = blockBuffer.getInt();
+      return true;
+    }
+
+    @Override
+    public int seekTo(byte[] key) throws IOException {
+      return seekTo(key, 0, key.length);
+    }
+
+    @Override
+    public int seekTo(byte[] key, int offset, int length) throws IOException {
+      int b = readerV1.blockContainingKey(key, offset, length);
+      if (b < 0) return -1; // falls before the beginning of the file! :-(
+      // Avoid re-reading the same block (that'd be dumb).
+      loadBlock(b, true);
+      return blockSeek(key, offset, length, false);
+    }
+
+    @Override
+    public int reseekTo(byte[] key) throws IOException {
+      return reseekTo(key, 0, key.length);
+    }
+
+    @Override
+    public int reseekTo(byte[] key, int offset, int length)
+        throws IOException {
+      if (blockBuffer != null && currKeyLen != 0) {
+        ByteBuffer bb = getKey();
+        int compared = reader.getComparator().compare(key, offset,
+            length, bb.array(), bb.arrayOffset(), bb.limit());
+        if (compared <= 0) {
+          // If the required key is less than or equal to current key, then
+          // don't do anything.
+          return compared;
+        }
+      }
+
+      int b = readerV1.blockContainingKey(key, offset, length);
+      if (b < 0) {
+        return -1;
+      }
+      loadBlock(b, false);
+      return blockSeek(key, offset, length, false);
+    }
+
+    /**
+     * Within a loaded block, seek looking for the first key
+     * that is smaller than (or equal to?) the key we are interested in.
+     *
+     * A note on the seekBefore - if you have seekBefore = true, AND the
+     * first key in the block = key, then you'll get thrown exceptions.
+     * @param key to find
+     * @param seekBefore find the key before the exact match.
+     * @return
+     */
+    private int blockSeek(byte[] key, int offset, int length,
+        boolean seekBefore) {
+      int klen, vlen;
+      int lastLen = 0;
+      do {
+        klen = blockBuffer.getInt();
+        vlen = blockBuffer.getInt();
+        int comp = reader.getComparator().compare(key, offset, length,
+            blockBuffer.array(),
+            blockBuffer.arrayOffset() + blockBuffer.position(), klen);
+        if (comp == 0) {
+          if (seekBefore) {
+            blockBuffer.position(blockBuffer.position() - lastLen - 16);
+            currKeyLen = blockBuffer.getInt();
+            currValueLen = blockBuffer.getInt();
+            return 1; // non exact match.
+          }
+          currKeyLen = klen;
+          currValueLen = vlen;
+          return 0; // indicate exact match
+        }
+        if (comp < 0) {
+          // go back one key:
+          blockBuffer.position(blockBuffer.position() - lastLen - 16);
+          currKeyLen = blockBuffer.getInt();
+          currValueLen = blockBuffer.getInt();
+          return 1;
+        }
+        blockBuffer.position(blockBuffer.position() + klen + vlen);
+        lastLen = klen + vlen;
+      } while (blockBuffer.remaining() > 0);
+
+      // ok we are at the end, so go back a littleeeeee....
+      // The 8 in the below is intentionally different to the 16s in the above
+      // Do the math you you'll figure it.
+      blockBuffer.position(blockBuffer.position() - lastLen - 8);
+      currKeyLen = blockBuffer.getInt();
+      currValueLen = blockBuffer.getInt();
+      return 1; // didn't exactly find it.
+    }
+
+    @Override
+    public boolean seekBefore(byte[] key) throws IOException {
+      return seekBefore(key, 0, key.length);
+    }
+
+    @Override
+    public boolean seekBefore(byte[] key, int offset, int length)
+    throws IOException {
+      int b = readerV1.blockContainingKey(key, offset, length);
+      if (b < 0)
+        return false; // key is before the start of the file.
+
+      // Question: does this block begin with 'key'?
+      byte[] firstkKey = reader.getDataBlockIndexReader().getRootBlockKey(b);
+      if (reader.getComparator().compare(firstkKey, 0, firstkKey.length,
+          key, offset, length) == 0) {
+        // Ok the key we're interested in is the first of the block, so go back
+        // by one.
+        if (b == 0) {
+          // we have a 'problem', the key we want is the first of the file.
+          return false;
+        }
+        b--;
+        // TODO shortcut: seek forward in this block to the last key of the
+        // block.
+      }
+      loadBlock(b, true);
+      blockSeek(key, offset, length, true);
+      return true;
+    }
+
+    @Override
+    public String getKeyString() {
+      return Bytes.toStringBinary(blockBuffer.array(),
+          blockBuffer.arrayOffset() + blockBuffer.position(), currKeyLen);
+    }
+
+    @Override
+    public String getValueString() {
+      return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset() +
+        blockBuffer.position() + currKeyLen, currValueLen);
+    }
+
+    @Override
+    public Reader getReader() {
+      return reader;
+    }
+
+    @Override
+    public boolean seekTo() throws IOException {
+      if (reader.getDataBlockIndexReader().isEmpty()) {
+        return false;
+      }
+      if (blockBuffer != null && currBlock == 0) {
+        blockBuffer.rewind();
+        currKeyLen = blockBuffer.getInt();
+        currValueLen = blockBuffer.getInt();
+        return true;
+      }
+      currBlock = 0;
+      blockBuffer = readerV1.readBlockBuffer(currBlock, cacheBlocks, pread,
+          isCompaction);
+      currKeyLen = blockBuffer.getInt();
+      currValueLen = blockBuffer.getInt();
+      blockFetches++;
+      return true;
+    }
+
+    private void loadBlock(int bloc, boolean rewind) throws IOException {
+      if (blockBuffer == null) {
+        blockBuffer = readerV1.readBlockBuffer(bloc, cacheBlocks, pread,
+            isCompaction);
+        currBlock = bloc;
+        blockFetches++;
+      } else {
+        if (bloc != currBlock) {
+          blockBuffer = readerV1.readBlockBuffer(bloc, cacheBlocks, pread,
+              isCompaction);
+          currBlock = bloc;
+          blockFetches++;
+        } else {
+          // we are already in the same block, just rewind to seek again.
+          if (rewind) {
+            blockBuffer.rewind();
+          }
+          else {
+            // Go back by (size of rowlength + size of valuelength) = 8 bytes
+            blockBuffer.position(blockBuffer.position()-8);
+          }
+        }
+      }
+    }
+
+  }
+
+  @Override
+  public HFileBlock readBlock(long offset, int onDiskBlockSize,
+      boolean cacheBlock, boolean pread, boolean isCompaction) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public DataInput getBloomFilterMetadata() throws IOException {
+    ByteBuffer buf = getMetaBlock(HFileWriterV1.BLOOM_FILTER_META_KEY, false);
+    if (buf == null)
+      return null;
+    ByteArrayInputStream bais = new ByteArrayInputStream(buf.array(),
+        buf.arrayOffset(), buf.limit());
+    return new DataInputStream(bais);
+  }
+
+  @Override
+  public boolean isFileInfoLoaded() {
+    return fileInfoLoaded;
+  }
+
+}
--- a/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java
+++ b/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java
@ -0,0 +1,732 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.IdLock;
+
+/**
+ * {@link HFile} reader for version 2.
+ */
+public class HFileReaderV2 extends AbstractHFileReader implements
+    HFileBlock.BasicReader {
+
+  private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
+
+  /**
+   * The size of a (key length, value length) tuple that prefixes each entry in
+   * a data block.
+   */
+  private static final int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
+
+  /**
+   * A "sparse lock" implementation allowing to lock on a particular block
+   * identified by offset. The purpose of this is to avoid two clients loading
+   * the same block, and have all but one client wait to get the block from the
+   * cache.
+   */
+  private IdLock offsetLock = new IdLock();
+
+  /**
+   * Blocks read from the load-on-open section, excluding data root index, meta
+   * index, and file info.
+   */
+  private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
+
+  /**
+   * Opens a HFile. You must load the index before you can use it by calling
+   * {@link #loadFileInfo()}.
+   *
+   * @param fsdis input stream. Caller is responsible for closing the passed
+   *          stream.
+   * @param size Length of the stream.
+   * @param blockCache block cache. Pass null if none.
+   * @param inMemory whether blocks should be marked as in-memory in cache
+   * @param evictOnClose whether blocks in cache should be evicted on close
+   * @throws IOException
+   */
+  public HFileReaderV2(Path path, FixedFileTrailer trailer,
+      final FSDataInputStream fsdis, final long size,
+      final boolean closeIStream, final BlockCache blockCache,
+      final boolean inMemory, final boolean evictOnClose) throws IOException {
+    super(path, trailer, fsdis, size, closeIStream, blockCache, inMemory,
+        evictOnClose);
+
+    trailer.expectVersion(2);
+    fsBlockReader = new HFileBlock.FSReaderV2(fsdis, compressAlgo,
+        fileSize);
+
+    // Comparator class name is stored in the trailer in version 2.
+    comparator = trailer.createComparator();
+    dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
+        trailer.getNumDataIndexLevels(), this);
+    metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
+        Bytes.BYTES_RAWCOMPARATOR, 1);
+
+    // Parse load-on-open data.
+
+    HFileBlock.BlockIterator blockIter = fsBlockReader.blockRange(
+        trailer.getLoadOnOpenDataOffset(),
+        fileSize - trailer.getTrailerSize());
+
+    // Data index. We also read statistics about the block index written after
+    // the root level.
+    dataBlockIndexReader.readMultiLevelIndexRoot(
+        blockIter.nextBlockAsStream(BlockType.ROOT_INDEX),
+        trailer.getDataIndexCount());
+
+    // Meta index.
+    metaBlockIndexReader.readRootIndex(
+        blockIter.nextBlockAsStream(BlockType.ROOT_INDEX),
+        trailer.getMetaIndexCount());
+
+    // File info
+    fileInfo = new FileInfo();
+    fileInfo.readFields(blockIter.nextBlockAsStream(BlockType.FILE_INFO));
+    lastKey = fileInfo.get(FileInfo.LASTKEY);
+    avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
+    avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
+
+    // Store all other load-on-open blocks for further consumption.
+    HFileBlock b;
+    while ((b = blockIter.nextBlock()) != null) {
+      loadOnOpenBlocks.add(b);
+    }
+  }
+
+  /**
+   * Create a Scanner on this file. No seeks or reads are done on creation. Call
+   * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
+   * nothing to clean up in a Scanner. Letting go of your references to the
+   * scanner is sufficient.
+   *
+   * @param cacheBlocks True if we should cache blocks read in by this scanner.
+   * @param pread Use positional read rather than seek+read if true (pread is
+   *          better for random reads, seek+read is better scanning).
+   * @param isCompaction is scanner being used for a compaction?
+   * @return Scanner on this file.
+   */
+  @Override
+  public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
+      final boolean isCompaction) {
+    return new ScannerV2(this, cacheBlocks, pread, isCompaction);
+  }
+
+  /**
+   * @param metaBlockName
+   * @param cacheBlock Add block to cache, if found
+   * @return block wrapped in a ByteBuffer, with header skipped
+   * @throws IOException
+   */
+  @Override
+  public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
+      throws IOException {
+    if (trailer.getMetaIndexCount() == 0) {
+      return null; // there are no meta blocks
+    }
+    if (metaBlockIndexReader == null) {
+      throw new IOException("Meta index not loaded");
+    }
+
+    byte[] mbname = Bytes.toBytes(metaBlockName);
+    int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
+        mbname.length);
+    if (block == -1)
+      return null;
+    long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
+    long startTimeNs = System.nanoTime();
+
+    // Per meta key from any given file, synchronize reads for said block. This
+    // is OK to do for meta blocks because the meta block index is always
+    // single-level.
+    synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
+      metaLoads.incrementAndGet();
+
+      // Check cache for block. If found return.
+      long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
+      String cacheKey = HFile.getBlockCacheKey(name, metaBlockOffset);
+
+      if (blockCache != null) {
+        HFileBlock cachedBlock = (HFileBlock) blockCache.getBlock(cacheKey,
+            true);
+        if (cachedBlock != null) {
+          // Return a distinct 'shallow copy' of the block,
+          // so pos does not get messed by the scanner
+          cacheHits.incrementAndGet();
+          return cachedBlock.getBufferWithoutHeader();
+        }
+        // Cache Miss, please load.
+      }
+
+      HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
+          blockSize, -1, true);
+
+      HFile.readTimeNano.addAndGet(System.nanoTime() - startTimeNs);
+      HFile.readOps.incrementAndGet();
+
+      // Cache the block
+      if (cacheBlock && blockCache != null) {
+        blockCache.cacheBlock(cacheKey, metaBlock, inMemory);
+      }
+
+      return metaBlock.getBufferWithoutHeader();
+    }
+  }
+
+  /**
+   * Implements the "basic block reader" API, used mainly by
+   * {@link HFileBlockIndex.BlockIndexReader} in
+   * {@link HFileBlockIndex.BlockIndexReader#seekToDataBlock(byte[], int, int,
+   * HFileBlock)} in a random-read access pattern.
+   */
+  @Override
+  public HFileBlock readBlockData(long offset, long onDiskSize,
+      int uncompressedSize, boolean pread) throws IOException {
+    if (onDiskSize >= Integer.MAX_VALUE) {
+      throw new IOException("Invalid on-disk size: " + onDiskSize);
+    }
+
+    // Assuming we are not doing a compaction.
+    return readBlock(offset, (int) onDiskSize, true, pread, false);
+  }
+
+  /**
+   * Read in a file block.
+   *
+   * @param dataBlockOffset offset to read.
+   * @param onDiskSize size of the block
+   * @param pread Use positional read instead of seek+read (positional is better
+   *          doing random reads whereas seek+read is better scanning).
+   * @param isCompaction is this block being read as part of a compaction
+   * @return Block wrapped in a ByteBuffer.
+   * @throws IOException
+   */
+  @Override
+  public HFileBlock readBlock(long dataBlockOffset, int onDiskBlockSize,
+      boolean cacheBlock, boolean pread, final boolean isCompaction)
+      throws IOException {
+    if (dataBlockIndexReader == null) {
+      throw new IOException("Block index not loaded");
+    }
+    if (dataBlockOffset < 0
+        || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
+      throw new IOException("Requested block is out of range: "
+          + dataBlockOffset + ", lastDataBlockOffset: "
+          + trailer.getLastDataBlockOffset());
+    }
+    // For any given block from any given file, synchronize reads for said
+    // block.
+    // Without a cache, this synchronizing is needless overhead, but really
+    // the other choice is to duplicate work (which the cache would prevent you
+    // from doing).
+
+    String cacheKey = HFile.getBlockCacheKey(name, dataBlockOffset);
+    IdLock.Entry lockEntry = offsetLock.getLockEntry(dataBlockOffset);
+    try {
+      blockLoads.incrementAndGet();
+
+      // Check cache for block. If found return.
+      if (blockCache != null) {
+        HFileBlock cachedBlock = (HFileBlock) blockCache.getBlock(cacheKey,
+            true);
+        if (cachedBlock != null) {
+          cacheHits.incrementAndGet();
+
+          return cachedBlock;
+        }
+        // Carry on, please load.
+      }
+
+      // Load block from filesystem.
+      long startTimeNs = System.nanoTime();
+      HFileBlock dataBlock = fsBlockReader.readBlockData(dataBlockOffset,
+          onDiskBlockSize, -1, pread);
+
+      HFile.readTimeNano.addAndGet(System.nanoTime() - startTimeNs);
+      HFile.readOps.incrementAndGet();
+
+      // Cache the block
+      if (cacheBlock && blockCache != null) {
+        blockCache.cacheBlock(cacheKey, dataBlock, inMemory);
+      }
+
+      return dataBlock;
+    } finally {
+      offsetLock.releaseLockEntry(lockEntry);
+    }
+  }
+
+  /**
+   * @return Last key in the file. May be null if file has no entries. Note that
+   *         this is not the last row key, but rather the byte form of the last
+   *         KeyValue.
+   */
+  @Override
+  public byte[] getLastKey() {
+    return dataBlockIndexReader.isEmpty() ? null : lastKey;
+  }
+
+  /**
+   * @return Midkey for this file. We work with block boundaries only so
+   *         returned midkey is an approximation only.
+   * @throws IOException
+   */
+  @Override
+  public byte[] midkey() throws IOException {
+    return dataBlockIndexReader.midkey();
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (evictOnClose && blockCache != null) {
+      int numEvicted = blockCache.evictBlocksByPrefix(name
+          + HFile.CACHE_KEY_SEPARATOR);
+      LOG.debug("On close of file " + name + " evicted " + numEvicted
+          + " block(s)");
+    }
+    if (closeIStream && istream != null) {
+      istream.close();
+      istream = null;
+    }
+  }
+
+  /**
+   * Implementation of {@link HFileScanner} interface.
+   */
+  protected static class ScannerV2 extends AbstractHFileReader.Scanner {
+    private HFileBlock block;
+
+    public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
+        final boolean pread, final boolean isCompaction) {
+      super(r, cacheBlocks, pread, isCompaction);
+    }
+
+    @Override
+    public KeyValue getKeyValue() {
+      if (!isSeeked())
+        return null;
+
+      return new KeyValue(blockBuffer.array(), blockBuffer.arrayOffset()
+          + blockBuffer.position());
+    }
+
+    @Override
+    public ByteBuffer getKey() {
+      assertSeeked();
+      return ByteBuffer.wrap(
+          blockBuffer.array(),
+          blockBuffer.arrayOffset() + blockBuffer.position()
+              + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
+    }
+
+    @Override
+    public ByteBuffer getValue() {
+      assertSeeked();
+      return ByteBuffer.wrap(
+          blockBuffer.array(),
+          blockBuffer.arrayOffset() + blockBuffer.position()
+              + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
+    }
+
+    private void setNonSeekedState() {
+      block = null;
+      blockBuffer = null;
+      currKeyLen = 0;
+      currValueLen = 0;
+    }
+
+    /**
+     * Go to the next key/value in the block section. Loads the next block if
+     * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
+     * be called.
+     *
+     * @return true if successfully navigated to the next key/value
+     */
+    @Override
+    public boolean next() throws IOException {
+      assertSeeked();
+
+      try {
+        blockBuffer.position(blockBuffer.position() + KEY_VALUE_LEN_SIZE
+            + currKeyLen + currValueLen);
+      } catch (IllegalArgumentException e) {
+        LOG.error("Current pos = " + blockBuffer.position()
+            + "; currKeyLen = " + currKeyLen + "; currValLen = "
+            + currValueLen + "; block limit = " + blockBuffer.limit()
+            + "; HFile name = " + reader.getName()
+            + "; currBlock currBlockOffset = " + block.getOffset());
+        throw e;
+      }
+
+      if (blockBuffer.remaining() <= 0) {
+        long lastDataBlockOffset =
+            reader.getTrailer().getLastDataBlockOffset();
+
+        if (block.getOffset() >= lastDataBlockOffset) {
+          setNonSeekedState();
+          return false;
+        }
+
+        // read the next block
+        HFileBlock nextBlock = readNextDataBlock();
+        if (nextBlock == null) {
+          setNonSeekedState();
+          return false;
+        }
+
+        updateCurrBlock(nextBlock);
+        return true;
+      }
+
+      // We are still in the same block.
+      readKeyValueLen();
+      return true;
+    }
+
+    /**
+     * Scans blocks in the "scanned" section of the {@link HFile} until the next
+     * data block is found.
+     *
+     * @return the next block, or null if there are no more data blocks
+     * @throws IOException
+     */
+    private HFileBlock readNextDataBlock() throws IOException {
+      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
+      if (block == null)
+        return null;
+
+      HFileBlock curBlock = block;
+
+      do {
+        if (curBlock.getOffset() >= lastDataBlockOffset)
+          return null;
+
+        if (curBlock.getOffset() < 0) {
+          throw new IOException("Invalid block file offset: " + block);
+        }
+        curBlock = reader.readBlock(curBlock.getOffset()
+            + curBlock.getOnDiskSizeWithHeader(),
+            curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
+            isCompaction);
+      } while (!curBlock.getBlockType().equals(BlockType.DATA));
+
+      return curBlock;
+    }
+
+    /**
+     * Positions this scanner at the start of the file.
+     *
+     * @return false if empty file; i.e. a call to next would return false and
+     *         the current key and value are undefined.
+     * @throws IOException
+     */
+    @Override
+    public boolean seekTo() throws IOException {
+      if (reader == null) {
+        return false;
+      }
+
+      if (reader.getTrailer().getEntryCount() == 0) {
+        // No data blocks.
+        return false;
+      }
+
+      long firstDataBlockOffset =
+          reader.getTrailer().getFirstDataBlockOffset();
+      if (block != null && block.getOffset() == firstDataBlockOffset) {
+        blockBuffer.rewind();
+        readKeyValueLen();
+        return true;
+      }
+
+      block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
+          isCompaction);
+      if (block.getOffset() < 0) {
+        throw new IOException("Invalid block offset: " + block.getOffset());
+      }
+      updateCurrBlock(block);
+      return true;
+    }
+
+    @Override
+    public int seekTo(byte[] key) throws IOException {
+      return seekTo(key, 0, key.length);
+    }
+
+    /**
+     * An internal API function. Seek to the given key, optionally rewinding to
+     * the first key of the block before doing the seek.
+     *
+     * @param key key byte array
+     * @param offset key offset in the key byte array
+     * @param length key length
+     * @param rewind whether to rewind to the first key of the block before
+     *        doing the seek. If this is false, we are assuming we never go
+     *        back, otherwise the result is undefined.
+     * @return -1 if the key is earlier than the first key of the file,
+     *         0 if we are at the given key, and 1 if we are past the given key
+     * @throws IOException
+     */
+    private int seekTo(byte[] key, int offset, int length, boolean rewind)
+        throws IOException {
+      HFileBlock seekToBlock =
+        ((HFileReaderV2) reader).getDataBlockIndexReader().seekToDataBlock(
+            key, offset, length, block);
+      if (seekToBlock == null) {
+        // This happens if the key e.g. falls before the beginning of the file.
+        return -1;
+      }
+      return loadBlockAndSeekToKey(seekToBlock, rewind, key, offset, length,
+          false);
+    }
+
+    @Override
+    public int seekTo(byte[] key, int offset, int length) throws IOException {
+      // Always rewind to the first key of the block, because the given key
+      // might be before or after the current key.
+      return seekTo(key, offset, length, true);
+    }
+
+    @Override
+    public int reseekTo(byte[] key) throws IOException {
+      return reseekTo(key, 0, key.length);
+    }
+
+    @Override
+    public int reseekTo(byte[] key, int offset, int length) throws IOException {
+      if (isSeeked()) {
+        ByteBuffer bb = getKey();
+        int compared = reader.getComparator().compare(key, offset,
+            length, bb.array(), bb.arrayOffset(), bb.limit());
+        if (compared < 1) {
+          // If the required key is less than or equal to current key, then
+          // don't do anything.
+          return compared;
+        }
+      }
+
+      // Don't rewind on a reseek operation, because reseek implies that we are
+      // always going forward in the file.
+      return seekTo(key, offset, length, false);
+    }
+
+    private int loadBlockAndSeekToKey(HFileBlock seekToBlock, boolean rewind,
+        byte[] key, int offset, int length, boolean seekBefore)
+        throws IOException {
+      if (block == null || block.getOffset() != seekToBlock.getOffset()) {
+        updateCurrBlock(seekToBlock);
+      } else if (rewind) {
+        blockBuffer.rewind();
+      }
+      return blockSeek(key, offset, length, seekBefore);
+    }
+
+    /**
+     * Updates the current block to be the given {@link HFileBlock}. Seeks to
+     * the the first key/value pair.
+     *
+     * @param newBlock the block to make current
+     */
+    private void updateCurrBlock(HFileBlock newBlock) {
+      block = newBlock;
+      blockBuffer = block.getBufferWithoutHeader();
+      readKeyValueLen();
+      blockFetches++;
+    }
+
+    private final void readKeyValueLen() {
+      blockBuffer.mark();
+      currKeyLen = blockBuffer.getInt();
+      currValueLen = blockBuffer.getInt();
+      blockBuffer.reset();
+
+      if (currKeyLen < 0 || currValueLen < 0
+          || currKeyLen > blockBuffer.limit()
+          || currValueLen > blockBuffer.limit()) {
+        throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
+            + " or currValueLen " + currValueLen + ". Block offset: "
+            + block.getOffset() + ", block length: " + blockBuffer.limit()
+            + ", position: " + blockBuffer.position() + " (without header).");
+      }
+    }
+
+    /**
+     * Within a loaded block, seek looking for the first key that is smaller
+     * than (or equal to?) the key we are interested in.
+     *
+     * A note on the seekBefore: if you have seekBefore = true, AND the first
+     * key in the block = key, then you'll get thrown exceptions. The caller has
+     * to check for that case and load the previous block as appropriate.
+     *
+     * @param key the key to find
+     * @param seekBefore find the key before the given key in case of exact
+     *          match.
+     * @return 0 in case of an exact key match, 1 in case of an inexact match
+     */
+    private int blockSeek(byte[] key, int offset, int length,
+        boolean seekBefore) {
+      int klen, vlen;
+      int lastKeyValueSize = -1;
+      do {
+        blockBuffer.mark();
+        klen = blockBuffer.getInt();
+        vlen = blockBuffer.getInt();
+        blockBuffer.reset();
+
+        int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
+            + KEY_VALUE_LEN_SIZE;
+        int comp = reader.getComparator().compare(key, offset, length,
+            blockBuffer.array(), keyOffset, klen);
+
+        if (comp == 0) {
+          if (seekBefore) {
+            if (lastKeyValueSize < 0) {
+              throw new IllegalStateException("blockSeek with seekBefore "
+                  + "at the first key of the block: key="
+                  + Bytes.toStringBinary(key) + ", blockOffset="
+                  + block.getOffset() + ", onDiskSize="
+                  + block.getOnDiskSizeWithHeader());
+            }
+            blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
+            readKeyValueLen();
+            return 1; // non exact match.
+          }
+          currKeyLen = klen;
+          currValueLen = vlen;
+          return 0; // indicate exact match
+        }
+
+        if (comp < 0) {
+          if (lastKeyValueSize > 0)
+            blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
+          readKeyValueLen();
+          return 1;
+        }
+
+        // The size of this key/value tuple, including key/value length fields.
+        lastKeyValueSize = klen + vlen + KEY_VALUE_LEN_SIZE;
+        blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
+      } while (blockBuffer.remaining() > 0);
+
+      // Seek to the last key we successfully read. This will happen if this is
+      // the last key/value pair in the file, in which case the following call
+      // to next() has to return false.
+      blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
+      readKeyValueLen();
+      return 1; // didn't exactly find it.
+    }
+
+    @Override
+    public boolean seekBefore(byte[] key) throws IOException {
+      return seekBefore(key, 0, key.length);
+    }
+
+    private ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
+      ByteBuffer buffer = curBlock.getBufferWithoutHeader();
+      // It is safe to manipulate this buffer because we own the buffer object.
+      buffer.rewind();
+      int klen = buffer.getInt();
+      buffer.getInt();
+      ByteBuffer keyBuff = buffer.slice();
+      keyBuff.limit(klen);
+      keyBuff.rewind();
+      return keyBuff;
+    }
+
+    @Override
+    public boolean seekBefore(byte[] key, int offset, int length)
+        throws IOException {
+      HFileReaderV2 reader2 = (HFileReaderV2) reader;
+      HFileBlock seekToBlock =
+          reader2.getDataBlockIndexReader().seekToDataBlock(
+              key, offset, length, block);
+      if (seekToBlock == null) {
+        return false;
+      }
+      ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
+      if (reader.getComparator().compare(firstKey.array(),
+          firstKey.arrayOffset(), firstKey.limit(), key, offset, length) == 0)
+      {
+        long previousBlockOffset = seekToBlock.getPrevBlockOffset();
+        // The key we are interested in
+        if (previousBlockOffset == -1) {
+          // we have a 'problem', the key we want is the first of the file.
+          return false;
+        }
+
+        // It is important that we compute and pass onDiskSize to the block
+        // reader so that it does not have to read the header separately to
+        // figure out the size.
+        seekToBlock = reader2.fsBlockReader.readBlockData(previousBlockOffset,
+            seekToBlock.getOffset() - previousBlockOffset, -1, pread);
+
+        // TODO shortcut: seek forward in this block to the last key of the
+        // block.
+      }
+      loadBlockAndSeekToKey(seekToBlock, true, key, offset, length, true);
+      return true;
+    }
+
+    @Override
+    public String getKeyString() {
+      return Bytes.toStringBinary(blockBuffer.array(),
+          blockBuffer.arrayOffset() + blockBuffer.position()
+              + KEY_VALUE_LEN_SIZE, currKeyLen);
+    }
+
+    @Override
+    public String getValueString() {
+      return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
+          + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
+          currValueLen);
+    }
+
+  }
+
+  /**
+   * Returns a buffer with the Bloom filter metadata. The caller takes
+   * ownership of the buffer.
+   */
+  @Override
+  public DataInput getBloomFilterMetadata() throws IOException {
+    for (HFileBlock b : loadOnOpenBlocks)
+      if (b.getBlockType() == BlockType.BLOOM_META)
+        return b.getByteStream();
+    return null;
+  }
+
+  @Override
+  public boolean isFileInfoLoaded() {
+    return true; // We load file info in constructor in version 2.
+  }
+
+}
--- a/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterV1.java
+++ b/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterV1.java
@ -0,0 +1,483 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.KeyValue.KeyComparator;
+import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
+import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
+import org.apache.hadoop.hbase.util.BloomFilterWriter;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.compress.Compressor;
+
+/**
+ * Writes version 1 HFiles. Mainly used for testing backwards-compatibilty.
+ */
+public class HFileWriterV1 extends AbstractHFileWriter {
+
+  /** Meta data block name for bloom filter parameters. */
+  static final String BLOOM_FILTER_META_KEY = "BLOOM_FILTER_META";
+
+  /** Meta data block name for bloom filter bits. */
+  public static final String BLOOM_FILTER_DATA_KEY = "BLOOM_FILTER_DATA";
+
+  private static final Log LOG = LogFactory.getLog(HFileWriterV1.class);
+
+  // A stream made per block written.
+  private DataOutputStream out;
+
+  // Offset where the current block began.
+  private long blockBegin;
+
+  // First keys of every block.
+  private ArrayList<byte[]> blockKeys = new ArrayList<byte[]>();
+
+  // Block offset in backing stream.
+  private ArrayList<Long> blockOffsets = new ArrayList<Long>();
+
+  // Raw (decompressed) data size.
+  private ArrayList<Integer> blockDataSizes = new ArrayList<Integer>();
+
+  private Compressor compressor;
+
+  // Additional byte array output stream used to fill block cache
+  private ByteArrayOutputStream baos;
+  private DataOutputStream baosDos;
+  private int blockNumber = 0;
+
+  static class WriterFactoryV1 extends HFile.WriterFactory {
+
+    WriterFactoryV1(Configuration conf) { super(conf); }
+
+    @Override
+    public Writer createWriter(FileSystem fs, Path path) throws IOException {
+      return new HFileWriterV1(conf, fs, path);
+    }
+
+    @Override
+    public Writer createWriter(FileSystem fs, Path path, int blockSize,
+        Compression.Algorithm compressAlgo, final KeyComparator comparator)
+        throws IOException {
+      return new HFileWriterV1(conf, fs, path, blockSize,
+          compressAlgo, comparator);
+    }
+
+    @Override
+    public Writer createWriter(FileSystem fs, Path path, int blockSize,
+        String compressAlgoName,
+        final KeyComparator comparator) throws IOException {
+      return new HFileWriterV1(conf, fs, path, blockSize,
+          compressAlgoName, comparator);
+    }
+
+    @Override
+    public Writer createWriter(final FSDataOutputStream ostream,
+        final int blockSize, final String compress,
+        final KeyComparator comparator) throws IOException {
+      return new HFileWriterV1(conf, ostream, blockSize, compress, comparator);
+    }
+
+    @Override
+    public Writer createWriter(final FSDataOutputStream ostream,
+        final int blockSize, final Compression.Algorithm compress,
+        final KeyComparator c) throws IOException {
+      return new HFileWriterV1(conf, ostream, blockSize, compress, c);
+    }
+  }
+
+  /** Constructor that uses all defaults for compression and block size. */
+  public HFileWriterV1(Configuration conf, FileSystem fs, Path path)
+      throws IOException {
+    this(conf, fs, path, HFile.DEFAULT_BLOCKSIZE,
+        HFile.DEFAULT_COMPRESSION_ALGORITHM,
+        null);
+  }
+
+  /**
+   * Constructor that takes a path, creates and closes the output stream. Takes
+   * compression algorithm name as string.
+   */
+  public HFileWriterV1(Configuration conf, FileSystem fs, Path path,
+      int blockSize, String compressAlgoName,
+      final KeyComparator comparator) throws IOException {
+    this(conf, fs, path, blockSize,
+        compressionByName(compressAlgoName), comparator);
+  }
+
+  /** Constructor that takes a path, creates and closes the output stream. */
+  public HFileWriterV1(Configuration conf, FileSystem fs, Path path,
+      int blockSize, Compression.Algorithm compress,
+      final KeyComparator comparator) throws IOException {
+    super(conf, createOutputStream(conf, fs, path), path,
+        blockSize, compress, comparator);
+  }
+
+  /** Constructor that takes a stream. */
+  public HFileWriterV1(Configuration conf,
+      final FSDataOutputStream outputStream, final int blockSize,
+      final String compressAlgoName, final KeyComparator comparator)
+      throws IOException {
+    this(conf, outputStream, blockSize,
+        Compression.getCompressionAlgorithmByName(compressAlgoName),
+        comparator);
+  }
+
+  /** Constructor that takes a stream. */
+  public HFileWriterV1(Configuration conf,
+      final FSDataOutputStream outputStream, final int blockSize,
+      final Compression.Algorithm compress, final KeyComparator comparator)
+      throws IOException {
+    super(conf, outputStream, null, blockSize, compress, comparator);
+  }
+
+  /**
+   * If at block boundary, opens new block.
+   *
+   * @throws IOException
+   */
+  private void checkBlockBoundary() throws IOException {
+    if (this.out != null && this.out.size() < blockSize)
+      return;
+    finishBlock();
+    newBlock();
+  }
+
+  /**
+   * Do the cleanup if a current block.
+   *
+   * @throws IOException
+   */
+  private void finishBlock() throws IOException {
+    if (this.out == null)
+      return;
+    long startTimeNs = System.nanoTime();
+
+    int size = releaseCompressingStream(this.out);
+    this.out = null;
+    blockKeys.add(firstKeyInBlock);
+    blockOffsets.add(Long.valueOf(blockBegin));
+    blockDataSizes.add(Integer.valueOf(size));
+    this.totalUncompressedBytes += size;
+
+    HFile.writeTimeNano.addAndGet(System.nanoTime() - startTimeNs);
+    HFile.writeOps.incrementAndGet();
+
+    if (cacheDataBlocksOnWrite) {
+      baosDos.flush();
+      byte[] bytes = baos.toByteArray();
+      blockCache.cacheBlock(HFile.getBlockCacheKey(name, blockBegin),
+          new HFileBlock(BlockType.DATA,
+              (int) (outputStream.getPos() - blockBegin), bytes.length, -1,
+              ByteBuffer.wrap(bytes, 0, bytes.length), true, blockBegin));
+      baosDos.close();
+    }
+    blockNumber++;
+  }
+
+  /**
+   * Ready a new block for writing.
+   *
+   * @throws IOException
+   */
+  private void newBlock() throws IOException {
+    // This is where the next block begins.
+    blockBegin = outputStream.getPos();
+    this.out = getCompressingStream();
+    BlockType.DATA.write(out);
+    firstKeyInBlock = null;
+    if (cacheDataBlocksOnWrite) {
+      this.baos = new ByteArrayOutputStream();
+      this.baosDos = new DataOutputStream(baos);
+      baosDos.write(HFileBlock.DUMMY_HEADER);
+    }
+  }
+
+  /**
+   * Sets up a compressor and creates a compression stream on top of
+   * this.outputStream. Get one per block written.
+   *
+   * @return A compressing stream; if 'none' compression, returned stream does
+   * not compress.
+   *
+   * @throws IOException
+   *
+   * @see {@link #releaseCompressingStream(DataOutputStream)}
+   */
+  private DataOutputStream getCompressingStream() throws IOException {
+    this.compressor = compressAlgo.getCompressor();
+    // Get new DOS compression stream. In tfile, the DOS, is not closed,
+    // just finished, and that seems to be fine over there. TODO: Check
+    // no memory retention of the DOS. Should I disable the 'flush' on the
+    // DOS as the BCFile over in tfile does? It wants to make it so flushes
+    // don't go through to the underlying compressed stream. Flush on the
+    // compressed downstream should be only when done. I was going to but
+    // looks like when we call flush in here, its legitimate flush that
+    // should go through to the compressor.
+    OutputStream os = this.compressAlgo.createCompressionStream(
+        this.outputStream, this.compressor, 0);
+    return new DataOutputStream(os);
+  }
+
+  /**
+   * Let go of block compressor and compressing stream gotten in call {@link
+   * #getCompressingStream}.
+   *
+   * @param dos
+   *
+   * @return How much was written on this stream since it was taken out.
+   *
+   * @see #getCompressingStream()
+   *
+   * @throws IOException
+   */
+  private int releaseCompressingStream(final DataOutputStream dos)
+      throws IOException {
+    dos.flush();
+    this.compressAlgo.returnCompressor(this.compressor);
+    this.compressor = null;
+    return dos.size();
+  }
+
+  /**
+   * Add a meta block to the end of the file. Call before close(). Metadata
+   * blocks are expensive. Fill one with a bunch of serialized data rather than
+   * do a metadata block per metadata instance. If metadata is small, consider
+   * adding to file info using {@link #appendFileInfo(byte[], byte[])}
+   *
+   * @param metaBlockName
+   *          name of the block
+   * @param content
+   *          will call readFields to get data later (DO NOT REUSE)
+   */
+  public void appendMetaBlock(String metaBlockName, Writable content) {
+    byte[] key = Bytes.toBytes(metaBlockName);
+    int i;
+    for (i = 0; i < metaNames.size(); ++i) {
+      // stop when the current key is greater than our own
+      byte[] cur = metaNames.get(i);
+      if (Bytes.BYTES_RAWCOMPARATOR.compare(cur, 0, cur.length, key, 0,
+          key.length) > 0) {
+        break;
+      }
+    }
+    metaNames.add(i, key);
+    metaData.add(i, content);
+  }
+
+  /**
+   * Add key/value to file. Keys must be added in an order that agrees with the
+   * Comparator passed on construction.
+   *
+   * @param kv
+   *          KeyValue to add. Cannot be empty nor null.
+   * @throws IOException
+   */
+  public void append(final KeyValue kv) throws IOException {
+    append(kv.getBuffer(), kv.getKeyOffset(), kv.getKeyLength(),
+        kv.getBuffer(), kv.getValueOffset(), kv.getValueLength());
+  }
+
+  /**
+   * Add key/value to file. Keys must be added in an order that agrees with the
+   * Comparator passed on construction.
+   *
+   * @param key
+   *          Key to add. Cannot be empty nor null.
+   * @param value
+   *          Value to add. Cannot be empty nor null.
+   * @throws IOException
+   */
+  public void append(final byte[] key, final byte[] value) throws IOException {
+    append(key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Add key/value to file. Keys must be added in an order that agrees with the
+   * Comparator passed on construction.
+   *
+   * @param key
+   * @param koffset
+   * @param klength
+   * @param value
+   * @param voffset
+   * @param vlength
+   * @throws IOException
+   */
+  private void append(final byte[] key, final int koffset, final int klength,
+      final byte[] value, final int voffset, final int vlength)
+      throws IOException {
+    boolean dupKey = checkKey(key, koffset, klength);
+    checkValue(value, voffset, vlength);
+    if (!dupKey) {
+      checkBlockBoundary();
+    }
+    // Write length of key and value and then actual key and value bytes.
+    this.out.writeInt(klength);
+    totalKeyLength += klength;
+    this.out.writeInt(vlength);
+    totalValueLength += vlength;
+    this.out.write(key, koffset, klength);
+    this.out.write(value, voffset, vlength);
+    // Are we the first key in this block?
+    if (this.firstKeyInBlock == null) {
+      // Copy the key.
+      this.firstKeyInBlock = new byte[klength];
+      System.arraycopy(key, koffset, this.firstKeyInBlock, 0, klength);
+    }
+    this.lastKeyBuffer = key;
+    this.lastKeyOffset = koffset;
+    this.lastKeyLength = klength;
+    this.entryCount++;
+    // If we are pre-caching blocks on write, fill byte array stream
+    if (cacheDataBlocksOnWrite) {
+      this.baosDos.writeInt(klength);
+      this.baosDos.writeInt(vlength);
+      this.baosDos.write(key, koffset, klength);
+      this.baosDos.write(value, voffset, vlength);
+    }
+  }
+
+  public void close() throws IOException {
+    if (this.outputStream == null) {
+      return;
+    }
+    // Write out the end of the data blocks, then write meta data blocks.
+    // followed by fileinfo, data block index and meta block index.
+
+    finishBlock();
+
+    FixedFileTrailer trailer = new FixedFileTrailer(1);
+
+    // Write out the metadata blocks if any.
+    ArrayList<Long> metaOffsets = null;
+    ArrayList<Integer> metaDataSizes = null;
+    if (metaNames.size() > 0) {
+      metaOffsets = new ArrayList<Long>(metaNames.size());
+      metaDataSizes = new ArrayList<Integer>(metaNames.size());
+      for (int i = 0; i < metaNames.size(); ++i) {
+        // store the beginning offset
+        long curPos = outputStream.getPos();
+        metaOffsets.add(curPos);
+        // write the metadata content
+        DataOutputStream dos = getCompressingStream();
+        BlockType.META.write(dos);
+        metaData.get(i).write(dos);
+        int size = releaseCompressingStream(dos);
+        // store the metadata size
+        metaDataSizes.add(size);
+      }
+    }
+
+    writeFileInfo(trailer, outputStream);
+
+    // Write the data block index.
+    trailer.setLoadOnOpenOffset(writeBlockIndex(this.outputStream,
+        this.blockKeys, this.blockOffsets, this.blockDataSizes));
+    LOG.info("Wrote a version 1 block index with " + this.blockKeys.size()
+        + " keys");
+
+    if (metaNames.size() > 0) {
+      // Write the meta index.
+      writeBlockIndex(this.outputStream, metaNames, metaOffsets, metaDataSizes);
+    }
+
+    // Now finish off the trailer.
+    trailer.setDataIndexCount(blockKeys.size());
+
+    finishClose(trailer);
+  }
+
+  @Override
+  protected void finishFileInfo() throws IOException {
+    super.finishFileInfo();
+
+    // In version 1, we store comparator name in the file info.
+    fileInfo.append(FileInfo.COMPARATOR,
+        Bytes.toBytes(comparator.getClass().getName()), false);
+  }
+
+  @Override
+  public void addInlineBlockWriter(InlineBlockWriter bloomWriter) {
+    // Inline blocks only exist in HFile format version 2.
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * Version 1 Bloom filters are stored in two meta blocks with two different
+   * keys.
+   */
+  @Override
+  public void addBloomFilter(BloomFilterWriter bfw) {
+    appendMetaBlock(BLOOM_FILTER_META_KEY,
+        bfw.getMetaWriter());
+    Writable dataWriter = bfw.getDataWriter();
+    if (dataWriter != null) {
+      appendMetaBlock(BLOOM_FILTER_DATA_KEY, dataWriter);
+    }
+  }
+
+  /**
+   * Write out the index in the version 1 format. This conforms to the legacy
+   * version 1 format, but can still be read by
+   * {@link HFileBlockIndex.BlockIndexReader#readRootIndex(java.io.DataInputStream,
+   * int)}.
+   *
+   * @param out the stream to write to
+   * @param keys
+   * @param offsets
+   * @param uncompressedSizes in contrast with a version 2 root index format,
+   *          the sizes stored in the version 1 are uncompressed sizes
+   * @return
+   * @throws IOException
+   */
+  private static long writeBlockIndex(final FSDataOutputStream out,
+      final List<byte[]> keys, final List<Long> offsets,
+      final List<Integer> uncompressedSizes) throws IOException {
+    long pos = out.getPos();
+    // Don't write an index if nothing in the index.
+    if (keys.size() > 0) {
+      BlockType.INDEX_V1.write(out);
+      // Write the index.
+      for (int i = 0; i < keys.size(); ++i) {
+        out.writeLong(offsets.get(i).longValue());
+        out.writeInt(uncompressedSizes.get(i).intValue());
+        byte[] key = keys.get(i);
+        Bytes.writeByteArray(out, key);
+      }
+    }
+    return pos;
+  }
+
+}
--- a/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterV2.java
+++ b/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterV2.java
@ -0,0 +1,452 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.KeyValue.KeyComparator;
+import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
+import org.apache.hadoop.hbase.io.hfile.HFileBlock.BlockWritable;
+import org.apache.hadoop.hbase.util.BloomFilterWriter;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Writes HFile format version 2.
+ */
+public class HFileWriterV2 extends AbstractHFileWriter {
+
+  /** Inline block writers for multi-level block index and compound Blooms. */
+  private List<InlineBlockWriter> inlineBlockWriters =
+      new ArrayList<InlineBlockWriter>();
+
+  /** Unified version 2 block writer */
+  private HFileBlock.Writer fsBlockWriter;
+
+  private HFileBlockIndex.BlockIndexWriter dataBlockIndexWriter;
+  private HFileBlockIndex.BlockIndexWriter metaBlockIndexWriter;
+
+  /** The offset of the first data block or -1 if the file is empty. */
+  private long firstDataBlockOffset = -1;
+
+  /** The offset of the last data block or 0 if the file is empty. */
+  private long lastDataBlockOffset;
+
+  /** Additional data items to be written to the "load-on-open" section. */
+  private List<BlockWritable> additionalLoadOnOpenData =
+    new ArrayList<BlockWritable>();
+
+  static class WriterFactoryV2 extends HFile.WriterFactory {
+
+    WriterFactoryV2(Configuration conf) { super(conf); }
+
+    @Override
+    public Writer createWriter(FileSystem fs, Path path)
+        throws IOException {
+      return new HFileWriterV2(conf, fs, path);
+    }
+
+    @Override
+    public Writer createWriter(FileSystem fs, Path path, int blockSize,
+        Compression.Algorithm compress,
+        final KeyComparator comparator) throws IOException {
+      return new HFileWriterV2(conf, fs, path, blockSize,
+          compress, comparator);
+    }
+
+    @Override
+    public Writer createWriter(FileSystem fs, Path path, int blockSize,
+        String compress, final KeyComparator comparator)
+        throws IOException {
+      return new HFileWriterV2(conf, fs, path, blockSize,
+          compress, comparator);
+    }
+
+    @Override
+    public Writer createWriter(final FSDataOutputStream ostream,
+        final int blockSize, final String compress,
+        final KeyComparator comparator) throws IOException {
+      return new HFileWriterV2(conf, ostream, blockSize, compress, comparator);
+    }
+
+    @Override
+    public Writer createWriter(final FSDataOutputStream ostream,
+        final int blockSize, final Compression.Algorithm compress,
+        final KeyComparator c) throws IOException {
+      return new HFileWriterV2(conf, ostream, blockSize, compress, c);
+    }
+  }
+
+  /** Constructor that uses all defaults for compression and block size. */
+  public HFileWriterV2(Configuration conf, FileSystem fs, Path path)
+      throws IOException {
+    this(conf, fs, path, HFile.DEFAULT_BLOCKSIZE,
+        HFile.DEFAULT_COMPRESSION_ALGORITHM, null);
+  }
+
+  /**
+   * Constructor that takes a path, creates and closes the output stream. Takes
+   * compression algorithm name as string.
+   */
+  public HFileWriterV2(Configuration conf, FileSystem fs, Path path,
+      int blockSize, String compressAlgoName,
+      final KeyComparator comparator) throws IOException {
+    this(conf, fs, path, blockSize,
+        compressionByName(compressAlgoName), comparator);
+  }
+
+  /** Constructor that takes a path, creates and closes the output stream. */
+  public HFileWriterV2(Configuration conf, FileSystem fs, Path path,
+      int blockSize, Compression.Algorithm compressAlgo,
+      final KeyComparator comparator) throws IOException {
+    super(conf, createOutputStream(conf, fs, path), path,
+        blockSize, compressAlgo, comparator);
+    finishInit(conf);
+  }
+
+  /** Constructor that takes a stream. */
+  public HFileWriterV2(final Configuration conf,
+      final FSDataOutputStream outputStream, final int blockSize,
+      final String compressAlgoName, final KeyComparator comparator)
+      throws IOException {
+    this(conf, outputStream, blockSize,
+        Compression.getCompressionAlgorithmByName(compressAlgoName),
+        comparator);
+  }
+
+  /** Constructor that takes a stream. */
+  public HFileWriterV2(final Configuration conf,
+      final FSDataOutputStream outputStream, final int blockSize,
+      final Compression.Algorithm compress, final KeyComparator comparator)
+      throws IOException {
+    super(conf, outputStream, null, blockSize, compress, comparator);
+    finishInit(conf);
+  }
+
+  /** Additional initialization steps */
+  private void finishInit(final Configuration conf) {
+    if (fsBlockWriter != null)
+      throw new IllegalStateException("finishInit called twice");
+
+    // HFile filesystem-level (non-caching) block writer
+    fsBlockWriter = new HFileBlock.Writer(compressAlgo);
+
+    // Data block index writer
+    dataBlockIndexWriter = new HFileBlockIndex.BlockIndexWriter(fsBlockWriter,
+        cacheIndexBlocksOnWrite ? blockCache : null,
+        cacheIndexBlocksOnWrite ? name : null);
+    dataBlockIndexWriter.setMaxChunkSize(
+        HFileBlockIndex.getMaxChunkSize(conf));
+    inlineBlockWriters.add(dataBlockIndexWriter);
+
+    // Meta data block index writer
+    metaBlockIndexWriter = new HFileBlockIndex.BlockIndexWriter();
+  }
+
+  /**
+   * At a block boundary, write all the inline blocks and opens new block.
+   *
+   * @throws IOException
+   */
+  private void checkBlockBoundary() throws IOException {
+    if (fsBlockWriter.blockSizeWritten() < blockSize)
+      return;
+
+    finishBlock();
+    writeInlineBlocks(false);
+    newBlock();
+  }
+
+  /** Clean up the current block */
+  private void finishBlock() throws IOException {
+    if (!fsBlockWriter.isWriting() || fsBlockWriter.blockSizeWritten() == 0)
+      return;
+
+    long startTimeNs = System.nanoTime();
+
+    // Update the first data block offset for scanning.
+    if (firstDataBlockOffset == -1)
+      firstDataBlockOffset = outputStream.getPos();
+
+    // Update the last data block offset
+    lastDataBlockOffset = outputStream.getPos();
+
+    fsBlockWriter.writeHeaderAndData(outputStream);
+
+    int onDiskSize = fsBlockWriter.getOnDiskSizeWithHeader();
+    dataBlockIndexWriter.addEntry(firstKeyInBlock, lastDataBlockOffset,
+        onDiskSize);
+    totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
+
+    HFile.writeTimeNano.addAndGet(System.nanoTime() - startTimeNs);
+    HFile.writeOps.incrementAndGet();
+
+    if (cacheDataBlocksOnWrite) {
+      blockCache.cacheBlock(HFile.getBlockCacheKey(name, lastDataBlockOffset),
+          fsBlockWriter.getBlockForCaching());
+    }
+  }
+
+  /** Gives inline block writers an opportunity to contribute blocks. */
+  private void writeInlineBlocks(boolean closing) throws IOException {
+    for (InlineBlockWriter ibw : inlineBlockWriters) {
+      while (ibw.shouldWriteBlock(closing)) {
+        long offset = outputStream.getPos();
+        boolean cacheThisBlock = ibw.cacheOnWrite();
+        ibw.writeInlineBlock(fsBlockWriter.startWriting(
+            ibw.getInlineBlockType(), cacheThisBlock));
+        fsBlockWriter.writeHeaderAndData(outputStream);
+        ibw.blockWritten(offset, fsBlockWriter.getOnDiskSizeWithHeader(),
+            fsBlockWriter.getUncompressedSizeWithoutHeader());
+
+        if (cacheThisBlock) {
+          // Cache this block on write.
+          blockCache.cacheBlock(HFile.getBlockCacheKey(name, offset),
+              fsBlockWriter.getBlockForCaching());
+        }
+      }
+    }
+  }
+
+  /**
+   * Ready a new block for writing.
+   *
+   * @throws IOException
+   */
+  private void newBlock() throws IOException {
+    // This is where the next block begins.
+    fsBlockWriter.startWriting(BlockType.DATA, cacheDataBlocksOnWrite);
+    firstKeyInBlock = null;
+  }
+
+  /**
+   * Add a meta block to the end of the file. Call before close(). Metadata
+   * blocks are expensive. Fill one with a bunch of serialized data rather than
+   * do a metadata block per metadata instance. If metadata is small, consider
+   * adding to file info using {@link #appendFileInfo(byte[], byte[])}
+   *
+   * @param metaBlockName
+   *          name of the block
+   * @param content
+   *          will call readFields to get data later (DO NOT REUSE)
+   */
+  @Override
+  public void appendMetaBlock(String metaBlockName, Writable content) {
+    byte[] key = Bytes.toBytes(metaBlockName);
+    int i;
+    for (i = 0; i < metaNames.size(); ++i) {
+      // stop when the current key is greater than our own
+      byte[] cur = metaNames.get(i);
+      if (Bytes.BYTES_RAWCOMPARATOR.compare(cur, 0, cur.length, key, 0,
+          key.length) > 0) {
+        break;
+      }
+    }
+    metaNames.add(i, key);
+    metaData.add(i, content);
+  }
+
+  /**
+   * Add key/value to file. Keys must be added in an order that agrees with the
+   * Comparator passed on construction.
+   *
+   * @param kv
+   *          KeyValue to add. Cannot be empty nor null.
+   * @throws IOException
+   */
+  @Override
+  public void append(final KeyValue kv) throws IOException {
+    append(kv.getBuffer(), kv.getKeyOffset(), kv.getKeyLength(),
+        kv.getBuffer(), kv.getValueOffset(), kv.getValueLength());
+  }
+
+  /**
+   * Add key/value to file. Keys must be added in an order that agrees with the
+   * Comparator passed on construction.
+   *
+   * @param key
+   *          Key to add. Cannot be empty nor null.
+   * @param value
+   *          Value to add. Cannot be empty nor null.
+   * @throws IOException
+   */
+  @Override
+  public void append(final byte[] key, final byte[] value) throws IOException {
+    append(key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Add key/value to file. Keys must be added in an order that agrees with the
+   * Comparator passed on construction.
+   *
+   * @param key
+   * @param koffset
+   * @param klength
+   * @param value
+   * @param voffset
+   * @param vlength
+   * @throws IOException
+   */
+  private void append(final byte[] key, final int koffset, final int klength,
+      final byte[] value, final int voffset, final int vlength)
+      throws IOException {
+    boolean dupKey = checkKey(key, koffset, klength);
+    checkValue(value, voffset, vlength);
+    if (!dupKey) {
+      checkBlockBoundary();
+    }
+
+    if (!fsBlockWriter.isWriting())
+      newBlock();
+
+    // Write length of key and value and then actual key and value bytes.
+    {
+      DataOutputStream out = fsBlockWriter.getUserDataStream();
+      out.writeInt(klength);
+      totalKeyLength += klength;
+      out.writeInt(vlength);
+      totalValueLength += vlength;
+      out.write(key, koffset, klength);
+      out.write(value, voffset, vlength);
+    }
+
+    // Are we the first key in this block?
+    if (firstKeyInBlock == null) {
+      // Copy the key.
+      firstKeyInBlock = new byte[klength];
+      System.arraycopy(key, koffset, firstKeyInBlock, 0, klength);
+    }
+
+    lastKeyBuffer = key;
+    lastKeyOffset = koffset;
+    lastKeyLength = klength;
+    entryCount++;
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (outputStream == null) {
+      return;
+    }
+    // Write out the end of the data blocks, then write meta data blocks.
+    // followed by fileinfo, data block index and meta block index.
+
+    finishBlock();
+    writeInlineBlocks(true);
+
+    FixedFileTrailer trailer = new FixedFileTrailer(2);
+
+    // Write out the metadata blocks if any.
+    if (!metaNames.isEmpty()) {
+      for (int i = 0; i < metaNames.size(); ++i) {
+        // store the beginning offset
+        long offset = outputStream.getPos();
+        // write the metadata content
+        DataOutputStream dos = fsBlockWriter.startWriting(BlockType.META,
+            cacheDataBlocksOnWrite);
+        metaData.get(i).write(dos);
+
+        fsBlockWriter.writeHeaderAndData(outputStream);
+
+        // Add the new meta block to the meta index.
+        metaBlockIndexWriter.addEntry(metaNames.get(i), offset,
+            fsBlockWriter.getOnDiskSizeWithHeader());
+      }
+    }
+
+    // Load-on-open section.
+
+    // Data block index.
+    //
+    // In version 2, this section of the file starts with the root level data
+    // block index. We call a function that writes intermediate-level blocks
+    // first, then root level, and returns the offset of the root level block
+    // index.
+
+    long rootIndexOffset = dataBlockIndexWriter.writeIndexBlocks(outputStream);
+    trailer.setLoadOnOpenOffset(rootIndexOffset);
+
+    // Meta block index.
+    metaBlockIndexWriter.writeSingleLevelIndex(fsBlockWriter.startWriting(
+        BlockType.ROOT_INDEX, false), "meta");
+    fsBlockWriter.writeHeaderAndData(outputStream);
+
+    // File info
+    writeFileInfo(trailer, fsBlockWriter.startWriting(BlockType.FILE_INFO,
+        false));
+    fsBlockWriter.writeHeaderAndData(outputStream);
+
+    // Load-on-open data supplied by higher levels, e.g. Bloom filters.
+    for (BlockWritable w : additionalLoadOnOpenData)
+      fsBlockWriter.writeBlock(w, outputStream);
+
+    // Now finish off the trailer.
+    trailer.setNumDataIndexLevels(dataBlockIndexWriter.getNumLevels());
+    trailer.setUncompressedDataIndexSize(
+        dataBlockIndexWriter.getTotalUncompressedSize());
+    trailer.setFirstDataBlockOffset(firstDataBlockOffset);
+    trailer.setLastDataBlockOffset(lastDataBlockOffset);
+    trailer.setComparatorClass(comparator.getClass());
+    trailer.setDataIndexCount(dataBlockIndexWriter.getNumRootEntries());
+
+    finishClose(trailer);
+
+    fsBlockWriter.releaseCompressor();
+  }
+
+  @Override
+  public void addInlineBlockWriter(InlineBlockWriter ibw) {
+    inlineBlockWriters.add(ibw);
+    if (blockCache == null && ibw.cacheOnWrite())
+      initBlockCache();
+  }
+
+  @Override
+  public void addBloomFilter(final BloomFilterWriter bfw) {
+    if (bfw.getKeyCount() <= 0)
+      return;
+
+    additionalLoadOnOpenData.add(new BlockWritable() {
+      @Override
+      public BlockType getBlockType() {
+        return BlockType.BLOOM_META;
+      }
+
+      @Override
+      public void writeToBlock(DataOutput out) throws IOException {
+        bfw.getMetaWriter().write(out);
+        Writable dataWriter = bfw.getDataWriter();
+        if (dataWriter != null)
+          dataWriter.write(out);
+      }
+    });
+  }
+
+}
--- a/src/main/java/org/apache/hadoop/hbase/util/BloomFilterWriter.java
+++ b/src/main/java/org/apache/hadoop/hbase/util/BloomFilterWriter.java
@ -0,0 +1,61 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.util;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Specifies methods needed to add elements to a Bloom filter and serialize the
+ * resulting Bloom filter as a sequence of bytes.
+ */
+public interface BloomFilterWriter extends BloomFilterBase {
+
+  /** Allocate memory for the bloom filter data. */
+  void allocBloom();
+
+  /** Compact the Bloom filter before writing metadata & data to disk. */
+  void compactBloom();
+
+  /**
+   * Get a writable interface into bloom filter meta data.
+   *
+   * @return a writable instance that can be later written to a stream
+   */
+  Writable getMetaWriter();
+
+  /**
+   * Get a writable interface into bloom filter data (the actual Bloom bits).
+   * Not used for compound Bloom filters.
+   *
+   * @return a writable instance that can be later written to a stream
+   */
+  Writable getDataWriter();
+
+  /**
+   * Add the specified binary to the bloom filter.
+   *
+   * @param buf data to be added to the bloom
+   * @param offset offset into the data to be added
+   * @param len length of the data to be added
+   */
+  void add(byte[] buf, int offset, int len);
+
+}