From f0731894479d90deace11d4560e2218776a8b2ac Mon Sep 17 00:00:00 2001
From: Michael Stack <stack@apache.org>
Date: Wed, 25 Feb 2009 05:59:26 +0000
Subject: [PATCH] HBASE-61 Create an HBase-specific MapFile implementation

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@747672 13f79535-47bb-0310-9956-ffa450edef68
---
 conf/hbase-default.xml                        |   16 +
 .../hadoop/hbase/io/HalfHFileReader.java      |  198 ++
 .../hadoop/hbase/io/hfile/BlockCache.java     |   42 +
 .../io/hfile/BoundedRangeFileInputStream.java |  143 ++
 .../hadoop/hbase/io/hfile/Compression.java    |  324 +++
 .../apache/hadoop/hbase/io/hfile/HFile.java   | 1423 +++++++++++
 .../hadoop/hbase/io/hfile/HFileScanner.java   |  112 +
 .../hbase/io/hfile/SimpleBlockCache.java      |   56 +
 .../apache/hadoop/hbase/io/hfile/package.html |   25 +
 .../hadoop/hbase/regionserver/HStore.java     | 2207 -----------------
 .../hadoop/hbase/regionserver/HStoreFile.java |  558 -----
 .../hadoop/hbase/regionserver/Store.java      | 1807 ++++++++++++++
 .../hadoop/hbase/regionserver/StoreFile.java  |  443 ++++
 .../{HStoreScanner.java => StoreScanner.java} |   36 +-
 .../hbase/HFilePerformanceEvaluation.java     |  365 +++
 .../hbase/PerformanceEvaluationCommons.java   |   78 +
 .../apache/hadoop/hbase/TestHStoreKey.java    |  128 +
 .../hadoop/hbase/io/hfile/KVGenerator.java    |  111 +
 .../hadoop/hbase/io/hfile/KeySampler.java     |   64 +
 .../hadoop/hbase/io/hfile/NanoTimer.java      |  198 ++
 .../hbase/io/hfile/RandomDistribution.java    |  271 ++
 .../hadoop/hbase/io/hfile/RandomSeek.java     |  124 +
 .../hadoop/hbase/io/hfile/TestHFile.java      |  248 ++
 .../hbase/io/hfile/TestHFilePerformance.java  |  387 +++
 .../hadoop/hbase/io/hfile/TestHFileSeek.java  |  510 ++++
 .../hadoop/hbase/io/hfile/TestSeekTo.java     |  146 ++
 .../hbase/regionserver/TestHStoreFile.java    |  388 ---
 .../hbase/regionserver/TestStoreFile.java     |  295 +++
 28 files changed, 7528 insertions(+), 3175 deletions(-)
 create mode 100644 src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java
 create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java
 create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java
 create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/Compression.java
 create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/HFile.java
 create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
 create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java
 create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/package.html
 delete mode 100644 src/java/org/apache/hadoop/hbase/regionserver/HStore.java
 delete mode 100644 src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java
 create mode 100644 src/java/org/apache/hadoop/hbase/regionserver/Store.java
 create mode 100644 src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java
 rename src/java/org/apache/hadoop/hbase/regionserver/{HStoreScanner.java => StoreScanner.java} (91%)
 create mode 100644 src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java
 create mode 100644 src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java
 create mode 100644 src/test/org/apache/hadoop/hbase/TestHStoreKey.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestHFileSeek.java
 create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java
 delete mode 100644 src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java
 create mode 100644 src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java

diff --git a/conf/hbase-default.xml b/conf/hbase-default.xml
index cbc51518b05..30b4f422ffa 100644
--- a/conf/hbase-default.xml
+++ b/conf/hbase-default.xml
@@ -323,6 +323,22 @@
     if you have small cells and want faster random-access of individual cells.
     </description>
   </property>
+  <property>
+    <name>hfile.min.blocksize.size</name>
+    <value>65536</value>
+    <description>Minimum store file block size.  The smaller you make this, the
+    bigger your index and the less you fetch on a random-access.  Set size down
+    if you have small cells and want faster random-access of individual cells.
+    </description>
+  </property>
+  <property>
+    <name>hfile.min.blocksize.size</name>
+    <value>65536</value>
+    <description>Minimum store file block size.  The smaller you make this, the
+    bigger your index and the less you fetch on a random-access.  Set size down
+    if you have small cells and want faster random-access of individual cells.
+    </description>
+  </property>
   <property>
     <name>hbase.io.seqfile.compression.type</name>
     <value>NONE</value>
diff --git a/src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java b/src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java
new file mode 100644
index 00000000000..7794cfa51a9
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java
@@ -0,0 +1,198 @@
+/**
+ * Copyright 2008 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.hfile.BlockCache;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileScanner;
+import org.apache.hadoop.hbase.io.hfile.HFile.Reader;
+import org.apache.hadoop.hbase.util.Bytes;
+
+/**
+ * A facade for a {@link org.apache.hadoop.io.hfile.HFile.Reader} that serves up
+ * either the top or bottom half of a HFile where 'bottom' is the first half
+ * of the file containing the keys that sort lowest and 'top' is the second half
+ * of the file with keys that sort greater than those of the bottom half.
+ * The top includes the split files midkey, of the key that follows if it does
+ * not exist in the file.
+ * 
+ * <p>This type works in tandem with the {@link Reference} type.  This class
+ * is used reading while Reference is used writing.
+ * 
+ * <p>This file is not splitable.  Calls to {@link #midKey()} return null.
+ */
+public class HalfHFileReader extends HFile.Reader {
+  static final Log LOG = LogFactory.getLog(HalfHFileReader.class);
+  private final boolean top;
+  // This is the key we split around.  Its the first possible entry on a row:
+  // i.e. empty column and a timestamp of LATEST_TIMESTAMP.
+  private final byte [] splitkey;
+
+  /**
+   * @param fs
+   * @param f
+   * @param c
+   * @param r
+   * @throws IOException
+   */
+  public HalfHFileReader(final FileSystem fs, final Path p, final BlockCache c,
+    final Reference r)
+  throws IOException {
+    super(fs, p, c);
+    // This is not actual midkey for this half-file; its just border
+    // around which we split top and bottom.  Have to look in files to find
+    // actual last and first keys for bottom and top halves.  Half-files don't
+    // have an actual midkey themselves. No midkey is how we indicate file is
+    // not splittable.
+    this.splitkey = r.getSplitKey();
+    // Is it top or bottom half?
+    this.top = Reference.isTopFileRegion(r.getFileRegion());
+  }
+
+  public HFileScanner getScanner() {
+    final HFileScanner s = super.getScanner();
+    return new HFileScanner() {
+      final HFileScanner delegate = s;
+
+      public ByteBuffer getKey() {
+        return delegate.getKey();
+      }
+
+      public String getKeyString() {
+        return delegate.getKeyString();
+      }
+
+      public ByteBuffer getValue() {
+        return delegate.getValue();
+      }
+
+      public String getValueString() {
+        return delegate.getValueString();
+      }
+
+      public boolean next() throws IOException {
+        boolean b = delegate.next();
+        if (!b) {
+          return b;
+        }
+        if (!top) {
+          ByteBuffer bb = getKey();
+          if (getComparator().compare(bb.array(), bb.arrayOffset(), bb.limit(),
+              splitkey, 0, splitkey.length) >= 0) {
+            return false;
+          }
+        }
+        return true;
+      }
+
+      public boolean seekBefore(byte[] key) throws IOException {
+        if (top) {
+          if (getComparator().compare(key, splitkey) < 0) {
+            return false;
+          }
+        } else {
+          if (getComparator().compare(key, splitkey) >= 0) {
+            return seekBefore(splitkey);
+          }
+        }
+        return this.delegate.seekBefore(key);
+      }
+
+      public boolean seekTo() throws IOException {
+        if (top) {
+          int r = this.delegate.seekTo(splitkey);
+          if (r < 0) {
+            // midkey is < first key in file
+            return this.delegate.seekTo();
+          }
+          if (r > 0) {
+            return this.delegate.next();
+          }
+          return true;
+        }
+        
+        boolean b = delegate.seekTo();
+        if (!b) {
+          return b;
+        }
+        // Check key.
+        ByteBuffer k = this.delegate.getKey();
+        return this.delegate.getReader().getComparator().
+          compare(k.array(), k.arrayOffset(), k.limit(),
+            splitkey, 0, splitkey.length) < 0;
+      }
+
+      public int seekTo(byte[] key) throws IOException {
+        if (top) {
+          if (getComparator().compare(key, splitkey) < 0) {
+            return -1;
+          }
+        } else {
+          if (getComparator().compare(key, splitkey) >= 0) {
+            // we would place the scanner in the second half.
+            // it might be an error to return false here ever...
+            boolean res = delegate.seekBefore(splitkey);
+            if (!res) {
+              throw new IOException("Seeking for a key in bottom of file, but key exists in top of file, failed on seekBefore(midkey)");
+            }
+            return 1;
+          }
+        }
+        return delegate.seekTo(key);
+      }
+
+      public Reader getReader() {
+        return this.delegate.getReader();
+      }
+
+      public boolean isSeeked() {
+        return this.delegate.isSeeked();
+      }
+    };
+  }
+
+  public byte[] getLastKey() {
+    if (top) {
+      return super.getLastKey(); 
+    } else {
+      HFileScanner scanner = getScanner();
+      try {
+        if (scanner.seekBefore(this.splitkey)) {
+          return Bytes.toBytes(scanner.getKey());
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed seekBefore " + Bytes.toString(this.splitkey), e);
+      }
+      return null;
+    }
+  }
+
+  public byte[] midkey() throws IOException {
+    // Returns null to indicate file is not splitable.
+    return null;
+  }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java b/src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java
new file mode 100644
index 00000000000..91cfaf6613f
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Block cache interface.
+ * TODO: Add filename or hash of filename to block cache key.
+ */
+public interface BlockCache {
+  /**
+   * Add block to cache.
+   * @param blockNumber Zero-based file block number.
+   * @param buf The block contents wrapped in a ByteBuffer.
+   */
+  public void cacheBlock(String blockName, ByteBuffer buf);
+  
+  /**
+   * Fetch block from cache.
+   * @param blockNumber Block number to fetch.
+   * @return Block or null if block is not in the cache.
+   */
+  public ByteBuffer getBlock(String blockName);  
+}
\ No newline at end of file
diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java b/src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java
new file mode 100644
index 00000000000..ae7734a941b
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+
+/**
+ * BoundedRangeFIleInputStream abstracts a contiguous region of a Hadoop
+ * FSDataInputStream as a regular input stream. One can create multiple
+ * BoundedRangeFileInputStream on top of the same FSDataInputStream and they
+ * would not interfere with each other.
+ * Copied from hadoop-335 tfile.
+ */
+class BoundedRangeFileInputStream  extends InputStream {
+
+  private FSDataInputStream in;
+  private long pos;
+  private long end;
+  private long mark;
+  private final byte[] oneByte = new byte[1];
+
+  /**
+   * Constructor
+   *
+   * @param in
+   *          The FSDataInputStream we connect to.
+   * @param offset
+   *          Beginning offset of the region.
+   * @param length
+   *          Length of the region.
+   *
+   *          The actual length of the region may be smaller if (off_begin +
+   *          length) goes beyond the end of FS input stream.
+   */
+  public BoundedRangeFileInputStream(FSDataInputStream in, long offset,
+      long length) {
+    if (offset < 0 || length < 0) {
+      throw new IndexOutOfBoundsException("Invalid offset/length: " + offset
+          + "/" + length);
+    }
+
+    this.in = in;
+    this.pos = offset;
+    this.end = offset + length;
+    this.mark = -1;
+  }
+
+  @Override
+  public int available() throws IOException {
+    int avail = in.available();
+    if (pos + avail > end) {
+      avail = (int) (end - pos);
+    }
+
+    return avail;
+  }
+
+  @Override
+  public int read() throws IOException {
+    int ret = read(oneByte);
+    if (ret == 1) return oneByte[0] & 0xff;
+    return -1;
+  }
+
+  @Override
+  public int read(byte[] b) throws IOException {
+    return read(b, 0, b.length);
+  }
+
+  @Override
+  public int read(byte[] b, int off, int len) throws IOException {
+    if ((off | len | (off + len) | (b.length - (off + len))) < 0) {
+      throw new IndexOutOfBoundsException();
+    }
+
+    int n = (int) Math.min(Integer.MAX_VALUE, Math.min(len, (end - pos)));
+    if (n == 0) return -1;
+    int ret = 0;
+    synchronized (in) {
+      in.seek(pos);
+      ret = in.read(b, off, n);
+    }
+    // / ret = in.read(pos, b, off, n);
+    if (ret < 0) {
+      end = pos;
+      return -1;
+    }
+    pos += ret;
+    return ret;
+  }
+
+  @Override
+  /*
+   * We may skip beyond the end of the file.
+   */
+  public long skip(long n) throws IOException {
+    long len = Math.min(n, end - pos);
+    pos += len;
+    return len;
+  }
+
+  @Override
+  public void mark(int readlimit) {
+    mark = pos;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    if (mark < 0) throw new IOException("Resetting to invalid mark");
+    pos = mark;
+  }
+
+  @Override
+  public boolean markSupported() {
+    return true;
+  }
+
+  @Override
+  public void close() {
+    // Invalidate the state of the stream.
+    in = null;
+    pos = end;
+    mark = -1;
+  }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/Compression.java b/src/java/org/apache/hadoop/hbase/io/hfile/Compression.java
new file mode 100644
index 00000000000..249bc425913
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/hfile/Compression.java
@@ -0,0 +1,324 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.FilterOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.compress.CodecPool;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionInputStream;
+import org.apache.hadoop.io.compress.CompressionOutputStream;
+import org.apache.hadoop.io.compress.Compressor;
+import org.apache.hadoop.io.compress.Decompressor;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.io.compress.LzoCodec;
+
+/**
+ * Compression related stuff.
+ * Copied from hadoop-3315 tfile.
+ */
+public final class Compression {
+  static final Log LOG = LogFactory.getLog(Compression.class);
+
+  /**
+   * Prevent the instantiation of class.
+   */
+  private Compression() {
+    super();
+  }
+
+  static class FinishOnFlushCompressionStream extends FilterOutputStream {
+    public FinishOnFlushCompressionStream(CompressionOutputStream cout) {
+      super(cout);
+    }
+
+    @Override
+    public void write(byte b[], int off, int len) throws IOException {
+      out.write(b, off, len);
+    }
+
+    @Override
+    public void flush() throws IOException {
+      CompressionOutputStream cout = (CompressionOutputStream) out;
+      cout.finish();
+      cout.flush();
+      cout.resetState();
+    }
+  }
+
+  /**
+   * Compression algorithms.
+   */
+  public static enum Algorithm {
+    LZO("lzo") {
+      private LzoCodec codec;
+
+      @Override
+      CompressionCodec getCodec() {
+        if (codec == null) {
+          Configuration conf = new Configuration();
+          conf.setBoolean("hadoop.native.lib", true);
+          codec = new LzoCodec();
+          codec.setConf(conf);
+        }
+
+        return codec;
+      }
+
+      @Override
+      public synchronized InputStream createDecompressionStream(
+          InputStream downStream, Decompressor decompressor,
+          int downStreamBufferSize) throws IOException {
+        InputStream bis1 = null;
+        if (downStreamBufferSize > 0) {
+          bis1 = new BufferedInputStream(downStream, downStreamBufferSize);
+        }
+        else {
+          bis1 = downStream;
+        }
+        codec.getConf()
+            .setInt("io.compression.codec.lzo.buffersize", 64 * 1024);
+        CompressionInputStream cis =
+            codec.createInputStream(bis1, decompressor);
+        BufferedInputStream bis2 = new BufferedInputStream(cis, DATA_IBUF_SIZE);
+        return bis2;
+      }
+
+      @Override
+      public synchronized OutputStream createCompressionStream(
+          OutputStream downStream, Compressor compressor,
+          int downStreamBufferSize) throws IOException {
+        OutputStream bos1 = null;
+        if (downStreamBufferSize > 0) {
+          bos1 = new BufferedOutputStream(downStream, downStreamBufferSize);
+        }
+        else {
+          bos1 = downStream;
+        }
+        codec.getConf()
+            .setInt("io.compression.codec.lzo.buffersize", 64 * 1024);
+        CompressionOutputStream cos =
+            codec.createOutputStream(bos1, compressor);
+        BufferedOutputStream bos2 =
+            new BufferedOutputStream(new FinishOnFlushCompressionStream(cos),
+                DATA_OBUF_SIZE);
+        return bos2;
+      }
+    },
+
+    GZ("gz") {
+      private GzipCodec codec;
+
+      @Override
+      CompressionCodec getCodec() {
+        if (codec == null) {
+          Configuration conf = new Configuration();
+          conf.setBoolean("hadoop.native.lib", true);
+          codec = new GzipCodec();
+          codec.setConf(conf);
+        }
+
+        return codec;
+      }
+
+      @Override
+      public synchronized InputStream createDecompressionStream(
+          InputStream downStream, Decompressor decompressor,
+          int downStreamBufferSize) throws IOException {
+        // Set the internal buffer size to read from down stream.
+        if (downStreamBufferSize > 0) {
+          codec.getConf().setInt("io.file.buffer.size", downStreamBufferSize);
+        }
+        CompressionInputStream cis =
+            codec.createInputStream(downStream, decompressor);
+        BufferedInputStream bis2 = new BufferedInputStream(cis, DATA_IBUF_SIZE);
+        return bis2;
+      }
+
+      @Override
+      public synchronized OutputStream createCompressionStream(
+          OutputStream downStream, Compressor compressor,
+          int downStreamBufferSize) throws IOException {
+        OutputStream bos1 = null;
+        if (downStreamBufferSize > 0) {
+          bos1 = new BufferedOutputStream(downStream, downStreamBufferSize);
+        }
+        else {
+          bos1 = downStream;
+        }
+        codec.getConf().setInt("io.file.buffer.size", 32 * 1024);
+        CompressionOutputStream cos =
+            codec.createOutputStream(bos1, compressor);
+        BufferedOutputStream bos2 =
+            new BufferedOutputStream(new FinishOnFlushCompressionStream(cos),
+                DATA_OBUF_SIZE);
+        return bos2;
+      }
+    },
+
+    NONE("none") {
+      @Override
+      CompressionCodec getCodec() {
+        return null;
+      }
+
+      @Override
+      public synchronized InputStream createDecompressionStream(
+          InputStream downStream, Decompressor decompressor,
+          int downStreamBufferSize) throws IOException {
+        if (downStreamBufferSize > 0) {
+          return new BufferedInputStream(downStream, downStreamBufferSize);
+        }
+        // else {
+          // Make sure we bypass FSInputChecker buffer.
+        // return new BufferedInputStream(downStream, 1024);
+        // }
+        // }
+        return downStream;
+      }
+
+      @Override
+      public synchronized OutputStream createCompressionStream(
+          OutputStream downStream, Compressor compressor,
+          int downStreamBufferSize) throws IOException {
+        if (downStreamBufferSize > 0) {
+          return new BufferedOutputStream(downStream, downStreamBufferSize);
+        }
+
+        return downStream;
+      }
+    };
+
+    private final String compressName;
+	// data input buffer size to absorb small reads from application.
+    private static final int DATA_IBUF_SIZE = 1 * 1024;
+	// data output buffer size to absorb small writes from application.
+    private static final int DATA_OBUF_SIZE = 4 * 1024;
+
+    Algorithm(String name) {
+      this.compressName = name;
+    }
+
+    abstract CompressionCodec getCodec();
+
+    public abstract InputStream createDecompressionStream(
+        InputStream downStream, Decompressor decompressor,
+        int downStreamBufferSize) throws IOException;
+
+    public abstract OutputStream createCompressionStream(
+        OutputStream downStream, Compressor compressor, int downStreamBufferSize)
+        throws IOException;
+
+    public Compressor getCompressor() {
+      CompressionCodec codec = getCodec();
+      if (codec != null) {
+        Compressor compressor = CodecPool.getCompressor(codec);
+        if (compressor != null) {
+          if (compressor.finished()) {
+            // Somebody returns the compressor to CodecPool but is still using
+            // it.
+            LOG
+                .warn("Compressor obtained from CodecPool is already finished()");
+            // throw new AssertionError(
+            // "Compressor obtained from CodecPool is already finished()");
+          }
+          else {
+            LOG.debug("Got a compressor: " + compressor.hashCode());
+          }
+          compressor.reset();
+        }
+        return compressor;
+      }
+      return null;
+    }
+
+    public void returnCompressor(Compressor compressor) {
+      if (compressor != null) {
+        LOG.debug("Return a compressor: " + compressor.hashCode());
+        CodecPool.returnCompressor(compressor);
+      }
+    }
+
+    public Decompressor getDecompressor() {
+      CompressionCodec codec = getCodec();
+      if (codec != null) {
+        Decompressor decompressor = CodecPool.getDecompressor(codec);
+        if (decompressor != null) {
+          if (decompressor.finished()) {
+            // Somebody returns the decompressor to CodecPool but is still using
+            // it.
+            LOG
+                .warn("Deompressor obtained from CodecPool is already finished()");
+            // throw new AssertionError(
+            // "Decompressor obtained from CodecPool is already finished()");
+          }
+          else {
+            LOG.debug("Got a decompressor: " + decompressor.hashCode());
+          }
+          decompressor.reset();
+        }
+        return decompressor;
+      }
+
+      return null;
+    }
+
+    public void returnDecompressor(Decompressor decompressor) {
+      if (decompressor != null) {
+        LOG.debug("Returned a decompressor: " + decompressor.hashCode());
+        CodecPool.returnDecompressor(decompressor);
+      }
+    }
+
+    public String getName() {
+      return compressName;
+    }
+  }
+
+  static Algorithm getCompressionAlgorithmByName(String compressName) {
+    Algorithm[] algos = Algorithm.class.getEnumConstants();
+
+    for (Algorithm a : algos) {
+      if (a.getName().equals(compressName)) {
+        return a;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Unsupported compression algorithm name: " + compressName);
+  }
+
+  static String[] getSupportedAlgorithms() {
+    Algorithm[] algos = Algorithm.class.getEnumConstants();
+
+    String[] ret = new String[algos.length];
+    int i = 0;
+    for (Algorithm a : algos) {
+      ret[i++] = a.getName();
+    }
+
+    return ret;
+  }
+}
diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/HFile.java b/src/java/org/apache/hadoop/hbase/io/hfile/HFile.java
new file mode 100644
index 00000000000..3a2915d05d0
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/hfile/HFile.java
@@ -0,0 +1,1423 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.HbaseMapWritable;
+import org.apache.hadoop.hbase.io.HeapSize;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.compress.Compressor;
+import org.apache.hadoop.io.compress.Decompressor;
+
+/**
+ * File format for hbase.
+ * A file of sorted key/value pairs. Both keys and values are byte arrays.
+ * <p>
+ * The memory footprint of a HFile includes the following (below is taken from
+ * <a
+ * href=https://issues.apache.org/jira/browse/HADOOP-3315>Hadoop-3315 tfile</a>
+ * but applies also to HFile):
+ * <ul>
+ * <li>Some constant overhead of reading or writing a compressed block.
+ * <ul>
+ * <li>Each compressed block requires one compression/decompression codec for
+ * I/O.
+ * <li>Temporary space to buffer the key.
+ * <li>Temporary space to buffer the value.
+ * </ul>
+ * <li>HFile index, which is proportional to the total number of Data Blocks.
+ * The total amount of memory needed to hold the index can be estimated as
+ * (56+AvgKeySize)*NumBlocks.
+ * </ul>
+ * Suggestions on performance optimization.
+ * <ul>
+ * <li>Minimum block size. We recommend a setting of minimum block size between
+ * 8KB to 1MB for general usage. Larger block size is preferred if files are
+ * primarily for sequential access. However, it would lead to inefficient random
+ * access (because there are more data to decompress). Smaller blocks are good
+ * for random access, but require more memory to hold the block index, and may
+ * be slower to create (because we must flush the compressor stream at the
+ * conclusion of each data block, which leads to an FS I/O flush). Further, due
+ * to the internal caching in Compression codec, the smallest possible block
+ * size would be around 20KB-30KB.
+ * <li>The current implementation does not offer true multi-threading for
+ * reading. The implementation uses FSDataInputStream seek()+read(), which is
+ * shown to be much faster than positioned-read call in single thread mode.
+ * However, it also means that if multiple threads attempt to access the same
+ * HFile (using multiple scanners) simultaneously, the actual I/O is carried out
+ * sequentially even if they access different DFS blocks (Reexamine! pread seems
+ * to be 10% faster than seek+read in my testing -- stack).
+ * <li>Compression codec. Use "none" if the data is not very compressable (by
+ * compressable, I mean a compression ratio at least 2:1). Generally, use "lzo"
+ * as the starting point for experimenting. "gz" overs slightly better
+ * compression ratio over "lzo" but requires 4x CPU to compress and 2x CPU to
+ * decompress, comparing to "lzo".
+ * </ul>
+ * 
+ * For more on the background behind HFile, see <a
+ * href=https://issues.apache.org/jira/browse/HBASE-3315>HBASE-61</a>.
+ * <p>
+ * File is made of data blocks followed by meta data blocks (if any), a fileinfo
+ * block, data block index, meta data block index, and a fixed size trailer
+ * which records the offsets at which file changes content type.
+ * <pre>&lt;data blocks>&lt;meta blocks>&lt;fileinfo>&lt;data index>&lt;meta index>&lt;trailer></pre>
+ * Each block has a bit of magic at its start.  Block are comprised of
+ * key/values.  In data blocks, they are both byte arrays.  Metadata blocks are
+ * a String key and a byte array value.  An empty file looks like this:
+ * <pre>&lt;fileinfo>&lt;trailer></pre>.  That is, there are not data nor meta
+ * blocks present.
+ * <p>
+ * TODO: Bloomfilters.  Need to add hadoop 0.20. first since it has bug fixes
+ * on the hadoop bf package.
+ *  * TODO: USE memcmp by default?  Write the keys out in an order that allows
+ * my using this -- reverse the timestamp.
+ * TODO: Add support for fast-gzip and for lzo.
+ * TODO: Do scanners need to be able to take a start and end row?
+ * TODO: Should BlockIndex know the name of its file?  Should it have a Path
+ * that points at its file say for the case where an index lives apart from
+ * an HFile instance?
+ */
+public class HFile {
+  static final Log LOG = LogFactory.getLog(HFile.class);
+
+  /* These values are more or less arbitrary, and they are used as a 
+   * form of check to make sure the file isn't completely corrupt.
+   */
+  final static byte [] DATABLOCKMAGIC =
+    {'D', 'A', 'T', 'A', 'B', 'L', 'K', 42 };
+  final static byte [] INDEXBLOCKMAGIC =
+    { 'I', 'D', 'X', 'B', 'L', 'K', 41, 43 };
+  final static byte [] METABLOCKMAGIC =
+    { 'M', 'E', 'T', 'A', 'B', 'L', 'K', 99 };
+  final static byte [] TRAILERBLOCKMAGIC = 
+    { 'T', 'R', 'A', 'B', 'L', 'K', 34, 36 };
+
+  /**
+   * Maximum length of key in HFile.
+   */
+  public final static int MAXIMUM_KEY_LENGTH = 64 * 1024;
+  
+  /**
+   * Default blocksize for hfile.
+   */
+  public final static int DEFAULT_BLOCKSIZE = 64 * 1024;
+
+  /**
+   * Default compression: none.
+   */
+  public final static String DEFAULT_COMPRESSION =
+    Compression.Algorithm.NONE.getName();
+
+  /**
+   * HFile Writer.
+   */
+  public static class Writer implements Closeable {
+    // FileSystem stream to write on.
+    private FSDataOutputStream outputStream;
+    // True if we opened the <code>outputStream</code> (and so will close it).
+    private boolean closeOutputStream;
+
+    // Name for this object used when logging or in toString.  Is either
+    // the result of a toString on stream or else toString of passed file Path.
+    private String name;
+
+    // Total uncompressed bytes, maybe calculate a compression ratio later.
+    private int totalBytes = 0;
+
+    // Total # of key/value entries, ie: how many times add() was called.
+    private int entryCount = 0;
+
+    // Used calculating average key and value lengths.
+    private long keylength = 0;
+    private long valuelength = 0;
+
+    // Used to ensure we write in order.
+    private final RawComparator<byte []> comparator;
+
+    // A stream made per block written.
+    private DataOutputStream out;
+
+    // Number of uncompressed bytes per block.  Reinitialized when we start
+    // new block.
+    private int blocksize;
+
+    // Offset where the current block began.
+    private long blockBegin;
+
+    // First key in a block (Not first key in file).
+    private byte [] firstKey = null;
+
+    // Key previously appended.  Becomes the last key in the file.
+    private byte [] lastKey = null;
+
+    // See {@link BlockIndex}. Below four fields are used to write the block
+    // index.
+    ArrayList<byte[]> blockKeys = new ArrayList<byte[]>();
+    // Block offset in backing stream.
+    ArrayList<Long> blockOffsets = new ArrayList<Long>();
+    // Raw (decompressed) data size.
+    ArrayList<Integer> blockDataSizes = new ArrayList<Integer>();
+
+    // Meta block system.
+    private ArrayList<byte []> metaNames = new ArrayList<byte []>();
+    private ArrayList<byte []> metaData = new ArrayList<byte[]>();
+
+    // Used compression.  Used even if no compression -- 'none'.
+    private final Compression.Algorithm compressAlgo;
+    private Compressor compressor;
+
+    // Special datastructure to hold fileinfo.
+    private FileInfo fileinfo = new FileInfo();
+
+    // May be null if we were passed a stream.
+    private Path path = null;
+
+    /**
+     * Constructor that uses all defaults for compression and block size.
+     * @param fs
+     * @param path
+     * @throws IOException
+     */
+    public Writer(FileSystem fs, Path path)
+    throws IOException {
+      this(fs, path, DEFAULT_BLOCKSIZE, null, null);
+    }
+
+    /**
+     * Constructor that takes a Path.
+     * @param fs
+     * @param path
+     * @param blocksize
+     * @param compress
+     * @param comparator
+     * @throws IOException
+     */
+    public Writer(FileSystem fs, Path path, int blocksize, String compress,
+      final RawComparator<byte []> comparator)
+    throws IOException {
+      this(fs.create(path), blocksize, compress, comparator);
+      this.closeOutputStream = true;
+      this.name = path.toString();
+      this.path = path;
+    }
+
+    /**
+     * Constructor that takes a stream.
+     * @param ostream Stream to use.
+     * @param blocksize
+     * @param compress
+     * @param c
+     * @throws IOException
+     */
+    public Writer(final FSDataOutputStream ostream, final int blocksize,
+        final String compress, final RawComparator<byte []> c)
+    throws IOException {
+      this.outputStream = ostream;
+      this.closeOutputStream = false;
+      this.blocksize = blocksize;
+      this.comparator = c == null? Bytes.BYTES_RAWCOMPARATOR: c;
+      this.name = this.outputStream.toString();
+      this.compressAlgo =
+        Compression.getCompressionAlgorithmByName(compress == null?
+          Compression.Algorithm.NONE.getName(): compress);
+    }
+
+    /*
+     * If at block boundary, opens new block.
+     * @throws IOException
+     */
+    private void checkBlockBoundary() throws IOException {
+      if (this.out != null && this.out.size() < blocksize) return;
+      finishBlock();
+      newBlock();
+    }
+
+    /*
+     * Do the cleanup if a current block.
+     * @throws IOException
+     */
+    private void finishBlock() throws IOException {
+      if (this.out == null) return;
+      long size = releaseCompressingStream(this.out);
+      this.out = null;
+      
+      blockKeys.add(firstKey);
+      int written = longToInt(size);
+      blockOffsets.add(Long.valueOf(blockBegin));
+      blockDataSizes.add(Integer.valueOf(written));
+      this.totalBytes += written;
+    }
+
+    /*
+     * Ready a new block for writing.
+     * @throws IOException
+     */
+    private void newBlock() throws IOException {
+      // This is where the next block begins.
+      blockBegin = outputStream.getPos();
+      this.out = getCompressingStream();
+      this.out.write(DATABLOCKMAGIC);
+      firstKey = null;
+    }
+
+    /*
+     * Sets up a compressor and creates a compression stream on top of
+     * this.outputStream.  Get one per block written.
+     * @return A compressing stream; if 'none' compression, returned stream
+     * does not compress.
+     * @throws IOException
+     * @see {@link #releaseCompressingStream(DataOutputStream)}
+     */
+    private DataOutputStream getCompressingStream() throws IOException {
+      this.compressor = compressAlgo.getCompressor();
+      // Get new DOS compression stream.  In tfile, the DOS, is not closed,
+      // just finished, and that seems to be fine over there.  TODO: Check
+      // no memory retention of the DOS.  Should I disable the 'flush' on the
+      // DOS as the BCFile over in tfile does?  It wants to make it so flushes
+      // don't go through to the underlying compressed stream.  Flush on the
+      // compressed downstream should be only when done.  I was going to but
+      // looks like when we call flush in here, its legitimate flush that
+      // should go through to the compressor.
+      OutputStream os =
+        this.compressAlgo.createCompressionStream(this.outputStream,
+        this.compressor, 0);
+      return new DataOutputStream(os);
+    }
+    
+    /*
+     * Let go of block compressor and compressing stream gotten in call
+     * {@link #getCompressingStream}.
+     * @param dos
+     * @return How much was written on this stream since it was taken out.
+     * @see #getCompressingStream() 
+     * @throws IOException
+     */
+    private int releaseCompressingStream(final DataOutputStream dos)
+    throws IOException {
+      dos.flush();
+      this.compressAlgo.returnCompressor(this.compressor);
+      this.compressor = null;
+      return dos.size();
+    }
+
+    /**
+     * Add a meta block to the end of the file. Call before close().
+     * Metadata blocks are expensive.  Fill one with a bunch of serialized data
+     * rather than do a metadata block per metadata instance.  If metadata is
+     * small, consider adding to file info using
+     * {@link #appendFileInfo(byte[], byte[])}
+     * @param metaBlockName name of the block
+     * @param bytes uninterpreted bytes of the block.
+     */
+    public void appendMetaBlock(String metaBlockName, byte [] bytes) {
+      metaNames.add(Bytes.toBytes(metaBlockName));
+      metaData.add(bytes);
+    }
+
+    /**
+     * Add to the file info.  Added key value can be gotten out of the return
+     * from {@link Reader#loadFileInfo()}.
+     * @param k Key
+     * @param v Value
+     * @throws IOException 
+     */
+    public void appendFileInfo(final byte [] k, final byte [] v)
+    throws IOException {
+      appendFileInfo(this.fileinfo, k, v, true);
+    }
+
+    FileInfo appendFileInfo(FileInfo fi, final byte [] k, final byte [] v,
+      final boolean checkPrefix)
+    throws IOException {
+      if (k == null || v == null) {
+        throw new NullPointerException("Key nor value may be null");
+      }
+      if (checkPrefix &&
+          Bytes.toString(k).toLowerCase().startsWith(FileInfo.RESERVED_PREFIX)) {
+        throw new IOException("Keys with a " + FileInfo.RESERVED_PREFIX +
+          " are reserved");
+      }
+      fi.put(k, v);
+      return fi;
+    }
+
+    /**
+     * @return Path or null if we were passed a stream rather than a Path.
+     */
+    public Path getPath() {
+      return this.path;
+    }
+
+    public String toString() {
+      return "writer=" + this.name + ", compression=" +
+        this.compressAlgo.getName();
+    }
+
+    /**
+     * Add key/value to file.
+     * Keys must be added in an order that agrees with the RawComparator passed
+     * on construction.
+     * @param key Key to add.  Cannot be empty nor null.
+     * @param value Value to add.  Cannot be empty nor null.
+     * @throws IOException
+     */
+    public void append(final byte [] key, final byte [] value)
+    throws IOException {
+      checkKey(key);
+      checkValue(value);
+      checkBlockBoundary();
+      // Write length of key and value and then actual key and value bytes.
+      this.out.writeInt(key.length);
+      this.keylength += key.length;
+      this.out.writeInt(value.length);
+      this.valuelength += valuelength;
+      this.out.write(key);
+      this.out.write(value);
+      // Are we the first key in this block?
+      if (this.firstKey == null) this.firstKey = key;
+      this.lastKey = key;
+      this.entryCount ++;
+    }
+
+    /*
+     * @param key Key to check.
+     * @throws IOException
+     */
+    private void checkKey(final byte [] key) throws IOException {
+      if (key == null || key.length <= 0) {
+        throw new IOException("Key cannot be null or empty");
+      }
+      if (key.length > MAXIMUM_KEY_LENGTH) {
+        throw new IOException("Key length " + key.length + " > " +
+          MAXIMUM_KEY_LENGTH);
+      }
+      if (this.lastKey != null) {
+        if (this.comparator.compare(this.lastKey, key) > 0) {
+          throw new IOException("Added a key not lexically larger than" +
+            " previous: key=" + Bytes.toString(key) + ", lastkey=" +
+            Bytes.toString(lastKey));
+        }
+      }
+    }
+
+    private void checkValue(final byte [] value) throws IOException {
+      if (value == null || value.length <= 0) {
+        throw new IOException("Value cannot be null or empty");
+      }
+    }
+
+    public void close() throws IOException {
+      if (this.outputStream == null) {
+        return;
+      }
+      // Write out the end of the data blocks, then write meta data blocks.
+      // followed by fileinfo, data block index and meta block index.
+
+      finishBlock();
+
+      FixedFileTrailer trailer = new FixedFileTrailer();
+      
+      // Write out the metadata blocks if any.
+      ArrayList<Long> metaOffsets = null;
+      ArrayList<Integer> metaDataSizes = null;
+      if (metaNames.size() > 0) {
+        metaOffsets = new ArrayList<Long>(metaNames.size());
+        metaDataSizes = new ArrayList<Integer>(metaNames.size());
+        for (int i = 0 ; i < metaNames.size() ; ++ i ) {
+          metaOffsets.add(Long.valueOf(outputStream.getPos()));
+          metaDataSizes.
+            add(Integer.valueOf(METABLOCKMAGIC.length + metaData.get(i).length));
+          writeMetaBlock(metaData.get(i));
+        }
+      }
+
+      // Write fileinfo.
+      trailer.fileinfoOffset = writeFileInfo(this.outputStream);
+
+      // Write the data block index.
+      trailer.dataIndexOffset = BlockIndex.writeIndex(this.outputStream,
+        this.blockKeys, this.blockOffsets, this.blockDataSizes);
+
+      // Meta block index.
+      if (metaNames.size() > 0) {
+        trailer.metaIndexOffset = BlockIndex.writeIndex(this.outputStream,
+          this.metaNames, metaOffsets, metaDataSizes);
+      }
+
+      // Now finish off the trailer.
+      trailer.dataIndexCount = blockKeys.size();
+      trailer.metaIndexCount = metaNames.size();
+      
+      trailer.totalUncompressedBytes = totalBytes;
+      trailer.entryCount = entryCount;
+      
+      trailer.compressionCodec = this.compressAlgo.ordinal();
+
+      trailer.serialize(outputStream);
+
+      if (this.closeOutputStream) {
+        this.outputStream.close();
+        this.outputStream = null;
+      }
+    }
+
+    /* Write a metadata block.
+     * @param metadata
+     * @throws IOException
+     */
+    private void writeMetaBlock(final byte [] b) throws IOException {
+      DataOutputStream dos = getCompressingStream();
+      dos.write(METABLOCKMAGIC);
+      dos.write(b);
+      releaseCompressingStream(dos);
+    }
+
+    /*
+     * Add last bits of metadata to fileinfo and then write it out.
+     * Reader will be expecting to find all below.
+     * @param o Stream to write on.
+     * @return Position at which we started writing.
+     * @throws IOException
+     */
+    private long writeFileInfo(FSDataOutputStream o) throws IOException {
+      if (this.lastKey != null) {
+        appendFileInfo(this.fileinfo, FileInfo.LASTKEY, this.lastKey, false);
+      }
+      int avgKeyLen = this.entryCount == 0? 0:
+        (int)(this.keylength/this.entryCount);
+      appendFileInfo(this.fileinfo, FileInfo.AVG_KEY_LEN,
+        Bytes.toBytes(avgKeyLen), false);
+      int avgValueLen = this.entryCount == 0? 0:
+        (int)(this.keylength/this.entryCount);
+      appendFileInfo(this.fileinfo, FileInfo.AVG_VALUE_LEN,
+        Bytes.toBytes(avgValueLen), false);
+      appendFileInfo(this.fileinfo, FileInfo.COMPARATOR,
+        Bytes.toBytes(this.comparator.getClass().getName()), false);
+      long pos = o.getPos();
+      this.fileinfo.write(o);
+      return pos;
+    }
+  }
+
+  /**
+   * HFile Reader.
+   */
+  public static class Reader implements Closeable {
+    // Stream to read from.
+    private FSDataInputStream istream;
+    // True if we should close istream when done.  We don't close it if we
+    // didn't open it.
+    private boolean closeIStream;
+
+    // These are read in when the file info is loaded.
+    HFile.BlockIndex blockIndex;
+    private BlockIndex metaIndex;
+    FixedFileTrailer trailer;
+    private volatile boolean fileInfoLoaded = false;
+    
+    // Filled when we read in the trailer.
+    private Compression.Algorithm compressAlgo;
+
+    // Last key in the file.  Filled in when we read in the file info
+    private byte [] lastkey = null;
+    // Stats read in when we load file info.
+    private int avgKeyLen = -1;
+    private int avgValueLen = -1;
+    
+    // Used to ensure we seek correctly.
+    RawComparator<byte []> comparator;
+
+    // Size of this file.
+    private final long fileSize;
+
+    // Block cache to use.
+    private final BlockCache cache;
+    public int cacheHits = 0;
+    public int blockLoads = 0;
+
+    // Name for this object used when logging or in toString.  Is either
+    // the result of a toString on the stream or else is toString of passed
+    // file Path plus metadata key/value pairs.
+    private String name;
+
+    /*
+     * Do not expose the default constructor.
+     */
+    @SuppressWarnings("unused")
+    private Reader() throws IOException {
+      this(null, null, null);
+    }
+
+    /** 
+     * Opens a HFile.  You must load the file info before you can 
+     * use it by calling {@link #loadFileInfo()}.
+     *
+     * @param fs filesystem to load from
+     * @param path path within said filesystem
+     * @param cache block cache. Pass null if none.
+     * @throws IOException
+     */
+    public Reader(FileSystem fs, Path path, BlockCache cache)
+    throws IOException {
+      this(fs.open(path), fs.getFileStatus(path).getLen(), cache);
+      this.closeIStream = true;
+      this.name = path.toString();
+    }
+
+    /** 
+     * Opens a HFile.  You must load the index before you can 
+     * use it by calling {@link #loadFileInfo()}.
+     *
+     * @param fsdis input stream.  Caller is responsible for closing the passed
+     * stream.
+     * @param size Length of the stream.
+     * @param cache block cache. Pass null if none.
+     * @throws IOException
+     */
+    public Reader(final FSDataInputStream fsdis, final long size,
+        final BlockCache cache)
+    throws IOException {
+      this.cache = cache;
+      this.fileSize = size;
+      this.istream = fsdis;
+      this.closeIStream = false;
+      this.name = this.istream.toString();
+    }
+
+    public String toString() {
+      return "reader=" + this.name +
+        (!isFileInfoLoaded()? "":
+          ", compression=" + this.compressAlgo.getName() +
+          ", firstKey=" + Bytes.toString(getFirstKey()) +
+          ", lastKey=" + Bytes.toString(getLastKey()) +
+          ", avgKeyLen=" + this.avgKeyLen +
+          ", avgValueLen=" + this.avgValueLen +
+          ", entries=" + this.trailer.entryCount +
+          ", length=" + this.fileSize);
+    }
+
+    public long length() {
+      return this.fileSize;
+    }
+
+    /**
+     * Read in the index and file info.
+     * @return A map of fileinfo data.
+     * See {@link Writer#appendFileInfo(byte[], byte[])}.
+     * @throws IOException
+     */
+    public Map<byte [], byte []> loadFileInfo() throws IOException {
+      this.trailer = readTrailer();
+
+      // Read in the fileinfo and get what we need from it.
+      this.istream.seek(this.trailer.fileinfoOffset);
+      FileInfo fi = new FileInfo();
+      fi.readFields(this.istream);
+      this.lastkey = fi.get(FileInfo.LASTKEY);
+      this.avgKeyLen = Bytes.toInt(fi.get(FileInfo.AVG_KEY_LEN));
+      this.avgValueLen = Bytes.toInt(fi.get(FileInfo.AVG_VALUE_LEN));
+      String clazzName = Bytes.toString(fi.get(FileInfo.COMPARATOR));
+      this.comparator = getComparator(clazzName);
+
+      // Read in the data index.
+      this.blockIndex = BlockIndex.readIndex(this.comparator, this.istream,
+        this.trailer.dataIndexOffset, this.trailer.dataIndexCount);
+
+      // Read in the metadata index.
+      if (trailer.metaIndexCount > 0) {
+        this.metaIndex = BlockIndex.readIndex(Bytes.BYTES_RAWCOMPARATOR,
+          this.istream, this.trailer.metaIndexOffset, trailer.metaIndexCount);
+      }
+      this.fileInfoLoaded = true;
+      return fi;
+    }
+
+    boolean isFileInfoLoaded() {
+      return this.fileInfoLoaded;
+    }
+
+    @SuppressWarnings("unchecked")
+    private RawComparator<byte []> getComparator(final String clazzName)
+    throws IOException {
+      if (clazzName == null || clazzName.length() == 0) {
+        return null;
+      }
+      try {
+        return (RawComparator)Class.forName(clazzName).newInstance();
+      } catch (InstantiationException e) {
+        throw new IOException(e);
+      } catch (IllegalAccessException e) {
+        throw new IOException(e);
+      } catch (ClassNotFoundException e) {
+        throw new IOException(e);
+      }
+    }
+
+    /* Read the trailer off the input stream.  As side effect, sets the
+     * compression algorithm.
+     * @return Populated FixedFileTrailer.
+     * @throws IOException
+     */
+    private FixedFileTrailer readTrailer() throws IOException {
+      FixedFileTrailer fft = new FixedFileTrailer();
+      long seekPoint = this.fileSize - FixedFileTrailer.trailerSize();
+      this.istream.seek(seekPoint);
+      fft.deserialize(this.istream);
+      // Set up the codec.
+      this.compressAlgo =
+        Compression.Algorithm.values()[fft.compressionCodec];
+      return fft;
+    }
+
+    /**
+     * Create a Scanner on this file.  No seeks or reads are done on creation.
+     * Call {@link HFileScanner#seekTo(byte[])} to position an start the read.
+     * There is nothing to clean up in a Scanner. Letting go of your references
+     * to the scanner is sufficient.
+     * @return Scanner on this file.
+     */
+    public HFileScanner getScanner() {
+      return new Scanner(this);
+    }
+    /**
+     * @param key Key to search.
+     * @return Block number of the block containing the key or -1 if not in this
+     * file.
+     */
+    protected int blockContainingKey(final byte [] key) {
+      if (blockIndex == null) {
+        throw new RuntimeException("Block index not loaded");
+      }
+      return blockIndex.blockContainingKey(key);
+    }
+    /**
+     * @param metaBlockName
+     * @return Block wrapped in a ByteBuffer
+     * @throws IOException
+     */
+    public ByteBuffer getMetaBlock(String metaBlockName) throws IOException {
+      if (metaIndex == null) {
+        throw new IOException("Meta index not loaded");
+      }
+      int block = metaIndex.blockContainingKey(Bytes.toBytes(metaBlockName));
+      if (block == -1)
+        return null;
+      long blockSize;
+      if (block == metaIndex.count - 1) {
+        blockSize = trailer.fileinfoOffset - metaIndex.blockOffsets[block];
+      } else {
+        blockSize = metaIndex.blockOffsets[block+1] - metaIndex.blockOffsets[block];
+      }
+      
+      ByteBuffer buf = decompress(metaIndex.blockOffsets[block],
+        longToInt(blockSize), metaIndex.blockDataSizes[block]);
+      byte [] magic = new byte[METABLOCKMAGIC.length];
+      buf.get(magic, 0, magic.length);
+
+      if (! Arrays.equals(magic, METABLOCKMAGIC)) {
+        throw new IOException("Meta magic is bad in block " + block);
+      }
+      // Toss the header. May have to remove later due to performance.
+      buf.compact();
+      buf.limit(buf.limit() - METABLOCKMAGIC.length);
+      buf.rewind();
+      return buf;
+    }
+    /**
+     * Read in a file block.
+     * @param block Index of block to read.
+     * @return Block wrapped in a ByteBuffer.
+     * @throws IOException
+     */
+    ByteBuffer readBlock(int block) throws IOException {
+      if (blockIndex == null) {
+        throw new IOException("Block index not loaded");
+      }
+      if (block < 0 || block > blockIndex.count) {
+        throw new IOException("Requested block is out of range: " + block +
+          ", max: " + blockIndex.count);
+      }
+
+      // For any given block from any given file, synchronize reads for said
+      // block.
+      // Without a cache, this synchronizing is needless overhead, but really
+      // the other choice is to duplicate work (which the cache would prevent you from doing).
+      synchronized (blockIndex.blockKeys[block]) {
+        blockLoads++;
+        // Check cache for block.  If found return.
+        if (cache != null) {
+          ByteBuffer cachedBuf = cache.getBlock(name + block);
+          if (cachedBuf != null) {
+            // LOG.debug("Reusing block for: " + block);
+            // Return a distinct 'copy' of the block, so pos doesnt get messed by
+            // the scanner
+            cacheHits++;
+            return cachedBuf.duplicate();
+          }
+          // Carry on, please load.
+        }
+
+        // Load block from filesystem.
+        long onDiskBlockSize;
+        if (block == blockIndex.count - 1) {
+          // last block!  The end of data block is first meta block if there is
+          // one or if there isn't, the fileinfo offset.
+          long offset = this.metaIndex != null?
+            this.metaIndex.blockOffsets[0]: this.trailer.fileinfoOffset;
+          onDiskBlockSize = offset - blockIndex.blockOffsets[block];
+        } else {
+          onDiskBlockSize = blockIndex.blockOffsets[block+1] -
+          blockIndex.blockOffsets[block];
+        }
+        ByteBuffer buf = decompress(blockIndex.blockOffsets[block],
+            longToInt(onDiskBlockSize), this.blockIndex.blockDataSizes[block]);
+
+        byte [] magic = new byte[DATABLOCKMAGIC.length];
+        buf.get(magic, 0, magic.length);
+        // LOG.debug("read block:"+buf.position() + " lim:" + buf.limit());
+        if (!Arrays.equals(magic, DATABLOCKMAGIC)) {
+          throw new IOException("Data magic is bad in block " + block);
+        }
+        // Toss the header. May have to remove later due to performance.
+        buf.compact();
+        buf.limit(buf.limit() - DATABLOCKMAGIC.length);
+        // LOG.debug("read block:"+buf.position() + " lim:" + buf.limit());
+        buf.rewind();
+        // LOG.debug("read block:"+buf.position() + " lim:" + buf.limit());
+
+        // Cache a copy, not the one we are sending back, so the position doesnt
+        // get messed.
+        if (cache != null) {
+          cache.cacheBlock(name + block, buf.duplicate());
+        }
+
+        return buf;
+      }
+    }
+
+    /*
+     * Decompress <code>compressedSize</code> bytes off the backing
+     * FSDataInputStream.
+     * @param offset
+     * @param compressedSize
+     * @param decompressedSize
+     * @return
+     * @throws IOException
+     */
+    private ByteBuffer decompress(final long offset, final int compressedSize,
+      final int decompressedSize) 
+    throws IOException {
+      Decompressor decompressor = this.compressAlgo.getDecompressor();
+      // My guess is that the bounded range fis is needed to stop the 
+      // decompressor reading into next block -- IIRC, it just grabs a
+      // bunch of data w/o regard to whether decompressor is coming to end of a
+      // decompression.
+      InputStream is = this.compressAlgo.createDecompressionStream(
+        new BoundedRangeFileInputStream(this.istream, offset, compressedSize),
+        decompressor, 0);
+      ByteBuffer buf = ByteBuffer.allocate(decompressedSize);
+      IOUtils.readFully(is, buf.array(), 0, buf.capacity());
+      return buf;
+    }
+
+    /**
+     * @return First key in the file.
+     */
+    public byte [] getFirstKey() {
+      if (blockIndex == null) {
+        throw new RuntimeException("Block index not loaded");
+      }
+      return blockIndex.blockKeys[0];
+    }
+
+    public int getEntries() {
+      if (!this.isFileInfoLoaded()) {
+        throw new RuntimeException("File info not loaded");
+      }
+      return this.trailer.entryCount;
+    }
+
+    /**
+     * @return Last key in the file.
+     */
+    public byte [] getLastKey() {
+      if (!isFileInfoLoaded()) {
+        throw new RuntimeException("Load file info first");
+      }
+      return this.lastkey;
+    }
+
+    /**
+     * @return Comparator.
+     */
+    public RawComparator<byte []> getComparator() {
+      return this.comparator;
+    }
+
+    /**
+     * @return index size
+     */
+    public long indexSize() {
+      return (this.blockIndex != null? this.blockIndex.heapSize(): 0) +
+        ((this.metaIndex != null)? this.metaIndex.heapSize(): 0);
+    }
+ 
+    /**
+     * @return Midkey for this file.  We work with block boundaries only so
+     * returned midkey is an approximation only.
+     * @throws IOException
+     */
+    public byte [] midkey() throws IOException {
+      if (!isFileInfoLoaded() || this.blockIndex.isEmpty()) {
+        return null;
+      }
+      return this.blockIndex.midkey();
+    }
+
+    public void close() throws IOException {
+      if (this.closeIStream && this.istream != null) {
+        this.istream.close();
+        this.istream = null;
+      }
+    }
+
+    /*
+     * Implementation of {@link HFileScanner} interface.
+     */
+    private static class Scanner implements HFileScanner {
+      private final Reader reader;
+      private ByteBuffer block;
+      private int currBlock;
+
+      private int currKeyLen = 0;
+      private int currValueLen = 0;
+
+      public int blockFetches = 0;
+
+      public Scanner(Reader r) {
+        this.reader = r;
+      }
+
+      public ByteBuffer getKey() {
+        if (this.block == null || this.currKeyLen == 0) {
+          throw new RuntimeException("you need to seekTo() before calling getKey()");
+        }
+        ByteBuffer keyBuff = this.block.slice();
+        keyBuff.limit(this.currKeyLen);
+        keyBuff.rewind();
+        // Do keyBuff.asReadOnly()?
+        return keyBuff;
+      }
+
+      public ByteBuffer getValue() {
+        if (block == null || currKeyLen == 0) {
+          throw new RuntimeException("you need to seekTo() before calling getValue()");
+        }
+        // TODO: Could this be done with one ByteBuffer rather than create two?
+        ByteBuffer valueBuff = this.block.slice();
+        valueBuff.position(this.currKeyLen);
+        valueBuff = valueBuff.slice();
+        valueBuff.limit(currValueLen);
+        valueBuff.rewind();
+        return valueBuff;
+      }
+
+      public boolean next() throws IOException {
+        // LOG.deug("rem:" + block.remaining() + " p:" + block.position() +
+        // " kl: " + currKeyLen + " kv: " + currValueLen);
+        if (block == null) {
+          throw new IOException("Next called on non-seeked scanner");
+        }
+        block.position(block.position() + currKeyLen + currValueLen);
+        if (block.remaining() <= 0) {
+          // LOG.debug("Fetch next block");
+          currBlock++;
+          if (currBlock >= reader.blockIndex.count) {
+            // damn we are at the end
+            currBlock = 0;
+            block = null;
+            return false;
+          }
+          block = reader.readBlock(currBlock);
+          currKeyLen = block.getInt();
+          currValueLen = block.getInt();
+          blockFetches++;
+          return true;
+        }
+        // LOG.debug("rem:" + block.remaining() + " p:" + block.position() +
+        // " kl: " + currKeyLen + " kv: " + currValueLen);
+
+        currKeyLen = block.getInt();
+        currValueLen = block.getInt();
+        return true;
+      }
+
+      public int seekTo(byte[] key) throws IOException {
+        int b = reader.blockContainingKey(key);
+        if (b < 0) return -1; // falls before the beginning of the file! :-(
+        // Avoid re-reading the same block (that'd be dumb).
+        loadBlock(b);
+        
+        return blockSeek(key, false);
+      }
+
+      /**
+       * Within a loaded block, seek looking for the first key
+       * that is smaller than (or equal to?) the key we are interested in.
+       * 
+       * A note on the seekBefore - if you have seekBefore = true, AND the
+       * first key in the block = key, then you'll get thrown exceptions.
+       * @param key to find
+       * @param seekBefore find the key before the exact match.
+       * @return
+       */
+      private int blockSeek(byte[] key, boolean seekBefore) {
+        int klen, vlen;
+        int lastLen = 0;
+        do {
+          klen = block.getInt();
+          vlen = block.getInt();
+          int comp = this.reader.comparator.compare(key, 0, key.length,
+              block.array(), block.arrayOffset() + block.position(), klen);
+          if (comp == 0) {
+            if (seekBefore) {
+              block.position(block.position() - lastLen - 16);
+              currKeyLen = block.getInt();
+              currValueLen = block.getInt();
+              return 1; // non exact match.
+            }
+            currKeyLen = klen;
+            currValueLen = vlen;
+            return 0; // indicate exact match
+          }
+          if (comp < 0) {
+            // go back one key:
+            block.position(block.position() - lastLen - 16);
+            currKeyLen = block.getInt();
+            currValueLen = block.getInt();
+            return 1;
+          }
+          block.position(block.position() + klen + vlen);
+          lastLen = klen + vlen ;
+        } while( block.remaining() > 0 );
+        // ok we are at the end, so go back a littleeeeee....
+        block.position(block.position() - lastLen - 8);
+        currKeyLen = block.getInt();
+        currValueLen = block.getInt();
+        return 1; // didn't exactly find it.
+      }
+
+      public boolean seekBefore(byte[] key) throws IOException {
+        int b = reader.blockContainingKey(key);
+        if (b < 0)
+          return false; // key is before the start of the file.
+
+        // Question: does this block begin with 'key'?
+        if (this.reader.comparator.compare(reader.blockIndex.blockKeys[b], key) == 0) {
+          // Ok the key we're interested in is the first of the block, so go back one.
+          if (b == 0) {
+            // we have a 'problem', the key we want is the first of the file.
+            return false;
+          }
+          b--;
+          // TODO shortcut: seek forward in this block to the last key of the block.
+        }
+        loadBlock(b);
+        blockSeek(key, true);
+        return true;
+      }
+
+      public String getKeyString() {
+        return Bytes.toString(block.array(), block.arrayOffset() +
+          block.position(), currKeyLen);
+      }
+
+      public String getValueString() {
+        return Bytes.toString(block.array(), block.arrayOffset() +
+          block.position() + currKeyLen, currValueLen);
+      }
+
+      public Reader getReader() {
+        return this.reader;
+      }
+
+      public boolean isSeeked(){
+        return this.block != null;
+      }
+
+      public boolean seekTo() throws IOException {
+        if (this.reader.blockIndex.isEmpty()) {
+          return false;
+        }
+        if (block != null && currBlock == 0) {
+          block.rewind();
+          currKeyLen = block.getInt();
+          currValueLen = block.getInt();
+        }
+        currBlock = 0;
+        block = reader.readBlock(currBlock);
+        currKeyLen = block.getInt();
+        currValueLen = block.getInt();
+        blockFetches++;
+        return true;
+      }
+      
+      private void loadBlock(int bloc) throws IOException {
+        if (block == null) {
+          block = reader.readBlock(bloc);
+          currBlock = bloc;
+          blockFetches++;
+        } else {
+          if (bloc != currBlock) {
+            block = reader.readBlock(bloc);
+            currBlock = bloc;
+            blockFetches++;
+          } else {
+            // we are already in the same block, just rewind to seek again.
+            block.rewind();
+          }
+        }
+      }
+    }
+  }
+  /*
+   * The RFile has a fixed trailer which contains offsets to other variable
+   * parts of the file.  Also includes basic metadata on this file.
+   */
+  private static class FixedFileTrailer {
+    // Offset to the data block index.
+    long dataIndexOffset;
+    // Offset to the fileinfo data, a small block of vitals..
+    long fileinfoOffset;
+    // How many index counts are there (aka: block count)
+    int dataIndexCount;
+    // Offset to the meta block index.
+    long metaIndexOffset;
+    // How many meta block index entries (aka: meta block count)
+    int metaIndexCount;
+    long totalUncompressedBytes;
+    int entryCount;
+    int compressionCodec;
+    int version = 1;
+    
+    FixedFileTrailer() {
+      super();
+    }
+
+    static int trailerSize() {
+      // Keep this up to date...
+      final int intSize = 4;
+      final int longSize = 8;
+      return 
+      ( intSize * 5 ) +
+      ( longSize * 4 ) +
+      TRAILERBLOCKMAGIC.length;
+    }
+
+    void serialize(DataOutputStream outputStream) throws IOException {
+      outputStream.write(TRAILERBLOCKMAGIC);
+      outputStream.writeLong(fileinfoOffset);
+      outputStream.writeLong(dataIndexOffset);
+      outputStream.writeInt(dataIndexCount);
+      outputStream.writeLong(metaIndexOffset);
+      outputStream.writeInt(metaIndexCount);
+      outputStream.writeLong(totalUncompressedBytes);
+      outputStream.writeInt(entryCount);
+      outputStream.writeInt(compressionCodec);
+      outputStream.writeInt(version);
+    }
+
+    void deserialize(DataInputStream inputStream) throws IOException {
+      byte [] header = new byte[TRAILERBLOCKMAGIC.length];
+      inputStream.readFully(header);
+      if ( !Arrays.equals(header, TRAILERBLOCKMAGIC)) {
+        throw new IOException("Trailer 'header' is wrong; does the trailer " +
+          "size match content?");
+      }
+      fileinfoOffset         = inputStream.readLong();
+      dataIndexOffset        = inputStream.readLong();
+      dataIndexCount         = inputStream.readInt();
+
+      metaIndexOffset        = inputStream.readLong();
+      metaIndexCount         = inputStream.readInt();
+      
+      totalUncompressedBytes = inputStream.readLong();
+      entryCount             = inputStream.readInt();
+      compressionCodec       = inputStream.readInt();
+      version                = inputStream.readInt();
+
+      if (version != 1) {
+        throw new IOException("Wrong version: " + version);
+      }
+    }
+
+    public String toString() {
+      return "fileinfoOffset=" + fileinfoOffset +
+      ", dataIndexOffset=" + dataIndexOffset +
+      ", dataIndexCount=" + dataIndexCount +
+      ", metaIndexOffset=" + metaIndexOffset +
+      ", metaIndexCount=" + metaIndexCount +
+      ", totalBytes=" + totalUncompressedBytes +
+      ", entryCount=" + entryCount +
+      ", version=" + version;
+    }
+  }
+
+  /*
+   * The block index for a RFile.
+   * Used reading.
+   */
+  static class BlockIndex implements HeapSize {
+    // How many actual items are there? The next insert location too.
+    int count = 0;
+    byte [][] blockKeys;
+    long [] blockOffsets;
+    int [] blockDataSizes;
+    int size = 0;
+  
+    /* Needed doing lookup on blocks.
+     */
+    RawComparator<byte []> comparator;
+  
+    /*
+     * Shutdown default constructor
+     */
+    @SuppressWarnings("unused")
+    private BlockIndex() {
+      this(null);
+    }
+  
+    /**
+     * Constructor
+     * @param trailer File tail structure with index stats.
+     */
+    BlockIndex(final RawComparator<byte []> c) {
+      this.comparator = c;
+      // Guess that cost of three arrays + this object is 4 * 8 bytes.
+      this.size += (4 * 8);
+    }
+
+    /**
+     * @return True if block index is empty.
+     */
+    boolean isEmpty() {
+      return this.blockKeys.length <= 0;
+    }
+
+    /**
+     * Adds a new entry in the block index.
+     * 
+     * @param key Last key in the block
+     * @param offset file offset where the block is stored
+     * @param dataSize the uncompressed data size
+     */
+    void add(final byte[] key, final long offset, final int dataSize) {
+      blockOffsets[count] = offset;
+      blockKeys[count] = key;
+      blockDataSizes[count] = dataSize;
+      count++;
+      this.size += (Bytes.SIZEOF_INT * 2 + key.length);
+    }
+
+    /**
+     * @param key Key to find
+     * @return Offset of block containing <code>key</code> or -1 if this file
+     * does not contain the request.
+     */
+    int blockContainingKey(final byte[] key) {
+      int pos = Arrays.binarySearch(blockKeys, key, this.comparator);
+      if (pos < 0) {
+        pos ++;
+        pos *= -1;
+        if (pos == 0) {
+          // falls before the beginning of the file.
+          return -1;
+        }
+        // When switched to "first key in block" index, binarySearch now returns
+        // the block with a firstKey < key.  This means the value we want is potentially
+        // in the next block.
+        pos --; // in previous block.
+        
+        return pos;
+      }
+      // wow, a perfect hit, how unlikely?
+      return pos;
+    }
+  
+    /*
+     * @return File midkey.  Inexact.  Operates on block boundaries.  Does
+     * not go into blocks.
+     */
+    byte [] midkey() throws IOException {
+      int pos = ((this.count - 1)/2);              // middle of the index
+      if (pos < 0) {
+        throw new IOException("HFile empty");
+      }
+      return this.blockKeys[pos];
+    }
+
+    /*
+     * Write out index. Whatever we write here must jibe with what
+     * BlockIndex#readIndex is expecting.  Make sure the two ends of the
+     * index serialization match.
+     * @param o
+     * @param keys
+     * @param offsets
+     * @param sizes
+     * @param c
+     * @return Position at which we entered the index.
+     * @throws IOException
+     */
+    static long writeIndex(final FSDataOutputStream o,
+      final List<byte []> keys, final List<Long> offsets,
+      final List<Integer> sizes)
+    throws IOException {
+      long pos = o.getPos();
+      // Don't write an index if nothing in the index.
+      if (keys.size() > 0) {
+        o.write(INDEXBLOCKMAGIC);
+        // Write the index.
+        for (int i = 0; i < keys.size(); ++i) {
+          o.writeLong(offsets.get(i).longValue());
+          o.writeInt(sizes.get(i).intValue());
+          byte [] key = keys.get(i);
+          Bytes.writeByteArray(o, key);
+        }
+      }
+      return pos;
+    }
+
+    /*
+     * Read in the index that is at <code>indexOffset</code>
+     * Must match what was written by writeIndex in the Writer.close.
+     * @param in
+     * @param indexOffset
+     * @throws IOException
+     */
+    static BlockIndex readIndex(final RawComparator<byte []> c,
+        final FSDataInputStream in, final long indexOffset, final int indexSize)
+    throws IOException {
+      BlockIndex bi = new BlockIndex(c);
+      bi.blockOffsets = new long[indexSize];
+      bi.blockKeys = new byte[indexSize][];
+      bi.blockDataSizes = new int[indexSize];
+      // If index size is zero, no index was written.
+      if (indexSize > 0) {
+        in.seek(indexOffset);
+        byte [] magic = new byte[INDEXBLOCKMAGIC.length];
+        IOUtils.readFully(in, magic, 0, magic.length);
+        if (!Arrays.equals(magic, INDEXBLOCKMAGIC)) {
+          throw new IOException("Index block magic is wrong: " +
+            Arrays.toString(magic));
+        }
+        for (int i = 0; i < indexSize; ++i ) {
+          long offset   = in.readLong();
+          int dataSize  = in.readInt();
+          byte [] key = Bytes.readByteArray(in);
+          bi.add(key, offset, dataSize);
+        }
+      }
+      return bi;
+    }
+
+    public String toString() {
+      StringBuilder sb = new StringBuilder();
+      sb.append("size=" + count);
+      for (int i = 0; i < count ; i++) {
+        sb.append(", ");
+        sb.append("key=").append(Bytes.toString(blockKeys[i])).
+          append(", offset=").append(blockOffsets[i]).
+          append(", dataSize=" + blockDataSizes[i]);
+      }
+      return sb.toString();
+    }
+
+    public long heapSize() {
+      return this.size;
+    }
+  }
+
+  /*
+   * Metadata for this file.  Conjured by the writer.  Read in by the reader.
+   */
+  static class FileInfo extends HbaseMapWritable<byte [], byte []> {
+    static final String RESERVED_PREFIX = "hfile.";
+    static final byte [] LASTKEY = Bytes.toBytes(RESERVED_PREFIX + "LASTKEY");
+    static final byte [] AVG_KEY_LEN =
+      Bytes.toBytes(RESERVED_PREFIX + "AVG_KEY_LEN");
+    static final byte [] AVG_VALUE_LEN =
+      Bytes.toBytes(RESERVED_PREFIX + "AVG_VALUE_LEN");
+    static final byte [] COMPARATOR =
+      Bytes.toBytes(RESERVED_PREFIX + "COMPARATOR");
+
+    /*
+     * Constructor.
+     */
+    FileInfo() {
+      super();
+    }
+  }
+
+  /**
+   * Get names of supported compression algorithms. The names are acceptable by
+   * HFile.Writer.
+   * 
+   * @return Array of strings, each represents a supported compression
+   *         algorithm. Currently, the following compression algorithms are
+   *         supported.
+   *         <ul>
+   *         <li>"none" - No compression.
+   *         <li>"gz" - GZIP compression.
+   *         </ul>
+   */
+  public static String[] getSupportedCompressionAlgorithms() {
+    return Compression.getSupportedAlgorithms();
+  }
+
+  // Utility methods.
+  /*
+   * @param l Long to convert to an int.
+   * @return <code>l</code> cast as an int.
+   */
+  static int longToInt(final long l) {
+    // Expecting the size() of a block not exceeding 4GB. Assuming the
+    // size() will wrap to negative integer if it exceeds 2GB (From tfile).
+    return (int)(l & 0x00000000ffffffffL);
+  }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java b/src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
new file mode 100644
index 00000000000..b16ee765280
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
@@ -0,0 +1,112 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * A scanner allows you to position yourself within a HFile and
+ * scan through it.  It allows you to reposition yourself as well.
+ * 
+ * <p>A scanner doesn't always have a key/value that it is pointing to
+ * when it is first created and before
+ * {@link #seekTo()}/{@link #seekTo(byte[])} are called.
+ * In this case, {@link #getKey()}/{@link #getValue()} returns null.  At most
+ * other times, a key and value will be available.  The general pattern is that
+ * you position the Scanner using the seekTo variants and then getKey and
+ * getValue.
+ */
+public interface HFileScanner {
+  /**
+   * SeekTo or just before the passed <code>key</code>.  Examine the return
+   * code to figure whether we found the key or not.
+   * Consider the key stream of all the keys in the file, 
+   * <code>k[0] .. k[n]</code>, where there are n keys in the file.
+   * @param key Key to find.
+   * @return -1, if key < k[0], no position;
+   * 0, such that k[i] = key and scanner is left in position i; and
+   * 1, such that k[i] < key, and scanner is left in position i.
+   * Furthermore, there may be a k[i+1], such that k[i] < key < k[i+1]
+   * but there may not be a k[i+1], and next() will return false (EOF).
+   * @throws IOException
+   */
+  public int seekTo(byte[] key) throws IOException;
+  /**
+   * Consider the key stream of all the keys in the file, 
+   * <code>k[0] .. k[n]</code>, where there are n keys in the file.
+   * @param key Key to find
+   * @return false if key <= k[0] or true with scanner in position 'i' such
+   * that: k[i] < key.  Furthermore: there may be a k[i+1], such that
+   * k[i] < key <= k[i+1] but there may also NOT be a k[i+1], and next() will
+   * return false (EOF).
+   */
+  public boolean seekBefore(byte [] key) throws IOException;
+  /**
+   * Positions this scanner at the start of the file.
+   * @return False if empty file; i.e. a call to next would return false and
+   * the current key and value are undefined.
+   * @throws IOException
+   */
+  public boolean seekTo() throws IOException;
+  /**
+   * Scans to the next entry in the file.
+   * @return Returns false if you are at the end otherwise true if more in file.
+   * @throws IOException
+   */
+  public boolean next() throws IOException;
+  /**
+   * Gets a buffer view to the current key. You must call
+   * {@link #seekTo(byte[])} before this method.
+   * @return byte buffer for the key. The limit is set to the key size, and the
+   * position is 0, the start of the buffer view.
+   */
+  public ByteBuffer getKey();
+  /**
+   * Gets a buffer view to the current value.  You must call
+   * {@link #seekTo(byte[])} before this method.
+   * 
+   * @return byte buffer for the value. The limit is set to the value size, and
+   * the position is 0, the start of the buffer view.
+   */
+  public ByteBuffer getValue();
+  /**
+   * Convenience method to get a copy of the key as a string - interpreting the
+   * bytes as UTF8. You must call {@link #seekTo(byte[])} before this method.
+   * @return key as a string
+   */
+  public String getKeyString();
+  /**
+   * Convenience method to get a copy of the value as a string - interpreting
+   * the bytes as UTF8. You must call {@link #seekTo(byte[])} before this method.
+   * @return value as a string
+   */
+  public String getValueString();
+  /**
+   * @return Reader that underlies this Scanner instance.
+   */
+  public HFile.Reader getReader();
+  /**
+   * @return True is scanner has had one of the seek calls invoked; i.e.
+   * {@link #seekBefore(byte[])} or {@link #seekTo()} or {@link #seekTo(byte[])}.
+   * Otherwise returns false.
+   */
+  public boolean isSeeked();
+}
\ No newline at end of file
diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java b/src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java
new file mode 100644
index 00000000000..7f934e19e74
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java
@@ -0,0 +1,56 @@
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.lang.ref.ReferenceQueue;
+import java.lang.ref.SoftReference;
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+
+
+/**
+ * Simple one RFile soft reference cache.
+ */
+public class SimpleBlockCache implements BlockCache {
+  private static class Ref extends SoftReference<ByteBuffer> {
+    public String blockId;
+    public Ref(String blockId, ByteBuffer buf, ReferenceQueue q) {
+      super(buf, q);
+      this.blockId = blockId;
+    }
+  }
+  private Map<String,Ref> cache = 
+    new HashMap<String,Ref>();
+
+  private ReferenceQueue q = new ReferenceQueue();
+  public int dumps = 0;
+  
+  public SimpleBlockCache() {
+    super();
+  }
+  
+  void processQueue() {
+    Ref r;
+    while ( (r = (Ref)q.poll()) != null) {
+      cache.remove(r.blockId);
+      dumps++;
+    }
+  }
+
+  public synchronized int size() {
+    processQueue();
+    return cache.size();
+  }
+  @Override
+  public synchronized ByteBuffer getBlock(String blockName) {
+    processQueue(); // clear out some crap.
+    Ref ref = cache.get(blockName);
+    if (ref == null)
+      return null;
+    return ref.get();
+  }
+
+  @Override
+  public synchronized void cacheBlock(String blockName, ByteBuffer buf) {
+    cache.put(blockName, new Ref(blockName, buf, q));
+  }
+}
diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/package.html b/src/java/org/apache/hadoop/hbase/io/hfile/package.html
new file mode 100644
index 00000000000..fa9244f735a
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/io/hfile/package.html
@@ -0,0 +1,25 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<head />
+<body bgcolor="white">
+Provides the hbase data+index+metadata file.
+</body>
+</html>
diff --git a/src/java/org/apache/hadoop/hbase/regionserver/HStore.java b/src/java/org/apache/hadoop/hbase/regionserver/HStore.java
deleted file mode 100644
index a9e86b32847..00000000000
--- a/src/java/org/apache/hadoop/hbase/regionserver/HStore.java
+++ /dev/null
@@ -1,2207 +0,0 @@
-/**
- * Copyright 2007 The Apache Software Foundation
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hbase.regionserver;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
-import java.util.concurrent.CopyOnWriteArraySet;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hbase.HBaseConfiguration;
-import org.apache.hadoop.hbase.HColumnDescriptor;
-import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HRegionInfo;
-import org.apache.hadoop.hbase.HStoreKey;
-import org.apache.hadoop.hbase.RemoteExceptionHandler;
-import org.apache.hadoop.hbase.filter.RowFilterInterface;
-import org.apache.hadoop.hbase.io.BloomFilterMapFile;
-import org.apache.hadoop.hbase.io.Cell;
-import org.apache.hadoop.hbase.io.HBaseMapFile;
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
-import org.apache.hadoop.hbase.io.Reference;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.util.FSUtils;
-import org.apache.hadoop.hbase.io.MapFile;
-import org.apache.hadoop.hbase.io.SequenceFile;
-import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.util.StringUtils;
-
-/**
- * HStore maintains a bunch of data files.  It is responsible for maintaining 
- * the memory/file hierarchy and for periodic flushes to disk and compacting 
- * edits to the file.
- *
- * Locking and transactions are handled at a higher level.  This API should not 
- * be called directly by any writer, but rather by an HRegion manager.
- */
-public class HStore implements HConstants {
-  static final Log LOG = LogFactory.getLog(HStore.class);
-
-  /*
-   * Regex that will work for straight filenames and for reference names.
-   * If reference, then the regex has more than just one group.  Group 1 is
-   * this files id.  Group 2 the referenced region name, etc.
-   */
-  private static final Pattern REF_NAME_PARSER =
-    Pattern.compile("^(\\d+)(?:\\.(.+))?$");
-  
-  protected final Memcache memcache;
-  private final Path basedir;
-  private final HRegionInfo info;
-  private final HColumnDescriptor family;
-  private final SequenceFile.CompressionType compression;
-  final FileSystem fs;
-  private final HBaseConfiguration conf;
-  // ttl in milliseconds.
-  protected long ttl;
-  private long majorCompactionTime;
-  private int maxFilesToCompact;
-  private final long desiredMaxFileSize;
-  private volatile long storeSize;
-
-  private final Integer flushLock = new Integer(0);
-
-  final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
-
-  final byte [] storeName;
-  private final String storeNameStr;
-
-  /*
-   * Sorted Map of readers keyed by sequence id (Most recent should be last in
-   * in list).
-   */
-  private final SortedMap<Long, HStoreFile> storefiles =
-    Collections.synchronizedSortedMap(new TreeMap<Long, HStoreFile>());
-  
-  /*
-   * Sorted Map of readers keyed by sequence id (Most recent is last in list).
-   */
-  private final SortedMap<Long, MapFile.Reader> readers =
-    new TreeMap<Long, MapFile.Reader>();
-
-  // The most-recent log-seq-ID that's present.  The most-recent such ID means
-  // we can ignore all log messages up to and including that ID (because they're
-  // already reflected in the TreeMaps).
-  private volatile long maxSeqId;
-  
-  private final Path compactionDir;
-  private final Integer compactLock = new Integer(0);
-  private final int compactionThreshold;
-  
-  // All access must be synchronized.
-  private final CopyOnWriteArraySet<ChangedReadersObserver> changedReaderObservers =
-    new CopyOnWriteArraySet<ChangedReadersObserver>();
-
-  /**
-   * An HStore is a set of zero or more MapFiles, which stretch backwards over 
-   * time.  A given HStore is responsible for a certain set of columns for a
-   * row in the HRegion.
-   *
-   * <p>The HRegion starts writing to its set of HStores when the HRegion's 
-   * memcache is flushed.  This results in a round of new MapFiles, one for
-   * each HStore.
-   *
-   * <p>There's no reason to consider append-logging at this level; all logging 
-   * and locking is handled at the HRegion level.  HStore just provides
-   * services to manage sets of MapFiles.  One of the most important of those
-   * services is MapFile-compaction services.
-   *
-   * <p>The only thing having to do with logs that HStore needs to deal with is
-   * the reconstructionLog.  This is a segment of an HRegion's log that might
-   * NOT be present upon startup.  If the param is NULL, there's nothing to do.
-   * If the param is non-NULL, we need to process the log to reconstruct
-   * a TreeMap that might not have been written to disk before the process
-   * died.
-   *
-   * <p>It's assumed that after this constructor returns, the reconstructionLog
-   * file will be deleted (by whoever has instantiated the HStore).
-   *
-   * @param basedir qualified path under which the region directory lives
-   * @param info HRegionInfo for this region
-   * @param family HColumnDescriptor for this column
-   * @param fs file system object
-   * @param reconstructionLog existing log file to apply if any
-   * @param conf configuration object
-   * @param reporter Call on a period so hosting server can report we're
-   * making progress to master -- otherwise master might think region deploy
-   * failed.  Can be null.
-   * @throws IOException
-   */
-  protected HStore(Path basedir, HRegionInfo info, HColumnDescriptor family,
-      FileSystem fs, Path reconstructionLog, HBaseConfiguration conf,
-      final Progressable reporter)
-  throws IOException {  
-    this.basedir = basedir;
-    this.info = info;
-    this.family = family;
-    this.fs = fs;
-    this.conf = conf;
-    // getTimeToLive returns ttl in seconds.  Convert to milliseconds.
-    this.ttl = family.getTimeToLive();
-    if (ttl != HConstants.FOREVER) {
-      this.ttl *= 1000;
-    }
-    this.memcache = new Memcache(this.ttl, info);
-    this.compactionDir = HRegion.getCompactionDir(basedir);
-    this.storeName = Bytes.toBytes(this.info.getEncodedName() + "/" +
-      Bytes.toString(this.family.getName()));
-    this.storeNameStr = Bytes.toString(this.storeName);
-
-    // By default, we compact if an HStore has more than
-    // MIN_COMMITS_FOR_COMPACTION map files
-    this.compactionThreshold =
-      conf.getInt("hbase.hstore.compactionThreshold", 3);
-    
-    // By default we split region if a file > DEFAULT_MAX_FILE_SIZE.
-    long maxFileSize = info.getTableDesc().getMaxFileSize();
-    if (maxFileSize == HConstants.DEFAULT_MAX_FILE_SIZE) {
-      maxFileSize = conf.getLong("hbase.hregion.max.filesize",
-        HConstants.DEFAULT_MAX_FILE_SIZE);
-    }
-    this.desiredMaxFileSize = maxFileSize;
-
-    this.majorCompactionTime =
-      conf.getLong(HConstants.MAJOR_COMPACTION_PERIOD, 86400000);
-    if (family.getValue(HConstants.MAJOR_COMPACTION_PERIOD) != null) {
-      String strCompactionTime =
-        family.getValue(HConstants.MAJOR_COMPACTION_PERIOD);
-      this.majorCompactionTime = (new Long(strCompactionTime)).longValue();
-    }
-
-    this.maxFilesToCompact = conf.getInt("hbase.hstore.compaction.max", 10);
-    this.storeSize = 0L;
-
-    if (family.getCompression() == HColumnDescriptor.CompressionType.BLOCK) {
-      this.compression = SequenceFile.CompressionType.BLOCK;
-    } else if (family.getCompression() ==
-      HColumnDescriptor.CompressionType.RECORD) {
-      this.compression = SequenceFile.CompressionType.RECORD;
-    } else {
-      this.compression = SequenceFile.CompressionType.NONE;
-    }
-    
-    Path mapdir = checkdir(HStoreFile.getMapDir(basedir, info.getEncodedName(),
-        family.getName()));
-    Path infodir = checkdir(HStoreFile.getInfoDir(basedir, info.getEncodedName(),
-        family.getName()));
-    
-    // Go through the 'mapdir' and 'infodir' together, make sure that all 
-    // MapFiles are in a reliable state.  Every entry in 'mapdir' must have a 
-    // corresponding one in 'loginfodir'. Without a corresponding log info
-    // file, the entry in 'mapdir' must be deleted.
-    // loadHStoreFiles also computes the max sequence id internally.
-    this.maxSeqId = -1L;
-    this.storefiles.putAll(loadHStoreFiles(infodir, mapdir));
-    if (LOG.isDebugEnabled() && this.storefiles.size() > 0) {
-      LOG.debug("Loaded " + this.storefiles.size() + " file(s) in hstore " +
-        Bytes.toString(this.storeName) + ", max sequence id " + this.maxSeqId);
-    }
-    
-    // Do reconstruction log.
-    runReconstructionLog(reconstructionLog, this.maxSeqId, reporter);
-    
-    // Finally, start up all the map readers!
-    setupReaders();
-  }
-  
-  /*
-   * Setup the mapfile readers for this store. There could be more than one
-   * since we haven't compacted yet.
-   * @throws IOException
-   */
-  private void setupReaders() throws IOException {
-    boolean first = true;
-    for(Map.Entry<Long, HStoreFile> e: this.storefiles.entrySet()) {
-      MapFile.Reader r = null;
-      if (first) {
-        // Use a block cache (if configured) for the first reader only
-        // so as to control memory usage.
-        r = e.getValue().getReader(this.fs, this.family.isBloomfilter(),
-          family.isBlockCacheEnabled());
-        first = false;
-      } else {
-        r = e.getValue().getReader(this.fs, this.family.isBloomfilter(),
-            false);
-      }
-      this.readers.put(e.getKey(), r);
-    }
-  }
-
-  /*
-   * @param dir If doesn't exist, create it.
-   * @return Passed <code>dir</code>.
-   * @throws IOException
-   */
-  private Path checkdir(final Path dir) throws IOException {
-    if (!fs.exists(dir)) {
-      fs.mkdirs(dir);
-    }
-    return dir;
-  }
-
-  HColumnDescriptor getFamily() {
-    return this.family;
-  }
-  
-  long getMaxSequenceId() {
-    return this.maxSeqId;
-  }
-  
-  /*
-   * Run reconstuction log
-   * @param reconstructionLog
-   * @param msid
-   * @param reporter
-   * @throws IOException
-   */
-  private void runReconstructionLog(final Path reconstructionLog,
-    final long msid, final Progressable reporter)
-  throws IOException {
-    try {
-      doReconstructionLog(reconstructionLog, msid, reporter);
-    } catch (EOFException e) {
-      // Presume we got here because of lack of HADOOP-1700; for now keep going
-      // but this is probably not what we want long term.  If we got here there
-      // has been data-loss
-      LOG.warn("Exception processing reconstruction log " + reconstructionLog +
-        " opening " + this.storeName +
-        " -- continuing.  Probably lack-of-HADOOP-1700 causing DATA LOSS!", e);
-    } catch (IOException e) {
-      // Presume we got here because of some HDFS issue. Don't just keep going.
-      // Fail to open the HStore.  Probably means we'll fail over and over
-      // again until human intervention but alternative has us skipping logs
-      // and losing edits: HBASE-642.
-      LOG.warn("Exception processing reconstruction log " + reconstructionLog +
-        " opening " + this.storeName, e);
-      throw e;
-    }
-  }
-
-  /*
-   * Read the reconstructionLog to see whether we need to build a brand-new 
-   * MapFile out of non-flushed log entries.  
-   *
-   * We can ignore any log message that has a sequence ID that's equal to or 
-   * lower than maxSeqID.  (Because we know such log messages are already 
-   * reflected in the MapFiles.)
-   */
-  @SuppressWarnings("unchecked")
-  private void doReconstructionLog(final Path reconstructionLog,
-    final long maxSeqID, final Progressable reporter)
-  throws UnsupportedEncodingException, IOException {
-    if (reconstructionLog == null || !fs.exists(reconstructionLog)) {
-      // Nothing to do.
-      return;
-    }
-    // Check its not empty.
-    FileStatus[] stats = fs.listStatus(reconstructionLog);
-    if (stats == null || stats.length == 0) {
-      LOG.warn("Passed reconstruction log " + reconstructionLog + " is zero-length");
-      return;
-    }
-    long maxSeqIdInLog = -1;
-    TreeMap<HStoreKey, byte []> reconstructedCache =
-      new TreeMap<HStoreKey, byte []>(new HStoreKey.HStoreKeyWritableComparator(this.info));
-      
-    SequenceFile.Reader logReader = new SequenceFile.Reader(this.fs,
-        reconstructionLog, this.conf);
-    
-    try {
-      HLogKey key = new HLogKey();
-      HLogEdit val = new HLogEdit();
-      long skippedEdits = 0;
-      long editsCount = 0;
-      // How many edits to apply before we send a progress report.
-      int reportInterval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
-      while (logReader.next(key, val)) {
-        maxSeqIdInLog = Math.max(maxSeqIdInLog, key.getLogSeqNum());
-        if (key.getLogSeqNum() <= maxSeqID) {
-          skippedEdits++;
-          continue;
-        }
-        // Check this edit is for me. Also, guard against writing
-        // METACOLUMN info such as HBASE::CACHEFLUSH entries
-        byte [] column = val.getColumn();
-        if (val.isTransactionEntry() || Bytes.equals(column, HLog.METACOLUMN)
-            || !Bytes.equals(key.getRegionName(), info.getRegionName())
-            || !HStoreKey.matchingFamily(family.getName(), column)) {
-          continue;
-        }
-        HStoreKey k = new HStoreKey(key.getRow(), column, val.getTimestamp(),
-          this.info);
-        reconstructedCache.put(k, val.getVal());
-        editsCount++;
-        // Every 2k edits, tell the reporter we're making progress.
-        // Have seen 60k edits taking 3minutes to complete.
-        if (reporter != null && (editsCount % reportInterval) == 0) {
-          reporter.progress();
-        }
-      }
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits +
-          " because sequence id <= " + maxSeqID);
-      }
-    } finally {
-      logReader.close();
-    }
-    
-    if (reconstructedCache.size() > 0) {
-      // We create a "virtual flush" at maxSeqIdInLog+1.
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("flushing reconstructionCache");
-      }
-      internalFlushCache(reconstructedCache, maxSeqIdInLog + 1);
-    }
-  }
-  
-  /*
-   * Creates a series of HStoreFiles loaded from the given directory.
-   * There must be a matching 'mapdir' and 'loginfo' pair of files.
-   * If only one exists, we'll delete it.  Does other consistency tests
-   * checking files are not zero, etc.
-   *
-   * @param infodir qualified path for info file directory
-   * @param mapdir qualified path for map file directory
-   * @throws IOException
-   */
-  private SortedMap<Long, HStoreFile> loadHStoreFiles(Path infodir, Path mapdir)
-  throws IOException {
-    // Look first at info files.  If a reference, these contain info we need
-    // to create the HStoreFile.
-    FileStatus infofiles[] = fs.listStatus(infodir);
-    SortedMap<Long, HStoreFile> results = new TreeMap<Long, HStoreFile>();
-    ArrayList<Path> mapfiles = new ArrayList<Path>(infofiles.length);
-    for (int i = 0; i < infofiles.length; i++) {
-      Path p = infofiles[i].getPath();
-      // Check for empty info file.  Should never be the case but can happen
-      // after data loss in hdfs for whatever reason (upgrade, etc.): HBASE-646
-      if (this.fs.getFileStatus(p).getLen() <= 0) {
-        LOG.warn("Skipping " + p + " because its empty.  DATA LOSS?  Can " +
-          "this scenario be repaired?  HBASE-646");
-        continue;
-      }
-
-      Matcher m = REF_NAME_PARSER.matcher(p.getName());
-      /*
-       *  *  *  *  *  N O T E  *  *  *  *  *
-       *  
-       *  We call isReference(Path, Matcher) here because it calls
-       *  Matcher.matches() which must be called before Matcher.group(int)
-       *  and we don't want to call Matcher.matches() twice.
-       *  
-       *  *  *  *  *  N O T E  *  *  *  *  *
-       */
-      boolean isReference = isReference(p, m);
-      long fid = Long.parseLong(m.group(1));
-      
-      HStoreFile curfile = null;
-      Reference reference = null;
-      if (isReference) {
-        reference = HStoreFile.readSplitInfo(p, fs);
-      }
-      curfile = new HStoreFile(conf, fs, basedir, this.info,
-        family.getName(), fid, reference);
-      long storeSeqId = -1;
-      try {
-        storeSeqId = curfile.loadInfo(fs);
-        if (storeSeqId > this.maxSeqId) {
-          this.maxSeqId = storeSeqId;
-        }
-      } catch (IOException e) {
-        // If the HSTORE_LOGINFOFILE doesn't contain a number, just ignore it.
-        // That means it was built prior to the previous run of HStore, and so
-        // it cannot contain any updates also contained in the log.
-        LOG.info("HSTORE_LOGINFOFILE " + curfile +
-          " does not contain a sequence number - ignoring");
-      }
-      Path mapfile = curfile.getMapFilePath();
-      if (!fs.exists(mapfile)) {
-        fs.delete(curfile.getInfoFilePath(), false);
-        LOG.warn("Mapfile " + mapfile.toString() + " does not exist. " +
-          "Cleaned up info file.  Continuing...Probable DATA LOSS!!!");
-        continue;
-      }
-      // References don't have data or index components under mapfile.
-      if (!isReference && isEmptyDataFile(mapfile)) {
-        curfile.delete();
-        // We can have empty data file if data loss in hdfs.
-        LOG.warn("Mapfile " + mapfile.toString() + " has empty data. " +
-          "Deleting.  Continuing...Probable DATA LOSS!!!  See HBASE-646.");
-        continue;
-      }
-      if (!isReference && isEmptyIndexFile(mapfile)) {
-        try {
-          // Try fixing this file.. if we can.  Use the hbase version of fix.
-          // Need to remove the old index file first else fix won't go ahead.
-          this.fs.delete(new Path(mapfile, MapFile.INDEX_FILE_NAME), false);
-          // TODO: This is going to fail if we are to rebuild a file from
-          // meta because it won't have right comparator: HBASE-848.
-          long count = MapFile.fix(this.fs, mapfile, HStoreKey.class,
-            HBaseMapFile.VALUE_CLASS, false, this.conf);
-          if (LOG.isDebugEnabled()) {
-            LOG.debug("Fixed index on " + mapfile.toString() + "; had " +
-              count + " entries");
-          }
-        } catch (Exception e) {
-          LOG.warn("Failed fix of " + mapfile.toString() +
-            "...continuing; Probable DATA LOSS!!!", e);
-          continue;
-        }
-      }
-      long length = curfile.length();
-      storeSize += length;
-      
-      // TODO: Confirm referent exists.
-      
-      // Found map and sympathetic info file.  Add this hstorefile to result.
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("loaded " + FSUtils.getPath(p) + ", isReference=" +
-          isReference + ", sequence id=" + storeSeqId +
-          ", length=" + length + ", majorCompaction=" +
-          curfile.isMajorCompaction());
-      }
-      results.put(Long.valueOf(storeSeqId), curfile);
-      // Keep list of sympathetic data mapfiles for cleaning info dir in next
-      // section.  Make sure path is fully qualified for compare.
-      mapfiles.add(this.fs.makeQualified(mapfile));
-    }
-    cleanDataFiles(mapfiles, mapdir);
-    return results;
-  }
-  
-  /*
-   * If no info file delete the sympathetic data file.
-   * @param mapfiles List of mapfiles.
-   * @param mapdir Directory to check.
-   * @throws IOException
-   */
-  private void cleanDataFiles(final List<Path> mapfiles, final Path mapdir)
-  throws IOException {
-    // List paths by experience returns fully qualified names -- at least when
-    // running on a mini hdfs cluster.
-    FileStatus [] datfiles = fs.listStatus(mapdir);
-    for (int i = 0; i < datfiles.length; i++) {
-      Path p = datfiles[i].getPath();
-      // If does not have sympathetic info file, delete.
-      Path qualifiedP = fs.makeQualified(p);
-      if (!mapfiles.contains(qualifiedP)) {
-        fs.delete(p, true);
-      }
-    }
-  }
-
-  /* 
-   * @param mapfile
-   * @return True if the passed mapfile has a zero-length data component (its
-   * broken).
-   * @throws IOException
-   */
-  private boolean isEmptyDataFile(final Path mapfile)
-  throws IOException {
-    // Mapfiles are made of 'data' and 'index' files.  Confirm 'data' is
-    // non-null if it exists (may not have been written to yet).
-    return isEmptyFile(new Path(mapfile, MapFile.DATA_FILE_NAME));
-  }
-
-  /* 
-   * @param mapfile
-   * @return True if the passed mapfile has a zero-length index component (its
-   * broken).
-   * @throws IOException
-   */
-  private boolean isEmptyIndexFile(final Path mapfile)
-  throws IOException {
-    // Mapfiles are made of 'data' and 'index' files.  Confirm 'data' is
-    // non-null if it exists (may not have been written to yet).
-    return isEmptyFile(new Path(mapfile, MapFile.INDEX_FILE_NAME));
-  }
-
-  /* 
-   * @param f
-   * @return True if the passed file does not exist or is zero-length (its
-   * broken).
-   * @throws IOException
-   */
-  private boolean isEmptyFile(final Path f)
-  throws IOException {
-    return !this.fs.exists(f) || this.fs.getFileStatus(f).getLen() == 0;
-  }
-
-  /**
-   * Adds a value to the memcache
-   * 
-   * @param key
-   * @param value
-   * @return memcache size delta
-   */
-  protected long add(HStoreKey key, byte[] value) {
-    lock.readLock().lock();
-    try {
-      return this.memcache.add(key, value);
-    } finally {
-      lock.readLock().unlock();
-    }
-  }
-  
-  /**
-   * Close all the MapFile readers
-   * 
-   * We don't need to worry about subsequent requests because the HRegion holds
-   * a write lock that will prevent any more reads or writes.
-   * 
-   * @throws IOException
-   */
-  List<HStoreFile> close() throws IOException {
-    ArrayList<HStoreFile> result = null;
-    this.lock.writeLock().lock();
-    try {
-      for (MapFile.Reader reader: this.readers.values()) {
-        reader.close();
-      }
-      synchronized (this.storefiles) {
-        result = new ArrayList<HStoreFile>(storefiles.values());
-      }
-      LOG.debug("closed " + this.storeNameStr);
-      return result;
-    } finally {
-      this.lock.writeLock().unlock();
-    }
-  }
-  
-  //////////////////////////////////////////////////////////////////////////////
-  // Flush changes to disk
-  //////////////////////////////////////////////////////////////////////////////
-
-  /**
-   * Snapshot this stores memcache.  Call before running
-   * {@link #flushCache(long)} so it has some work to do.
-   */
-  void snapshot() {
-    this.memcache.snapshot();
-  }
-  
-  /**
-   * Write out current snapshot.  Presumes {@link #snapshot()} has been called
-   * previously.
-   * @param logCacheFlushId flush sequence number
-   * @return true if a compaction is needed
-   * @throws IOException
-   */
-  boolean flushCache(final long logCacheFlushId) throws IOException {
-    // Get the snapshot to flush.  Presumes that a call to
-    // this.memcache.snapshot() has happened earlier up in the chain.
-    SortedMap<HStoreKey, byte []> cache = this.memcache.getSnapshot();
-    boolean compactionNeeded = internalFlushCache(cache, logCacheFlushId);
-    // If an exception happens flushing, we let it out without clearing
-    // the memcache snapshot.  The old snapshot will be returned when we say
-    // 'snapshot', the next time flush comes around.
-    this.memcache.clearSnapshot(cache);
-    return compactionNeeded;
-  }
-  
-  private boolean internalFlushCache(final SortedMap<HStoreKey, byte []> cache,
-    final long logCacheFlushId)
-  throws IOException {
-    long flushed = 0;
-    // Don't flush if there are no entries.
-    if (cache.size() == 0) {
-      return false;
-    }
-    
-    // TODO:  We can fail in the below block before we complete adding this
-    // flush to list of store files.  Add cleanup of anything put on filesystem
-    // if we fail.
-    synchronized(flushLock) {
-      long now = System.currentTimeMillis();
-      // A. Write the Maps out to the disk
-      HStoreFile flushedFile = new HStoreFile(conf, fs, basedir,
-        this.info,  family.getName(), -1L, null);
-      MapFile.Writer out = flushedFile.getWriter(this.fs, this.compression,
-        this.family.isBloomfilter(), cache.size());
-      setIndexInterval(out);
-      
-      // Here we tried picking up an existing HStoreFile from disk and
-      // interlacing the memcache flush compacting as we go.  The notion was
-      // that interlacing would take as long as a pure flush with the added
-      // benefit of having one less file in the store.  Experiments showed that
-      // it takes two to three times the amount of time flushing -- more column
-      // families makes it so the two timings come closer together -- but it
-      // also complicates the flush. The code was removed.  Needed work picking
-      // which file to interlace (favor references first, etc.)
-      //
-      // Related, looks like 'merging compactions' in BigTable paper interlaces
-      // a memcache flush.  We don't.
-      int entries = 0;
-      try {
-        for (Map.Entry<HStoreKey, byte []> es: cache.entrySet()) {
-          HStoreKey curkey = es.getKey();
-          byte[] bytes = es.getValue();
-          if (HStoreKey.matchingFamily(this.family.getName(), curkey.getColumn())) {
-            if (!isExpired(curkey, ttl, now)) {
-              entries++;
-              out.append(curkey, new ImmutableBytesWritable(bytes));
-              flushed += this.memcache.heapSize(curkey, bytes, null);
-            }
-          }
-        }
-      } finally {
-        out.close();
-      }
-      long newStoreSize = flushedFile.length();
-      storeSize += newStoreSize;
-
-      // B. Write out the log sequence number that corresponds to this output
-      // MapFile.  The MapFile is current up to and including the log seq num.
-      flushedFile.writeInfo(fs, logCacheFlushId);
-      
-      // C. Finally, make the new MapFile available.
-      updateReaders(logCacheFlushId, flushedFile);
-      if(LOG.isDebugEnabled()) {
-        LOG.debug("Added " + FSUtils.getPath(flushedFile.getMapFilePath()) +
-          " with " + entries +
-          " entries, sequence id " + logCacheFlushId + ", data size ~" +
-          StringUtils.humanReadableInt(flushed) + ", file size " +
-          StringUtils.humanReadableInt(newStoreSize) + " to " +
-          this.info.getRegionNameAsString());
-      }
-    }
-    return storefiles.size() >= compactionThreshold;
-  }
-  
-  /*
-   * Change readers adding into place the Reader produced by this new flush.
-   * @param logCacheFlushId
-   * @param flushedFile
-   * @throws IOException
-   */
-  private void updateReaders(final long logCacheFlushId,
-      final HStoreFile flushedFile)
-  throws IOException {
-    this.lock.writeLock().lock();
-    try {
-      Long flushid = Long.valueOf(logCacheFlushId);
-      // Open the map file reader.
-      this.readers.put(flushid,
-        flushedFile.getReader(this.fs, this.family.isBloomfilter(),
-        this.family.isBlockCacheEnabled()));
-      this.storefiles.put(flushid, flushedFile);
-      // Tell listeners of the change in readers.
-      notifyChangedReadersObservers();
-    } finally {
-      this.lock.writeLock().unlock();
-    }
-  }
-
-  /*
-   * Notify all observers that set of Readers has changed.
-   * @throws IOException
-   */
-  private void notifyChangedReadersObservers() throws IOException {
-    for (ChangedReadersObserver o: this.changedReaderObservers) {
-      o.updateReaders();
-    }
-  }
-
-  /*
-   * @param o Observer who wants to know about changes in set of Readers
-   */
-  void addChangedReaderObserver(ChangedReadersObserver o) {
-    this.changedReaderObservers.add(o);
-  }
-  
-  /*
-   * @param o Observer no longer interested in changes in set of Readers.
-   */
-  void deleteChangedReaderObserver(ChangedReadersObserver o) {
-    if (!this.changedReaderObservers.remove(o)) {
-      LOG.warn("Not in set" + o);
-    }
-  }
-
-  //////////////////////////////////////////////////////////////////////////////
-  // Compaction
-  //////////////////////////////////////////////////////////////////////////////
-
-  /*
-   * @param files
-   * @return True if any of the files in <code>files</code> are References.
-   */
-  private boolean hasReferences(Collection<HStoreFile> files) {
-    if (files != null && files.size() > 0) {
-      for (HStoreFile hsf: files) {
-        if (hsf.isReference()) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-
-  /*
-   * Gets lowest timestamp from files in a dir
-   * 
-   * @param fs
-   * @param dir
-   * @throws IOException
-   */
-  private static long getLowestTimestamp(FileSystem fs, Path dir)
-  throws IOException {
-    FileStatus[] stats = fs.listStatus(dir);
-    if (stats == null || stats.length == 0) {
-      return 0l;
-    }
-    long lowTimestamp = Long.MAX_VALUE;
-    for (int i = 0; i < stats.length; i++) {
-      long timestamp = stats[i].getModificationTime();
-      if (timestamp < lowTimestamp){
-        lowTimestamp = timestamp;
-      }
-    }
-    return lowTimestamp;
-  }
-
-  /**
-   * Compact the back-HStores.  This method may take some time, so the calling 
-   * thread must be able to block for long periods.
-   * 
-   * <p>During this time, the HStore can work as usual, getting values from
-   * MapFiles and writing new MapFiles from the Memcache.
-   * 
-   * Existing MapFiles are not destroyed until the new compacted TreeMap is 
-   * completely written-out to disk.
-   *
-   * The compactLock prevents multiple simultaneous compactions.
-   * The structureLock prevents us from interfering with other write operations.
-   * 
-   * We don't want to hold the structureLock for the whole time, as a compact() 
-   * can be lengthy and we want to allow cache-flushes during this period.
-   * 
-   * @param majorCompaction True to force a major compaction regardless of
-   * thresholds
-   * @return mid key if a split is needed, null otherwise
-   * @throws IOException
-   */
-  StoreSize compact(final boolean majorCompaction) throws IOException {
-    boolean forceSplit = this.info.shouldSplit(false);
-    boolean doMajorCompaction = majorCompaction;
-    synchronized (compactLock) {
-      long maxId = -1;
-      List<HStoreFile> filesToCompact = null;
-      synchronized (storefiles) {
-        if (this.storefiles.size() <= 0) {
-          LOG.debug(this.storeNameStr + ": no store files to compact");
-          return null;
-        }
-        // filesToCompact are sorted oldest to newest.
-        filesToCompact = new ArrayList<HStoreFile>(this.storefiles.values());
-        
-        // The max-sequenceID in any of the to-be-compacted TreeMaps is the 
-        // last key of storefiles.
-        maxId = this.storefiles.lastKey().longValue();
-      }
-      // Check to see if we need to do a major compaction on this region.
-      // If so, change doMajorCompaction to true to skip the incremental
-      // compacting below. Only check if doMajorCompaction is not true.
-      if (!doMajorCompaction) {
-        doMajorCompaction = isMajorCompaction(filesToCompact);
-      }
-      boolean references = hasReferences(filesToCompact);
-      if (!doMajorCompaction && !references && 
-          (forceSplit || (filesToCompact.size() < compactionThreshold))) {
-        return checkSplit(forceSplit);
-      }
-      if (!fs.exists(compactionDir) && !fs.mkdirs(compactionDir)) {
-        LOG.warn("Mkdir on " + compactionDir.toString() + " failed");
-        return checkSplit(forceSplit);
-      }
-
-      // HBASE-745, preparing all store file sizes for incremental compacting
-      // selection.
-      int countOfFiles = filesToCompact.size();
-      long totalSize = 0;
-      long[] fileSizes = new long[countOfFiles];
-      long skipped = 0;
-      int point = 0;
-      for (int i = 0; i < countOfFiles; i++) {
-        HStoreFile file = filesToCompact.get(i);
-        Path path = file.getMapFilePath();
-        if (path == null) {
-          LOG.warn("Path is null for " + file);
-          return null;
-        }
-        int len = 0;
-        // listStatus can come back null.
-        FileStatus [] fss = this.fs.listStatus(path);
-        for (int ii = 0; fss != null && ii < fss.length; ii++) {
-          len += fss[ii].getLen();
-        }
-        fileSizes[i] = len;
-        totalSize += len;
-      }
-      if (!doMajorCompaction && !references) {
-        // Here we select files for incremental compaction.  
-        // The rule is: if the largest(oldest) one is more than twice the 
-        // size of the second, skip the largest, and continue to next...,
-        // until we meet the compactionThreshold limit.
-        for (point = 0; point < countOfFiles - 1; point++) {
-          if ((fileSizes[point] < fileSizes[point + 1] * 2) && 
-               (countOfFiles - point) <= maxFilesToCompact) {
-            break;
-          }
-          skipped += fileSizes[point];
-        }
-        filesToCompact = new ArrayList<HStoreFile>(filesToCompact.subList(point,
-          countOfFiles));
-        if (filesToCompact.size() <= 1) {
-          if (LOG.isDebugEnabled()) {
-            LOG.debug("Skipped compaction of 1 file; compaction size of " +
-              this.storeNameStr + ": " +
-              StringUtils.humanReadableInt(totalSize) + "; Skipped " + point +
-              " files, size: " + skipped);
-          }
-          return checkSplit(forceSplit);
-        }
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Compaction size of " + this.storeNameStr + ": " +
-            StringUtils.humanReadableInt(totalSize) + "; Skipped " + point +
-            " file(s), size: " + skipped);
-        }
-      }
-
-      /*
-       * We create a new list of MapFile.Reader objects so we don't screw up
-       * the caching associated with the currently-loaded ones. Our iteration-
-       * based access pattern is practically designed to ruin the cache.
-       */
-      List<MapFile.Reader> rdrs = new ArrayList<MapFile.Reader>();
-      int nrows = createReaders(rdrs, filesToCompact);
-
-      // Step through them, writing to the brand-new MapFile
-      HStoreFile compactedOutputFile = new HStoreFile(conf, fs, 
-          this.compactionDir,  this.info, family.getName(), -1L, null);
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Started compaction of " + rdrs.size() + " file(s)" +
-          (references? ", hasReferences=true,": " ") + " into " +
-          FSUtils.getPath(compactedOutputFile.getMapFilePath()));
-      }
-      MapFile.Writer writer = compactedOutputFile.getWriter(this.fs,
-        this.compression, this.family.isBloomfilter(), nrows);
-      setIndexInterval(writer);
-      try {
-        compact(writer, rdrs, doMajorCompaction);
-      } finally {
-        writer.close();
-      }
-
-      // Now, write out an HSTORE_LOGINFOFILE for the brand-new TreeMap.
-      compactedOutputFile.writeInfo(fs, maxId, doMajorCompaction);
-
-      // Move the compaction into place.
-      completeCompaction(filesToCompact, compactedOutputFile);
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Completed " + (doMajorCompaction? "major": "") +
-          " compaction of " + this.storeNameStr +
-          " store size is " + StringUtils.humanReadableInt(storeSize));
-      }
-    }
-    return checkSplit(forceSplit);
-  }  
-
-  /*
-   * Set the index interval for the mapfile. There are two sources for
-   * configuration information: the HCD, and the global HBase config.
-   * If a source returns the default value, it is ignored. Otherwise,
-   * the smallest non-default value is preferred. 
-   */
-  private void setIndexInterval(MapFile.Writer writer) {
-    int familyInterval = this.family.getMapFileIndexInterval();
-    int interval = this.conf.getInt("hbase.io.index.interval",
-        HColumnDescriptor.DEFAULT_MAPFILE_INDEX_INTERVAL);
-    if (familyInterval != HColumnDescriptor.DEFAULT_MAPFILE_INDEX_INTERVAL) {
-      if (interval != HColumnDescriptor.DEFAULT_MAPFILE_INDEX_INTERVAL) {
-        if (familyInterval < interval)
-          interval = familyInterval;
-      } else {
-        interval = familyInterval;
-      }
-    }
-    writer.setIndexInterval(interval);
-  }
-
-  /*
-   * @return True if we should run a major compaction.
-   */
-  boolean isMajorCompaction() throws IOException {
-    List<HStoreFile> filesToCompact = null;
-    synchronized (storefiles) {
-      // filesToCompact are sorted oldest to newest.
-      filesToCompact = new ArrayList<HStoreFile>(this.storefiles.values());
-    }
-    return isMajorCompaction(filesToCompact);
-  }
-
-  /*
-   * @param filesToCompact Files to compact. Can be null.
-   * @return True if we should run a major compaction.
-   */
-  private boolean isMajorCompaction(final List<HStoreFile> filesToCompact)
-  throws IOException {
-    boolean result = false;
-    Path mapdir = HStoreFile.getMapDir(this.basedir, this.info.getEncodedName(),
-      this.family.getName());
-    long lowTimestamp = getLowestTimestamp(fs, mapdir);
-    long now = System.currentTimeMillis();
-    if (lowTimestamp > 0l && lowTimestamp < (now - this.majorCompactionTime)) {
-      // Major compaction time has elapsed.
-      long elapsedTime = now - lowTimestamp;
-      if (filesToCompact != null && filesToCompact.size() == 1 &&
-          filesToCompact.get(0).isMajorCompaction() &&
-          (this.ttl == HConstants.FOREVER || elapsedTime < this.ttl)) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Skipping major compaction of " + this.storeNameStr +
-            " because one (major) compacted file only and elapsedTime " +
-            elapsedTime + "ms is < ttl=" + this.ttl);
-        }
-      } else {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Major compaction triggered on store " + this.storeNameStr +
-            "; time since last major compaction " + (now - lowTimestamp) + "ms");
-        }
-        result = true;
-      }
-    }
-    return result;
-  }
-  
-  /*
-   * Create readers for the passed in list of HStoreFiles and add them to
-   * <code>readers</code> list.  Used compacting.
-   * @param readers Add Readers here.
-   * @param files List of HSFs to make Readers for.
-   * @return Count of rows for bloom filter sizing.  Returns -1 if no bloom
-   * filter wanted.
-   */
-  private int createReaders(final List<MapFile.Reader> rs,
-    final List<HStoreFile> files)
-  throws IOException {
-    /* We create a new list of MapFile.Reader objects so we don't screw up
-     * the caching associated with the currently-loaded ones. Our iteration-
-     * based access pattern is practically designed to ruin the cache.
-     */
-    int nrows = -1;
-    for (HStoreFile file: files) {
-      try {
-        // TODO: Readers are opened without block-cache enabled.  Means we don't
-        // get the prefetch that makes the read faster.  But we don't want to
-        // enable block-cache for these readers that are about to be closed.
-        // The compaction of soon-to-be closed readers will probably force out
-        // blocks that may be needed servicing real-time requests whereas
-        // compaction runs in background.  TODO: We know we're going to read
-        // this file straight through.  Leverage this fact.  Use a big buffer
-        // client side to speed things up or read it all up into memory one file
-        // at a time or pull local and memory-map the file but leave the writer
-        // up in hdfs?
-        BloomFilterMapFile.Reader reader = file.getReader(fs, false, false);
-        rs.add(reader);
-        // Compute the size of the new bloomfilter if needed
-        if (this.family.isBloomfilter()) {
-          nrows += reader.getBloomFilterSize();
-        }
-      } catch (IOException e) {
-        // Add info about which file threw exception. It may not be in the
-        // exception message so output a message here where we know the
-        // culprit.
-        LOG.warn("Failed with " + e.toString() + ": " + file.toString());
-        closeCompactionReaders(rs);
-        throw e;
-      }
-    }
-    return nrows;
-  }
-  
-  /*
-   * @param r List to reverse
-   * @return A reversed array of content of <code>readers</code>
-   */
-  private MapFile.Reader [] reverse(final List<MapFile.Reader> r) {
-    List<MapFile.Reader> copy = new ArrayList<MapFile.Reader>(r);
-    Collections.reverse(copy);
-    return copy.toArray(new MapFile.Reader[0]);
-  }
-
-  /*
-   * @param rdrs List of readers
-   * @param keys Current keys
-   * @param done Which readers are done
-   * @return The lowest current key in passed <code>rdrs</code>
-   */
-  private int getLowestKey(final MapFile.Reader [] rdrs,
-      final HStoreKey [] keys, final boolean [] done) {
-    int lowestKey = -1;
-    for (int i = 0; i < rdrs.length; i++) {
-      if (done[i]) {
-        continue;
-      }
-      if (lowestKey < 0) {
-        lowestKey = i;
-      } else {
-        if (keys[i].compareTo(keys[lowestKey]) < 0) {
-          lowestKey = i;
-        }
-      }
-    }
-    return lowestKey;
-  }
-
-  /*
-   * Compact a list of MapFile.Readers into MapFile.Writer.
-   * 
-   * We work by iterating through the readers in parallel looking at newest
-   * store file first. We always increment the lowest-ranked one. Updates to a
-   * single row/column will appear ranked by timestamp.
-   * @param compactedOut Where to write compaction.
-   * @param pReaders List of readers sorted oldest to newest.
-   * @param majorCompaction True to force a major compaction regardless of
-   * thresholds
-   * @throws IOException
-   */
-  private void compact(final MapFile.Writer compactedOut,
-      final List<MapFile.Reader> pReaders, final boolean majorCompaction)
-  throws IOException {
-    // Reverse order so newest store file is first.
-    MapFile.Reader[] rdrs = reverse(pReaders);
-    try {
-      HStoreKey [] keys = new HStoreKey[rdrs.length];
-      ImmutableBytesWritable [] vals = new ImmutableBytesWritable[rdrs.length];
-      boolean [] done = new boolean[rdrs.length];
-      for(int i = 0; i < rdrs.length; i++) {
-        keys[i] = new HStoreKey(HConstants.EMPTY_BYTE_ARRAY, this.info);
-        vals[i] = new ImmutableBytesWritable();
-        done[i] = false;
-      }
-
-      // Now, advance through the readers in order.  This will have the
-      // effect of a run-time sort of the entire dataset.
-      int numDone = 0;
-      for (int i = 0; i < rdrs.length; i++) {
-        rdrs[i].reset();
-        done[i] = !rdrs[i].next(keys[i], vals[i]);
-        if (done[i]) {
-          numDone++;
-        }
-      }
-
-      long now = System.currentTimeMillis();
-      int timesSeen = 0;
-      HStoreKey lastSeen = new HStoreKey();
-      HStoreKey lastDelete = null;
-      while (numDone < done.length) {
-        // Get lowest key in all store files.
-        int lowestKey = getLowestKey(rdrs, keys, done);
-        HStoreKey sk = keys[lowestKey];
-        // If its same row and column as last key, increment times seen.
-        if (HStoreKey.equalsTwoRowKeys(info, lastSeen.getRow(), sk.getRow())
-            && Bytes.equals(lastSeen.getColumn(), sk.getColumn())) {
-          timesSeen++;
-          // Reset last delete if not exact timestamp -- lastDelete only stops
-          // exactly the same key making it out to the compacted store file.
-          if (lastDelete != null &&
-              lastDelete.getTimestamp() != sk.getTimestamp()) {
-            lastDelete = null;
-          }
-        } else {
-          timesSeen = 1;
-          lastDelete = null;
-        }
-
-        // Don't write empty rows or columns.  Only remove cells on major
-        // compaction.  Remove if expired of > VERSIONS
-        if (sk.getRow().length != 0 && sk.getColumn().length != 0) {
-          ImmutableBytesWritable value = vals[lowestKey];
-          if (!majorCompaction) {
-            // Write out all values if not a major compaction.
-            compactedOut.append(sk, value);
-          } else {
-            boolean expired = false;
-            boolean deleted = false;
-            if (timesSeen <= family.getMaxVersions() &&
-                !(expired = isExpired(sk, ttl, now))) {
-              // If this value key is same as a deleted key, skip
-              if (lastDelete != null && sk.equals(lastDelete)) {
-                deleted = true;
-              } else if (HLogEdit.isDeleted(value.get())) {
-                // If a deleted value, skip
-                deleted = true;
-                lastDelete = new HStoreKey(sk);
-              } else {
-                compactedOut.append(sk, vals[lowestKey]);
-              }
-            }
-            if (expired || deleted) {
-              // HBASE-855 remove one from timesSeen because it did not make it
-              // past expired check -- don't count against max versions.
-              timesSeen--;
-            }
-          }
-        }
-
-        // Update last-seen items
-        lastSeen = new HStoreKey(sk);
-
-        // Advance the smallest key.  If that reader's all finished, then 
-        // mark it as done.
-        if (!rdrs[lowestKey].next(keys[lowestKey], vals[lowestKey])) {
-          done[lowestKey] = true;
-          rdrs[lowestKey].close();
-          rdrs[lowestKey] = null;
-          numDone++;
-        }
-      }
-    } finally {
-      closeCompactionReaders(Arrays.asList(rdrs));
-    }
-  }
-
-  private void closeCompactionReaders(final List<MapFile.Reader> rdrs) {
-    for (MapFile.Reader r: rdrs) {
-      try {
-        if (r != null) {
-          r.close();
-        }
-      } catch (IOException e) {
-        LOG.warn("Exception closing reader for " + this.storeNameStr, e);
-      }
-    }
-  }
-
-  /*
-   * It's assumed that the compactLock  will be acquired prior to calling this 
-   * method!  Otherwise, it is not thread-safe!
-   *
-   * It works by processing a compaction that's been written to disk.
-   * 
-   * <p>It is usually invoked at the end of a compaction, but might also be
-   * invoked at HStore startup, if the prior execution died midway through.
-   * 
-   * <p>Moving the compacted TreeMap into place means:
-   * <pre>
-   * 1) Moving the new compacted MapFile into place
-   * 2) Unload all replaced MapFiles, close and collect list to delete.
-   * 3) Loading the new TreeMap.
-   * 4) Compute new store size
-   * </pre>
-   * 
-   * @param compactedFiles list of files that were compacted
-   * @param compactedFile HStoreFile that is the result of the compaction
-   * @throws IOException
-   */
-  private void completeCompaction(final List<HStoreFile> compactedFiles,
-    final HStoreFile compactedFile)
-  throws IOException {
-    this.lock.writeLock().lock();
-    try {
-      // 1. Moving the new MapFile into place.
-      HStoreFile finalCompactedFile = new HStoreFile(conf, fs, basedir,
-        this.info, family.getName(), -1, null,
-        compactedFile.isMajorCompaction());
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("moving " + FSUtils.getPath(compactedFile.getMapFilePath()) +
-          " to " + FSUtils.getPath(finalCompactedFile.getMapFilePath()));
-      }
-      if (!compactedFile.rename(this.fs, finalCompactedFile)) {
-        LOG.error("Failed move of compacted file " +
-          finalCompactedFile.getMapFilePath().toString());
-        return;
-      }
-
-      // 2. Unload all replaced MapFiles, close and collect list to delete.
-      synchronized (storefiles) {
-        Map<Long, HStoreFile> toDelete = new HashMap<Long, HStoreFile>();
-        for (Map.Entry<Long, HStoreFile> e : this.storefiles.entrySet()) {
-          if (!compactedFiles.contains(e.getValue())) {
-            continue;
-          }
-          Long key = e.getKey();
-          MapFile.Reader reader = this.readers.remove(key);
-          if (reader != null) {
-            reader.close();
-          }
-          toDelete.put(key, e.getValue());
-        }
-
-        try {
-          // 3. Loading the new TreeMap.
-          // Change this.storefiles so it reflects new state but do not
-          // delete old store files until we have sent out notification of
-          // change in case old files are still being accessed by outstanding
-          // scanners.
-          for (Long key : toDelete.keySet()) {
-            this.storefiles.remove(key);
-          }
-          // Add new compacted Reader and store file.
-          Long orderVal = Long.valueOf(finalCompactedFile.loadInfo(fs));
-          this.readers.put(orderVal,
-              // Use a block cache (if configured) for this reader since
-              // it is the only one.
-              finalCompactedFile.getReader(this.fs,
-                  this.family.isBloomfilter(),
-                  this.family.isBlockCacheEnabled()));
-          this.storefiles.put(orderVal, finalCompactedFile);
-          // Tell observers that list of Readers has changed.
-          notifyChangedReadersObservers();
-          // Finally, delete old store files.
-          for (HStoreFile hsf : toDelete.values()) {
-            hsf.delete();
-          }
-        } catch (IOException e) {
-          e = RemoteExceptionHandler.checkIOException(e);
-          LOG.error("Failed replacing compacted files for " +
-            this.storeNameStr +
-            ". Compacted file is " + finalCompactedFile.toString() +
-            ".  Files replaced are " + compactedFiles.toString() +
-            " some of which may have been already removed", e);
-        }
-        // 4. Compute new store size
-        storeSize = 0L;
-        for (HStoreFile hsf : storefiles.values()) {
-          storeSize += hsf.length();
-        }
-      }
-    } finally {
-      this.lock.writeLock().unlock();
-    }
-  }
-
-  // ////////////////////////////////////////////////////////////////////////////
-  // Accessors.
-  // (This is the only section that is directly useful!)
-  //////////////////////////////////////////////////////////////////////////////
-  
-  /**
-   * Return all the available columns for the given key.  The key indicates a 
-   * row and timestamp, but not a column name.
-   *
-   * The returned object should map column names to Cells.
-   */
-  void getFull(HStoreKey key, final Set<byte []> columns,
-      final int numVersions, Map<byte [], Cell> results)
-  throws IOException {
-    int versions = versionsToReturn(numVersions);
-
-    Map<byte [], Long> deletes =
-      new TreeMap<byte [], Long>(Bytes.BYTES_COMPARATOR);
-
-    // if the key is null, we're not even looking for anything. return.
-    if (key == null) {
-      return;
-    }
-    
-    this.lock.readLock().lock();
-    
-    // get from the memcache first.
-    memcache.getFull(key, columns, versions, deletes, results);
-    
-    try {
-      MapFile.Reader[] maparray = getReaders();
-      
-      // examine each mapfile
-      for (int i = maparray.length - 1; i >= 0; i--) {
-        MapFile.Reader map = maparray[i];
-        
-        // synchronize on the map so that no one else iterates it at the same 
-        // time
-        getFullFromMapFile(map, key, columns, versions, deletes, results);
-      }
-    } finally {
-      this.lock.readLock().unlock();
-    }
-  }
-  
-  private void getFullFromMapFile(MapFile.Reader map, HStoreKey key, 
-    Set<byte []> columns, int numVersions, Map<byte [], Long> deletes,
-    Map<byte [], Cell> results) 
-  throws IOException {
-    synchronized(map) {
-      long now = System.currentTimeMillis();
-
-      // seek back to the beginning
-      map.reset();
-      
-      // seek to the closest key that should match the row we're looking for
-      ImmutableBytesWritable readval = new ImmutableBytesWritable();
-      HStoreKey readkey = (HStoreKey)map.getClosest(key, readval);
-      if (readkey == null) {
-        return;
-      }
-      
-      do {
-        byte [] readcol = readkey.getColumn();
-        
-        // if we're looking for this column (or all of them), and there isn't 
-        // already a value for this column in the results map or there is a value
-        // but we haven't collected enough versions yet, and the key we 
-        // just read matches, then we'll consider it
-        if ((columns == null || columns.contains(readcol)) 
-          && (!results.containsKey(readcol) 
-              || results.get(readcol).getNumValues() < numVersions)
-          && key.matchesWithoutColumn(readkey)) {
-          // if the value of the cell we're looking at right now is a delete, 
-          // we need to treat it differently
-          if(HLogEdit.isDeleted(readval.get())) {
-            // if it's not already recorded as a delete or recorded with a more
-            // recent delete timestamp, record it for later
-            if (!deletes.containsKey(readcol) 
-              || deletes.get(readcol).longValue() < readkey.getTimestamp()) {
-              deletes.put(readcol, Long.valueOf(readkey.getTimestamp()));              
-            }
-          } else if (!(deletes.containsKey(readcol) 
-            && deletes.get(readcol).longValue() >= readkey.getTimestamp()) ) {
-            // So the cell itself isn't a delete, but there may be a delete 
-            // pending from earlier in our search. Only record this result if
-            // there aren't any pending deletes.
-            if (!(deletes.containsKey(readcol) &&
-                deletes.get(readcol).longValue() >= readkey.getTimestamp())) {
-              if (!isExpired(readkey, ttl, now)) {
-                if (!results.containsKey(readcol)) {
-                  results.put(readcol,
-                              new Cell(readval.get(), readkey.getTimestamp()));
-                } else {
-                  results.get(readcol).add(readval.get(),
-                                           readkey.getTimestamp());
-                }
-                // need to reinstantiate the readval so we can reuse it, 
-                // otherwise next iteration will destroy our result
-                readval = new ImmutableBytesWritable();
-              }
-            }
-          }
-        } else if (HStoreKey.compareTwoRowKeys(info,key.getRow(), readkey.getRow()) < 0) {
-          // if we've crossed into the next row, then we can just stop 
-          // iterating
-          break;
-        }
-        
-      } while(map.next(readkey, readval));
-    }
-  }
-
-  /**
-   * @return Array of readers ordered oldest to newest.
-   */
-  public MapFile.Reader [] getReaders() {
-    return this.readers.values().
-      toArray(new MapFile.Reader[this.readers.size()]);
-  }
-
-  /*
-   * @param wantedVersions How many versions were asked for.
-   * @return wantedVersions or this families' MAX_VERSIONS.
-   */
-  private int versionsToReturn(final int wantedVersions) {
-    if (wantedVersions <= 0) {
-      throw new IllegalArgumentException("Number of versions must be > 0");
-    }
-    // Make sure we do not return more than maximum versions for this store.
-    return wantedVersions > this.family.getMaxVersions()?
-      this.family.getMaxVersions(): wantedVersions;
-  }
-  
-  /**
-   * Get the value for the indicated HStoreKey.  Grab the target value and the 
-   * previous <code>numVersions - 1</code> values, as well.
-   *
-   * Use {@link HConstants.ALL_VERSIONS} to retrieve all versions.
-   * @param key
-   * @param numVersions Number of versions to fetch.  Must be > 0.
-   * @return values for the specified versions
-   * @throws IOException
-   */
-  Cell[] get(final HStoreKey key, final int numVersions) throws IOException {
-    // This code below is very close to the body of the getKeys method.  Any 
-    // changes in the flow below should also probably be done in getKeys.
-    // TODO: Refactor so same code used.
-    long now = System.currentTimeMillis();
-    int versions = versionsToReturn(numVersions);
-    // Keep a list of deleted cell keys.  We need this because as we go through
-    // the memcache and store files, the cell with the delete marker may be
-    // in one store and the old non-delete cell value in a later store.
-    // If we don't keep around the fact that the cell was deleted in a newer
-    // record, we end up returning the old value if user is asking for more
-    // than one version. This List of deletes should not be large since we
-    // are only keeping rows and columns that match those set on the get and
-    // which have delete values.  If memory usage becomes an issue, could
-    // redo as bloom filter.
-    Set<HStoreKey> deletes = new HashSet<HStoreKey>();
-    this.lock.readLock().lock();
-    try {
-      // Check the memcache
-      List<Cell> results = this.memcache.get(key, versions, deletes, now);
-      // If we got sufficient versions from memcache, return. 
-      if (results.size() == versions) {
-        return results.toArray(new Cell[results.size()]);
-      }
-      MapFile.Reader[] maparray = getReaders();
-      // Returned array is sorted with the most recent addition last.
-      for(int i = maparray.length - 1;
-          i >= 0 && !hasEnoughVersions(versions, results); i--) {
-        MapFile.Reader r = maparray[i];
-        synchronized (r) {
-          // Do the priming read
-          ImmutableBytesWritable readval = new ImmutableBytesWritable();
-          HStoreKey readkey = (HStoreKey)r.getClosest(key, readval);
-          if (readkey == null) {
-            // map.getClosest returns null if the passed key is > than the
-            // last key in the map file.  getClosest is a bit of a misnomer
-            // since it returns exact match or the next closest key AFTER not
-            // BEFORE.  We use getClosest because we're usually passed a
-            // key that has a timestamp of maximum long to indicate we want
-            // most recent update.
-            continue;
-          }
-          if (!readkey.matchesRowCol(key)) {
-            continue;
-          }
-          if (get(readkey, readval.get(), versions, results, deletes, now)) {
-            break;
-          }
-          for (readval = new ImmutableBytesWritable();
-              r.next(readkey, readval) && readkey.matchesRowCol(key);
-              readval = new ImmutableBytesWritable()) {
-            if (get(readkey, readval.get(), versions, results, deletes, now)) {
-              break;
-            }
-          }
-        }
-      }
-      return results.size() == 0 ?
-        null : results.toArray(new Cell[results.size()]);
-    } finally {
-      this.lock.readLock().unlock();
-    }
-  }
-  
-  /*
-   * Look at one key/value.
-   * @param key
-   * @param value
-   * @param versions
-   * @param results
-   * @param deletes
-   * @param now
-   * @return True if we have enough versions.
-   */
-  private boolean get(final HStoreKey key, final byte [] value,
-      final int versions, final List<Cell> results,
-      final Set<HStoreKey> deletes, final long now) {
-    if (!HLogEdit.isDeleted(value)) {
-      if (notExpiredAndNotInDeletes(this.ttl, key, now, deletes)) {
-        results.add(new Cell(value, key.getTimestamp()));
-      }
-      // Perhaps only one version is wanted.  I could let this
-      // test happen later in the for loop test but it would cost
-      // the allocation of an ImmutableBytesWritable.
-      if (hasEnoughVersions(versions, results)) {
-        return true;
-      }
-    } else {
-      // Is this copy necessary?
-      deletes.add(new HStoreKey(key));
-    }
-    return false;
-  }
-
-  /*
-   * Small method to check if we are over the max number of versions
-   * or we acheived this family max versions. 
-   * The later happens when we have the situation described in HBASE-621.
-   * @param versions
-   * @param c
-   * @return 
-   */
-  private boolean hasEnoughVersions(final int versions, final List<Cell> c) {
-    return c.size() >= versions;
-  }
-
-  /**
-   * Get <code>versions</code> of keys matching the origin key's
-   * row/column/timestamp and those of an older vintage.
-   * @param origin Where to start searching.
-   * @param versions How many versions to return. Pass
-   * {@link HConstants#ALL_VERSIONS} to retrieve all.
-   * @param now
-   * @param columnPattern regex pattern for column matching. if columnPattern
-   * is not null, we use column pattern to match columns. And the columnPattern
-   * only works when origin's column is null or its length is zero.
-   * @return Matching keys.
-   * @throws IOException
-   */
-  public List<HStoreKey> getKeys(final HStoreKey origin, final int versions,
-    final long now, final Pattern columnPattern)
-  throws IOException {
-    // This code below is very close to the body of the get method.  Any 
-    // changes in the flow below should also probably be done in get.  TODO:
-    // Refactor so same code used.
-    Set<HStoreKey> deletes = new HashSet<HStoreKey>();
-    this.lock.readLock().lock();
-    try {
-      // Check the memcache
-      List<HStoreKey> keys =
-        this.memcache.getKeys(origin, versions, deletes, now, columnPattern);
-      // If we got sufficient versions from memcache, return. 
-      if (keys.size() >= versions) {
-        return keys;
-      }
-      MapFile.Reader[] maparray = getReaders();
-      // Returned array is sorted with the most recent addition last.
-      for (int i = maparray.length - 1;
-          i >= 0 && keys.size() < versions; i--) {
-        MapFile.Reader map = maparray[i];
-        synchronized(map) {
-          map.reset();
-          // Do the priming read
-          ImmutableBytesWritable readval = new ImmutableBytesWritable();
-          HStoreKey readkey = (HStoreKey)map.getClosest(origin, readval);
-          if (readkey == null) {
-            // map.getClosest returns null if the passed key is > than the
-            // last key in the map file.  getClosest is a bit of a misnomer
-            // since it returns exact match or the next closest key AFTER not
-            // BEFORE.
-            continue;
-          }
-          do {
-            // if the row matches, we might want this one.
-            if (rowMatches(origin, readkey)) {
-              // if the column pattern is not null, we use it for column matching.
-              // we will skip the keys whose column doesn't match the pattern.
-              if (columnPattern != null) {
-                if (!(columnPattern.matcher(Bytes.toString(readkey.getColumn())).matches())) {
-                  continue;
-                }
-              }
-              // if the cell address matches, then we definitely want this key.
-              if (cellMatches(origin, readkey)) {
-                // Store key if isn't deleted or superceded by memcache
-                if (!HLogEdit.isDeleted(readval.get())) {
-                  if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
-                    keys.add(new HStoreKey(readkey));
-                  }
-                  if (keys.size() >= versions) {
-                    break;
-                  }
-                } else {
-                  deletes.add(new HStoreKey(readkey));
-                }
-              } else {
-                // the cell doesn't match, but there might be more with different
-                // timestamps, so move to the next key
-                continue;
-              }
-            } else {
-              // the row doesn't match, so we've gone too far.
-              break;
-            }
-          } while (map.next(readkey, readval)); // advance to the next key
-        }
-      }
-      return keys;
-    } finally {
-      this.lock.readLock().unlock();
-    }
-  }
-  
-  /**
-   * Find the key that matches <i>row</i> exactly, or the one that immediately
-   * preceeds it. WARNING: Only use this method on a table where writes occur 
-   * with stricly increasing timestamps. This method assumes this pattern of 
-   * writes in order to make it reasonably performant.
-   * @param row
-   * @return Found row
-   * @throws IOException
-   */
-  @SuppressWarnings("unchecked")
-  byte [] getRowKeyAtOrBefore(final byte [] row)
-  throws IOException{
-    // Map of HStoreKeys that are candidates for holding the row key that
-    // most closely matches what we're looking for. We'll have to update it as
-    // deletes are found all over the place as we go along before finally
-    // reading the best key out of it at the end.
-    SortedMap<HStoreKey, Long> candidateKeys = new TreeMap<HStoreKey, Long>(
-      new HStoreKey.HStoreKeyWritableComparator(info));
-    
-    // Keep a list of deleted cell keys.  We need this because as we go through
-    // the store files, the cell with the delete marker may be in one file and
-    // the old non-delete cell value in a later store file. If we don't keep
-    // around the fact that the cell was deleted in a newer record, we end up
-    // returning the old value if user is asking for more than one version.
-    // This List of deletes should not be large since we are only keeping rows
-    // and columns that match those set on the scanner and which have delete
-    // values.  If memory usage becomes an issue, could redo as bloom filter.
-    Set<HStoreKey> deletes = new HashSet<HStoreKey>();
-    this.lock.readLock().lock();
-    try {
-      // First go to the memcache.  Pick up deletes and candidates.
-      this.memcache.getRowKeyAtOrBefore(row, candidateKeys, deletes);
-      
-      // Process each store file.  Run through from newest to oldest.
-      // This code below is very close to the body of the getKeys method.
-      MapFile.Reader[] maparray = getReaders();
-      for (int i = maparray.length - 1; i >= 0; i--) {
-        // Update the candidate keys from the current map file
-        rowAtOrBeforeFromMapFile(maparray[i], row, candidateKeys, deletes);
-      }
-      // Return the best key from candidateKeys
-      byte [] result =
-        candidateKeys.isEmpty()? null: candidateKeys.lastKey().getRow();
-      return result;
-    } finally {
-      this.lock.readLock().unlock();
-    }
-  }
-  
-  /*
-   * Check an individual MapFile for the row at or before a given key 
-   * and timestamp
-   * @param map
-   * @param row
-   * @param candidateKeys
-   * @throws IOException
-   */
-  private void rowAtOrBeforeFromMapFile(final MapFile.Reader map,
-    final byte [] row, final SortedMap<HStoreKey, Long> candidateKeys,
-    final Set<HStoreKey> deletes)
-  throws IOException {
-    HStoreKey startKey = new HStoreKey();
-    ImmutableBytesWritable startValue = new ImmutableBytesWritable();
-    synchronized(map) {
-      // Don't bother with the rest of this if the file is empty
-      map.reset();
-      if (!map.next(startKey, startValue)) {
-        return;
-      }
-      startKey.setHRegionInfo(this.info);
-      // If start row for this file is beyond passed in row, return; nothing
-      // in here is of use to us.
-      if (HStoreKey.compareTwoRowKeys(this.info, startKey.getRow(), row) > 0) {
-        return;
-      }
-      long now = System.currentTimeMillis();
-      // if there aren't any candidate keys yet, we'll do some things different 
-      if (candidateKeys.isEmpty()) {
-        rowAtOrBeforeCandidate(startKey, map, row, candidateKeys, deletes, now);
-      } else {
-        rowAtOrBeforeWithCandidates(startKey, map, row, candidateKeys, deletes,
-          now);
-      }
-    }
-  }
-  
-  /* Find a candidate for row that is at or before passed row in passed
-   * mapfile.
-   * @param startKey First key in the mapfile.
-   * @param map
-   * @param row
-   * @param candidateKeys
-   * @param now
-   * @throws IOException
-   */
-  private void rowAtOrBeforeCandidate(final HStoreKey startKey,
-    final MapFile.Reader map, final byte[] row,
-    final SortedMap<HStoreKey, Long> candidateKeys,
-    final Set<HStoreKey> deletes, final long now) 
-  throws IOException {
-    // if the row we're looking for is past the end of this mapfile, set the
-    // search key to be the last key.  If its a deleted key, then we'll back
-    // up to the row before and return that.
-    HStoreKey finalKey = getFinalKey(map);
-    HStoreKey searchKey = null;
-    if (HStoreKey.compareTwoRowKeys(info,finalKey.getRow(), row) < 0) {
-      searchKey = finalKey;
-    } else {
-      searchKey = new HStoreKey(row, this.info);
-      if (searchKey.compareTo(startKey) < 0) {
-        searchKey = startKey;
-      }
-    }
-    rowAtOrBeforeCandidate(map, searchKey, candidateKeys, deletes, now);
-  }
-
-  /* 
-   * @param ttlSetting
-   * @param hsk
-   * @param now
-   * @param deletes
-   * @return True if key has not expired and is not in passed set of deletes.
-   */
-  static boolean notExpiredAndNotInDeletes(final long ttl,
-      final HStoreKey hsk, final long now, final Set<HStoreKey> deletes) {
-    return !isExpired(hsk, ttl, now) &&
-      (deletes == null || !deletes.contains(hsk));
-  }
-  
-  static boolean isExpired(final HStoreKey hsk, final long ttl,
-      final long now) {
-    return ttl != HConstants.FOREVER && now > hsk.getTimestamp() + ttl;
-  }
-
-  /* Find a candidate for row that is at or before passed key, sk, in mapfile.
-   * @param map
-   * @param sk Key to go search the mapfile with.
-   * @param candidateKeys
-   * @param now
-   * @throws IOException
-   * @see {@link #rowAtOrBeforeCandidate(HStoreKey, org.apache.hadoop.io.MapFile.Reader, byte[], SortedMap, long)}
-   */
-  private void rowAtOrBeforeCandidate(final MapFile.Reader map,
-    final HStoreKey sk, final SortedMap<HStoreKey, Long> candidateKeys,
-    final Set<HStoreKey> deletes, final long now)
-  throws IOException {
-    HStoreKey searchKey = sk;
-    if (searchKey.getHRegionInfo() == null) {
-      searchKey.setHRegionInfo(this.info);
-    }
-    HStoreKey readkey = null;
-    ImmutableBytesWritable readval = new ImmutableBytesWritable();
-    HStoreKey knownNoGoodKey = null;
-    for (boolean foundCandidate = false; !foundCandidate;) {
-      // Seek to the exact row, or the one that would be immediately before it
-      readkey = (HStoreKey)map.getClosest(searchKey, readval, true);
-      if (readkey == null) {
-        // If null, we are at the start or end of the file.
-        break;
-      }
-      HStoreKey deletedOrExpiredRow = null;
-      do {
-        // Set this region into the readkey.
-        readkey.setHRegionInfo(this.info);
-        // If we have an exact match on row, and it's not a delete, save this
-        // as a candidate key
-        if (HStoreKey.equalsTwoRowKeys(this.info, readkey.getRow(),
-            searchKey.getRow())) {
-          if (!HLogEdit.isDeleted(readval.get())) {
-            if (handleNonDelete(readkey, now, deletes, candidateKeys)) {
-              foundCandidate = true;
-              // NOTE! Continue.
-              continue;
-            }
-          }
-          HStoreKey copy = addCopyToDeletes(readkey, deletes);
-          if (deletedOrExpiredRow == null) {
-            deletedOrExpiredRow = copy;
-          }
-        } else if (HStoreKey.compareTwoRowKeys(this.info, readkey.getRow(),
-            searchKey.getRow()) > 0) {
-          // if the row key we just read is beyond the key we're searching for,
-          // then we're done.
-          break;
-        } else {
-          // So, the row key doesn't match, but we haven't gone past the row
-          // we're seeking yet, so this row is a candidate for closest
-          // (assuming that it isn't a delete).
-          if (!HLogEdit.isDeleted(readval.get())) {
-            if (handleNonDelete(readkey, now, deletes, candidateKeys)) {
-              foundCandidate = true;
-              // NOTE: Continue
-              continue;
-            }
-          }
-          HStoreKey copy = addCopyToDeletes(readkey, deletes);
-          if (deletedOrExpiredRow == null) {
-            deletedOrExpiredRow = copy;
-          }
-        }
-      } while(map.next(readkey, readval) && (knownNoGoodKey == null ||
-          readkey.compareTo(knownNoGoodKey) < 0));
-
-      // If we get here and have no candidates but we did find a deleted or
-      // expired candidate, we need to look at the key before that
-      if (!foundCandidate && deletedOrExpiredRow != null) {
-        knownNoGoodKey = deletedOrExpiredRow;
-        searchKey = new HStoreKey.BeforeThisStoreKey(deletedOrExpiredRow);
-      } else {
-        // No candidates and no deleted or expired candidates. Give up.
-        break;
-      }
-    }
-    
-    // Arriving here just means that we consumed the whole rest of the map
-    // without going "past" the key we're searching for. we can just fall
-    // through here.
-  }
-  
-  /*
-   * @param key Key to copy and add to <code>deletes</code>
-   * @param deletes
-   * @return Instance of the copy added to <code>deletes</code>
-   */
-  private HStoreKey addCopyToDeletes(final HStoreKey key,
-      final Set<HStoreKey> deletes) {
-    HStoreKey copy = new HStoreKey(key);
-    deletes.add(copy);
-    return copy;
-  }
-  
-  private void rowAtOrBeforeWithCandidates(final HStoreKey startKey,
-    final MapFile.Reader map, final byte[] row,
-    final SortedMap<HStoreKey, Long> candidateKeys,
-    final Set<HStoreKey> deletes, final long now) 
-  throws IOException {
-    HStoreKey readkey = null;
-    ImmutableBytesWritable readval = new ImmutableBytesWritable();
-
-    // if there are already candidate keys, we need to start our search 
-    // at the earliest possible key so that we can discover any possible
-    // deletes for keys between the start and the search key.  Back up to start
-    // of the row in case there are deletes for this candidate in this mapfile
-    // BUT do not backup before the first key in the mapfile else getClosest
-    // will return null
-    HStoreKey searchKey = new HStoreKey(candidateKeys.firstKey().getRow(), this.info);
-    if (searchKey.compareTo(startKey) < 0) {
-      searchKey = startKey;
-    }
-
-    // Seek to the exact row, or the one that would be immediately before it
-    readkey = (HStoreKey)map.getClosest(searchKey, readval, true);
-    if (readkey == null) {
-      // If null, we are at the start or end of the file.
-      // Didn't find anything that would match, so return
-      return;
-    }
-
-    do {
-      // if we have an exact match on row, and it's not a delete, save this
-      // as a candidate key
-      if (Bytes.equals(readkey.getRow(), row)) {
-        handleKey(readkey, readval.get(), now, deletes, candidateKeys);
-      } else if (HStoreKey.compareTwoRowKeys(info, 
-          readkey.getRow(), row) > 0 ) {
-        // if the row key we just read is beyond the key we're searching for,
-        // then we're done.
-        break;
-      } else {
-        // So, the row key doesn't match, but we haven't gone past the row
-        // we're seeking yet, so this row is a candidate for closest 
-        // (assuming that it isn't a delete).
-        handleKey(readkey, readval.get(), now, deletes, candidateKeys);
-      }
-    } while(map.next(readkey, readval));    
-  }
-  
-  /*
-   * @param readkey
-   * @param now
-   * @param deletes
-   * @param candidateKeys
-   */
-  private void handleKey(final HStoreKey readkey, final byte [] value,
-      final long now, final Set<HStoreKey> deletes,
-      final SortedMap<HStoreKey, Long> candidateKeys) {
-    if (!HLogEdit.isDeleted(value)) {
-      handleNonDelete(readkey, now, deletes, candidateKeys);
-    } else {
-      // Pass copy because readkey will change next time next is called.
-      handleDeleted(new HStoreKey(readkey), candidateKeys, deletes);
-    }
-  }
-  
-  /*
-   * @param readkey
-   * @param now
-   * @param deletes
-   * @param candidateKeys
-   * @return True if we added a candidate.
-   */
-  private boolean handleNonDelete(final HStoreKey readkey, final long now,
-      final Set<HStoreKey> deletes, final Map<HStoreKey, Long> candidateKeys) {
-    if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
-      candidateKeys.put(stripTimestamp(readkey),
-        Long.valueOf(readkey.getTimestamp()));
-      return true;
-    }
-    return false;
-  }
-
-  /* Handle keys whose values hold deletes.
-   * Add to the set of deletes and then if the candidate keys contain any that
-   * might match by timestamp, then check for a match and remove it if it's too
-   * young to survive the delete 
-   * @param k Be careful; if key was gotten from a Mapfile, pass in a copy.
-   * Values gotten by 'nexting' out of Mapfiles will change in each invocation.
-   * @param candidateKeys
-   * @param deletes
-   */
-  static void handleDeleted(final HStoreKey k,
-      final SortedMap<HStoreKey, Long> candidateKeys,
-      final Set<HStoreKey> deletes) {
-    deletes.add(k);
-    HStoreKey strippedKey = stripTimestamp(k);
-    if (candidateKeys.containsKey(strippedKey)) {
-      long bestCandidateTs = 
-        candidateKeys.get(strippedKey).longValue();
-      if (bestCandidateTs <= k.getTimestamp()) {
-        candidateKeys.remove(strippedKey);
-      }
-    }
-  }
-
-  /*
-   * @param mf MapFile to dig in.
-   * @return Final key from passed <code>mf</code>
-   * @throws IOException
-   */
-  private HStoreKey getFinalKey(final MapFile.Reader mf) throws IOException {
-    HStoreKey finalKey = new HStoreKey(); 
-    mf.finalKey(finalKey);
-    finalKey.setHRegionInfo(this.info);
-    return finalKey;
-  }
-  
-  static HStoreKey stripTimestamp(HStoreKey key) {
-    return new HStoreKey(key.getRow(), key.getColumn(), key.getHRegionInfo());
-  }
-    
-  /*
-   * Test that the <i>target</i> matches the <i>origin</i> cell address. If the 
-   * <i>origin</i> has an empty column, then it's assumed to mean any column 
-   * matches and only match on row and timestamp. Otherwise, it compares the
-   * keys with HStoreKey.matchesRowCol().
-   * @param origin The key we're testing against
-   * @param target The key we're testing
-   */
-  private boolean cellMatches(HStoreKey origin, HStoreKey target){
-    // if the origin's column is empty, then we're matching any column
-    if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) {
-      // if the row matches, then...
-      if (HStoreKey.equalsTwoRowKeys(info, target.getRow(), origin.getRow())) {
-        // check the timestamp
-        return target.getTimestamp() <= origin.getTimestamp();
-      }
-      return false;
-    }
-    // otherwise, we want to match on row and column
-    return target.matchesRowCol(origin);
-  }
-    
-  /*
-   * Test that the <i>target</i> matches the <i>origin</i>. If the <i>origin</i>
-   * has an empty column, then it just tests row equivalence. Otherwise, it uses
-   * HStoreKey.matchesRowCol().
-   * @param origin Key we're testing against
-   * @param target Key we're testing
-   */
-  private boolean rowMatches(HStoreKey origin, HStoreKey target){
-    // if the origin's column is empty, then we're matching any column
-    if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) {
-      // if the row matches, then...
-      return HStoreKey.equalsTwoRowKeys(info, target.getRow(), origin.getRow());
-    }
-    // otherwise, we want to match on row and column
-    return target.matchesRowCol(origin);
-  }
-  
-  /**
-   * Determines if HStore can be split
-   * @param force Whether to force a split or not.
-   * @return a StoreSize if store can be split, null otherwise
-   */
-  StoreSize checkSplit(final boolean force) {
-    if (this.storefiles.size() <= 0) {
-      return null;
-    }
-    if (!force && (storeSize < this.desiredMaxFileSize)) {
-      return null;
-    }
-    this.lock.readLock().lock();
-    try {
-      // Not splitable if we find a reference store file present in the store.
-      boolean splitable = true;
-      long maxSize = 0L;
-      Long mapIndex = Long.valueOf(0L);
-      // Iterate through all the MapFiles
-      synchronized (storefiles) {
-        for (Map.Entry<Long, HStoreFile> e: storefiles.entrySet()) {
-          HStoreFile curHSF = e.getValue();
-          if (splitable) {
-            splitable = !curHSF.isReference();
-            if (!splitable) {
-              // RETURN IN MIDDLE OF FUNCTION!!! If not splitable, just return.
-              if (LOG.isDebugEnabled()) {
-                LOG.debug(curHSF +  " is not splittable");
-              }
-              return null;
-            }
-          }
-          long size = curHSF.length();
-          if (size > maxSize) {
-            // This is the largest one so far
-            maxSize = size;
-            mapIndex = e.getKey();
-          }
-        }
-      }
-
-      // Cast to HbaseReader.
-      HBaseMapFile.HBaseReader r =
-        (HBaseMapFile.HBaseReader)this.readers.get(mapIndex);
-      // Get first, last, and mid keys.
-      r.reset();
-      HStoreKey firstKey = new HStoreKey();
-      HStoreKey lastKey = new HStoreKey();
-      r.next(firstKey, new ImmutableBytesWritable());
-      r.finalKey(lastKey);
-      HStoreKey mk = (HStoreKey)r.midKey();
-      if (mk != null) {
-        // if the midkey is the same as the first and last keys, then we cannot
-        // (ever) split this region. 
-        if (HStoreKey.equalsTwoRowKeys(info, mk.getRow(), firstKey.getRow()) && 
-            HStoreKey.equalsTwoRowKeys(info, mk.getRow(), lastKey.getRow())) {
-          if (LOG.isDebugEnabled()) {
-            LOG.debug("cannot split because midkey is the same as first or last row");
-          }
-          return null;
-        }
-        return new StoreSize(maxSize, mk.getRow());
-      }
-    } catch(IOException e) {
-      LOG.warn("Failed getting store size for " + this.storeNameStr, e);
-    } finally {
-      this.lock.readLock().unlock();
-    }
-    return null;
-  }
-  
-  /** @return aggregate size of HStore */
-  public long getSize() {
-    return storeSize;
-  }
-  
-  //////////////////////////////////////////////////////////////////////////////
-  // File administration
-  //////////////////////////////////////////////////////////////////////////////
-
-  /**
-   * Return a scanner for both the memcache and the HStore files
-   */
-  protected InternalScanner getScanner(long timestamp, byte [][] targetCols,
-      byte [] firstRow, RowFilterInterface filter)
-  throws IOException {
-    lock.readLock().lock();
-    try {
-      return new HStoreScanner(this, targetCols, firstRow, timestamp, filter);
-    } finally {
-      lock.readLock().unlock();
-    }
-  }
-
-  @Override
-  public String toString() {
-    return this.storeNameStr;
-  }
-
-  /**
-   * @param p Path to check.
-   * @return True if the path has format of a HStoreFile reference.
-   */
-  public static boolean isReference(final Path p) {
-    return isReference(p, REF_NAME_PARSER.matcher(p.getName()));
-  }
- 
-  private static boolean isReference(final Path p, final Matcher m) {
-    if (m == null || !m.matches()) {
-      LOG.warn("Failed match of store file name " + p.toString());
-      throw new RuntimeException("Failed match of store file name " +
-          p.toString());
-    }
-    return m.groupCount() > 1 && m.group(2) != null;
-  }
-
-  /**
-   * @return Current list of store files.
-   */
-  SortedMap<Long, HStoreFile> getStorefiles() {
-    synchronized (this.storefiles) {
-      SortedMap<Long, HStoreFile> copy =
-        new TreeMap<Long, HStoreFile>(this.storefiles);
-      return copy;
-    }
-  }
-  
-  /**
-   * @return Count of store files
-   */
-  int getStorefilesCount() {
-    return this.storefiles.size();
-  }
-
-  /**
-   * @return The size of the store file indexes, in bytes.
-   * @throws IOException if there was a problem getting file sizes from the
-   * filesystem
-   */
-  long getStorefilesIndexSize() throws IOException {
-    long size = 0;
-    for (HStoreFile s: storefiles.values())
-      size += s.indexLength();
-    return size;
-  }
-
-  /*
-   * Datastructure that holds size and key.
-   */
-  class StoreSize {
-    private final long size;
-    private final byte[] key;
-    StoreSize(long size, byte[] key) {
-      this.size = size;
-      this.key = new byte[key.length];
-      System.arraycopy(key, 0, this.key, 0, key.length);
-    }
-    /* @return the size */
-    long getSize() {
-      return size;
-    }
-    /* @return the key */
-    byte[] getKey() {
-      return key;
-    }
-  }
-
-  HRegionInfo getHRegionInfo() {
-    return this.info;
-  }
-}
diff --git a/src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java b/src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java
deleted file mode 100644
index 2162276d05a..00000000000
--- a/src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java
+++ /dev/null
@@ -1,558 +0,0 @@
-/**
- * Copyright 2007 The Apache Software Foundation
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hbase.regionserver;
-
-import java.io.DataInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.Random;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hbase.HBaseConfiguration;
-import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HRegionInfo;
-import org.apache.hadoop.hbase.io.BloomFilterMapFile;
-import org.apache.hadoop.hbase.io.HalfMapFileReader;
-import org.apache.hadoop.hbase.io.Reference;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.io.MapFile;
-import org.apache.hadoop.hbase.io.SequenceFile;
-
-/**
- * A HStore data file.  HStores usually have one or more of these files.  They
- * are produced by flushing the memcache to disk.
- *
- * <p>Each HStore maintains a bunch of different data files. The filename is a
- * mix of the parent dir, the region name, the column name, and a file
- * identifier. The name may also be a reference to a store file located
- * elsewhere. This class handles all that path-building stuff for you.
- * 
- * <p>An HStoreFile usually tracks 4 things: its parent dir, the region
- * identifier, the column family, and the file identifier.  If you know those
- * four things, you know how to obtain the right HStoreFile.  HStoreFiles may
- * also reference store files in another region serving either from
- * the top-half of the remote file or from the bottom-half.  Such references
- * are made fast splitting regions.
- * 
- * <p>Plain HStoreFiles are named for a randomly generated id as in:
- * <code>1278437856009925445</code>  A file by this name is made in both the
- * <code>mapfiles</code> and <code>info</code> subdirectories of a
- * HStore columnfamily directoy: E.g. If the column family is 'anchor:', then
- * under the region directory there is a subdirectory named 'anchor' within
- * which is a 'mapfiles' and 'info' subdirectory.  In each will be found a
- * file named something like <code>1278437856009925445</code>, one to hold the
- * data in 'mapfiles' and one under 'info' that holds the sequence id for this
- * store file.
- * 
- * <p>References to store files located over in some other region look like
- * this:
- * <code>1278437856009925445.hbaserepository,qAReLZD-OyQORZWq_vqR1k==,959247014679548184</code>:
- * i.e. an id followed by the name of the referenced region.  The data
- * ('mapfiles') of HStoreFile references are empty. The accompanying
- * <code>info</code> file contains the
- * midkey, the id of the remote store we're referencing and whether we're
- * to serve the top or bottom region of the remote store file.  Note, a region
- * is not splitable if it has instances of store file references (References
- * are cleaned up by compactions).
- * 
- * <p>When merging or splitting HRegions, we might want to modify one of the 
- * params for an HStoreFile (effectively moving it elsewhere).
- */
-public class HStoreFile implements HConstants {
-  static final Log LOG = LogFactory.getLog(HStoreFile.class.getName());
-  static final byte INFO_SEQ_NUM = 0;
-  static final byte MAJOR_COMPACTION = INFO_SEQ_NUM + 1;
-  static final String HSTORE_DATFILE_DIR = "mapfiles";
-  static final String HSTORE_INFO_DIR = "info";
-  static final String HSTORE_FILTER_DIR = "filter";
-  
-  private final static Random rand = new Random();
-
-  private final Path basedir;
-  private final int encodedRegionName;
-  private final byte [] colFamily;
-  private final long fileId;
-  private final HBaseConfiguration conf;
-  private final FileSystem fs;
-  private final Reference reference;
-  private final HRegionInfo hri;
-  /* If true, this file was product of a major compaction.
-   */
-  private boolean majorCompaction = false;
-  private long indexLength;
-
-  /**
-   * Constructor that fully initializes the object
-   * @param conf Configuration object
-   * @param basedir qualified path that is parent of region directory
-   * @param colFamily name of the column family
-   * @param fileId file identifier
-   * @param ref Reference to another HStoreFile.
-   * @param hri The region info for this file (HACK HBASE-868). TODO: Fix.
-   * @throws IOException
-   */
-  HStoreFile(HBaseConfiguration conf, FileSystem fs, Path basedir,
-      final HRegionInfo hri, byte [] colFamily, long fileId,
-      final Reference ref)
-  throws IOException {
-    this(conf, fs, basedir, hri, colFamily, fileId, ref, false);
-  }
-  
-  /**
-   * Constructor that fully initializes the object
-   * @param conf Configuration object
-   * @param basedir qualified path that is parent of region directory
-   * @param colFamily name of the column family
-   * @param fileId file identifier
-   * @param ref Reference to another HStoreFile.
-   * @param hri The region info for this file (HACK HBASE-868). TODO: Fix.
-   * @param mc Try if this file was result of a major compression.
-   * @throws IOException
-   */
-  HStoreFile(HBaseConfiguration conf, FileSystem fs, Path basedir,
-      final HRegionInfo hri, byte [] colFamily, long fileId,
-      final Reference ref, final boolean mc)
-  throws IOException {
-    this.conf = conf;
-    this.fs = fs;
-    this.basedir = basedir;
-    this.encodedRegionName = hri.getEncodedName();
-    this.colFamily = colFamily;
-    this.hri = hri;
-    
-    long id = fileId;
-    if (id == -1) {
-      Path mapdir = HStoreFile.getMapDir(basedir, encodedRegionName, colFamily);
-      Path testpath = null;
-      do {
-        id = Math.abs(rand.nextLong());
-        testpath = new Path(mapdir, createHStoreFilename(id, -1));
-      } while(fs.exists(testpath));
-    }
-    this.fileId = id;
-    
-    // If a reference, construction does not write the pointer files.  Thats
-    // done by invocations of writeReferenceFiles(hsf, fs). Happens at split.
-    this.reference = ref;
-    this.majorCompaction = mc;
-  }
-
-  /** @return the region name */
-  boolean isReference() {
-    return reference != null;
-  }
-  
-  Reference getReference() {
-    return reference;
-  }
-
-  int getEncodedRegionName() {
-    return this.encodedRegionName;
-  }
-
-  /** @return the column family */
-  byte [] getColFamily() {
-    return colFamily;
-  }
-
-  /** @return the file identifier */
-  long getFileId() {
-    return fileId;
-  }
-
-  // Build full filenames from those components
-  
-  /** @return path for MapFile */
-  Path getMapFilePath() {
-    if (isReference()) {
-      return getMapFilePath(encodedRegionName, fileId,
-          reference.getEncodedRegionName());
-    }
-    return getMapFilePath(this.encodedRegionName, fileId);
-  }
-
-  private Path getMapFilePath(final Reference r) {
-    if (r == null) {
-      return getMapFilePath();
-    }
-    return getMapFilePath(r.getEncodedRegionName(), r.getFileId());
-  }
-
-  private Path getMapFilePath(final int encodedName, final long fid) {
-    return getMapFilePath(encodedName, fid, HRegionInfo.NO_HASH);
-  }
-  
-  private Path getMapFilePath(final int encodedName, final long fid,
-      final int ern) {
-    return new Path(HStoreFile.getMapDir(basedir, encodedName, colFamily), 
-      createHStoreFilename(fid, ern));
-  }
-
-  /** @return path for info file */
-  Path getInfoFilePath() {
-    if (isReference()) {
-      return getInfoFilePath(encodedRegionName, fileId,
-          reference.getEncodedRegionName());
- 
-    }
-    return getInfoFilePath(encodedRegionName, fileId);
-  }
-  
-  private Path getInfoFilePath(final int encodedName, final long fid) {
-    return getInfoFilePath(encodedName, fid, HRegionInfo.NO_HASH);
-  }
-  
-  private Path getInfoFilePath(final int encodedName, final long fid,
-      final int ern) {
-    return new Path(HStoreFile.getInfoDir(basedir, encodedName, colFamily), 
-      createHStoreFilename(fid, ern));
-  }
-
-  // File handling
-
-  /*
-   * Split by making two new store files that reference top and bottom regions
-   * of original store file.
-   * @param midKey
-   * @param dstA
-   * @param dstB
-   * @param fs
-   * @param c
-   * @throws IOException
-   *
-   * @param midKey the key which will be the starting key of the second region
-   * @param dstA the file which will contain keys from the start of the source
-   * @param dstB the file which will contain keys from midKey to end of source
-   * @param fs file system
-   * @param c configuration
-   * @throws IOException
-   */
-  void splitStoreFile(final HStoreFile dstA, final HStoreFile dstB,
-      final FileSystem fs)
-  throws IOException {
-    dstA.writeReferenceFiles(fs);
-    dstB.writeReferenceFiles(fs);
-  }
-  
-  void writeReferenceFiles(final FileSystem fs)
-  throws IOException {
-    createOrFail(fs, getMapFilePath());
-    writeSplitInfo(fs);
-  }
-  
-  /*
-   * If reference, create and write the remote store file id, the midkey and
-   * whether we're going against the top file region of the referent out to
-   * the info file. 
-   * @param p Path to info file.
-   * @param hsf
-   * @param fs
-   * @throws IOException
-   */
-  private void writeSplitInfo(final FileSystem fs) throws IOException {
-    Path p = getInfoFilePath();
-    if (fs.exists(p)) {
-      throw new IOException("File already exists " + p.toString());
-    }
-    FSDataOutputStream out = fs.create(p);
-    try {
-      reference.write(out);
-    } finally {
-      out.close();
-   }
-  }
-
-  /**
-   * @see #writeSplitInfo(FileSystem fs)
-   */
-  static Reference readSplitInfo(final Path p, final FileSystem fs)
-  throws IOException {
-    FSDataInputStream in = fs.open(p);
-    try {
-      Reference r = new Reference();
-      r.readFields(in);
-      return r;
-    } finally {
-      in.close();
-    }
-  }
-
-  private void createOrFail(final FileSystem fs, final Path p)
-  throws IOException {
-    if (fs.exists(p)) {
-      throw new IOException("File already exists " + p.toString());
-    }
-    if (!fs.createNewFile(p)) {
-      throw new IOException("Failed create of " + p);
-    }
-  }
-
-  /** 
-   * Reads in an info file
-   *
-   * @param filesystem file system
-   * @return The sequence id contained in the info file
-   * @throws IOException
-   */
-  long loadInfo(final FileSystem filesystem) throws IOException {
-    Path p = null;
-    if (isReference()) {
-      p = getInfoFilePath(reference.getEncodedRegionName(),
-        this.reference.getFileId());
-    } else {
-      p = getInfoFilePath();
-    }
-    long length = filesystem.getFileStatus(p).getLen();
-    boolean hasMoreThanSeqNum = length > (Byte.SIZE + Bytes.SIZEOF_LONG);
-    DataInputStream in = new DataInputStream(filesystem.open(p));
-    try {
-      byte flag = in.readByte();
-      if (flag == INFO_SEQ_NUM) {
-        if (hasMoreThanSeqNum) {
-          flag = in.readByte();
-          if (flag == MAJOR_COMPACTION) {
-            this.majorCompaction = in.readBoolean();
-          }
-        }
-        return in.readLong();
-      }
-      throw new IOException("Cannot process log file: " + p);
-    } finally {
-      in.close();
-    }
-  }
-  
-  /**
-   * Writes the file-identifier to disk
-   * 
-   * @param filesystem file system
-   * @param infonum file id
-   * @throws IOException
-   */
-  void writeInfo(final FileSystem filesystem, final long infonum)
-  throws IOException {
-    writeInfo(filesystem, infonum, false);
-  }
-  
-  /**
-   * Writes the file-identifier to disk
-   * 
-   * @param filesystem file system
-   * @param infonum file id
-   * @param mc True if this file is product of a major compaction
-   * @throws IOException
-   */
-  void writeInfo(final FileSystem filesystem, final long infonum,
-    final boolean mc)
-  throws IOException {
-    Path p = getInfoFilePath();
-    FSDataOutputStream out = filesystem.create(p);
-    try {
-      out.writeByte(INFO_SEQ_NUM);
-      out.writeLong(infonum);
-      if (mc) {
-        // Set whether major compaction flag on this file.
-        this.majorCompaction = mc;
-        out.writeByte(MAJOR_COMPACTION);
-        out.writeBoolean(mc);
-      }
-    } finally {
-      out.close();
-    }
-  }
-
-  /**
-   * Delete store map files.
-   * @throws IOException 
-   */
-  public void delete() throws IOException {
-    fs.delete(getMapFilePath(), true);
-    fs.delete(getInfoFilePath(), true);
-  }
-  
-  /**
-   * Renames the mapfiles and info directories under the passed
-   * <code>hsf</code> directory.
-   * @param fs
-   * @param hsf
-   * @return True if succeeded.
-   * @throws IOException
-   */
-  public boolean rename(final FileSystem fs, final HStoreFile hsf)
-  throws IOException {
-    Path src = getMapFilePath();
-    if (!fs.exists(src)) {
-      throw new FileNotFoundException(src.toString());
-    }
-    boolean success = fs.rename(src, hsf.getMapFilePath());
-    if (!success) {
-      LOG.warn("Failed rename of " + src + " to " + hsf.getMapFilePath());
-    } else {
-      src = getInfoFilePath();
-      if (!fs.exists(src)) {
-        throw new FileNotFoundException(src.toString());
-      }
-      success = fs.rename(src, hsf.getInfoFilePath());
-      if (!success) {
-        LOG.warn("Failed rename of " + src + " to " + hsf.getInfoFilePath());
-      }
-    }
-    return success;
-  }
-  
-  /**
-   * Get reader for the store file map file.
-   * Client is responsible for closing file when done.
-   * @param fs
-   * @param bloomFilter If true, a bloom filter exists
-   * @param blockCacheEnabled If true, MapFile blocks should be cached.
-   * @return BloomFilterMapFile.Reader
-   * @throws IOException
-   */
-  public synchronized BloomFilterMapFile.Reader getReader(final FileSystem fs,
-      final boolean bloomFilter, final boolean blockCacheEnabled)
-  throws IOException {
-    if (isReference()) {
-      return new HalfMapFileReader(fs,
-          getMapFilePath(reference).toString(), conf, 
-          reference.getFileRegion(), reference.getMidkey(), bloomFilter,
-          blockCacheEnabled, this.hri);
-    }
-    return new BloomFilterMapFile.Reader(fs, getMapFilePath().toString(),
-        conf, bloomFilter, blockCacheEnabled, this.hri);
-  }
-
-  /**
-   * Get a store file writer.
-   * Client is responsible for closing file when done.
-   * @param fs
-   * @param compression Pass <code>SequenceFile.CompressionType.NONE</code>
-   * for none.
-   * @param bloomFilter If true, create a bloom filter
-   * @param nrows number of rows expected. Required if bloomFilter is true.
-   * @return MapFile.Writer
-   * @throws IOException
-   */
-  public MapFile.Writer getWriter(final FileSystem fs,
-      final SequenceFile.CompressionType compression,
-      final boolean bloomFilter, int nrows)
-  throws IOException {
-    if (isReference()) {
-      throw new IOException("Illegal Access: Cannot get a writer on a" +
-        "HStoreFile reference");
-    }
-    return new BloomFilterMapFile.Writer(conf, fs,
-      getMapFilePath().toString(), compression, bloomFilter, nrows, this.hri);
-  }
-
-  /**
-   * @return Length of the store map file.  If a reference, size is
-   * approximation.
-   * @throws IOException
-   */
-  public long length() throws IOException {
-    Path p = new Path(getMapFilePath(reference), MapFile.DATA_FILE_NAME);
-    long l = p.getFileSystem(conf).getFileStatus(p).getLen();
-    return (isReference())? l / 2: l;
-  }
-
-  /**
-   * @return Length of the store map file index.
-   * @throws IOException
-   */
-  public synchronized long indexLength() throws IOException {
-    if (indexLength == 0) {
-      Path p = new Path(getMapFilePath(reference), MapFile.INDEX_FILE_NAME);
-      indexLength = p.getFileSystem(conf).getFileStatus(p).getLen();
-    }
-    return indexLength;
-  }
-
-  @Override
-  public String toString() {
-    return encodedRegionName + "/" + Bytes.toString(colFamily) + "/" + fileId +
-      (isReference()? "-" + reference.toString(): "");
-  }
-  
-  /**
-   * @return True if this file was made by a major compaction.
-   */
-  public boolean isMajorCompaction() {
-    return this.majorCompaction;
-  }
-
-  private static String createHStoreFilename(final long fid,
-      final int encodedRegionName) {
-    return Long.toString(fid) + 
-      ((encodedRegionName != HRegionInfo.NO_HASH)?
-        "." + encodedRegionName : "");
-  }
-
-  /**
-   * @param dir Base directory
-   * @param encodedRegionName Encoding of region name.
-   * @param f Column family.
-   * @return path for map file directory
-   */
-  public static Path getMapDir(Path dir, int encodedRegionName,
-      final byte [] f) {
-    return getFamilySubDir(dir, encodedRegionName, f, HSTORE_DATFILE_DIR);
-  }
-
-  /**
-   * @param dir Base directory
-   * @param encodedRegionName Encoding of region name.
-   * @param f Column family.
-   * @return the info directory path
-   */
-  public static Path getInfoDir(Path dir, int encodedRegionName, byte [] f) {
-    return getFamilySubDir(dir, encodedRegionName, f, HSTORE_INFO_DIR);
-  }
-
-  /**
-   * @param dir Base directory
-   * @param encodedRegionName Encoding of region name.
-   * @param f Column family.
-   * @return the bloom filter directory path
-   */
-  @Deprecated
-  public static Path getFilterDir(Path dir, int encodedRegionName,
-      final byte [] f) {
-    return getFamilySubDir(dir, encodedRegionName, f, HSTORE_FILTER_DIR);
-  }
-  
-  /*
-   * @param base Base directory
-   * @param encodedRegionName Encoding of region name.
-   * @param f Column family.
-   * @param subdir Subdirectory to create under column family/store directory.
-   * @return
-   */
-  private static Path getFamilySubDir(final Path base,
-      final int encodedRegionName, final byte [] f, final String subdir) {
-    return new Path(base, new Path(Integer.toString(encodedRegionName),
-        new Path(Bytes.toString(f), subdir)));
-  }
-}
\ No newline at end of file
diff --git a/src/java/org/apache/hadoop/hbase/regionserver/Store.java b/src/java/org/apache/hadoop/hbase/regionserver/Store.java
new file mode 100644
index 00000000000..075891dd210
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/regionserver/Store.java
@@ -0,0 +1,1807 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.CopyOnWriteArraySet;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HStoreKey;
+import org.apache.hadoop.hbase.RemoteExceptionHandler;
+import org.apache.hadoop.hbase.filter.RowFilterInterface;
+import org.apache.hadoop.hbase.io.Cell;
+import org.apache.hadoop.hbase.io.SequenceFile;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileScanner;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+  * A Store holds a column family in a Region.  Its a memcache and a set of zero
+  * or more StoreFiles, which stretch backwards over time.
+  *
+  * <p>There's no reason to consider append-logging at this level; all logging 
+  * and locking is handled at the HRegion level.  Store just provides
+  * services to manage sets of StoreFiles.  One of the most important of those
+  * services is compaction services where files are aggregated once they pass
+  * a configurable threshold.
+  *
+  * <p>The only thing having to do with logs that Store needs to deal with is
+  * the reconstructionLog.  This is a segment of an HRegion's log that might
+  * NOT be present upon startup.  If the param is NULL, there's nothing to do.
+  * If the param is non-NULL, we need to process the log to reconstruct
+  * a TreeMap that might not have been written to disk before the process
+  * died.
+  *
+  * <p>It's assumed that after this constructor returns, the reconstructionLog
+  * file will be deleted (by whoever has instantiated the Store).
+ *
+ * <p>Locking and transactions are handled at a higher level.  This API should
+ * not be called directly but by an HRegion manager.
+ */
+public class Store implements HConstants {
+  static final Log LOG = LogFactory.getLog(Store.class);
+  protected final Memcache memcache;
+  // This stores directory in the filesystem.
+  private final Path homedir;
+  private final HRegionInfo regioninfo;
+  private final HColumnDescriptor family;
+  final FileSystem fs;
+  private final HBaseConfiguration conf;
+  // ttl in milliseconds.
+  protected long ttl;
+  private long majorCompactionTime;
+  private int maxFilesToCompact;
+  private final long desiredMaxFileSize;
+  private volatile long storeSize = 0L;
+  private final Integer flushLock = new Integer(0);
+  final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
+  final byte [] storeName;
+  private final String storeNameStr;
+
+  /*
+   * Sorted Map of readers keyed by maximum edit sequence id (Most recent should
+   * be last in in list).  ConcurrentSkipListMap is lazily consistent so no
+   * need to lock it down when iterating; iterator view is that of when the
+   * iterator was taken out.
+   */
+  private final NavigableMap<Long, StoreFile> storefiles =
+    new ConcurrentSkipListMap<Long, StoreFile>();
+
+  // All access must be synchronized.
+  private final CopyOnWriteArraySet<ChangedReadersObserver> changedReaderObservers =
+    new CopyOnWriteArraySet<ChangedReadersObserver>();
+
+  // The most-recent log-seq-ID.  The most-recent such ID means we can ignore
+  // all log messages up to and including that ID (because they're already
+  // reflected in the TreeMaps).
+  private volatile long maxSeqId = -1;
+
+  private final Path compactionDir;
+  private final Integer compactLock = new Integer(0);
+  private final int compactionThreshold;
+
+  /**
+   * Constructor
+   * @param basedir qualified path under which the region directory lives;
+   * generally the table subdirectory
+   * @param info HRegionInfo for this region
+   * @param family HColumnDescriptor for this column
+   * @param fs file system object
+   * @param reconstructionLog existing log file to apply if any
+   * @param conf configuration object
+   * @param reporter Call on a period so hosting server can report we're
+   * making progress to master -- otherwise master might think region deploy
+   * failed.  Can be null.
+   * @throws IOException
+   */
+  protected Store(Path basedir, HRegionInfo info, HColumnDescriptor family,
+    FileSystem fs, Path reconstructionLog, HBaseConfiguration conf,
+    final Progressable reporter)
+  throws IOException {  
+    this.homedir = getStoreHomedir(basedir, info.getEncodedName(),
+      family.getName());
+    this.regioninfo = info;
+    this.family = family;
+    this.fs = fs;
+    this.conf = conf;
+    // getTimeToLive returns ttl in seconds.  Convert to milliseconds.
+    this.ttl = family.getTimeToLive();
+    if (ttl != HConstants.FOREVER) {
+      this.ttl *= 1000;
+    }
+    this.memcache = new Memcache(this.ttl);
+    this.compactionDir = HRegion.getCompactionDir(basedir);
+    this.storeName = Bytes.toBytes(this.regioninfo.getEncodedName() + "/" +
+      Bytes.toString(this.family.getName()));
+    this.storeNameStr = Bytes.toString(this.storeName);
+
+    // By default, we compact if an HStore has more than
+    // MIN_COMMITS_FOR_COMPACTION map files
+    this.compactionThreshold =
+      conf.getInt("hbase.hstore.compactionThreshold", 3);
+
+    // By default we split region if a file > DEFAULT_MAX_FILE_SIZE.
+    long maxFileSize = info.getTableDesc().getMaxFileSize();
+    if (maxFileSize == HConstants.DEFAULT_MAX_FILE_SIZE) {
+      maxFileSize = conf.getLong("hbase.hregion.max.filesize",
+        HConstants.DEFAULT_MAX_FILE_SIZE);
+    }
+    this.desiredMaxFileSize = maxFileSize;
+
+    this.majorCompactionTime =
+      conf.getLong(HConstants.MAJOR_COMPACTION_PERIOD, 86400000);
+    if (family.getValue(HConstants.MAJOR_COMPACTION_PERIOD) != null) {
+      String strCompactionTime =
+        family.getValue(HConstants.MAJOR_COMPACTION_PERIOD);
+      this.majorCompactionTime = (new Long(strCompactionTime)).longValue();
+    }
+
+    this.maxFilesToCompact = conf.getInt("hbase.hstore.compaction.max", 10);
+
+    // loadStoreFiles calculates this.maxSeqId. as side-effect.
+    this.storefiles.putAll(loadStoreFiles());
+    if (LOG.isDebugEnabled() && this.storefiles.size() > 0) {
+      LOG.debug("Loaded " + this.storefiles.size() + " file(s) in Store " +
+        Bytes.toString(this.storeName) + ", max sequence id " + this.maxSeqId);
+    }
+
+    // Do reconstruction log.
+    runReconstructionLog(reconstructionLog, this.maxSeqId, reporter);
+  }
+
+  HColumnDescriptor getFamily() {
+    return this.family;
+  }
+
+  long getMaxSequenceId() {
+    return this.maxSeqId;
+  }
+
+  /**
+   * @param tabledir
+   * @param encodedName Encoded region name.
+   * @param family
+   * @return Path to family/Store home directory.
+   */
+  public static Path getStoreHomedir(final Path tabledir,
+      final int encodedName, final byte [] family) {
+    return new Path(tabledir, new Path(Integer.toString(encodedName),
+      new Path(Bytes.toString(family))));
+  }
+
+  /*
+   * Run reconstruction log
+   * @param reconstructionLog
+   * @param msid
+   * @param reporter
+   * @throws IOException
+   */
+  private void runReconstructionLog(final Path reconstructionLog,
+    final long msid, final Progressable reporter)
+  throws IOException {
+    try {
+      doReconstructionLog(reconstructionLog, msid, reporter);
+    } catch (EOFException e) {
+      // Presume we got here because of lack of HADOOP-1700; for now keep going
+      // but this is probably not what we want long term.  If we got here there
+      // has been data-loss
+      LOG.warn("Exception processing reconstruction log " + reconstructionLog +
+        " opening " + this.storeName +
+        " -- continuing.  Probably lack-of-HADOOP-1700 causing DATA LOSS!", e);
+    } catch (IOException e) {
+      // Presume we got here because of some HDFS issue. Don't just keep going.
+      // Fail to open the HStore.  Probably means we'll fail over and over
+      // again until human intervention but alternative has us skipping logs
+      // and losing edits: HBASE-642.
+      LOG.warn("Exception processing reconstruction log " + reconstructionLog +
+        " opening " + this.storeName, e);
+      throw e;
+    }
+  }
+
+  /*
+   * Read the reconstructionLog to see whether we need to build a brand-new 
+   * file out of non-flushed log entries.  
+   *
+   * We can ignore any log message that has a sequence ID that's equal to or 
+   * lower than maxSeqID.  (Because we know such log messages are already 
+   * reflected in the MapFiles.)
+   */
+  @SuppressWarnings("unchecked")
+  private void doReconstructionLog(final Path reconstructionLog,
+    final long maxSeqID, final Progressable reporter)
+  throws UnsupportedEncodingException, IOException {
+    if (reconstructionLog == null || !this.fs.exists(reconstructionLog)) {
+      // Nothing to do.
+      return;
+    }
+    // Check its not empty.
+    FileStatus [] stats = this.fs.listStatus(reconstructionLog);
+    if (stats == null || stats.length == 0) {
+      LOG.warn("Passed reconstruction log " + reconstructionLog +
+        " is zero-length");
+      return;
+    }
+    // TODO: This could grow large and blow heap out.  Need to get it into
+    // general memory usage accounting.
+    long maxSeqIdInLog = -1;
+    NavigableMap<HStoreKey, byte []> reconstructedCache =
+      new TreeMap<HStoreKey, byte []>(new HStoreKey.HStoreKeyWritableComparator());
+    SequenceFile.Reader logReader = new SequenceFile.Reader(this.fs,
+      reconstructionLog, this.conf);
+    try {
+      HLogKey key = new HLogKey();
+      HLogEdit val = new HLogEdit();
+      long skippedEdits = 0;
+      long editsCount = 0;
+      // How many edits to apply before we send a progress report.
+      int reportInterval =
+        this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
+      while (logReader.next(key, val)) {
+        maxSeqIdInLog = Math.max(maxSeqIdInLog, key.getLogSeqNum());
+        if (key.getLogSeqNum() <= maxSeqID) {
+          skippedEdits++;
+          continue;
+        }
+        // Check this edit is for me. Also, guard against writing
+        // METACOLUMN info such as HBASE::CACHEFLUSH entries
+        byte [] column = val.getColumn();
+        if (val.isTransactionEntry() || Bytes.equals(column, HLog.METACOLUMN)
+            || !Bytes.equals(key.getRegionName(), regioninfo.getRegionName())
+            || !HStoreKey.matchingFamily(family.getName(), column)) {
+          continue;
+        }
+        HStoreKey k = new HStoreKey(key.getRow(), column, val.getTimestamp());
+        reconstructedCache.put(k, val.getVal());
+        editsCount++;
+        // Every 2k edits, tell the reporter we're making progress.
+        // Have seen 60k edits taking 3minutes to complete.
+        if (reporter != null && (editsCount % reportInterval) == 0) {
+          reporter.progress();
+        }
+      }
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits +
+          " because sequence id <= " + maxSeqID);
+      }
+    } finally {
+      logReader.close();
+    }
+    
+    if (reconstructedCache.size() > 0) {
+      // We create a "virtual flush" at maxSeqIdInLog+1.
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("flushing reconstructionCache");
+      }
+      internalFlushCache(reconstructedCache, maxSeqIdInLog + 1);
+    }
+  }
+
+  /*
+   * Creates a series of StoreFile loaded from the given directory.
+   * @throws IOException
+   */
+  private Map<Long, StoreFile> loadStoreFiles()
+  throws IOException {
+    Map<Long, StoreFile> results = new HashMap<Long, StoreFile>();
+    FileStatus files[] = this.fs.listStatus(this.homedir);
+    for (int i = 0; files != null && i < files.length; i++) {
+      // Skip directories.
+      if (files[i].isDir()) {
+        continue;
+      }
+      Path p = files[i].getPath();
+      // Check for empty file.  Should never be the case but can happen
+      // after data loss in hdfs for whatever reason (upgrade, etc.): HBASE-646
+      if (this.fs.getFileStatus(p).getLen() <= 0) {
+        LOG.warn("Skipping " + p + " because its empty. HBASE-646 DATA LOSS?");
+        continue;
+      }
+      StoreFile curfile = new StoreFile(fs, p);
+      long storeSeqId = curfile.getMaxSequenceId();
+      if (storeSeqId > this.maxSeqId) {
+        this.maxSeqId = storeSeqId;
+      }
+      long length = curfile.getReader().length();
+      this.storeSize += length;
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("loaded " + FSUtils.getPath(p) + ", isReference=" +
+          curfile.isReference() + ", sequence id=" + storeSeqId +
+          ", length=" + length + ", majorCompaction=" +
+          curfile.isMajorCompaction());
+      }
+      results.put(Long.valueOf(storeSeqId), curfile);
+    }
+    return results;
+  }
+
+  /**
+   * Adds a value to the memcache
+   * 
+   * @param key
+   * @param value
+   * @return memcache size delta
+   */
+  protected long add(HStoreKey key, byte[] value) {
+    lock.readLock().lock();
+    try {
+      return this.memcache.add(key, value);
+    } finally {
+      lock.readLock().unlock();
+    }
+  }
+
+  /**
+   * @return All store files.
+   */
+  NavigableMap<Long, StoreFile> getStorefiles() {
+    return this.storefiles;
+  }
+
+  /**
+   * Close all the readers
+   * 
+   * We don't need to worry about subsequent requests because the HRegion holds
+   * a write lock that will prevent any more reads or writes.
+   * 
+   * @throws IOException
+   */
+  List<StoreFile> close() throws IOException {
+    this.lock.writeLock().lock();
+    try {
+      ArrayList<StoreFile> result =
+        new ArrayList<StoreFile>(storefiles.values());
+      // Clear so metrics doesn't find them.
+      this.storefiles.clear();
+      for (StoreFile f: result) {
+        f.close();
+      }
+      LOG.debug("closed " + this.storeNameStr);
+      return result;
+    } finally {
+      this.lock.writeLock().unlock();
+    }
+  }
+
+  /**
+   * Snapshot this stores memcache.  Call before running
+   * {@link #flushCache(long)} so it has some work to do.
+   */
+  void snapshot() {
+    this.memcache.snapshot();
+  }
+
+  /**
+   * Write out current snapshot.  Presumes {@link #snapshot()} has been called
+   * previously.
+   * @param logCacheFlushId flush sequence number
+   * @return true if a compaction is needed
+   * @throws IOException
+   */
+  boolean flushCache(final long logCacheFlushId) throws IOException {
+    // Get the snapshot to flush.  Presumes that a call to
+    // this.memcache.snapshot() has happened earlier up in the chain.
+    SortedMap<HStoreKey, byte []> cache = this.memcache.getSnapshot();
+    // If an exception happens flushing, we let it out without clearing
+    // the memcache snapshot.  The old snapshot will be returned when we say
+    // 'snapshot', the next time flush comes around.
+    StoreFile sf = internalFlushCache(cache, logCacheFlushId);
+    if (sf == null) {
+      return false;
+    }
+    // Add new file to store files.  Clear snapshot too while we have the
+    // Store write lock.
+    int size = updateStorefiles(logCacheFlushId, sf, cache);
+    return size >= this.compactionThreshold;
+  }
+
+  /*
+   * @param cache
+   * @param logCacheFlushId
+   * @return StoreFile created.
+   * @throws IOException
+   */
+  private StoreFile internalFlushCache(final SortedMap<HStoreKey, byte []> cache,
+    final long logCacheFlushId)
+  throws IOException {
+    HFile.Writer writer = null;
+    long flushed = 0;
+    // Don't flush if there are no entries.
+    if (cache.size() == 0) {
+      return null;
+    }
+    long now = System.currentTimeMillis();
+    // TODO:  We can fail in the below block before we complete adding this
+    // flush to list of store files.  Add cleanup of anything put on filesystem
+    // if we fail.
+    synchronized (flushLock) {
+      // A. Write the map out to the disk
+      writer = StoreFile.getWriter(this.fs, this.homedir);
+      int entries = 0;
+      try {
+        for (Map.Entry<HStoreKey, byte []> es: cache.entrySet()) {
+          HStoreKey curkey = es.getKey();
+          byte[] bytes = es.getValue();
+          if (!isExpired(curkey, ttl, now)) {
+            writer.append(curkey.getBytes(), bytes);
+            entries++;
+            flushed += this.memcache.heapSize(curkey, bytes, null);
+          }
+        }
+        // B. Write out the log sequence number that corresponds to this output
+        // MapFile.  The MapFile is current up to and including logCacheFlushId.
+        StoreFile.appendMetadata(writer, logCacheFlushId);
+      } finally {
+        writer.close();
+      }
+    }
+    StoreFile sf = new StoreFile(this.fs, writer.getPath());
+    this.storeSize += sf.getReader().length();
+    if(LOG.isDebugEnabled()) {
+      LOG.debug("Added " + sf + ", entries=" + sf.getReader().getEntries() +
+        ", sequenceid=" + logCacheFlushId +
+        ", memsize=" + StringUtils.humanReadableInt(flushed) +
+        ", filesize=" + StringUtils.humanReadableInt(sf.getReader().length()) +
+        " to " + this.regioninfo.getRegionNameAsString());
+    }
+    return sf;
+  }
+  
+  /*
+   * Change storefiles adding into place the Reader produced by this new flush.
+   * @param logCacheFlushId
+   * @param sf
+   * @param cache That was used to make the passed file <code>p</code>.
+   * @throws IOException
+   * @return Count of store files.
+   */
+  private int updateStorefiles(final long logCacheFlushId,
+    final StoreFile sf, final SortedMap<HStoreKey, byte []> cache)
+  throws IOException {
+    int count = 0;
+    this.lock.writeLock().lock();
+    try {
+      this.storefiles.put(Long.valueOf(logCacheFlushId), sf);
+      count = this.storefiles.size();
+      // Tell listeners of the change in readers.
+      notifyChangedReadersObservers();
+      this.memcache.clearSnapshot(cache);
+      return count;
+    } finally {
+      this.lock.writeLock().unlock();
+    }
+  }
+
+  /*
+   * Notify all observers that set of Readers has changed.
+   * @throws IOException
+   */
+  private void notifyChangedReadersObservers() throws IOException {
+    for (ChangedReadersObserver o: this.changedReaderObservers) {
+      o.updateReaders();
+    }
+  }
+
+  /*
+   * @param o Observer who wants to know about changes in set of Readers
+   */
+  void addChangedReaderObserver(ChangedReadersObserver o) {
+    this.changedReaderObservers.add(o);
+  }
+
+  /*
+   * @param o Observer no longer interested in changes in set of Readers.
+   */
+  void deleteChangedReaderObserver(ChangedReadersObserver o) {
+    if (!this.changedReaderObservers.remove(o)) {
+      LOG.warn("Not in set" + o);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Compaction
+  //////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Compact the StoreFiles.  This method may take some time, so the calling 
+   * thread must be able to block for long periods.
+   * 
+   * <p>During this time, the Store can work as usual, getting values from
+   * MapFiles and writing new MapFiles from the Memcache.
+   * 
+   * Existing MapFiles are not destroyed until the new compacted TreeMap is 
+   * completely written-out to disk.
+   *
+   * The compactLock prevents multiple simultaneous compactions.
+   * The structureLock prevents us from interfering with other write operations.
+   * 
+   * We don't want to hold the structureLock for the whole time, as a compact() 
+   * can be lengthy and we want to allow cache-flushes during this period.
+   * 
+   * @param mc True to force a major compaction regardless of
+   * thresholds
+   * @return row to split around if a split is needed, null otherwise
+   * @throws IOException
+   */
+  StoreSize compact(final boolean mc) throws IOException {
+    boolean forceSplit = this.regioninfo.shouldSplit(false);
+    boolean majorcompaction = mc;
+    synchronized (compactLock) {
+      long maxId = -1;
+      // filesToCompact are sorted oldest to newest.
+      List<StoreFile> filesToCompact = null;
+      filesToCompact = new ArrayList<StoreFile>(this.storefiles.values());
+      if (filesToCompact.size() <= 0) {
+        LOG.debug(this.storeNameStr + ": no store files to compact");
+        return null;
+      }
+      // The max-sequenceID in any of the to-be-compacted TreeMaps is the 
+      // last key of storefiles.
+      maxId = this.storefiles.lastKey().longValue();
+      // Check to see if we need to do a major compaction on this region.
+      // If so, change doMajorCompaction to true to skip the incremental
+      // compacting below. Only check if doMajorCompaction is not true.
+      if (!majorcompaction) {
+        majorcompaction = isMajorCompaction(filesToCompact);
+      }
+      boolean references = hasReferences(filesToCompact);
+      if (!majorcompaction && !references && 
+          (forceSplit || (filesToCompact.size() < compactionThreshold))) {
+        return checkSplit(forceSplit);
+      }
+      if (!fs.exists(compactionDir) && !fs.mkdirs(compactionDir)) {
+        LOG.warn("Mkdir on " + compactionDir.toString() + " failed");
+        return checkSplit(forceSplit);
+      }
+
+      // HBASE-745, preparing all store file sizes for incremental compacting
+      // selection.
+      int countOfFiles = filesToCompact.size();
+      long totalSize = 0;
+      long[] fileSizes = new long[countOfFiles];
+      long skipped = 0;
+      int point = 0;
+      for (int i = 0; i < countOfFiles; i++) {
+        StoreFile file = filesToCompact.get(i);
+        Path path = file.getPath();
+        if (path == null) {
+          LOG.warn("Path is null for " + file);
+          return null;
+        }
+        long len = file.getReader().length();
+        fileSizes[i] = len;
+        totalSize += len;
+      }
+      if (!majorcompaction && !references) {
+        // Here we select files for incremental compaction.  
+        // The rule is: if the largest(oldest) one is more than twice the 
+        // size of the second, skip the largest, and continue to next...,
+        // until we meet the compactionThreshold limit.
+        for (point = 0; point < countOfFiles - 1; point++) {
+          if ((fileSizes[point] < fileSizes[point + 1] * 2) && 
+               (countOfFiles - point) <= maxFilesToCompact) {
+            break;
+          }
+          skipped += fileSizes[point];
+        }
+        filesToCompact = new ArrayList<StoreFile>(filesToCompact.subList(point,
+          countOfFiles));
+        if (filesToCompact.size() <= 1) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Skipped compaction of 1 file; compaction size of " +
+              this.storeNameStr + ": " +
+              StringUtils.humanReadableInt(totalSize) + "; Skipped " + point +
+              " files, size: " + skipped);
+          }
+          return checkSplit(forceSplit);
+        }
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Compaction size of " + this.storeNameStr + ": " +
+            StringUtils.humanReadableInt(totalSize) + "; Skipped " + point +
+            " file(s), size: " + skipped);
+        }
+      }
+ 
+      // Step through them, writing to the brand-new file
+      HFile.Writer writer = StoreFile.getWriter(this.fs, this.homedir);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Started compaction of " + filesToCompact.size() + " file(s)" +
+          (references? ", hasReferences=true,": " ") + " into " +
+          FSUtils.getPath(writer.getPath()));
+      }
+      try {
+        compact(writer, filesToCompact, majorcompaction);
+      } finally {
+        // Now, write out an HSTORE_LOGINFOFILE for the brand-new TreeMap.
+        StoreFile.appendMetadata(writer, maxId, majorcompaction);
+        writer.close();
+      }
+
+      // Move the compaction into place.
+      completeCompaction(filesToCompact, writer);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Completed " + (majorcompaction? "major": "") +
+          " compaction of " + this.storeNameStr +
+          " store size is " + StringUtils.humanReadableInt(storeSize));
+      }
+    }
+    return checkSplit(forceSplit);
+  }
+
+  /*
+   * @param files
+   * @return True if any of the files in <code>files</code> are References.
+   */
+  private boolean hasReferences(Collection<StoreFile> files) {
+    if (files != null && files.size() > 0) {
+      for (StoreFile hsf: files) {
+        if (hsf.isReference()) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  /*
+   * Gets lowest timestamp from files in a dir
+   * 
+   * @param fs
+   * @param dir
+   * @throws IOException
+   */
+  private static long getLowestTimestamp(FileSystem fs, Path dir)
+  throws IOException {
+    FileStatus[] stats = fs.listStatus(dir);
+    if (stats == null || stats.length == 0) {
+      return 0l;
+    }
+    long lowTimestamp = Long.MAX_VALUE;
+    for (int i = 0; i < stats.length; i++) {
+      long timestamp = stats[i].getModificationTime();
+      if (timestamp < lowTimestamp){
+        lowTimestamp = timestamp;
+      }
+    }
+    return lowTimestamp;
+  }
+
+  /*
+   * @return True if we should run a major compaction.
+   */
+  boolean isMajorCompaction() throws IOException {
+    List<StoreFile> filesToCompact = null;
+    // filesToCompact are sorted oldest to newest.
+    filesToCompact = new ArrayList<StoreFile>(this.storefiles.values());
+    return isMajorCompaction(filesToCompact);
+  }
+
+  /*
+   * @param filesToCompact Files to compact. Can be null.
+   * @return True if we should run a major compaction.
+   */
+  private boolean isMajorCompaction(final List<StoreFile> filesToCompact)
+  throws IOException {
+    boolean result = false;
+    if (filesToCompact == null || filesToCompact.size() <= 0) {
+      return result;
+    }
+    long lowTimestamp = getLowestTimestamp(fs,
+      filesToCompact.get(0).getPath().getParent());
+    long now = System.currentTimeMillis();
+    if (lowTimestamp > 0l && lowTimestamp < (now - this.majorCompactionTime)) {
+      // Major compaction time has elapsed.
+      long elapsedTime = now - lowTimestamp;
+      if (filesToCompact.size() == 1 &&
+          filesToCompact.get(0).isMajorCompaction() &&
+          (this.ttl == HConstants.FOREVER || elapsedTime < this.ttl)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Skipping major compaction of " + this.storeNameStr +
+            " because one (major) compacted file only and elapsedTime " +
+            elapsedTime + "ms is < ttl=" + this.ttl);
+        }
+      } else {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Major compaction triggered on store " + this.storeNameStr +
+            "; time since last major compaction " + (now - lowTimestamp) + "ms");
+        }
+        result = true;
+      }
+    }
+    return result;
+  }
+
+  /*
+   * @param r StoreFile list to reverse
+   * @return A new array of content of <code>readers</code>, reversed.
+   */
+  private StoreFile [] reverse(final List<StoreFile> r) {
+    List<StoreFile> copy = new ArrayList<StoreFile>(r);
+    Collections.reverse(copy);
+    // MapFile.Reader is instance of StoreFileReader so this should be ok.
+    return copy.toArray(new StoreFile[0]);
+  }
+
+  /*
+   * @param rdrs List of StoreFiles
+   * @param keys Current keys
+   * @param done Which readers are done
+   * @return The lowest current key in passed <code>rdrs</code>
+   */
+  private int getLowestKey(final HFileScanner [] rdrs, final ByteBuffer [] keys,
+      final boolean [] done) {
+    int lowestKey = -1;
+    for (int i = 0; i < rdrs.length; i++) {
+      if (done[i]) {
+        continue;
+      }
+      if (lowestKey < 0) {
+        lowestKey = i;
+      } else {
+        RawComparator<byte []> c = rdrs[i].getReader().getComparator();
+        if (c.compare(keys[i].array(), keys[i].arrayOffset(), keys[i].limit(),
+            keys[lowestKey].array(), keys[lowestKey].arrayOffset(),
+              keys[lowestKey].limit()) < 0) {
+          lowestKey = i;
+        }
+      }
+    }
+    return lowestKey;
+  }
+
+  /*
+   * Compact a list of StoreFiles.
+   * 
+   * We work by iterating through the readers in parallel looking at newest
+   * store file first. We always increment the lowest-ranked one. Updates to a
+   * single row/column will appear ranked by timestamp.
+   * @param compactedOut Where to write compaction.
+   * @param pReaders List of readers sorted oldest to newest.
+   * @param majorCompaction True to force a major compaction regardless of
+   * thresholds
+   * @throws IOException
+   */
+  private void compact(final HFile.Writer compactedOut,
+      final List<StoreFile> pReaders, final boolean majorCompaction)
+  throws IOException {
+    // Reverse order so newest store file is first.
+    StoreFile[] files = reverse(pReaders);
+    HFileScanner[] rdrs = new HFileScanner[files.length];
+    ByteBuffer[] keys = new ByteBuffer[rdrs.length];
+    ByteBuffer[] vals = new ByteBuffer[rdrs.length];
+    boolean[] done = new boolean[rdrs.length];
+    // Now, advance through the readers in order. This will have the
+    // effect of a run-time sort of the entire dataset.
+    int numDone = 0;
+    for (int i = 0; i < rdrs.length; i++) {
+      rdrs[i] = files[i].getReader().getScanner();
+      done[i] = !rdrs[i].seekTo();
+      if (done[i]) {
+        numDone++;
+      } else {
+        keys[i] = rdrs[i].getKey();
+        vals[i] = rdrs[i].getValue();
+      }
+    }
+
+    long now = System.currentTimeMillis();
+    int timesSeen = 0;
+    HStoreKey lastSeen = new HStoreKey();
+    HStoreKey lastDelete = null;
+    while (numDone < done.length) {
+      // Get lowest key in all store files.
+      int lowestKey = getLowestKey(rdrs, keys, done);
+      // TODO: Suboptimal. And below where we are going from ByteBuffer to
+      // byte array. FIX!! Can we get rid of HSK instantiations?
+      HStoreKey hsk = HStoreKey.create(keys[lowestKey]);
+      // If its same row and column as last key, increment times seen.
+      if (HStoreKey.equalsTwoRowKeys(lastSeen.getRow(), hsk.getRow())
+          && Bytes.equals(lastSeen.getColumn(), hsk.getColumn())) {
+        timesSeen++;
+        // Reset last delete if not exact timestamp -- lastDelete only stops
+        // exactly the same key making it out to the compacted store file.
+        if (lastDelete != null
+            && lastDelete.getTimestamp() != hsk.getTimestamp()) {
+          lastDelete = null;
+        }
+      } else {
+        timesSeen = 1;
+        lastDelete = null;
+      }
+
+      // Don't write empty rows or columns. Only remove cells on major
+      // compaction. Remove if expired of > VERSIONS
+      if (hsk.getRow().length != 0 && hsk.getColumn().length != 0) {
+        ByteBuffer value = vals[lowestKey];
+        if (!majorCompaction) {
+          // Write out all values if not a major compaction.
+          compactedOut.append(hsk.getBytes(), Bytes.toBytes(value));
+        } else {
+          boolean expired = false;
+          boolean deleted = false;
+          if (timesSeen <= family.getMaxVersions()
+              && !(expired = isExpired(hsk, ttl, now))) {
+            // If this value key is same as a deleted key, skip
+            if (lastDelete != null && hsk.equals(lastDelete)) {
+              deleted = true;
+            } else if (HLogEdit.isDeleted(value)) {
+              // If a deleted value, skip
+              deleted = true;
+              lastDelete = hsk;
+            } else {
+              compactedOut.append(hsk.getBytes(), Bytes.toBytes(value));
+            }
+          }
+          if (expired || deleted) {
+            // HBASE-855 remove one from timesSeen because it did not make it
+            // past expired check -- don't count against max versions.
+            timesSeen--;
+          }
+        }
+      }
+
+      // Update last-seen items
+      lastSeen = hsk;
+
+      // Advance the smallest key. If that reader's all finished, then
+      // mark it as done.
+      if (!rdrs[lowestKey].next()) {
+        done[lowestKey] = true;
+        rdrs[lowestKey] = null;
+        numDone++;
+      } else {
+        keys[lowestKey] = rdrs[lowestKey].getKey();
+        vals[lowestKey] = rdrs[lowestKey].getValue();
+      }
+    }
+  }
+
+  /*
+   * It's assumed that the compactLock  will be acquired prior to calling this 
+   * method!  Otherwise, it is not thread-safe!
+   *
+   * It works by processing a compaction that's been written to disk.
+   * 
+   * <p>It is usually invoked at the end of a compaction, but might also be
+   * invoked at HStore startup, if the prior execution died midway through.
+   * 
+   * <p>Moving the compacted TreeMap into place means:
+   * <pre>
+   * 1) Moving the new compacted MapFile into place
+   * 2) Unload all replaced MapFiles, close and collect list to delete.
+   * 3) Loading the new TreeMap.
+   * 4) Compute new store size
+   * </pre>
+   * 
+   * @param compactedFiles list of files that were compacted
+   * @param compactedFile HStoreFile that is the result of the compaction
+   * @throws IOException
+   */
+  private void completeCompaction(final List<StoreFile> compactedFiles,
+    final HFile.Writer compactedFile)
+  throws IOException {
+    // 1. Moving the new files into place.
+    Path p = null;
+    try {
+      p = StoreFile.rename(this.fs, compactedFile.getPath(),
+        StoreFile.getRandomFilename(fs, this.homedir));
+    } catch (IOException e) {
+      LOG.error("Failed move of compacted file " + compactedFile.getPath(), e);
+      return;
+    }
+    StoreFile finalCompactedFile = new StoreFile(this.fs, p);
+    this.lock.writeLock().lock();
+    try {
+      try {
+        // 3. Loading the new TreeMap.
+        // Change this.storefiles so it reflects new state but do not
+        // delete old store files until we have sent out notification of
+        // change in case old files are still being accessed by outstanding
+        // scanners.
+        for (Map.Entry<Long, StoreFile> e: this.storefiles.entrySet()) {
+          if (compactedFiles.contains(e.getValue())) {
+            this.storefiles.remove(e.getKey());
+          }
+        }
+        // Add new compacted Reader and store file.
+        Long orderVal = Long.valueOf(finalCompactedFile.getMaxSequenceId());
+        this.storefiles.put(orderVal, finalCompactedFile);
+        // Tell observers that list of Readers has changed.
+        notifyChangedReadersObservers();
+        // Finally, delete old store files.
+        for (StoreFile hsf: compactedFiles) {
+          hsf.delete();
+        }
+      } catch (IOException e) {
+        e = RemoteExceptionHandler.checkIOException(e);
+        LOG.error("Failed replacing compacted files for " +
+            this.storeNameStr +
+            ". Compacted file is " + finalCompactedFile.toString() +
+            ".  Files replaced are " + compactedFiles.toString() +
+            " some of which may have been already removed", e);
+      }
+      // 4. Compute new store size
+      this.storeSize = 0L;
+      for (StoreFile hsf : this.storefiles.values()) {
+        this.storeSize += hsf.getReader().length();
+      }
+    } finally {
+      this.lock.writeLock().unlock();
+    }
+  }
+
+  // ////////////////////////////////////////////////////////////////////////////
+  // Accessors.
+  // (This is the only section that is directly useful!)
+  //////////////////////////////////////////////////////////////////////////////
+  
+  /**
+   * Return all the available columns for the given key.  The key indicates a 
+   * row and timestamp, but not a column name.
+   *
+   * The returned object should map column names to Cells.
+   */
+  void getFull(HStoreKey key, final Set<byte []> columns,
+      final int numVersions, Map<byte [], Cell> results)
+  throws IOException {
+    int versions = versionsToReturn(numVersions);
+    Map<byte [], Long> deletes =
+      new TreeMap<byte [], Long>(Bytes.BYTES_COMPARATOR);
+    // if the key is null, we're not even looking for anything. return.
+    if (key == null) {
+      return;
+    }
+    
+    this.lock.readLock().lock();
+    // get from the memcache first.
+    this.memcache.getFull(key, columns, versions, deletes, results);
+    try {
+      Map<Long, StoreFile> m = this.storefiles.descendingMap();
+      for (Iterator<Map.Entry<Long, StoreFile>> i = m.entrySet().iterator();
+          i.hasNext();) {
+        getFullFromStoreFile(i.next().getValue(), key, columns, versions, deletes, results);
+      }
+    } finally {
+      this.lock.readLock().unlock();
+    }
+  }
+
+  private void getFullFromStoreFile(StoreFile f, HStoreKey key, 
+    Set<byte []> columns, int numVersions, Map<byte [], Long> deletes,
+    Map<byte [], Cell> results) 
+  throws IOException {
+    long now = System.currentTimeMillis();
+    HFileScanner scanner = f.getReader().getScanner();
+    if (!getClosest(scanner, key.getBytes())) {
+      return;
+    }
+    do {
+      HStoreKey readkey = HStoreKey.create(scanner.getKey());
+      byte[] readcol = readkey.getColumn();
+
+      // if we're looking for this column (or all of them), and there isn't
+      // already a value for this column in the results map or there is a value
+      // but we haven't collected enough versions yet, and the key we
+      // just read matches, then we'll consider it
+      if ((columns == null || columns.contains(readcol)) &&
+          (!results.containsKey(readcol) ||
+            results.get(readcol).getNumValues() < numVersions) &&
+          key.matchesWithoutColumn(readkey)) {
+        // if the value of the cell we're looking at right now is a delete,
+        // we need to treat it differently
+        ByteBuffer value = scanner.getValue();
+        if (HLogEdit.isDeleted(value)) {
+          // if it's not already recorded as a delete or recorded with a more
+          // recent delete timestamp, record it for later
+          if (!deletes.containsKey(readcol)
+              || deletes.get(readcol).longValue() < readkey.getTimestamp()) {
+            deletes.put(readcol, Long.valueOf(readkey.getTimestamp()));
+          }
+        } else if (!(deletes.containsKey(readcol) && deletes.get(readcol)
+            .longValue() >= readkey.getTimestamp())) {
+          // So the cell itself isn't a delete, but there may be a delete
+          // pending from earlier in our search. Only record this result if
+          // there aren't any pending deletes.
+          if (!(deletes.containsKey(readcol) && deletes.get(readcol)
+              .longValue() >= readkey.getTimestamp())) {
+            if (!isExpired(readkey, ttl, now)) {
+              if (!results.containsKey(readcol)) {
+                results.put(readcol, new Cell(value, readkey.getTimestamp()));
+              } else {
+                results.get(readcol).add(Bytes.toBytes(value),
+                    readkey.getTimestamp());
+              }
+            }
+          }
+        }
+      } else if (HStoreKey.compareTwoRowKeys(key.getRow(), readkey.getRow()) < 0) {
+        // if we've crossed into the next row, then we can just stop
+        // iterating
+        break;
+      }
+    } while (scanner.next());
+  }
+
+  /*
+   * @param wantedVersions How many versions were asked for.
+   * @return wantedVersions or this families' MAX_VERSIONS.
+   */
+  private int versionsToReturn(final int wantedVersions) {
+    if (wantedVersions <= 0) {
+      throw new IllegalArgumentException("Number of versions must be > 0");
+    }
+    // Make sure we do not return more than maximum versions for this store.
+    return wantedVersions > this.family.getMaxVersions()?
+      this.family.getMaxVersions(): wantedVersions;
+  }
+  
+  /**
+   * Get the value for the indicated HStoreKey.  Grab the target value and the 
+   * previous <code>numVersions - 1</code> values, as well.
+   *
+   * Use {@link HConstants.ALL_VERSIONS} to retrieve all versions.
+   * @param key
+   * @param numVersions Number of versions to fetch.  Must be > 0.
+   * @return values for the specified versions
+   * @throws IOException
+   */
+  Cell[] get(final HStoreKey key, final int numVersions) throws IOException {
+    // This code below is very close to the body of the getKeys method.  Any 
+    // changes in the flow below should also probably be done in getKeys.
+    // TODO: Refactor so same code used.
+    long now = System.currentTimeMillis();
+    int versions = versionsToReturn(numVersions);
+    // Keep a list of deleted cell keys.  We need this because as we go through
+    // the memcache and store files, the cell with the delete marker may be
+    // in one store and the old non-delete cell value in a later store.
+    // If we don't keep around the fact that the cell was deleted in a newer
+    // record, we end up returning the old value if user is asking for more
+    // than one version. This List of deletes should not be large since we
+    // are only keeping rows and columns that match those set on the get and
+    // which have delete values.  If memory usage becomes an issue, could
+    // redo as bloom filter.
+    Set<HStoreKey> deletes = new HashSet<HStoreKey>();
+    this.lock.readLock().lock();
+    try {
+      // Check the memcache
+      List<Cell> results = this.memcache.get(key, versions, deletes, now);
+      // If we got sufficient versions from memcache, return. 
+      if (results.size() == versions) {
+        return results.toArray(new Cell[results.size()]);
+      }
+      Map<Long, StoreFile> m = this.storefiles.descendingMap();
+      byte [] keyBytes = key.getBytes();
+      for (Iterator<Map.Entry<Long, StoreFile>> i = m.entrySet().iterator();
+          i.hasNext() && !hasEnoughVersions(versions, results);) {
+        StoreFile f = i.next().getValue();
+        HFileScanner scanner = f.getReader().getScanner();
+        if (!getClosest(scanner, keyBytes)) {
+          continue;
+        }
+        HStoreKey readkey = HStoreKey.create(scanner.getKey());
+        if (!readkey.matchesRowCol(key)) {
+          continue;
+        }
+        if (get(readkey, scanner.getValue(), versions, results, deletes, now)) {
+          break;
+        }
+        while (scanner.next()) {
+          readkey = HStoreKey.create(scanner.getKey());
+          if (!readkey.matchesRowCol(key)) {
+            break;
+          }
+          if (get(readkey, scanner.getValue(), versions, results, deletes, now)) {
+            break;
+          }
+        }
+      }
+      return results.size() == 0 ?
+        null : results.toArray(new Cell[results.size()]);
+    } finally {
+      this.lock.readLock().unlock();
+    }
+  }
+  
+  /*
+   * Look at one key/value.
+   * @param key
+   * @param value
+   * @param versions
+   * @param results
+   * @param deletes
+   * @param now
+   * @return True if we have enough versions.
+   */
+  private boolean get(final HStoreKey key, ByteBuffer value,
+      final int versions, final List<Cell> results,
+      final Set<HStoreKey> deletes, final long now) {
+    if (!HLogEdit.isDeleted(value)) {
+      if (notExpiredAndNotInDeletes(this.ttl, key, now, deletes)) {
+        results.add(new Cell(value, key.getTimestamp()));
+      }
+      // Perhaps only one version is wanted.  I could let this
+      // test happen later in the for loop test but it would cost
+      // the allocation of an ImmutableBytesWritable.
+      if (hasEnoughVersions(versions, results)) {
+        return true;
+      }
+    } else {
+      // Is this copy necessary?
+      deletes.add(new HStoreKey(key));
+    }
+    return false;
+  }
+
+  /*
+   * Small method to check if we are over the max number of versions
+   * or we acheived this family max versions. 
+   * The later happens when we have the situation described in HBASE-621.
+   * @param versions
+   * @param c
+   * @return 
+   */
+  private boolean hasEnoughVersions(final int versions, final List<Cell> c) {
+    return c.size() >= versions;
+  }
+
+  /**
+   * Get <code>versions</code> of keys matching the origin key's
+   * row/column/timestamp and those of an older vintage.
+   * @param origin Where to start searching.
+   * @param versions How many versions to return. Pass
+   * {@link HConstants#ALL_VERSIONS} to retrieve all.
+   * @param now
+   * @param columnPattern regex pattern for column matching. if columnPattern
+   * is not null, we use column pattern to match columns. And the columnPattern
+   * only works when origin's column is null or its length is zero.
+   * @return Matching keys.
+   * @throws IOException
+   */
+  public List<HStoreKey> getKeys(final HStoreKey origin, final int versions,
+    final long now, final Pattern columnPattern)
+  throws IOException {
+    // This code below is very close to the body of the get method.  Any 
+    // changes in the flow below should also probably be done in get.
+    // TODO: Refactor so same code used.
+    Set<HStoreKey> deletes = new HashSet<HStoreKey>();
+    this.lock.readLock().lock();
+    try {
+      // Check the memcache
+      List<HStoreKey> keys =
+        this.memcache.getKeys(origin, versions, deletes, now, columnPattern);
+      // If we got sufficient versions from memcache, return.
+      if (keys.size() >= versions) {
+        return keys;
+      }
+      Map<Long, StoreFile> m = this.storefiles.descendingMap();
+      for (Iterator<Map.Entry<Long, StoreFile>> i = m.entrySet().iterator();
+          i.hasNext() && keys.size() < versions;) {
+        StoreFile f = i.next().getValue();
+        HFileScanner scanner = f.getReader().getScanner();
+        if (!getClosest(scanner, origin.getBytes())) {
+          continue;
+        }
+        do {
+          HStoreKey readkey = HStoreKey.create(scanner.getKey());
+          // if the row and column matches, we might want this one.
+          if (rowMatches(origin, readkey)) {
+            // if the column pattern is not null, we use it for column matching.
+            // we will skip the keys whose column doesn't match the pattern.
+            if (columnPattern != null) {
+              if (!(columnPattern.
+                  matcher(Bytes.toString(readkey.getColumn())).matches())) {
+                continue;
+              }
+            }
+            // if the cell address matches, then we definitely want this key.
+            if (cellMatches(origin, readkey)) {
+              ByteBuffer readval = scanner.getValue();
+              // Store key if isn't deleted or superceded by memcache
+              if (!HLogEdit.isDeleted(readval)) {
+                if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
+                  keys.add(readkey);
+                }
+                if (keys.size() >= versions) {
+                  break;
+                }
+              } else {
+                deletes.add(readkey);
+              }
+            } else {
+              // the cell doesn't match, but there might be more with different
+              // timestamps, so move to the next key
+              continue;
+            }
+          } else {
+            // the row doesn't match, so we've gone too far.
+            break;
+          }
+        } while (scanner.next()); // advance to the next key
+      }
+      return keys;
+    } finally {
+      this.lock.readLock().unlock();
+    }
+  }
+
+  /**
+   * Find the key that matches <i>row</i> exactly, or the one that immediately
+   * preceeds it. WARNING: Only use this method on a table where writes occur 
+   * with stricly increasing timestamps. This method assumes this pattern of 
+   * writes in order to make it reasonably performant.
+   * @param row
+   * @return Found row
+   * @throws IOException
+   */
+  @SuppressWarnings("unchecked")
+  byte [] getRowKeyAtOrBefore(final byte [] row)
+  throws IOException{
+    // Map of HStoreKeys that are candidates for holding the row key that
+    // most closely matches what we're looking for. We'll have to update it as
+    // deletes are found all over the place as we go along before finally
+    // reading the best key out of it at the end.
+    NavigableMap<HStoreKey, Long> candidateKeys = new TreeMap<HStoreKey, Long>(
+      new HStoreKey.HStoreKeyWritableComparator());
+    
+    // Keep a list of deleted cell keys.  We need this because as we go through
+    // the store files, the cell with the delete marker may be in one file and
+    // the old non-delete cell value in a later store file. If we don't keep
+    // around the fact that the cell was deleted in a newer record, we end up
+    // returning the old value if user is asking for more than one version.
+    // This List of deletes should not be large since we are only keeping rows
+    // and columns that match those set on the scanner and which have delete
+    // values.  If memory usage becomes an issue, could redo as bloom filter.
+    Set<HStoreKey> deletes = new HashSet<HStoreKey>();
+    this.lock.readLock().lock();
+    try {
+      // First go to the memcache.  Pick up deletes and candidates.
+      this.memcache.getRowKeyAtOrBefore(row, candidateKeys, deletes);
+      // Process each store file.  Run through from newest to oldest.
+      // This code below is very close to the body of the getKeys method.
+      Map<Long, StoreFile> m = this.storefiles.descendingMap();
+      for (Map.Entry<Long, StoreFile> e: m.entrySet()) {
+        // Update the candidate keys from the current map file
+        rowAtOrBeforeFromStoreFile(e.getValue(), row, candidateKeys, deletes);
+      }
+      // Return the best key from candidateKeys
+      byte [] result =
+        candidateKeys.isEmpty()? null: candidateKeys.lastKey().getRow();
+      return result;
+    } finally {
+      this.lock.readLock().unlock();
+    }
+  }
+
+  /*
+   * Check an individual MapFile for the row at or before a given key 
+   * and timestamp
+   * @param f
+   * @param row
+   * @param candidateKeys
+   * @throws IOException
+   */
+  private void rowAtOrBeforeFromStoreFile(final StoreFile f,
+    final byte [] row, final SortedMap<HStoreKey, Long> candidateKeys,
+    final Set<HStoreKey> deletes)
+  throws IOException {
+    HFileScanner scanner = f.getReader().getScanner();
+    // TODO: FIX THIS PROFLIGACY!!!
+    if (!scanner.seekBefore(new HStoreKey(row).getBytes())) {
+      return;
+    }
+    long now = System.currentTimeMillis();
+    HStoreKey startKey = HStoreKey.create(scanner.getKey());
+    // if there aren't any candidate keys yet, we'll do some things different 
+    if (candidateKeys.isEmpty()) {
+      rowAtOrBeforeCandidate(startKey, f, row, candidateKeys, deletes, now);
+    } else {
+      rowAtOrBeforeWithCandidates(startKey, f, row, candidateKeys, deletes, now);
+    }
+  }
+  
+  /* Find a candidate for row that is at or before passed row in passed
+   * mapfile.
+   * @param startKey First key in the mapfile.
+   * @param map
+   * @param row
+   * @param candidateKeys
+   * @param now
+   * @throws IOException
+   */
+  private void rowAtOrBeforeCandidate(final HStoreKey startKey,
+    final StoreFile f, final byte[] row,
+    final SortedMap<HStoreKey, Long> candidateKeys,
+    final Set<HStoreKey> deletes, final long now) 
+  throws IOException {
+    // if the row we're looking for is past the end of this mapfile, set the
+    // search key to be the last key.  If its a deleted key, then we'll back
+    // up to the row before and return that.
+    HStoreKey finalKey = HStoreKey.create(f.getReader().getLastKey());
+    HStoreKey searchKey = null;
+    if (HStoreKey.compareTwoRowKeys(finalKey.getRow(), row) < 0) {
+      searchKey = finalKey;
+    } else {
+      searchKey = new HStoreKey(row);
+      if (searchKey.compareTo(startKey) < 0) {
+        searchKey = startKey;
+      }
+    }
+    rowAtOrBeforeCandidate(f, searchKey, candidateKeys, deletes, now);
+  }
+
+  /* 
+   * @param ttlSetting
+   * @param hsk
+   * @param now
+   * @param deletes
+   * @return True if key has not expired and is not in passed set of deletes.
+   */
+  static boolean notExpiredAndNotInDeletes(final long ttl,
+      final HStoreKey hsk, final long now, final Set<HStoreKey> deletes) {
+    return !isExpired(hsk, ttl, now) &&
+      (deletes == null || !deletes.contains(hsk));
+  }
+  
+  static boolean isExpired(final HStoreKey hsk, final long ttl,
+      final long now) {
+    return ttl != HConstants.FOREVER && now > hsk.getTimestamp() + ttl;
+  }
+
+  /* Find a candidate for row that is at or before passed key, sk, in mapfile.
+   * @param f
+   * @param sk Key to go search the mapfile with.
+   * @param candidateKeys
+   * @param now
+   * @throws IOException
+   * @see {@link #rowAtOrBeforeCandidate(HStoreKey, org.apache.hadoop.io.MapFile.Reader, byte[], SortedMap, long)}
+   */
+  private void rowAtOrBeforeCandidate(final StoreFile f,
+    final HStoreKey sk, final SortedMap<HStoreKey, Long> candidateKeys,
+    final Set<HStoreKey> deletes, final long now)
+  throws IOException {
+    HStoreKey searchKey = sk;
+    HStoreKey readkey = null;
+    HStoreKey knownNoGoodKey = null;
+    HFileScanner scanner = f.getReader().getScanner();
+    for (boolean foundCandidate = false; !foundCandidate;) {
+      // Seek to the exact row, or the one that would be immediately before it
+      int result = scanner.seekTo(searchKey.getBytes());
+      if (result < 0) {
+        // Not in file.
+        continue;
+      }
+      HStoreKey deletedOrExpiredRow = null;
+      do {
+        readkey = HStoreKey.create(scanner.getKey());
+        ByteBuffer value = scanner.getValue();
+        // If we have an exact match on row, and it's not a delete, save this
+        // as a candidate key
+        if (HStoreKey.equalsTwoRowKeys(readkey.getRow(), searchKey.getRow())) {
+          if (!HLogEdit.isDeleted(value)) {
+            if (handleNonDelete(readkey, now, deletes, candidateKeys)) {
+              foundCandidate = true;
+              // NOTE! Continue.
+              continue;
+            }
+          }
+          HStoreKey copy = addCopyToDeletes(readkey, deletes);
+          if (deletedOrExpiredRow == null) {
+            deletedOrExpiredRow = copy;
+          }
+        } else if (HStoreKey.compareTwoRowKeys(readkey.getRow(),
+            searchKey.getRow()) > 0) {
+          // if the row key we just read is beyond the key we're searching for,
+          // then we're done.
+          break;
+        } else {
+          // So, the row key doesn't match, but we haven't gone past the row
+          // we're seeking yet, so this row is a candidate for closest
+          // (assuming that it isn't a delete).
+          if (!HLogEdit.isDeleted(value)) {
+            if (handleNonDelete(readkey, now, deletes, candidateKeys)) {
+              foundCandidate = true;
+              // NOTE: Continue
+              continue;
+            }
+          }
+          HStoreKey copy = addCopyToDeletes(readkey, deletes);
+          if (deletedOrExpiredRow == null) {
+            deletedOrExpiredRow = copy;
+          }
+        }
+      } while(scanner.next() && (knownNoGoodKey == null ||
+          readkey.compareTo(knownNoGoodKey) < 0));
+
+      // If we get here and have no candidates but we did find a deleted or
+      // expired candidate, we need to look at the key before that
+      if (!foundCandidate && deletedOrExpiredRow != null) {
+        knownNoGoodKey = deletedOrExpiredRow;
+        if (!scanner.seekBefore(deletedOrExpiredRow.getBytes())) {
+          // Is this right?
+          break;
+        }
+        searchKey = HStoreKey.create(scanner.getKey());
+      } else {
+        // No candidates and no deleted or expired candidates. Give up.
+        break;
+      }
+    }
+    
+    // Arriving here just means that we consumed the whole rest of the map
+    // without going "past" the key we're searching for. we can just fall
+    // through here.
+  }
+  
+  /*
+   * @param key Key to copy and add to <code>deletes</code>
+   * @param deletes
+   * @return Instance of the copy added to <code>deletes</code>
+   */
+  private HStoreKey addCopyToDeletes(final HStoreKey key,
+      final Set<HStoreKey> deletes) {
+    HStoreKey copy = new HStoreKey(key);
+    deletes.add(copy);
+    return copy;
+  }
+
+  private void rowAtOrBeforeWithCandidates(final HStoreKey startKey,
+    final StoreFile f, final byte[] row,
+    final SortedMap<HStoreKey, Long> candidateKeys,
+    final Set<HStoreKey> deletes, final long now) 
+  throws IOException {
+    // if there are already candidate keys, we need to start our search 
+    // at the earliest possible key so that we can discover any possible
+    // deletes for keys between the start and the search key.  Back up to start
+    // of the row in case there are deletes for this candidate in this mapfile
+    // BUT do not backup before the first key in the store file.
+    // TODO: FIX THIS PROFLIGATE OBJECT MAKING!!!
+    byte [] searchKey =
+      new HStoreKey(candidateKeys.firstKey().getRow()).getBytes();
+    if (f.getReader().getComparator().compare(searchKey, 0, searchKey.length,
+        startKey.getRow(), 0, startKey.getRow().length) < 0) {
+      searchKey = startKey.getBytes();
+    }
+
+    // Seek to the exact row, or the one that would be immediately before it
+    HFileScanner scanner = f.getReader().getScanner();
+    int result = scanner.seekTo(searchKey);
+    if (result < 0) {
+      // Key is before start of this file.  Return.
+      return;
+    }
+    do {
+      HStoreKey k = HStoreKey.create(scanner.getKey());
+      ByteBuffer v = scanner.getValue();
+      // if we have an exact match on row, and it's not a delete, save this
+      // as a candidate key
+      if (HStoreKey.equalsTwoRowKeys(k.getRow(), row)) {
+        handleKey(k, v, now, deletes, candidateKeys);
+      } else if (HStoreKey.compareTwoRowKeys(k.getRow(), row) > 0 ) {
+        // if the row key we just read is beyond the key we're searching for,
+        // then we're done.
+        break;
+      } else {
+        // So, the row key doesn't match, but we haven't gone past the row
+        // we're seeking yet, so this row is a candidate for closest 
+        // (assuming that it isn't a delete).
+        handleKey(k, v, now, deletes, candidateKeys);
+      }
+    } while(scanner.next());
+  }
+
+  /*
+   * @param readkey
+   * @param now
+   * @param deletes
+   * @param candidateKeys
+   */
+  private void handleKey(final HStoreKey readkey, ByteBuffer value,
+      final long now, final Set<HStoreKey> deletes,
+      final SortedMap<HStoreKey, Long> candidateKeys) {
+    if (!HLogEdit.isDeleted(value)) {
+      handleNonDelete(readkey, now, deletes, candidateKeys);
+    } else {
+      // Pass copy because readkey will change next time next is called.
+      handleDeleted(new HStoreKey(readkey), candidateKeys, deletes);
+    }
+  }
+
+  /*
+   * @param readkey
+   * @param now
+   * @param deletes
+   * @param candidateKeys
+   * @return True if we added a candidate.
+   */
+  private boolean handleNonDelete(final HStoreKey readkey, final long now,
+      final Set<HStoreKey> deletes, final Map<HStoreKey, Long> candidateKeys) {
+    if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
+      candidateKeys.put(stripTimestamp(readkey),
+        Long.valueOf(readkey.getTimestamp()));
+      return true;
+    }
+    return false;
+  }
+
+  /* Handle keys whose values hold deletes.
+   * Add to the set of deletes and then if the candidate keys contain any that
+   * might match by timestamp, then check for a match and remove it if it's too
+   * young to survive the delete 
+   * @param k Be careful; if key was gotten from a Mapfile, pass in a copy.
+   * Values gotten by 'nexting' out of Mapfiles will change in each invocation.
+   * @param candidateKeys
+   * @param deletes
+   */
+  static void handleDeleted(final HStoreKey k,
+      final SortedMap<HStoreKey, Long> candidateKeys,
+      final Set<HStoreKey> deletes) {
+    deletes.add(k);
+    HStoreKey strippedKey = stripTimestamp(k);
+    if (candidateKeys.containsKey(strippedKey)) {
+      long bestCandidateTs = 
+        candidateKeys.get(strippedKey).longValue();
+      if (bestCandidateTs <= k.getTimestamp()) {
+        candidateKeys.remove(strippedKey);
+      }
+    }
+  }
+
+  static HStoreKey stripTimestamp(HStoreKey key) {
+    return new HStoreKey(key.getRow(), key.getColumn());
+  }
+    
+  /*
+   * Test that the <i>target</i> matches the <i>origin</i> cell address. If the 
+   * <i>origin</i> has an empty column, then it's assumed to mean any column 
+   * matches and only match on row and timestamp. Otherwise, it compares the
+   * keys with HStoreKey.matchesRowCol().
+   * @param origin The key we're testing against
+   * @param target The key we're testing
+   */
+  private boolean cellMatches(HStoreKey origin, HStoreKey target){
+    // if the origin's column is empty, then we're matching any column
+    if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) {
+      // if the row matches, then...
+      if (HStoreKey.equalsTwoRowKeys(target.getRow(), origin.getRow())) {
+        // check the timestamp
+        return target.getTimestamp() <= origin.getTimestamp();
+      }
+      return false;
+    }
+    // otherwise, we want to match on row and column
+    return target.matchesRowCol(origin);
+  }
+    
+  /*
+   * Test that the <i>target</i> matches the <i>origin</i>. If the <i>origin</i>
+   * has an empty column, then it just tests row equivalence. Otherwise, it uses
+   * HStoreKey.matchesRowCol().
+   * @param origin Key we're testing against
+   * @param target Key we're testing
+   */
+  private boolean rowMatches(final HStoreKey origin, final HStoreKey target){
+    // if the origin's column is empty, then we're matching any column
+    if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) {
+      // if the row matches, then...
+      return HStoreKey.equalsTwoRowKeys(target.getRow(), origin.getRow());
+    }
+    // otherwise, we want to match on row and column
+    return target.matchesRowCol(origin);
+  }
+
+  /**
+   * Determines if HStore can be split
+   * @param force Whether to force a split or not.
+   * @return a StoreSize if store can be split, null otherwise.
+   */
+  StoreSize checkSplit(final boolean force) {
+    this.lock.readLock().lock();
+    try {
+      // Iterate through all store files
+      if (this.storefiles.size() <= 0) {
+        return null;
+      }
+      if (!force && (storeSize < this.desiredMaxFileSize)) {
+        return null;
+      }
+      // Not splitable if we find a reference store file present in the store.
+      boolean splitable = true;
+      long maxSize = 0L;
+      Long mapIndex = Long.valueOf(0L);
+      for (Map.Entry<Long, StoreFile> e: storefiles.entrySet()) {
+        StoreFile curHSF = e.getValue();
+        if (splitable) {
+          splitable = !curHSF.isReference();
+          if (!splitable) {
+            // RETURN IN MIDDLE OF FUNCTION!!! If not splitable, just return.
+            if (LOG.isDebugEnabled()) {
+              LOG.debug(curHSF +  " is not splittable");
+            }
+            return null;
+          }
+        }
+        long size = curHSF.getReader().length();
+        if (size > maxSize) {
+          // This is the largest one so far
+          maxSize = size;
+          mapIndex = e.getKey();
+        }
+      }
+
+      HFile.Reader r = this.storefiles.get(mapIndex).getReader();
+      // Get first, last, and mid keys.  Midkey is the key that starts block
+      // in middle of hfile.  Has column and timestamp.  Need to return just
+      // the row we want to split on as midkey.
+      byte [] midkey = r.midkey();
+      if (midkey != null) {
+        HStoreKey mk = HStoreKey.create(midkey);
+        HStoreKey firstKey = HStoreKey.create(r.getFirstKey());
+        HStoreKey lastKey = HStoreKey.create(r.getLastKey());
+        // if the midkey is the same as the first and last keys, then we cannot
+        // (ever) split this region. 
+        if (HStoreKey.equalsTwoRowKeys(mk.getRow(), firstKey.getRow()) && 
+            HStoreKey.equalsTwoRowKeys( mk.getRow(), lastKey.getRow())) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("cannot split because midkey is the same as first or " +
+              "last row");
+          }
+          return null;
+        }
+        return new StoreSize(maxSize, mk.getRow());
+      }
+    } catch(IOException e) {
+      LOG.warn("Failed getting store size for " + this.storeNameStr, e);
+    } finally {
+      this.lock.readLock().unlock();
+    }
+    return null;
+  }
+  
+  /** @return aggregate size of HStore */
+  public long getSize() {
+    return storeSize;
+  }
+  
+  //////////////////////////////////////////////////////////////////////////////
+  // File administration
+  //////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Return a scanner for both the memcache and the HStore files
+   */
+  protected InternalScanner getScanner(long timestamp, byte [][] targetCols,
+      byte [] firstRow, RowFilterInterface filter)
+  throws IOException {
+    lock.readLock().lock();
+    try {
+      return new StoreScanner(this, targetCols, firstRow, timestamp, filter);
+    } finally {
+      lock.readLock().unlock();
+    }
+  }
+
+  @Override
+  public String toString() {
+    return this.storeNameStr;
+  }
+
+  /**
+   * @return Count of store files
+   */
+  int getStorefilesCount() {
+    return this.storefiles.size();
+  }
+
+  /**
+   * @return The size of the store file indexes, in bytes.
+   * @throws IOException if there was a problem getting file sizes from the
+   * filesystem
+   */
+  long getStorefilesIndexSize() throws IOException {
+    long size = 0;
+    for (StoreFile s: storefiles.values())
+      size += s.getReader().indexSize();
+    return size;
+  }
+
+  /*
+   * Datastructure that holds size and row to split a file around.
+   */
+  class StoreSize {
+    private final long size;
+    private final byte[] key;
+    StoreSize(long size, byte[] key) {
+      this.size = size;
+      this.key = new byte[key.length];
+      System.arraycopy(key, 0, this.key, 0, key.length);
+    }
+    /* @return the size */
+    long getSize() {
+      return size;
+    }
+    /* @return the key */
+    byte[] getSplitRow() {
+      return key;
+    }
+  }
+
+  HRegionInfo getHRegionInfo() {
+    return this.regioninfo;
+  }
+
+  /**
+   * Convenience method that implements the old MapFile.getClosest on top of
+   * HFile Scanners.  getClosest used seek to the asked-for key or just after
+   * (HFile seeks to the key or just before).
+   * @param s
+   * @param b
+   * @return True if we were able to seek the scanner to <code>b</code> or to
+   * the key just after.
+   * @throws IOException 
+   */
+  static boolean getClosest(final HFileScanner s, final byte [] b)
+  throws IOException {
+    int result = s.seekTo(b);
+    if (result < 0) {
+      // Not in file.  Will the first key do?
+      if (!s.seekTo()) {
+        return false;
+      }
+    } else if (result > 0) {
+      // Less than what was asked for but maybe < because we're asking for
+      // r/c/LATEST_TIMESTAMP -- what was returned was r/c-1/SOME_TS...
+      // A next will get us a r/c/SOME_TS.
+      if (!s.next()) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
diff --git a/src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java b/src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java
new file mode 100644
index 00000000000..133ef2cef8c
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java
@@ -0,0 +1,443 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HStoreKey;
+import org.apache.hadoop.hbase.io.HalfHFileReader;
+import org.apache.hadoop.hbase.io.Reference;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.RawComparator;
+
+/**
+ * A Store data file.  Stores usually have one or more of these files.  They
+ * are produced by flushing the memcache to disk.  To
+ * create, call {@link #getWriter(FileSystem, Path)} and append data.  Be
+ * sure to add any metadata before calling close on the Writer
+ * (Use the appendMetadata convenience methods). On close, a StoreFile is
+ * sitting in the Filesystem.  To refer to it, create a StoreFile instance
+ * passing filesystem and path.  To read, call {@link #getReader()}.
+ * <p>StoreFiles may also reference store files in another Store.
+ */
+public class StoreFile implements HConstants {
+  static final Log LOG = LogFactory.getLog(StoreFile.class.getName());
+  
+  // Make default block size for StoreFiles 8k while testing.  TODO: FIX!
+  // Need to make it 8k for testing.
+  private static final int DEFAULT_BLOCKSIZE_SMALL = 8 * 1024;
+
+  private final FileSystem fs;
+  // This file's path.
+  private final Path path;
+  // If this storefile references another, this is the reference instance.
+  private Reference reference;
+  // If this StoreFile references another, this is the other files path.
+  private Path referencePath;
+
+  // Keys for metadata stored in backing HFile.
+  private static final byte [] MAX_SEQ_ID_KEY = Bytes.toBytes("MAX_SEQ_ID_KEY");
+  // Set when we obtain a Reader.
+  private long sequenceid = -1;
+
+  private static final byte [] MAJOR_COMPACTION_KEY =
+    Bytes.toBytes("MAJOR_COMPACTION_KEY");
+  // If true, this file was product of a major compaction.  Its then set
+  // whenever you get a Reader.
+  private AtomicBoolean majorCompaction = null;
+
+  /*
+   * Regex that will work for straight filenames and for reference names.
+   * If reference, then the regex has more than just one group.  Group 1 is
+   * this files id.  Group 2 the referenced region name, etc.
+   */
+  private static final Pattern REF_NAME_PARSER =
+    Pattern.compile("^(\\d+)(?:\\.(.+))?$");
+
+  private volatile HFile.Reader reader;
+
+  // Used making file ids.
+  private final static Random rand = new Random();
+
+  /**
+   * Constructor.
+   * Loads up a Reader (and its indices, etc.).
+   * @param fs Filesystem.
+   * @param p qualified path
+   * @throws IOException
+   */
+  StoreFile(final FileSystem fs, final Path p)
+  throws IOException {
+    this.fs = fs;
+    this.path = p;
+    if (isReference(p)) {
+      this.reference = Reference.read(fs, p);
+      this.referencePath = getReferredToFile(this.path);
+    }
+    this.reader = open();
+  }
+
+  /**
+   * @return Path or null if this StoreFile was made with a Stream.
+   */
+  Path getPath() {
+    return this.path;
+  }
+
+  /**
+   * @return The Store/ColumnFamily this file belongs to.
+   */
+  byte [] getFamily() {
+    return Bytes.toBytes(this.path.getParent().getName());
+  }
+
+  /**
+   * @return True if this is a StoreFile Reference; call after {@link #open()}
+   * else may get wrong answer.
+   */
+  boolean isReference() {
+    return this.reference != null;
+  }
+
+  /**
+   * @param p Path to check.
+   * @return True if the path has format of a HStoreFile reference.
+   */
+  public static boolean isReference(final Path p) {
+    return isReference(p, REF_NAME_PARSER.matcher(p.getName()));
+  }
+
+  /**
+   * @param p Path to check.
+   * @param m Matcher to use.
+   * @return True if the path has format of a HStoreFile reference.
+   */
+  public static boolean isReference(final Path p, final Matcher m) {
+    if (m == null || !m.matches()) {
+      LOG.warn("Failed match of store file name " + p.toString());
+      throw new RuntimeException("Failed match of store file name " +
+          p.toString());
+    }
+    return m.groupCount() > 1 && m.group(2) != null;
+  }
+
+  /*
+   * Return path to the file referred to by a Reference.  Presumes a directory
+   * hierarchy of <code>${hbase.rootdir}/tablename/regionname/familyname</code>.
+   * @param p Path to a Reference file.
+   * @return Calculated path to parent region file.
+   * @throws IOException
+   */
+  static Path getReferredToFile(final Path p) throws IOException {
+    Matcher m = REF_NAME_PARSER.matcher(p.getName());
+    if (m == null || !m.matches()) {
+      LOG.warn("Failed match of store file name " + p.toString());
+      throw new RuntimeException("Failed match of store file name " +
+          p.toString());
+    }
+    // Other region name is suffix on the passed Reference file name
+    String otherRegion = m.group(2);
+    // Tabledir is up two directories from where Reference was written.
+    Path tableDir = p.getParent().getParent().getParent();
+    String nameStrippedOfSuffix = m.group(1);
+    // Build up new path with the referenced region in place of our current
+    // region in the reference path.  Also strip regionname suffix from name.
+    return new Path(new Path(new Path(tableDir, otherRegion),
+      p.getParent().getName()), nameStrippedOfSuffix);
+  }
+
+  /**
+   * @return True if this file was made by a major compaction.
+   */
+  boolean isMajorCompaction() {
+    if (this.majorCompaction == null) {
+      throw new NullPointerException("This has not been set yet");
+    }
+    return this.majorCompaction.get();
+  }
+
+  /**
+   * @return This files maximum edit sequence id.
+   */
+  public long getMaxSequenceId() {
+    if (this.sequenceid == -1) {
+      throw new IllegalAccessError("Has not been initialized");
+    }
+    return this.sequenceid;
+  }
+
+  /**
+   * Opens reader on this store file.  Called by Constructor.
+   * @return Reader for the store file.
+   * @throws IOException
+   * @see #close()
+   */
+  protected HFile.Reader open()
+  throws IOException {
+    if (this.reader != null) {
+      throw new IllegalAccessError("Already open");
+    }
+    if (isReference()) {
+      this.reader = new HalfHFileReader(this.fs, this.referencePath, null,
+        this.reference);
+    } else {
+      this.reader = new HFile.Reader(this.fs, this.path, null);
+    }
+    // Load up indices and fileinfo.
+    Map<byte [], byte []> map = this.reader.loadFileInfo();
+    // Read in our metadata.
+    byte [] b = map.get(MAX_SEQ_ID_KEY);
+    if (b != null) {
+      // By convention, if halfhfile, top half has a sequence number > bottom
+      // half. Thats why we add one in below. Its done for case the two halves
+      // are ever merged back together --rare.  Without it, on open of store,
+      // since store files are distingushed by sequence id, the one half would
+      // subsume the other.
+      this.sequenceid = Bytes.toLong(b);
+      if (isReference()) {
+        if (Reference.isTopFileRegion(this.reference.getFileRegion())) {
+          this.sequenceid += 1;
+        }
+      }
+      
+    }
+    b = map.get(MAJOR_COMPACTION_KEY);
+    if (b != null) {
+      boolean mc = Bytes.toBoolean(b);
+      if (this.majorCompaction == null) {
+        this.majorCompaction = new AtomicBoolean(mc);
+      } else {
+        this.majorCompaction.set(mc);
+      }
+    }
+    return this.reader;
+  }
+
+  /**
+   * @return Current reader.  Must call open first.
+   */
+  public HFile.Reader getReader() {
+    if (this.reader == null) {
+      throw new IllegalAccessError("Call open first");
+    }
+    return this.reader;
+  }
+
+  /**
+   * @throws IOException
+   * @see #open()
+   */
+  public synchronized void close() throws IOException {
+    if (this.reader != null) {
+      this.reader.close();
+      this.reader = null;
+    }
+  }
+
+  public String toString() {
+    return this.path.toString() +
+      (isReference()? "-" + this.referencePath + "-" + reference.toString(): "");
+  }
+
+  /**
+   * Delete this file
+   * @throws IOException 
+   */
+  public void delete() throws IOException {
+    close();
+    this.fs.delete(getPath(), true);
+  }
+
+  /**
+   * Utility to help with rename.
+   * @param fs
+   * @param src
+   * @param tgt
+   * @return True if succeeded.
+   * @throws IOException
+   */
+  public static Path rename(final FileSystem fs, final Path src,
+      final Path tgt)
+  throws IOException {
+    if (!fs.exists(src)) {
+      throw new FileNotFoundException(src.toString());
+    }
+    if (!fs.rename(src, tgt)) {
+      throw new IOException("Failed rename of " + src + " to " + tgt);
+    }
+    return tgt;
+  }
+
+  /**
+   * Get a store file writer. Client is responsible for closing file when done.
+   * If metadata, add BEFORE closing using
+   * {@link #appendMetadata(org.apache.hadoop.hbase.io.hfile.HFile.Writer, long)}.
+   * @param fs
+   * @param dir Path to family directory.  Makes the directory if doesn't exist.
+   * Creates a file with a unique name in this directory.
+   * @return HFile.Writer
+   * @throws IOException
+   */
+  public static HFile.Writer getWriter(final FileSystem fs, final Path dir)
+  throws IOException {
+    return getWriter(fs, dir, DEFAULT_BLOCKSIZE_SMALL, null, null);
+  }
+
+  /**
+   * Get a store file writer. Client is responsible for closing file when done.
+   * If metadata, add BEFORE closing using
+   * {@link #appendMetadata(org.apache.hadoop.hbase.io.hfile.HFile.Writer, long)}.
+   * @param fs
+   * @param dir Path to family directory.  Makes the directory if doesn't exist.
+   * Creates a file with a unique name in this directory.
+   * @param blocksize
+   * @param algorithm Pass null to get default.
+   * @param c Pass null to get default.
+   * @return HFile.Writer
+   * @throws IOException
+   */
+  public static HFile.Writer getWriter(final FileSystem fs, final Path dir,
+    final int blocksize, final String algorithm, final RawComparator<byte []> c)
+  throws IOException {
+    if (!fs.exists(dir)) {
+      fs.mkdirs(dir);
+    }
+    Path path = getUniqueFile(fs, dir);
+    return new HFile.Writer(fs, path, blocksize,
+      algorithm == null? HFile.DEFAULT_COMPRESSION: algorithm,
+      c == null? HStoreKey.BYTECOMPARATOR: c);
+  }
+
+  /**
+   * @param fs
+   * @param p
+   * @return random filename inside passed <code>dir</code>
+   */
+  static Path getUniqueFile(final FileSystem fs, final Path p)
+  throws IOException {
+    if (!fs.getFileStatus(p).isDir()) {
+      throw new IOException("Expecting a directory");
+    }
+    return fs.getFileStatus(p).isDir()? getRandomFilename(fs, p): p;
+  }
+
+  /**
+   * @param fs
+   * @param dir
+   * @param encodedRegionName
+   * @param family
+   * @return Path to a file that doesn't exist at time of this invocation.
+   * @throws IOException
+   */
+  static Path getRandomFilename(final FileSystem fs, final Path dir)
+  throws IOException {
+    return getRandomFilename(fs, dir, null);
+  }
+
+  /**
+   * @param fs
+   * @param dir
+   * @param encodedRegionName
+   * @param family
+   * @param suffix
+   * @return Path to a file that doesn't exist at time of this invocation.
+   * @throws IOException
+   */
+  static Path getRandomFilename(final FileSystem fs, final Path dir,
+      final String suffix)
+  throws IOException {
+    long id = -1;
+    Path p = null;
+    do {
+      id = Math.abs(rand.nextLong());
+      p = new Path(dir, Long.toString(id) +
+        ((suffix == null || suffix.length() <= 0)? "": suffix));
+    } while(fs.exists(p));
+    return p;
+  }
+
+  /**
+   * Write file metadata.
+   * Call before you call close on the passed <code>w</code> since its written
+   * as metadata to that file.
+   * 
+   * @param filesystem file system
+   * @param maxSequenceId Maximum sequence id.
+   * @throws IOException
+   */
+  static void appendMetadata(final HFile.Writer w, final long maxSequenceId)
+  throws IOException {
+    appendMetadata(w, maxSequenceId, false);
+  }
+
+  /**
+   * Writes metadata.
+   * Call before you call close on the passed <code>w</code> since its written
+   * as metadata to that file.
+   * @param maxSequenceId Maximum sequence id.
+   * @param mc True if this file is product of a major compaction
+   * @throws IOException
+   */
+  static void appendMetadata(final HFile.Writer w, final long maxSequenceId,
+    final boolean mc)
+  throws IOException {
+    w.appendFileInfo(MAX_SEQ_ID_KEY, Bytes.toBytes(maxSequenceId));
+    w.appendFileInfo(MAJOR_COMPACTION_KEY, Bytes.toBytes(mc));
+  }
+
+  /*
+   * Write out a split reference.
+   * @param fs
+   * @param splitDir Presumes path format is actually
+   * <code>SOME_DIRECTORY/REGIONNAME/FAMILY</code>.
+   * @param f File to split.
+   * @param splitRow
+   * @param range
+   * @return Path to created reference.
+   * @throws IOException
+   */
+  static Path split(final FileSystem fs, final Path splitDir,
+    final StoreFile f, final byte [] splitRow, final Reference.Range range)
+  throws IOException {
+    // A reference to the bottom half of the hsf store file.
+    Reference r = new Reference(new HStoreKey(splitRow).getBytes(), range);
+    // Add the referred-to regions name as a dot separated suffix. 
+    // See REF_NAME_PARSER regex above.  The referred-to regions name is
+    // up in the path of the passed in <code>f</code> -- parentdir is family,
+    // then the directory above is the region name.
+    String parentRegionName = f.getPath().getParent().getParent().getName();
+    // Write reference with same file id only with the other region name as
+    // suffix and into the new region location (under same family).
+    Path p = new Path(splitDir, f.getPath().getName() + "." + parentRegionName);
+    return r.write(fs, p);
+  }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/hadoop/hbase/regionserver/HStoreScanner.java b/src/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
similarity index 91%
rename from src/java/org/apache/hadoop/hbase/regionserver/HStoreScanner.java
rename to src/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
index 33f5a3b9a8c..ccd5d605048 100644
--- a/src/java/org/apache/hadoop/hbase/regionserver/HStoreScanner.java
+++ b/src/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
@@ -36,13 +36,12 @@ import org.apache.hadoop.hbase.HStoreKey;
 import org.apache.hadoop.hbase.filter.RowFilterInterface;
 import org.apache.hadoop.hbase.io.Cell;
 import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.io.MapFile;
 
 /**
  * Scanner scans both the memcache and the HStore
  */
-class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
-  static final Log LOG = LogFactory.getLog(HStoreScanner.class);
+class StoreScanner implements InternalScanner,  ChangedReadersObserver {
+  static final Log LOG = LogFactory.getLog(StoreScanner.class);
 
   private InternalScanner[] scanners;
   private TreeMap<byte [], Cell>[] resultSets;
@@ -50,7 +49,7 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
   private boolean wildcardMatch = false;
   private boolean multipleMatchers = false;
   private RowFilterInterface dataFilter;
-  private HStore store;
+  private Store store;
   private final long timestamp;
   private final byte [][] targetCols;
   
@@ -66,7 +65,7 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
   
   /** Create an Scanner with a handle on the memcache and HStore files. */
   @SuppressWarnings("unchecked")
-  HStoreScanner(HStore store, byte [][] targetCols, byte [] firstRow,
+  StoreScanner(Store store, byte [][] targetCols, byte [] firstRow,
     long timestamp, RowFilterInterface filter) 
   throws IOException {
     this.store = store;
@@ -81,7 +80,6 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
     // See updateReaders below.
     this.timestamp = timestamp;
     this.targetCols = targetCols;
-
     try {
       scanners[MEMS_INDEX] =
         store.memcache.getScanner(timestamp, targetCols, firstRow);
@@ -124,7 +122,8 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
   private void setupScanner(final int i) throws IOException {
     this.keys[i] = new HStoreKey();
     this.resultSets[i] = new TreeMap<byte [], Cell>(Bytes.BYTES_COMPARATOR);
-    if (this.scanners[i] != null && !this.scanners[i].next(this.keys[i], this.resultSets[i])) {
+    if (this.scanners[i] != null && !this.scanners[i].next(this.keys[i],
+        this.resultSets[i])) {
       closeScanner(i);
     }
   }
@@ -154,10 +153,8 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
       for (int i = 0; i < this.keys.length; i++) {
         if (scanners[i] != null &&
             (chosenRow == null ||
-            (HStoreKey.compareTwoRowKeys(store.getHRegionInfo(),
-              keys[i].getRow(), chosenRow) < 0) ||
-            ((HStoreKey.compareTwoRowKeys(store.getHRegionInfo(),
-              keys[i].getRow(), chosenRow) == 0) &&
+            (HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) < 0) ||
+            ((HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) == 0) &&
             (keys[i].getTimestamp() > chosenTimestamp)))) {
           chosenRow = keys[i].getRow();
           chosenTimestamp = keys[i].getTimestamp();
@@ -187,8 +184,7 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
           while ((scanners[i] != null
               && !filtered
               && moreToFollow)
-              && (HStoreKey.compareTwoRowKeys(store.getHRegionInfo(),
-                keys[i].getRow(), chosenRow) == 0)) {
+              && (HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) == 0)) {
             // If we are doing a wild card match or there are multiple
             // matchers per column, we need to scan all the older versions of 
             // this row to pick up the rest of the family members
@@ -204,7 +200,7 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
             // a result if the map does not contain the key.
             HStoreKey hsk = new HStoreKey(key.getRow(),
               HConstants.EMPTY_BYTE_ARRAY,
-              key.getTimestamp(), this.store.getHRegionInfo());
+              key.getTimestamp());
             for (Map.Entry<byte [], Cell> e : resultSets[i].entrySet()) {
               hsk.setColumn(e.getKey());
               if (HLogEdit.isDeleted(e.getValue().getValue())) {
@@ -231,15 +227,13 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
               closeScanner(i);
             }
           }
-        }          
+        }
       }
-      
       for (int i = 0; i < scanners.length; i++) {
         // If the current scanner is non-null AND has a lower-or-equal
         // row label, then its timestamp is bad. We need to advance it.
         while ((scanners[i] != null) &&
-            (HStoreKey.compareTwoRowKeys(store.getHRegionInfo(), 
-              keys[i].getRow(), chosenRow) <= 0)) {
+            (HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) <= 0)) {
           resultSets[i].clear();
           if (!scanners[i].next(keys[i], resultSets[i])) {
             closeScanner(i);
@@ -248,7 +242,6 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
       }
 
       moreToFollow = chosenTimestamp >= 0;
-      
       if (dataFilter != null) {
         if (dataFilter.filterAllRemaining()) {
           moreToFollow = false;
@@ -319,9 +312,8 @@ class HStoreScanner implements InternalScanner,  ChangedReadersObserver {
     }
     this.lock.writeLock().lock();
     try {
-      MapFile.Reader [] readers = this.store.getReaders();
-      if (this.scanners[HSFS_INDEX] == null && readers != null &&
-          readers.length > 0) {
+      Map<Long, StoreFile> map = this.store.getStorefiles();
+      if (this.scanners[HSFS_INDEX] == null && map != null && map.size() > 0) {
         // Presume that we went from no readers to at least one -- need to put
         // a HStoreScanner in place.
         try {
diff --git a/src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java b/src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java
new file mode 100644
index 00000000000..a6d2ea08bfc
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java
@@ -0,0 +1,365 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.math.random.RandomData;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileScanner;
+import org.apache.hadoop.hbase.util.Bytes;
+
+/**
+ * <p>
+ * This class runs performance benchmarks for {@link HFile}.
+ * </p>
+ */
+public class HFilePerformanceEvaluation {
+  
+  private static final int ROW_LENGTH = 10;
+  private static final int ROW_COUNT = 1000000;
+  private static final int RFILE_BLOCKSIZE = 8 * 1024;
+  
+  static final Log LOG =
+    LogFactory.getLog(HFilePerformanceEvaluation.class.getName());
+  
+  static byte [] format(final int i) {
+    String v = Integer.toString(i);
+    return Bytes.toBytes("0000000000".substring(v.length()) + v);
+  }
+
+  static ImmutableBytesWritable format(final int i, ImmutableBytesWritable w) {
+    w.set(format(i));
+    return w;
+  }
+
+  private void runBenchmarks() throws Exception {
+    final Configuration conf = new Configuration();
+    final FileSystem fs = FileSystem.get(conf);
+    final Path mf = fs.makeQualified(new Path("performanceevaluation.mapfile"));
+    if (fs.exists(mf)) {
+      fs.delete(mf, true);
+    }
+
+    runBenchmark(new SequentialWriteBenchmark(conf, fs, mf, ROW_COUNT),
+        ROW_COUNT);
+    PerformanceEvaluationCommons.concurrentReads(new Runnable() {
+      public void run() {
+        try {
+          runBenchmark(new UniformRandomSmallScan(conf, fs, mf, ROW_COUNT),
+            ROW_COUNT);
+        } catch (Exception e) {
+          e.printStackTrace();
+        }
+      }
+    });
+    PerformanceEvaluationCommons.concurrentReads(new Runnable() {
+      public void run() {
+        try {
+          runBenchmark(new UniformRandomReadBenchmark(conf, fs, mf, ROW_COUNT),
+              ROW_COUNT);
+        } catch (Exception e) {
+          e.printStackTrace();
+        }
+      }
+    });
+    PerformanceEvaluationCommons.concurrentReads(new Runnable() {
+      public void run() {
+        try {
+          runBenchmark(new GaussianRandomReadBenchmark(conf, fs, mf, ROW_COUNT),
+              ROW_COUNT);
+        } catch (Exception e) {
+          e.printStackTrace();
+        }
+      }
+    });
+    PerformanceEvaluationCommons.concurrentReads(new Runnable() {
+      public void run() {
+        try {
+          runBenchmark(new SequentialReadBenchmark(conf, fs, mf, ROW_COUNT),
+              ROW_COUNT);
+        } catch (Exception e) {
+          e.printStackTrace();
+        }
+      }
+    });
+    
+  }
+  
+  private void runBenchmark(RowOrientedBenchmark benchmark, int rowCount)
+    throws Exception {
+    LOG.info("Running " + benchmark.getClass().getSimpleName() + " for " +
+        rowCount + " rows.");
+    long elapsedTime = benchmark.run();
+    LOG.info("Running " + benchmark.getClass().getSimpleName() + " for " +
+        rowCount + " rows took " + elapsedTime + "ms.");
+  }
+  
+  static abstract class RowOrientedBenchmark {
+    
+    protected final Configuration conf;
+    protected final FileSystem fs;
+    protected final Path mf;
+    protected final int totalRows;
+    
+    public RowOrientedBenchmark(Configuration conf, FileSystem fs, Path mf,
+        int totalRows) {
+      this.conf = conf;
+      this.fs = fs;
+      this.mf = mf;
+      this.totalRows = totalRows;
+    }
+    
+    void setUp() throws Exception {
+      // do nothing
+    }
+    
+    abstract void doRow(int i) throws Exception;
+    
+    protected int getReportingPeriod() {
+      return this.totalRows / 10;
+    }
+    
+    void tearDown() throws Exception {
+      // do nothing
+    }
+    
+    /**
+     * Run benchmark
+     * @return elapsed time.
+     * @throws Exception
+     */
+    long run() throws Exception {
+      long elapsedTime;
+      setUp();
+      long startTime = System.currentTimeMillis();
+      try {
+        for (int i = 0; i < totalRows; i++) {
+          if (i > 0 && i % getReportingPeriod() == 0) {
+            LOG.info("Processed " + i + " rows.");
+          }
+          doRow(i);
+        }
+        elapsedTime = System.currentTimeMillis() - startTime;
+      } finally {
+        tearDown();
+      }
+      return elapsedTime;
+    }
+    
+  }
+  
+  static class SequentialWriteBenchmark extends RowOrientedBenchmark {
+    protected HFile.Writer writer;
+    private Random random = new Random();
+    private byte[] bytes = new byte[ROW_LENGTH];
+    
+    public SequentialWriteBenchmark(Configuration conf, FileSystem fs, Path mf,
+        int totalRows) {
+      super(conf, fs, mf, totalRows);
+    }
+    
+    @Override
+    void setUp() throws Exception {
+      writer = new HFile.Writer(this.fs, this.mf, RFILE_BLOCKSIZE, null, null);
+    }
+    
+    @Override
+    void doRow(int i) throws Exception {
+      writer.append(format(i), generateValue()); 
+    }
+    
+    private byte[] generateValue() {
+      random.nextBytes(bytes);
+      return bytes;
+    }
+    
+    @Override
+    protected int getReportingPeriod() {
+      return this.totalRows; // don't report progress
+    }
+    
+    @Override
+    void tearDown() throws Exception {
+      writer.close();
+    }
+    
+  }
+  
+  static abstract class ReadBenchmark extends RowOrientedBenchmark {
+    ImmutableBytesWritable key = new ImmutableBytesWritable();
+    ImmutableBytesWritable value = new ImmutableBytesWritable();
+    
+    protected HFile.Reader reader;
+    
+    public ReadBenchmark(Configuration conf, FileSystem fs, Path mf,
+        int totalRows) {
+      super(conf, fs, mf, totalRows);
+    }
+    
+    @Override
+    void setUp() throws Exception {
+      reader = new HFile.Reader(this.fs, this.mf, null);
+      this.reader.loadFileInfo();
+    }
+    
+    @Override
+    void tearDown() throws Exception {
+      reader.close();
+    }
+    
+  }
+
+  static class SequentialReadBenchmark extends ReadBenchmark {
+    private HFileScanner scanner;
+    
+    public SequentialReadBenchmark(Configuration conf, FileSystem fs,
+      Path mf, int totalRows)
+    throws IOException {
+      super(conf, fs, mf, totalRows);
+    }
+    
+    @Override
+    void setUp() throws Exception {
+      super.setUp();
+      this.scanner = this.reader.getScanner();
+      this.scanner.seekTo();
+    }
+
+    @Override
+    void doRow(@SuppressWarnings("unused") int i) throws Exception {
+      if (this.scanner.next()) {
+        ByteBuffer k = this.scanner.getKey();
+        PerformanceEvaluationCommons.assertKey(format(i + 1), k);
+        ByteBuffer v = scanner.getValue();
+        PerformanceEvaluationCommons.assertValueSize(v.limit(), ROW_LENGTH);
+      }
+    }
+    
+    @Override
+    protected int getReportingPeriod() {
+      return this.totalRows; // don't report progress
+    }
+    
+  }
+  
+  static class UniformRandomReadBenchmark extends ReadBenchmark {
+    
+    private Random random = new Random();
+
+    public UniformRandomReadBenchmark(Configuration conf, FileSystem fs,
+        Path mf, int totalRows) {
+      super(conf, fs, mf, totalRows);
+    }
+
+    @Override
+    void doRow(@SuppressWarnings("unused") int i) throws Exception {
+      HFileScanner scanner = this.reader.getScanner();
+      byte [] b = getRandomRow();
+      scanner.seekTo(b);
+      ByteBuffer k = scanner.getKey();
+      PerformanceEvaluationCommons.assertKey(b, k);
+      ByteBuffer v = scanner.getValue();
+      PerformanceEvaluationCommons.assertValueSize(v.limit(), ROW_LENGTH);
+    }
+    
+    private byte [] getRandomRow() {
+      return format(random.nextInt(totalRows));
+    }
+  }
+  
+  static class UniformRandomSmallScan extends ReadBenchmark {
+    private Random random = new Random();
+
+    public UniformRandomSmallScan(Configuration conf, FileSystem fs,
+        Path mf, int totalRows) {
+      super(conf, fs, mf, totalRows/10);
+    }
+
+    @Override
+    void doRow(@SuppressWarnings("unused") int i) throws Exception {
+      HFileScanner scanner = this.reader.getScanner();
+      byte [] b = getRandomRow();
+      if (scanner.seekTo(b) != 0) {
+        System.out.println("Nonexistent row: " + new String(b));
+        return;
+      }
+      ByteBuffer k = scanner.getKey();
+      PerformanceEvaluationCommons.assertKey(b, k);
+      // System.out.println("Found row: " + new String(b));
+      for (int ii = 0; ii < 30; ii++) {
+        if (!scanner.next()) {
+          System.out.println("NOTHING FOLLOWS");
+        }
+        ByteBuffer v = scanner.getValue();
+        PerformanceEvaluationCommons.assertValueSize(v.limit(), ROW_LENGTH);
+      }
+    }
+    
+    private byte [] getRandomRow() {
+      return format(random.nextInt(totalRows));
+    }
+  }
+  
+  static class GaussianRandomReadBenchmark extends ReadBenchmark {
+    
+    private RandomData randomData = new RandomDataImpl();
+
+    public GaussianRandomReadBenchmark(Configuration conf, FileSystem fs,
+        Path mf, int totalRows) {
+      super(conf, fs, mf, totalRows);
+    }
+
+    @Override
+    void doRow(@SuppressWarnings("unused") int i) throws Exception {
+      HFileScanner scanner = this.reader.getScanner();
+      scanner.seekTo(getGaussianRandomRowBytes());
+      for (int ii = 0; ii < 30; ii++) {
+        if (!scanner.next()) {
+          System.out.println("NOTHING FOLLOWS");
+        }
+        scanner.getKey();
+        scanner.getValue();
+      }
+    }
+
+    private byte [] getGaussianRandomRowBytes() {
+      int r = (int) randomData.nextGaussian(totalRows / 2, totalRows / 10);
+      return format(r);
+    }
+  }
+  
+  /**
+   * @param args
+   * @throws IOException 
+   */
+  public static void main(String[] args) throws Exception {
+    new HFilePerformanceEvaluation().runBenchmarks();
+  }
+}
\ No newline at end of file
diff --git a/src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java b/src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java
new file mode 100644
index 00000000000..78d984cee54
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+
+/**
+ * Code shared by PE tests.
+ */
+public class PerformanceEvaluationCommons {
+  static final Log LOG =
+    LogFactory.getLog(PerformanceEvaluationCommons.class.getName());
+  
+  public static void assertValueSize(final int expectedSize, final int got) {
+    if (got != expectedSize) {
+      throw new AssertionError("Expected " + expectedSize + " but got " + got);
+    }
+  }
+  
+  public static void assertKey(final byte [] expected, final ByteBuffer got) {
+    byte [] b = new byte[got.limit()];
+    got.get(b, 0, got.limit());
+    assertKey(expected, b);
+  }
+
+  public static void assertKey(final byte [] expected, final byte [] got) {
+    if (!org.apache.hadoop.hbase.util.Bytes.equals(expected, got)) {
+      throw new AssertionError("Expected " +
+        org.apache.hadoop.hbase.util.Bytes.toString(expected) +
+        " but got " + org.apache.hadoop.hbase.util.Bytes.toString(got));
+    }
+  }
+  
+  public static void concurrentReads(final Runnable r) {
+    final int count = 1;
+    long now = System.currentTimeMillis();
+    List<Thread> threads = new ArrayList<Thread>(count);
+    for (int i = 0; i < count; i++) {
+      Thread t = new Thread(r);
+      t.setName("" + i);
+      threads.add(t);
+    }
+    for (Thread t: threads) {
+      t.start();
+    }
+    for (Thread t: threads) {
+      try {
+        t.join();
+      } catch (InterruptedException e) {
+        e.printStackTrace();
+      }
+    }
+    LOG.info("Test took " + (System.currentTimeMillis() - now));
+  }
+}
\ No newline at end of file
diff --git a/src/test/org/apache/hadoop/hbase/TestHStoreKey.java b/src/test/org/apache/hadoop/hbase/TestHStoreKey.java
new file mode 100644
index 00000000000..f03f01cf1b0
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/TestHStoreKey.java
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.hbase.HStoreKey.StoreKeyByteComparator;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Writables;
+
+/**
+ * Tests for the HStoreKey Plain and Meta RawComparators.
+ */
+public class TestHStoreKey extends TestCase {
+  protected void setUp() throws Exception {
+    super.setUp();
+  }
+
+  protected void tearDown() throws Exception {
+    super.tearDown();
+  }
+
+  public void testByteBuffer() throws Exception {
+    final long ts = 123;
+    final byte [] row = Bytes.toBytes("row");
+    final byte [] column = Bytes.toBytes("column");
+    HStoreKey hsk = new HStoreKey(row, column, ts);
+    ByteBuffer bb = ByteBuffer.wrap(hsk.getBytes());
+    assertTrue(Bytes.equals(row, HStoreKey.getRow(bb)));
+    assertTrue(Bytes.equals(column, HStoreKey.getColumn(bb)));
+    assertEquals(ts, HStoreKey.getTimestamp(bb));
+  }
+
+  /**
+   * Test the byte comparator works same as the object comparator.
+   */
+  public void testRawComparator() throws IOException {
+    long timestamp = System.currentTimeMillis();
+    byte [] a = Bytes.toBytes("a");
+    HStoreKey past = new HStoreKey(a, a, timestamp - 10);
+    byte [] pastBytes = Writables.getBytes(past);
+    HStoreKey now = new HStoreKey(a, a, timestamp);
+    byte [] nowBytes = Writables.getBytes(now);
+    HStoreKey future = new HStoreKey(a, a, timestamp + 10);
+    byte [] futureBytes = Writables.getBytes(future);
+    StoreKeyByteComparator comparator = new HStoreKey.StoreKeyByteComparator();
+    assertTrue(past.compareTo(now) > 0);
+    assertTrue(comparator.compare(pastBytes, nowBytes) > 0);
+    assertTrue(now.compareTo(now) == 0);
+    assertTrue(comparator.compare(nowBytes, nowBytes) == 0);
+    assertTrue(future.compareTo(now) < 0);
+    assertTrue(comparator.compare(futureBytes, nowBytes) < 0);
+    // Check that empty column comes before one with a column
+    HStoreKey nocolumn = new HStoreKey(a, timestamp);
+    byte [] nocolumnBytes = Writables.getBytes(nocolumn);
+    HStoreKey withcolumn = new HStoreKey(a, a, timestamp);
+    byte [] withcolumnBytes = Writables.getBytes(withcolumn);
+    assertTrue(nocolumn.compareTo(withcolumn) < 0);
+    assertTrue(comparator.compare(nocolumnBytes, withcolumnBytes) < 0);
+    // Check that empty column comes and LATEST comes before one with a column
+    // and old timestamp.
+    nocolumn = new HStoreKey(a, HConstants.LATEST_TIMESTAMP);
+    nocolumnBytes = Writables.getBytes(nocolumn);
+    withcolumn = new HStoreKey(a, a, timestamp);
+    withcolumnBytes = Writables.getBytes(withcolumn);
+    assertTrue(nocolumn.compareTo(withcolumn) < 0);
+    assertTrue(comparator.compare(nocolumnBytes, withcolumnBytes) < 0);
+  }
+  
+//  /**
+//   * Tests cases where rows keys have characters below the ','.
+//   * See HBASE-832
+//   * @throws IOException 
+//   */
+//  public void testHStoreKeyBorderCases() throws IOException {
+//    HRegionInfo info = new HRegionInfo(new HTableDescriptor("testtable"),
+//        HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY);
+//
+//    HStoreKey rowA = new HStoreKey("testtable,www.hbase.org/,1234",
+//      "", Long.MAX_VALUE, info);
+//    byte [] rowABytes = Writables.getBytes(rowA);
+//    HStoreKey rowB = new HStoreKey("testtable,www.hbase.org/%20,99999",
+//      "", Long.MAX_VALUE, info);
+//    byte [] rowBBytes = Writables.getBytes(rowB);
+//    assertTrue(rowA.compareTo(rowB) > 0);
+//    HStoreKey.Comparator comparator = new HStoreKey.PlainStoreKeyComparator();
+//    assertTrue(comparator.compare(rowABytes, rowBBytes) > 0);
+//
+//    rowA = new HStoreKey("testtable,www.hbase.org/,1234",
+//        "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO);
+//    rowB = new HStoreKey("testtable,www.hbase.org/%20,99999",
+//        "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO);
+//    assertTrue(rowA.compareTo(rowB) < 0);
+//    assertTrue(comparator.compare(rowABytes, rowBBytes) < 0);
+//
+//    rowA = new HStoreKey("testtable,,1234",
+//        "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO);
+//    rowB = new HStoreKey("testtable,$www.hbase.org/,99999",
+//        "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO);
+//    assertTrue(rowA.compareTo(rowB) < 0);
+//    assertTrue(comparator.compare(rowABytes, rowBBytes) < 0);
+//
+//    rowA = new HStoreKey(".META.,testtable,www.hbase.org/,1234,4321",
+//        "", Long.MAX_VALUE, HRegionInfo.ROOT_REGIONINFO);
+//    rowB = new HStoreKey(".META.,testtable,www.hbase.org/%20,99999,99999",
+//        "", Long.MAX_VALUE, HRegionInfo.ROOT_REGIONINFO);
+//    assertTrue(rowA.compareTo(rowB) > 0);
+//    assertTrue(comparator.compare(rowABytes, rowBBytes) > 0);
+//  }
+}
\ No newline at end of file
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java b/src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java
new file mode 100644
index 00000000000..ca8b80ad0e1
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.util.Random;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.WritableComparator;
+
+/**
+ * Generate random <key, value> pairs.
+ * <p>
+ * Copied from
+ * <a href="https://issues.apache.org/jira/browse/HADOOP-3315">hadoop-3315 tfile</a>.
+ * Remove after tfile is committed and use the tfile version of this class
+ * instead.</p>
+ */
+class KVGenerator {
+  private final Random random;
+  private final byte[][] dict;
+  private final boolean sorted;
+  private final RandomDistribution.DiscreteRNG keyLenRNG, valLenRNG;
+  private BytesWritable lastKey;
+  private static final int MIN_KEY_LEN = 4;
+  private final byte prefix[] = new byte[MIN_KEY_LEN];
+
+  public KVGenerator(Random random, boolean sorted,
+      RandomDistribution.DiscreteRNG keyLenRNG,
+      RandomDistribution.DiscreteRNG valLenRNG,
+      RandomDistribution.DiscreteRNG wordLenRNG, int dictSize) {
+    this.random = random;
+    dict = new byte[dictSize][];
+    this.sorted = sorted;
+    this.keyLenRNG = keyLenRNG;
+    this.valLenRNG = valLenRNG;
+    for (int i = 0; i < dictSize; ++i) {
+      int wordLen = wordLenRNG.nextInt();
+      dict[i] = new byte[wordLen];
+      random.nextBytes(dict[i]);
+    }
+    lastKey = new BytesWritable();
+    fillKey(lastKey);
+  }
+  
+  private void fillKey(BytesWritable o) {
+    int len = keyLenRNG.nextInt();
+    if (len < MIN_KEY_LEN) len = MIN_KEY_LEN;
+    o.setSize(len);
+    int n = MIN_KEY_LEN;
+    while (n < len) {
+      byte[] word = dict[random.nextInt(dict.length)];
+      int l = Math.min(word.length, len - n);
+      System.arraycopy(word, 0, o.get(), n, l);
+      n += l;
+    }
+    if (sorted
+        && WritableComparator.compareBytes(lastKey.get(), MIN_KEY_LEN, lastKey
+            .getSize()
+            - MIN_KEY_LEN, o.get(), MIN_KEY_LEN, o.getSize() - MIN_KEY_LEN) > 0) {
+      incrementPrefix();
+    }
+
+    System.arraycopy(prefix, 0, o.get(), 0, MIN_KEY_LEN);
+    lastKey.set(o);
+  }
+
+  private void fillValue(BytesWritable o) {
+    int len = valLenRNG.nextInt();
+    o.setSize(len);
+    int n = 0;
+    while (n < len) {
+      byte[] word = dict[random.nextInt(dict.length)];
+      int l = Math.min(word.length, len - n);
+      System.arraycopy(word, 0, o.get(), n, l);
+      n += l;
+    }
+  }
+  
+  private void incrementPrefix() {
+    for (int i = MIN_KEY_LEN - 1; i >= 0; --i) {
+      ++prefix[i];
+      if (prefix[i] != 0) return;
+    }
+    
+    throw new RuntimeException("Prefix overflown");
+  }
+  
+  public void next(BytesWritable key, BytesWritable value, boolean dupKey) {
+    if (dupKey) {
+      key.set(lastKey);
+    }
+    else {
+      fillKey(key);
+    }
+    fillValue(value);
+  }
+}
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java b/src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java
new file mode 100644
index 00000000000..b90a3ea7529
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.hbase.io.hfile.RandomDistribution.DiscreteRNG;
+
+/*
+* <p>
+* Copied from
+* <a href="https://issues.apache.org/jira/browse/HADOOP-3315">hadoop-3315 tfile</a>.
+* Remove after tfile is committed and use the tfile version of this class
+* instead.</p>
+*/
+class KeySampler {
+  Random random;
+  int min, max;
+  DiscreteRNG keyLenRNG;
+  private static final int MIN_KEY_LEN = 4;
+
+  public KeySampler(Random random, byte [] first, byte [] last,
+      DiscreteRNG keyLenRNG) throws IOException {
+    this.random = random;
+    min = keyPrefixToInt(first);
+    max = keyPrefixToInt(last);
+    this.keyLenRNG = keyLenRNG;
+  }
+
+  private int keyPrefixToInt(byte [] key) throws IOException {
+    byte[] b = key;
+    int o = 0;
+    return (b[o] & 0xff) << 24 | (b[o + 1] & 0xff) << 16
+        | (b[o + 2] & 0xff) << 8 | (b[o + 3] & 0xff);
+  }
+  
+  public void next(BytesWritable key) {
+    key.setSize(Math.max(MIN_KEY_LEN, keyLenRNG.nextInt()));
+    random.nextBytes(key.get());
+    int n = random.nextInt(max - min) + min;
+    byte[] b = key.get();
+    b[0] = (byte) (n >> 24);
+    b[1] = (byte) (n >> 16);
+    b[2] = (byte) (n >> 8);
+    b[3] = (byte) n;
+  }
+}
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java b/src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java
new file mode 100644
index 00000000000..1312da047ab
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+/**
+ * A nano-second timer.
+ * <p>
+ * Copied from
+ * <a href="https://issues.apache.org/jira/browse/HADOOP-3315">hadoop-3315 tfile</a>.
+ * Remove after tfile is committed and use the tfile version of this class
+ * instead.</p>
+ */
+public class NanoTimer {
+  private long last = -1;
+  private boolean started = false;
+  private long cumulate = 0;
+
+  /**
+   * Constructor
+   * 
+   * @param start
+   *          Start the timer upon construction.
+   */
+  public NanoTimer(boolean start) {
+    if (start) this.start();
+  }
+
+  /**
+   * Start the timer.
+   * 
+   * Note: No effect if timer is already started.
+   */
+  public void start() {
+    if (!this.started) {
+      this.last = System.nanoTime();
+      this.started = true;
+    }
+  }
+
+  /**
+   * Stop the timer.
+   * 
+   * Note: No effect if timer is already stopped.
+   */
+  public void stop() {
+    if (this.started) {
+      this.started = false;
+      this.cumulate += System.nanoTime() - this.last;
+    }
+  }
+
+  /**
+   * Read the timer.
+   * 
+   * @return the elapsed time in nano-seconds. Note: If the timer is never
+   *         started before, -1 is returned.
+   */
+  public long read() {
+    if (!readable()) return -1;
+
+    return this.cumulate;
+  }
+
+  /**
+   * Reset the timer.
+   */
+  public void reset() {
+    this.last = -1;
+    this.started = false;
+    this.cumulate = 0;
+  }
+
+  /**
+   * Checking whether the timer is started
+   * 
+   * @return true if timer is started.
+   */
+  public boolean isStarted() {
+    return this.started;
+  }
+
+  /**
+   * Format the elapsed time to a human understandable string.
+   * 
+   * Note: If timer is never started, "ERR" will be returned.
+   */
+  public String toString() {
+    if (!readable()) {
+      return "ERR";
+    }
+
+    return NanoTimer.nanoTimeToString(this.cumulate);
+  }
+
+  /**
+   * A utility method to format a time duration in nano seconds into a human
+   * understandable stirng.
+   * 
+   * @param t
+   *          Time duration in nano seconds.
+   * @return String representation.
+   */
+  public static String nanoTimeToString(long t) {
+    if (t < 0) return "ERR";
+
+    if (t == 0) return "0";
+
+    if (t < 1000) {
+      return t + "ns";
+    }
+
+    double us = (double) t / 1000;
+    if (us < 1000) {
+      return String.format("%.2fus", us);
+    }
+
+    double ms = us / 1000;
+    if (ms < 1000) {
+      return String.format("%.2fms", ms);
+    }
+
+    double ss = ms / 1000;
+    if (ss < 1000) {
+      return String.format("%.2fs", ss);
+    }
+
+    long mm = (long) ss / 60;
+    ss -= mm * 60;
+    long hh = mm / 60;
+    mm -= hh * 60;
+    long dd = hh / 24;
+    hh -= dd * 24;
+
+    if (dd > 0) {
+      return String.format("%dd%dh", dd, hh);
+    }
+
+    if (hh > 0) {
+      return String.format("%dh%dm", hh, mm);
+    }
+
+    if (mm > 0) {
+      return String.format("%dm%.1fs", mm, ss);
+    }
+
+    return String.format("%.2fs", ss);
+
+    /**
+     * StringBuilder sb = new StringBuilder(); String sep = "";
+     * 
+     * if (dd > 0) { String unit = (dd > 1) ? "days" : "day";
+     * sb.append(String.format("%s%d%s", sep, dd, unit)); sep = " "; }
+     * 
+     * if (hh > 0) { String unit = (hh > 1) ? "hrs" : "hr";
+     * sb.append(String.format("%s%d%s", sep, hh, unit)); sep = " "; }
+     * 
+     * if (mm > 0) { String unit = (mm > 1) ? "mins" : "min";
+     * sb.append(String.format("%s%d%s", sep, mm, unit)); sep = " "; }
+     * 
+     * if (ss > 0) { String unit = (ss > 1) ? "secs" : "sec";
+     * sb.append(String.format("%s%.3f%s", sep, ss, unit)); sep = " "; }
+     * 
+     * return sb.toString();
+     */
+  }
+
+  private boolean readable() {
+    return this.last != -1;
+  }
+
+  /**
+   * Simple tester.
+   * 
+   * @param args
+   */
+  public static void main(String[] args) {
+    long i = 7;
+
+    for (int x = 0; x < 20; ++x, i *= 7) {
+      System.out.println(NanoTimer.nanoTimeToString(i));
+    }
+  }
+}
+
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java b/src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java
new file mode 100644
index 00000000000..32196648901
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java
@@ -0,0 +1,271 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Random;
+
+/**
+ * A class that generates random numbers that follow some distribution.
+ * <p>
+ * Copied from
+ * <a href="https://issues.apache.org/jira/browse/HADOOP-3315">hadoop-3315 tfile</a>.
+ * Remove after tfile is committed and use the tfile version of this class
+ * instead.</p>
+ */
+public class RandomDistribution {
+  /**
+   * Interface for discrete (integer) random distributions.
+   */
+  public static interface DiscreteRNG {
+    /**
+     * Get the next random number
+     * 
+     * @return the next random number.
+     */
+    public int nextInt();
+  }
+
+  /**
+   * P(i)=1/(max-min)
+   */
+  public static final class Flat implements DiscreteRNG {
+    private final Random random;
+    private final int min;
+    private final int max;
+
+    /**
+     * Generate random integers from min (inclusive) to max (exclusive)
+     * following even distribution.
+     * 
+     * @param random
+     *          The basic random number generator.
+     * @param min
+     *          Minimum integer
+     * @param max
+     *          maximum integer (exclusive).
+     * 
+     */
+    public Flat(Random random, int min, int max) {
+      if (min >= max) {
+        throw new IllegalArgumentException("Invalid range");
+      }
+      this.random = random;
+      this.min = min;
+      this.max = max;
+    }
+    
+    /**
+     * @see DiscreteRNG#nextInt()
+     */
+    @Override
+    public int nextInt() {
+      return random.nextInt(max - min) + min;
+    }
+  }
+
+  /**
+   * Zipf distribution. The ratio of the probabilities of integer i and j is
+   * defined as follows:
+   * 
+   * P(i)/P(j)=((j-min+1)/(i-min+1))^sigma.
+   */
+  public static final class Zipf implements DiscreteRNG {
+    private static final double DEFAULT_EPSILON = 0.001;
+    private final Random random;
+    private final ArrayList<Integer> k;
+    private final ArrayList<Double> v;
+
+    /**
+     * Constructor
+     * 
+     * @param r
+     *          The random number generator.
+     * @param min
+     *          minimum integer (inclusvie)
+     * @param max
+     *          maximum integer (exclusive)
+     * @param sigma
+     *          parameter sigma. (sigma > 1.0)
+     */
+    public Zipf(Random r, int min, int max, double sigma) {
+      this(r, min, max, sigma, DEFAULT_EPSILON);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param r
+     *          The random number generator.
+     * @param min
+     *          minimum integer (inclusvie)
+     * @param max
+     *          maximum integer (exclusive)
+     * @param sigma
+     *          parameter sigma. (sigma > 1.0)
+     * @param epsilon
+     *          Allowable error percentage (0 < epsilon < 1.0).
+     */
+    public Zipf(Random r, int min, int max, double sigma, double epsilon) {
+      if ((max <= min) || (sigma <= 1) || (epsilon <= 0)
+          || (epsilon >= 0.5)) {
+        throw new IllegalArgumentException("Invalid arguments");
+      }
+      random = r;
+      k = new ArrayList<Integer>();
+      v = new ArrayList<Double>();
+
+      double sum = 0;
+      int last = -1;
+      for (int i = min; i < max; ++i) {
+        sum += Math.exp(-sigma * Math.log(i - min + 1));
+        if ((last == -1) || i * (1 - epsilon) > last) {
+          k.add(i);
+          v.add(sum);
+          last = i;
+        }
+      }
+
+      if (last != max - 1) {
+        k.add(max - 1);
+        v.add(sum);
+      }
+
+      v.set(v.size() - 1, 1.0);
+
+      for (int i = v.size() - 2; i >= 0; --i) {
+        v.set(i, v.get(i) / sum);
+      }
+    }
+
+    /**
+     * @see DiscreteRNG#nextInt()
+     */
+    @Override
+    public int nextInt() {
+      double d = random.nextDouble();
+      int idx = Collections.binarySearch(v, d);
+
+      if (idx > 0) {
+        ++idx;
+      }
+      else {
+        idx = -(idx + 1);
+      }
+
+      if (idx >= v.size()) {
+        idx = v.size() - 1;
+      }
+
+      if (idx == 0) {
+        return k.get(0);
+      }
+
+      int ceiling = k.get(idx);
+      int lower = k.get(idx - 1);
+
+      return ceiling - random.nextInt(ceiling - lower);
+    }
+  }
+
+  /**
+   * Binomial distribution.
+   * 
+   * P(k)=select(n, k)*p^k*(1-p)^(n-k) (k = 0, 1, ..., n)
+   * 
+   * P(k)=select(max-min-1, k-min)*p^(k-min)*(1-p)^(k-min)*(1-p)^(max-k-1)
+   */
+  public static final class Binomial implements DiscreteRNG {
+    private final Random random;
+    private final int min;
+    private final int n;
+    private final double[] v;
+
+    private static double select(int n, int k) {
+      double ret = 1.0;
+      for (int i = k + 1; i <= n; ++i) {
+        ret *= (double) i / (i - k);
+      }
+      return ret;
+    }
+    
+    private static double power(double p, int k) {
+      return Math.exp(k * Math.log(p));
+    }
+
+    /**
+     * Generate random integers from min (inclusive) to max (exclusive)
+     * following Binomial distribution.
+     * 
+     * @param random
+     *          The basic random number generator.
+     * @param min
+     *          Minimum integer
+     * @param max
+     *          maximum integer (exclusive).
+     * @param p
+     *          parameter.
+     * 
+     */
+    public Binomial(Random random, int min, int max, double p) {
+      if (min >= max) {
+        throw new IllegalArgumentException("Invalid range");
+      }
+      this.random = random;
+      this.min = min;
+      this.n = max - min - 1;
+      if (n > 0) {
+        v = new double[n + 1];
+        double sum = 0.0;
+        for (int i = 0; i <= n; ++i) {
+          sum += select(n, i) * power(p, i) * power(1 - p, n - i);
+          v[i] = sum;
+        }
+        for (int i = 0; i <= n; ++i) {
+          v[i] /= sum;
+        }
+      }
+      else {
+        v = null;
+      }
+    }
+
+    /**
+     * @see DiscreteRNG#nextInt()
+     */
+    @Override
+    public int nextInt() {
+      if (v == null) {
+        return min;
+      }
+      double d = random.nextDouble();
+      int idx = Arrays.binarySearch(v, d);
+      if (idx > 0) {
+        ++idx;
+      } else {
+        idx = -(idx + 1);
+      }
+
+      if (idx >= v.length) {
+        idx = v.length - 1;
+      }
+      return idx + min;
+    }
+  }
+}
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java b/src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java
new file mode 100644
index 00000000000..2845c073937
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java
@@ -0,0 +1,124 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RawLocalFileSystem;
+import org.apache.hadoop.hbase.io.hfile.HFile.Reader;
+import org.apache.hadoop.hbase.util.Bytes;
+
+/**
+ * Random seek test.
+ */
+public class RandomSeek {
+  private static List<String> slurp(String fname) throws IOException {
+    BufferedReader istream = new BufferedReader(new FileReader(fname));
+    String str;
+    List<String> l = new ArrayList<String>();
+    while ( (str=istream.readLine()) != null) {
+      String [] parts = str.split(",");
+      l.add(parts[0] + ":" + parts[1] + ":" + parts[2]);
+    }
+    return l;
+  }
+  private static String randKey(List<String> keys) {
+    Random r = new Random();
+    //return keys.get(r.nextInt(keys.size()));
+    return "2" + Integer.toString(7+r.nextInt(2)) + Integer.toString(r.nextInt(100));
+    //return new String(r.nextInt(100));
+  }
+  
+  public static void main(String [] argv) throws IOException {
+    Configuration conf = new Configuration();
+    conf.setInt("io.file.buffer.size", 64*1024);
+    RawLocalFileSystem rlfs = new RawLocalFileSystem();
+    rlfs.setConf(conf);
+    LocalFileSystem lfs = new LocalFileSystem(rlfs);
+    
+    Path path = new Path("/Users/ryan/rfile.big.txt");
+    long start = System.currentTimeMillis();
+    SimpleBlockCache cache = new SimpleBlockCache();
+    //LruBlockCache cache = new LruBlockCache();
+    Reader reader = new HFile.Reader(lfs, path, cache);
+    reader.loadFileInfo();
+    System.out.println(reader.trailer);
+    long end = System.currentTimeMillis();
+    
+    System.out.println("Index read time: " + (end - start));
+
+    List<String> keys = slurp("/Users/ryan/xaa.50k");
+    
+    HFileScanner scanner = reader.getScanner();
+    int count;
+    long totalBytes = 0;
+    int notFound = 0;
+
+    start = System.nanoTime();
+    for(count = 0; count < 500000; ++count) {
+      String key = randKey(keys);
+      byte [] bkey = Bytes.toBytes(key);
+      int res = scanner.seekTo(bkey);
+      if (res == 0) {
+        ByteBuffer k = scanner.getKey();
+        ByteBuffer v = scanner.getValue();
+        totalBytes += k.limit();
+        totalBytes += v.limit();
+      } else {
+        ++ notFound;
+      }
+      if (res == -1) {
+        scanner.seekTo();
+      }
+      // Scan for another 1000 rows.
+      for (int i = 0; i < 1000; ++i) {
+        if (!scanner.next())
+          break;
+        ByteBuffer k = scanner.getKey();
+        ByteBuffer v = scanner.getValue();
+        totalBytes += k.limit();
+        totalBytes += v.limit();
+      }
+      
+      if ( count % 1000 == 0 ) {
+        end = System.nanoTime();
+        
+            System.out.println("Cache block count: " + cache.size() + " dumped: "+ cache.dumps);
+            //System.out.println("Cache size: " + cache.heapSize());
+            double msTime = ((end - start) / 1000000.0);
+            System.out.println("Seeked: "+ count + " in " + msTime + " (ms) " 
+                + (1000.0 / msTime ) + " seeks/ms "
+                + (msTime / 1000.0) + " ms/seek");
+            
+            start = System.nanoTime();
+      }
+    }
+    System.out.println("Total bytes: " + totalBytes + " not found: " + notFound);
+  }
+}
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java
new file mode 100644
index 00000000000..0ab42b80931
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java
@@ -0,0 +1,248 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RawLocalFileSystem;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.io.hfile.HFile.Reader;
+import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.RawComparator;
+
+/**
+ * test hfile features.
+ * <p>
+ * Copied from
+ * <a href="https://issues.apache.org/jira/browse/HADOOP-3315">hadoop-3315 tfile</a>.
+ * Remove after tfile is committed and use the tfile version of this class
+ * instead.</p>
+ */
+public class TestHFile extends TestCase {
+  static final Log LOG = LogFactory.getLog(TestHFile.class);
+  
+  private static String ROOT_DIR =
+    System.getProperty("test.build.data", "/tmp/TestHFile");
+  private FileSystem fs;
+  private Configuration conf;
+  private final int minBlockSize = 512;
+  private static String localFormatter = "%010d";
+
+  @Override
+  public void setUp() {
+    conf = new HBaseConfiguration();
+    RawLocalFileSystem rawLFS = new RawLocalFileSystem();
+    rawLFS.setConf(conf);
+    fs = new LocalFileSystem(rawLFS);
+  }
+
+  // write some records into the tfile
+  // write them twice
+  private int writeSomeRecords(Writer writer, int start, int n)
+      throws IOException {
+    String value = "value";
+    for (int i = start; i < (start + n); i++) {
+      String key = String.format(localFormatter, Integer.valueOf(i));
+      writer.append(Bytes.toBytes(key), Bytes.toBytes(value + key));
+    }
+    return (start + n);
+  }
+
+  private void readAllRecords(HFileScanner scanner) throws IOException {
+    readAndCheckbytes(scanner, 0, 100);
+  }
+
+  // read the records and check
+  private int readAndCheckbytes(HFileScanner scanner, int start, int n)
+      throws IOException {
+    String value = "value";
+    int i = start;
+    for (; i < (start + n); i++) {
+      ByteBuffer key = scanner.getKey();
+      ByteBuffer val = scanner.getValue();
+      String keyStr = String.format(localFormatter, Integer.valueOf(i));
+      String valStr = value + keyStr;
+      byte [] keyBytes = Bytes.toBytes(key);
+      assertTrue("bytes for keys do not match " + keyStr + " " +
+        Bytes.toString(Bytes.toBytes(key)),
+          Arrays.equals(Bytes.toBytes(keyStr), keyBytes));
+      byte [] valBytes = Bytes.toBytes(val);
+      assertTrue("bytes for vals do not match " + valStr + " " +
+        Bytes.toString(valBytes),
+        Arrays.equals(Bytes.toBytes(valStr), valBytes));
+      if (!scanner.next()) {
+        break;
+      }
+    }
+    assertEquals(i, start + n - 1);
+    return (start + n);
+  }
+
+  private byte[] getSomeKey(int rowId) {
+    return String.format(localFormatter, Integer.valueOf(rowId)).getBytes();
+  }
+
+  private void writeRecords(Writer writer) throws IOException {
+    writeSomeRecords(writer, 0, 100);
+    writer.close();
+  }
+
+  private FSDataOutputStream createFSOutput(Path name) throws IOException {
+    if (fs.exists(name)) fs.delete(name, true);
+    FSDataOutputStream fout = fs.create(name);
+    return fout;
+  }
+
+  /**
+   * test none codecs
+   */
+  void basicWithSomeCodec(String codec) throws IOException {
+    Path ncTFile = new Path(ROOT_DIR, "basic.hfile");
+    FSDataOutputStream fout = createFSOutput(ncTFile);
+    Writer writer = new Writer(fout, minBlockSize, codec, null);
+    LOG.info(writer);
+    writeRecords(writer);
+    fout.close();
+    FSDataInputStream fin = fs.open(ncTFile);
+    Reader reader = new Reader(fs.open(ncTFile),
+      fs.getFileStatus(ncTFile).getLen(), null);
+    // Load up the index.
+    reader.loadFileInfo();
+    LOG.info(reader);
+    HFileScanner scanner = reader.getScanner();
+    // Align scanner at start of the file.
+    scanner.seekTo();
+    readAllRecords(scanner);
+    scanner.seekTo(getSomeKey(50));
+    assertTrue("location lookup failed", scanner.seekTo(getSomeKey(50)) == 0);
+    // read the key and see if it matches
+    ByteBuffer readKey = scanner.getKey();
+    assertTrue("seeked key does not match", Arrays.equals(getSomeKey(50),
+      Bytes.toBytes(readKey)));
+
+    scanner.seekTo(new byte[0]);
+    ByteBuffer val1 = scanner.getValue();
+    scanner.seekTo(new byte[0]);
+    ByteBuffer val2 = scanner.getValue();
+    assertTrue(Arrays.equals(Bytes.toBytes(val1), Bytes.toBytes(val2)));
+
+    reader.close();
+    fin.close();
+    fs.delete(ncTFile, true);
+  }
+
+  public void testTFileFeatures() throws IOException {
+    basicWithSomeCodec("none");
+    basicWithSomeCodec("gz");
+  }
+
+  private void writeNumMetablocks(Writer writer, int n) {
+    for (int i = 0; i < n; i++) {
+      writer.appendMetaBlock("TfileMeta" + i, ("something to test" + i).getBytes());
+    }
+  }
+
+  private void someTestingWithMetaBlock(Writer writer) {
+    writeNumMetablocks(writer, 10);
+  }
+
+  private void readNumMetablocks(Reader reader, int n) throws IOException {
+    for (int i = 0; i < n; i++) {
+      ByteBuffer b = reader.getMetaBlock("TfileMeta" + i);
+      byte [] found = Bytes.toBytes(b);
+      assertTrue("failed to match metadata", Arrays.equals(
+          ("something to test" + i).getBytes(), found));
+    }
+  }
+
+  private void someReadingWithMetaBlock(Reader reader) throws IOException {
+    readNumMetablocks(reader, 10);
+  }
+
+  private void metablocks(final String compress) throws Exception {
+    Path mFile = new Path(ROOT_DIR, "meta.tfile");
+    FSDataOutputStream fout = createFSOutput(mFile);
+    Writer writer = new Writer(fout, minBlockSize, compress, null);
+    someTestingWithMetaBlock(writer);
+    writer.close();
+    fout.close();
+    FSDataInputStream fin = fs.open(mFile);
+    Reader reader = new Reader(fs.open(mFile), this.fs.getFileStatus(mFile)
+        .getLen(), null);
+    reader.loadFileInfo();
+    // No data -- this should return false.
+    assertFalse(reader.getScanner().seekTo());
+    someReadingWithMetaBlock(reader);
+    fs.delete(mFile, true);
+    reader.close();
+    fin.close();
+  }
+
+  // test meta blocks for tfiles
+  public void testMetaBlocks() throws Exception {
+    metablocks("none");
+    metablocks("gz");
+  }
+  
+  /**
+   * Make sure the orginals for our compression libs doesn't change on us.
+   */
+  public void testCompressionOrdinance() {
+    assertTrue(Compression.Algorithm.LZO.ordinal() == 0);
+    assertTrue(Compression.Algorithm.GZ.ordinal() == 1);
+    assertTrue(Compression.Algorithm.NONE.ordinal() == 2);
+  }
+  
+  
+  public void testComparator() throws IOException {
+    Path mFile = new Path(ROOT_DIR, "meta.tfile");
+    FSDataOutputStream fout = createFSOutput(mFile);
+    Writer writer = new Writer(fout, minBlockSize, "none",
+      new RawComparator<byte []>() {
+        @Override
+        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2,
+            int l2) {
+          return -Bytes.compareTo(b1, s1, l1, b2, s2, l2);
+          
+        }
+        @Override
+        public int compare(byte[] o1, byte[] o2) {
+          return compare(o1, 0, o1.length, o2, 0, o2.length);
+        }
+      });
+    writer.append("3".getBytes(), "0".getBytes());
+    writer.append("2".getBytes(), "0".getBytes());
+    writer.append("1".getBytes(), "0".getBytes());
+    writer.close();
+  }
+}
\ No newline at end of file
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java
new file mode 100644
index 00000000000..b59e30f5ea0
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java
@@ -0,0 +1,387 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.io.compress.LzoCodec;
+
+/**
+ *  Set of long-running tests to measure performance of HFile.
+ * <p>
+ * Copied from
+ * <a href="https://issues.apache.org/jira/browse/HADOOP-3315">hadoop-3315 tfile</a>.
+ * Remove after tfile is committed and use the tfile version of this class
+ * instead.</p>
+ */
+public class TestHFilePerformance extends TestCase {
+  private static String ROOT_DIR =
+      System.getProperty("test.build.data", "/tmp/TestHFilePerformance");
+  private FileSystem fs;
+  private Configuration conf;
+  private long startTimeEpoch;
+  private long finishTimeEpoch;
+  private DateFormat formatter;
+
+  @Override
+  public void setUp() throws IOException {
+    conf = new Configuration();
+    fs = FileSystem.get(conf);
+    formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+  }
+
+  public void startTime() {
+    startTimeEpoch = System.currentTimeMillis();
+    System.out.println(formatTime() + " Started timing.");
+  }
+
+  public void stopTime() {
+    finishTimeEpoch = System.currentTimeMillis();
+    System.out.println(formatTime() + " Stopped timing.");
+  }
+
+  public long getIntervalMillis() {
+    return finishTimeEpoch - startTimeEpoch;
+  }
+
+  public void printlnWithTimestamp(String message) {
+    System.out.println(formatTime() + "  " +  message);
+  }
+
+  /*
+   * Format millis into minutes and seconds.
+   */
+  public String formatTime(long milis){
+    return formatter.format(milis);
+  }
+
+  public String formatTime(){
+    return formatTime(System.currentTimeMillis());
+  }
+
+  private FSDataOutputStream createFSOutput(Path name) throws IOException {
+    if (fs.exists(name))
+      fs.delete(name, true);
+    FSDataOutputStream fout = fs.create(name);
+    return fout;
+  }
+
+  //TODO have multiple ways of generating key/value e.g. dictionary words
+  //TODO to have a sample compressable data, for now, made 1 out of 3 values random
+  //     keys are all random.
+
+  private static class KeyValueGenerator {
+    Random keyRandomizer;
+    Random valueRandomizer;
+    long randomValueRatio = 3; // 1 out of randomValueRatio generated values will be random.
+    long valueSequence = 0 ;
+
+
+    KeyValueGenerator() {
+      keyRandomizer = new Random(0L); //TODO with seed zero
+      valueRandomizer = new Random(1L); //TODO with seed one
+    }
+
+    // Key is always random now.
+    void getKey(byte[] key) {
+      keyRandomizer.nextBytes(key);
+    }
+
+    void getValue(byte[] value) {
+      if (valueSequence % randomValueRatio == 0)
+          valueRandomizer.nextBytes(value);
+      valueSequence++;
+    }
+  }
+
+  /**
+   *
+   * @param fileType "HFile" or "SequenceFile"
+   * @param keyLength
+   * @param valueLength
+   * @param codecName "none", "lzo", "gz"
+   * @param rows number of rows to be written.
+   * @param writeMethod used for HFile only.
+   * @param minBlockSize used for HFile only.
+   * @throws IOException
+   */
+   //TODO writeMethod: implement multiple ways of writing e.g. A) known length (no chunk) B) using a buffer and streaming (for many chunks).
+  public void timeWrite(String fileType, int keyLength, int valueLength,
+    String codecName, long rows, String writeMethod, int minBlockSize)
+  throws IOException {
+    System.out.println("File Type: " + fileType);
+    System.out.println("Writing " + fileType + " with codecName: " + codecName);
+    long totalBytesWritten = 0;
+
+
+    //Using separate randomizer for key/value with seeds matching Sequence File.
+    byte[] key = new byte[keyLength];
+    byte[] value = new byte[valueLength];
+    KeyValueGenerator generator = new KeyValueGenerator();
+
+    startTime();
+
+    Path path = new Path(ROOT_DIR, fileType + ".Performance");
+    System.out.println(ROOT_DIR + path.getName());
+    FSDataOutputStream fout =  createFSOutput(path);
+
+    if ("HFile".equals(fileType)){
+        System.out.println("HFile write method: ");
+        HFile.Writer writer =
+          new HFile.Writer(fout, minBlockSize, codecName, null);
+
+        // Writing value in one shot.
+        for (long l=0 ; l<rows ; l++ ) {
+          generator.getKey(key);
+          generator.getValue(value);
+          writer.append(key, value);
+          totalBytesWritten += key.length;
+          totalBytesWritten += value.length;
+         }
+        writer.close();
+    } else if ("SequenceFile".equals(fileType)){
+        CompressionCodec codec = null;
+        if ("lzo".equals(codecName))
+          codec = new LzoCodec();
+        else if ("gz".equals(codecName))
+          codec = new GzipCodec();
+        else if (!"none".equals(codecName))
+          throw new IOException("Codec not supported.");
+
+        SequenceFile.Writer writer;
+
+        //TODO
+        //JobConf conf = new JobConf();
+
+        if (!"none".equals(codecName))
+          writer = SequenceFile.createWriter(conf, fout, BytesWritable.class,
+            BytesWritable.class, SequenceFile.CompressionType.BLOCK, codec);
+        else
+          writer = SequenceFile.createWriter(conf, fout, BytesWritable.class,
+            BytesWritable.class, SequenceFile.CompressionType.NONE, null);
+
+        BytesWritable keyBsw;
+        BytesWritable valBsw;
+        for (long l=0 ; l<rows ; l++ ) {
+
+           generator.getKey(key);
+           keyBsw = new BytesWritable(key);
+           totalBytesWritten += keyBsw.getSize();
+
+           generator.getValue(value);
+           valBsw = new BytesWritable(value);
+           writer.append(keyBsw, valBsw);
+           totalBytesWritten += valBsw.getSize();
+        }
+
+        writer.close();
+    } else
+       throw new IOException("File Type is not supported");
+
+    fout.close();
+    stopTime();
+
+    printlnWithTimestamp("Data written: ");
+    printlnWithTimestamp("  rate  = " +
+      totalBytesWritten / getIntervalMillis() * 1000 / 1024 / 1024 + "MB/s");
+    printlnWithTimestamp("  total = " + totalBytesWritten + "B");
+
+    printlnWithTimestamp("File written: ");
+    printlnWithTimestamp("  rate  = " +
+      fs.getFileStatus(path).getLen() / getIntervalMillis() * 1000 / 1024 / 1024 + "MB/s");
+    printlnWithTimestamp("  total = " + fs.getFileStatus(path).getLen() + "B");
+  }
+
+  public void timeReading(String fileType, int keyLength, int valueLength,
+      long rows, int method) throws IOException {
+    System.out.println("Reading file of type: " + fileType);
+    Path path = new Path(ROOT_DIR, fileType + ".Performance");
+    System.out.println("Input file size: " + fs.getFileStatus(path).getLen());
+    long totalBytesRead = 0;
+
+
+    ByteBuffer val;
+
+    ByteBuffer key;
+
+    startTime();
+    FSDataInputStream fin = fs.open(path);
+
+    if ("HFile".equals(fileType)){
+        HFile.Reader reader = new HFile.Reader(fs.open(path),
+          fs.getFileStatus(path).getLen(), null);
+        reader.loadFileInfo();
+        System.out.println(reader);
+        switch (method) {
+        
+          case 0:
+          case 1:
+          default:
+            {
+              HFileScanner scanner = reader.getScanner();
+              scanner.seekTo();
+              for (long l=0 ; l<rows ; l++ ) {
+                key = scanner.getKey();
+                val = scanner.getValue();
+                totalBytesRead += key.limit() + val.limit();
+                scanner.next();
+              }
+            }
+            break;
+        }
+    } else if("SequenceFile".equals(fileType)){
+
+        SequenceFile.Reader reader;
+        reader = new SequenceFile.Reader(fs, path, new Configuration());
+
+        if (reader.getCompressionCodec() != null) {
+            printlnWithTimestamp("Compression codec class: " + reader.getCompressionCodec().getClass());
+        } else
+            printlnWithTimestamp("Compression codec class: " + "none");
+
+        BytesWritable keyBsw = new BytesWritable();
+        BytesWritable valBsw = new BytesWritable();
+
+        for (long l=0 ; l<rows ; l++ ) {
+          reader.next(keyBsw, valBsw);
+          totalBytesRead += keyBsw.getSize() + valBsw.getSize();
+        }
+        reader.close();
+
+        //TODO make a tests for other types of SequenceFile reading scenarios
+
+    } else {
+        throw new IOException("File Type not supported.");
+    }
+
+
+    //printlnWithTimestamp("Closing reader");
+    fin.close();
+    stopTime();
+    //printlnWithTimestamp("Finished close");
+
+    printlnWithTimestamp("Finished in " + getIntervalMillis() + "ms");
+    printlnWithTimestamp("Data read: ");
+    printlnWithTimestamp("  rate  = " +
+      totalBytesRead / getIntervalMillis() * 1000 / 1024 / 1024 + "MB/s");
+    printlnWithTimestamp("  total = " + totalBytesRead + "B");
+
+    printlnWithTimestamp("File read: ");
+    printlnWithTimestamp("  rate  = " +
+      fs.getFileStatus(path).getLen() / getIntervalMillis() * 1000 / 1024 / 1024 + "MB/s");
+    printlnWithTimestamp("  total = " + fs.getFileStatus(path).getLen() + "B");
+
+    //TODO uncomment this for final committing so test files is removed.
+    //fs.delete(path, true);
+  }
+
+  public void testRunComparisons() throws IOException {
+
+    int keyLength = 100; // 100B
+    int valueLength = 5*1024; // 5KB
+    int minBlockSize = 10*1024*1024; // 10MB
+    int rows = 10000;
+
+    System.out.println("****************************** Sequence File *****************************");
+
+    timeWrite("SequenceFile", keyLength, valueLength, "none", rows, null, minBlockSize);
+    System.out.println("\n+++++++\n");
+    timeReading("SequenceFile", keyLength, valueLength, rows, -1);
+
+    System.out.println("");
+    System.out.println("----------------------");
+    System.out.println("");
+
+    /* DISABLED LZO
+    timeWrite("SequenceFile", keyLength, valueLength, "lzo", rows, null, minBlockSize);
+    System.out.println("\n+++++++\n");
+    timeReading("SequenceFile", keyLength, valueLength, rows, -1);
+
+    System.out.println("");
+    System.out.println("----------------------");
+    System.out.println("");
+
+    /* Sequence file can only use native hadoop libs gzipping so commenting out.
+     */
+    try {
+      timeWrite("SequenceFile", keyLength, valueLength, "gz", rows, null,
+        minBlockSize);
+      System.out.println("\n+++++++\n");
+      timeReading("SequenceFile", keyLength, valueLength, rows, -1);
+    } catch (IllegalArgumentException e) {
+      System.out.println("Skipping sequencefile gz: " + e.getMessage());
+    }
+
+
+    System.out.println("\n\n\n");
+    System.out.println("****************************** HFile *****************************");
+
+    timeWrite("HFile", keyLength, valueLength, "none", rows, null, minBlockSize);
+    System.out.println("\n+++++++\n");
+    timeReading("HFile", keyLength, valueLength, rows, 0 );
+
+    System.out.println("");
+    System.out.println("----------------------");
+    System.out.println("");
+/* DISABLED LZO
+    timeWrite("HFile", keyLength, valueLength, "lzo", rows, null, minBlockSize);
+    System.out.println("\n+++++++\n");
+    timeReading("HFile", keyLength, valueLength, rows, 0 );
+    System.out.println("\n+++++++\n");
+    timeReading("HFile", keyLength, valueLength, rows, 1 );
+    System.out.println("\n+++++++\n");
+    timeReading("HFile", keyLength, valueLength, rows, 2 );
+
+    System.out.println("");
+    System.out.println("----------------------");
+    System.out.println("");
+*/
+    timeWrite("HFile", keyLength, valueLength, "gz", rows, null, minBlockSize);
+    System.out.println("\n+++++++\n");
+    timeReading("HFile", keyLength, valueLength, rows, 0 );
+
+    System.out.println("\n\n\n\nNotes: ");
+    System.out.println(" * Timing includes open/closing of files.");
+    System.out.println(" * Timing includes reading both Key and Value");
+    System.out.println(" * Data is generated as random bytes. Other methods e.g. using " +
+            "dictionary with care for distributation of words is under development.");
+    System.out.println(" * Timing of write currently, includes random value/key generations. " +
+            "Which is the same for Sequence File and HFile. Another possibility is to generate " +
+            "test data beforehand");
+    System.out.println(" * We need to mitigate cache effect on benchmark. We can apply several " +
+            "ideas, for next step we do a large dummy read between benchmark read to dismantle " +
+            "caching of data. Renaming of file may be helpful. We can have a loop that reads with" +
+            " the same method several times and flood cache every time and average it to get a" +
+            " better number.");
+  }
+}
\ No newline at end of file
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/TestHFileSeek.java b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFileSeek.java
new file mode 100644
index 00000000000..69ecef1cfe1
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFileSeek.java
@@ -0,0 +1,510 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.StringTokenizer;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.hfile.HFile.Reader;
+import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
+import org.apache.hadoop.io.BytesWritable;
+
+/**
+ * test the performance for seek.
+ * <p>
+ * Copied from
+ * <a href="https://issues.apache.org/jira/browse/HADOOP-3315">hadoop-3315 tfile</a>.
+ * Remove after tfile is committed and use the tfile version of this class
+ * instead.</p>
+ */
+public class TestHFileSeek extends TestCase { 
+  private MyOptions options;
+  private Configuration conf;
+  private Path path;
+  private FileSystem fs;
+  private NanoTimer timer;
+  private Random rng;
+  private RandomDistribution.DiscreteRNG keyLenGen;
+  private KVGenerator kvGen;
+
+  @Override
+  public void setUp() throws IOException {
+    if (options == null) {
+      options = new MyOptions(new String[0]);
+    }
+
+    conf = new Configuration();
+    conf.setInt("tfile.fs.input.buffer.size", options.fsInputBufferSize);
+    conf.setInt("tfile.fs.output.buffer.size", options.fsOutputBufferSize);
+    path = new Path(new Path(options.rootDir), options.file);
+    fs = path.getFileSystem(conf);
+    timer = new NanoTimer(false);
+    rng = new Random(options.seed);
+    keyLenGen =
+        new RandomDistribution.Zipf(new Random(rng.nextLong()),
+            options.minKeyLen, options.maxKeyLen, 1.2);
+    RandomDistribution.DiscreteRNG valLenGen =
+        new RandomDistribution.Flat(new Random(rng.nextLong()),
+            options.minValLength, options.maxValLength);
+    RandomDistribution.DiscreteRNG wordLenGen =
+        new RandomDistribution.Flat(new Random(rng.nextLong()),
+            options.minWordLen, options.maxWordLen);
+    kvGen =
+        new KVGenerator(rng, true, keyLenGen, valLenGen, wordLenGen,
+            options.dictSize);
+  }
+  
+  @Override
+  public void tearDown() {
+    try {
+      fs.close();
+    }
+    catch (Exception e) {
+      // Nothing
+    }
+  }
+  
+  private static FSDataOutputStream createFSOutput(Path name, FileSystem fs)
+    throws IOException {
+    if (fs.exists(name)) {
+      fs.delete(name, true);
+    }
+    FSDataOutputStream fout = fs.create(name);
+    return fout;
+  }
+
+  private void createTFile() throws IOException {
+    long totalBytes = 0;
+    FSDataOutputStream fout = createFSOutput(path, fs);
+    try {
+      Writer writer =
+          new Writer(fout, options.minBlockSize, options.compress, null);
+      try {
+        BytesWritable key = new BytesWritable();
+        BytesWritable val = new BytesWritable();
+        timer.start();
+        for (long i = 0; true; ++i) {
+          if (i % 1000 == 0) { // test the size for every 1000 rows.
+            if (fs.getFileStatus(path).getLen() >= options.fileSize) {
+              break;
+            }
+          }
+          kvGen.next(key, val, false);
+          byte [] k = new byte [key.getLength()];
+          System.arraycopy(key.getBytes(), 0, k, 0, key.getLength());
+          byte [] v = new byte [val.getLength()];
+          System.arraycopy(val.getBytes(), 0, v, 0, key.getLength());
+          writer.append(k, v);
+          totalBytes += key.getLength();
+          totalBytes += val.getLength();
+        }
+        timer.stop();
+      }
+      finally {
+        writer.close();
+      }
+    }
+    finally {
+      fout.close();
+    }
+    double duration = (double)timer.read()/1000; // in us.
+    long fsize = fs.getFileStatus(path).getLen();
+
+    System.out.printf(
+        "time: %s...uncompressed: %.2fMB...raw thrpt: %.2fMB/s\n",
+        timer.toString(), (double) totalBytes / 1024 / 1024, totalBytes
+            / duration);
+    System.out.printf("time: %s...file size: %.2fMB...disk thrpt: %.2fMB/s\n",
+        timer.toString(), (double) fsize / 1024 / 1024, fsize / duration);
+  }
+  
+  public void seekTFile() throws IOException {
+    int miss = 0;
+    long totalBytes = 0;
+    FSDataInputStream fsdis = fs.open(path);
+    Reader reader =
+      new Reader(fsdis, fs.getFileStatus(path).getLen(), null);
+    reader.loadFileInfo();
+    System.out.println(reader);
+    KeySampler kSampler =
+        new KeySampler(rng, reader.getFirstKey(), reader.getLastKey(),
+            keyLenGen);
+    HFileScanner scanner = reader.getScanner();
+    BytesWritable key = new BytesWritable();
+    BytesWritable val = new BytesWritable();
+    timer.reset();
+    timer.start();
+    for (int i = 0; i < options.seekCount; ++i) {
+      kSampler.next(key);
+      byte [] k = new byte [key.getLength()];
+      System.arraycopy(key.getBytes(), 0, k, 0, key.getLength());
+      if (scanner.seekTo(k) >= 0) {
+        ByteBuffer bbkey = scanner.getKey();
+        ByteBuffer bbval = scanner.getValue();
+        totalBytes += bbkey.limit();
+        totalBytes += bbval.limit();
+      }
+      else {
+        ++miss;
+      }
+    }
+    timer.stop();
+    double duration = (double) timer.read() / 1000; // in us.
+    System.out.printf(
+        "time: %s...avg seek: %s...%d hit...%d miss...avg I/O size: %.2fKB\n",
+        timer.toString(), NanoTimer.nanoTimeToString(timer.read()
+            / options.seekCount), options.seekCount - miss, miss,
+        (double) totalBytes / 1024 / (options.seekCount - miss));
+
+  }
+  
+  public void testSeeks() throws IOException {
+    if (options.doCreate()) {
+      createTFile();
+    }
+
+    if (options.doRead()) {
+      seekTFile();
+    }
+
+    if (options.doCreate()) {
+      fs.delete(path, true);
+    }
+  }
+  
+  private static class IntegerRange {
+    private final int from, to;
+
+    public IntegerRange(int from, int to) {
+      this.from = from;
+      this.to = to;
+    }
+
+    public static IntegerRange parse(String s) throws ParseException {
+      StringTokenizer st = new StringTokenizer(s, " \t,");
+      if (st.countTokens() != 2) {
+        throw new ParseException("Bad integer specification: " + s);
+      }
+      int from = Integer.parseInt(st.nextToken());
+      int to = Integer.parseInt(st.nextToken());
+      return new IntegerRange(from, to);
+    }
+
+    public int from() {
+      return from;
+    }
+
+    public int to() {
+      return to;
+    }
+  }
+
+  private static class MyOptions {
+    // hard coded constants
+    int dictSize = 1000;
+    int minWordLen = 5;
+    int maxWordLen = 20;
+    int osInputBufferSize = 64 * 1024;
+    int osOutputBufferSize = 64 * 1024;
+    int fsInputBufferSizeNone = 0;
+    int fsInputBufferSizeLzo = 0;
+    int fsInputBufferSizeGz = 0;
+    int fsOutputBufferSizeNone = 1;
+    int fsOutputBufferSizeLzo = 1;
+    int fsOutputBufferSizeGz = 1;
+   
+    String rootDir =
+        System.getProperty("test.build.data", "/tmp/TestTFileSeek");
+    String file = "TestTFileSeek";
+    // String compress = "lzo"; DISABLED
+    String compress = "none";
+    int minKeyLen = 10;
+    int maxKeyLen = 50;
+    int minValLength = 1024;
+    int maxValLength = 2 * 1024;
+    int minBlockSize = 1 * 1024 * 1024;
+    int fsOutputBufferSize = 1;
+    int fsInputBufferSize = 0;
+    // Default writing 10MB.
+    long fileSize = 10 * 1024 * 1024;
+    long seekCount = 1000;
+    long seed;
+
+    static final int OP_CREATE = 1;
+    static final int OP_READ = 2;
+    int op = OP_CREATE | OP_READ;
+
+    boolean proceed = false;
+
+    public MyOptions(String[] args) {
+      seed = System.nanoTime();
+
+      try {
+        Options opts = buildOptions();
+        CommandLineParser parser = new GnuParser();
+        CommandLine line = parser.parse(opts, args, true);
+        processOptions(line, opts);
+        validateOptions();
+      }
+      catch (ParseException e) {
+        System.out.println(e.getMessage());
+        System.out.println("Try \"--help\" option for details.");
+        setStopProceed();
+      }
+    }
+
+    public boolean proceed() {
+      return proceed;
+    }
+
+    private Options buildOptions() {
+      Option compress =
+          OptionBuilder.withLongOpt("compress").withArgName("[none|lzo|gz]")
+              .hasArg().withDescription("compression scheme").create('c');
+
+      Option fileSize =
+          OptionBuilder.withLongOpt("file-size").withArgName("size-in-MB")
+              .hasArg().withDescription("target size of the file (in MB).")
+              .create('s');
+
+      Option fsInputBufferSz =
+          OptionBuilder.withLongOpt("fs-input-buffer").withArgName("size")
+              .hasArg().withDescription(
+                  "size of the file system input buffer (in bytes).").create(
+                  'i');
+
+      Option fsOutputBufferSize =
+          OptionBuilder.withLongOpt("fs-output-buffer").withArgName("size")
+              .hasArg().withDescription(
+                  "size of the file system output buffer (in bytes).").create(
+                  'o');
+
+      Option keyLen =
+          OptionBuilder
+              .withLongOpt("key-length")
+              .withArgName("min,max")
+              .hasArg()
+              .withDescription(
+                  "the length range of the key (in bytes)")
+              .create('k');
+
+      Option valueLen =
+          OptionBuilder
+              .withLongOpt("value-length")
+              .withArgName("min,max")
+              .hasArg()
+              .withDescription(
+                  "the length range of the value (in bytes)")
+              .create('v');
+
+      Option blockSz =
+          OptionBuilder.withLongOpt("block").withArgName("size-in-KB").hasArg()
+              .withDescription("minimum block size (in KB)").create('b');
+
+      Option seed =
+          OptionBuilder.withLongOpt("seed").withArgName("long-int").hasArg()
+              .withDescription("specify the seed").create('S');
+
+      Option operation =
+          OptionBuilder.withLongOpt("operation").withArgName("r|w|rw").hasArg()
+              .withDescription(
+                  "action: seek-only, create-only, seek-after-create").create(
+                  'x');
+
+      Option rootDir =
+          OptionBuilder.withLongOpt("root-dir").withArgName("path").hasArg()
+              .withDescription(
+                  "specify root directory where files will be created.")
+              .create('r');
+
+      Option file =
+          OptionBuilder.withLongOpt("file").withArgName("name").hasArg()
+              .withDescription("specify the file name to be created or read.")
+              .create('f');
+
+      Option seekCount =
+          OptionBuilder
+              .withLongOpt("seek")
+              .withArgName("count")
+              .hasArg()
+              .withDescription(
+                  "specify how many seek operations we perform (requires -x r or -x rw.")
+              .create('n');
+
+      Option help =
+          OptionBuilder.withLongOpt("help").hasArg(false).withDescription(
+              "show this screen").create("h");
+
+      return new Options().addOption(compress).addOption(fileSize).addOption(
+          fsInputBufferSz).addOption(fsOutputBufferSize).addOption(keyLen)
+          .addOption(blockSz).addOption(rootDir).addOption(valueLen).addOption(
+              operation).addOption(seekCount).addOption(file).addOption(help);
+
+    }
+
+    private void processOptions(CommandLine line, Options opts)
+        throws ParseException {
+      // --help -h and --version -V must be processed first.
+      if (line.hasOption('h')) {
+        HelpFormatter formatter = new HelpFormatter();
+        System.out.println("TFile and SeqFile benchmark.");
+        System.out.println();
+        formatter.printHelp(100,
+            "java ... TestTFileSeqFileComparison [options]",
+            "\nSupported options:", opts, "");
+        return;
+      }
+
+      if (line.hasOption('c')) {
+        compress = line.getOptionValue('c');
+      }
+
+      if (line.hasOption('d')) {
+        dictSize = Integer.parseInt(line.getOptionValue('d'));
+      }
+
+      if (line.hasOption('s')) {
+        fileSize = Long.parseLong(line.getOptionValue('s')) * 1024 * 1024;
+      }
+
+      if (line.hasOption('i')) {
+        fsInputBufferSize = Integer.parseInt(line.getOptionValue('i'));
+      }
+
+      if (line.hasOption('o')) {
+        fsOutputBufferSize = Integer.parseInt(line.getOptionValue('o'));
+      }
+      
+      if (line.hasOption('n')) {
+        seekCount = Integer.parseInt(line.getOptionValue('n'));
+      }
+
+      if (line.hasOption('k')) {
+        IntegerRange ir = IntegerRange.parse(line.getOptionValue('k'));
+        minKeyLen = ir.from();
+        maxKeyLen = ir.to();
+      }
+
+      if (line.hasOption('v')) {
+        IntegerRange ir = IntegerRange.parse(line.getOptionValue('v'));
+        minValLength = ir.from();
+        maxValLength = ir.to();
+      }
+
+      if (line.hasOption('b')) {
+        minBlockSize = Integer.parseInt(line.getOptionValue('b')) * 1024;
+      }
+
+      if (line.hasOption('r')) {
+        rootDir = line.getOptionValue('r');
+      }
+      
+      if (line.hasOption('f')) {
+        file = line.getOptionValue('f');
+      }
+
+      if (line.hasOption('S')) {
+        seed = Long.parseLong(line.getOptionValue('S'));
+      }
+
+      if (line.hasOption('x')) {
+        String strOp = line.getOptionValue('x');
+        if (strOp.equals("r")) {
+          op = OP_READ;
+        }
+        else if (strOp.equals("w")) {
+          op = OP_CREATE;
+        }
+        else if (strOp.equals("rw")) {
+          op = OP_CREATE | OP_READ;
+        }
+        else {
+          throw new ParseException("Unknown action specifier: " + strOp);
+        }
+      }
+
+      proceed = true;
+    }
+
+    private void validateOptions() throws ParseException {
+      if (!compress.equals("none") && !compress.equals("lzo")
+          && !compress.equals("gz")) {
+        throw new ParseException("Unknown compression scheme: " + compress);
+      }
+
+      if (minKeyLen >= maxKeyLen) {
+        throw new ParseException(
+            "Max key length must be greater than min key length.");
+      }
+
+      if (minValLength >= maxValLength) {
+        throw new ParseException(
+            "Max value length must be greater than min value length.");
+      }
+
+      if (minWordLen >= maxWordLen) {
+        throw new ParseException(
+            "Max word length must be greater than min word length.");
+      }
+      return;
+    }
+
+    private void setStopProceed() {
+      proceed = false;
+    }
+
+    public boolean doCreate() {
+      return (op & OP_CREATE) != 0;
+    }
+
+    public boolean doRead() {
+      return (op & OP_READ) != 0;
+    }
+  }
+  
+  public static void main(String[] argv) throws IOException {
+    TestHFileSeek testCase = new TestHFileSeek();
+    MyOptions options = new MyOptions(argv);
+    
+    if (options.proceed == false) {
+      return;
+    }
+
+    testCase.options = options;
+    testCase.setUp();
+    testCase.testSeeks();
+    testCase.tearDown();
+  }
+}
diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java b/src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java
new file mode 100644
index 00000000000..cedfb3e9e07
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java
@@ -0,0 +1,146 @@
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RawLocalFileSystem;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.util.Bytes;
+
+/**
+ * Test {@link HFileScanner#seekTo(byte[])} and its variants.
+ */
+public class TestSeekTo extends TestCase {
+  private static String ROOT_DIR =
+    System.getProperty("test.build.data", "/tmp/TestHFile");
+
+  private HBaseConfiguration conf;
+  private LocalFileSystem fs;
+
+  @Override
+  public void setUp() {
+    conf = new HBaseConfiguration();
+    RawLocalFileSystem rawLFS = new RawLocalFileSystem();
+    rawLFS.setConf(conf);
+    fs = new LocalFileSystem(rawLFS);
+  }
+  private FSDataOutputStream createFSOutput(Path name) throws IOException {
+    if (fs.exists(name)) fs.delete(name, true);
+    FSDataOutputStream fout = fs.create(name);
+    return fout;
+  }
+
+  Path makeNewFile() throws IOException {
+    Path ncTFile = new Path(ROOT_DIR, "basic.hfile");
+    FSDataOutputStream fout = createFSOutput(ncTFile);
+    HFile.Writer writer = new HFile.Writer(fout, 40, "none", null);
+    // 4 bytes * 3 * 2 for each key/value +
+    // 3 for keys, 15 for values = 42 (woot)
+    writer.append(Bytes.toBytes("c"), Bytes.toBytes("value"));
+    writer.append(Bytes.toBytes("e"), Bytes.toBytes("value"));
+    writer.append(Bytes.toBytes("g"), Bytes.toBytes("value"));
+    // block transition
+    writer.append(Bytes.toBytes("i"), Bytes.toBytes("value"));
+    writer.append(Bytes.toBytes("k"), Bytes.toBytes("value"));
+    writer.close();
+    fout.close();
+    return ncTFile;
+  }
+  public void testSeekBefore() throws Exception {
+    Path p = makeNewFile();
+    HFile.Reader reader = new HFile.Reader(fs, p, null);
+    reader.loadFileInfo();
+    HFileScanner scanner = reader.getScanner();
+    assertEquals(false, scanner.seekBefore(Bytes.toBytes("a")));
+    
+    assertEquals(false, scanner.seekBefore(Bytes.toBytes("c")));
+    
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("d")));
+    assertEquals("c", scanner.getKeyString());
+    
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("e")));
+    assertEquals("c", scanner.getKeyString());
+    
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("f")));
+    assertEquals("e", scanner.getKeyString());
+    
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("g")));
+    assertEquals("e", scanner.getKeyString());
+    
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("h")));
+    assertEquals("g", scanner.getKeyString());
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("i")));
+    assertEquals("g", scanner.getKeyString());
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("j")));
+    assertEquals("i", scanner.getKeyString());
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("k")));
+    assertEquals("i", scanner.getKeyString());
+    assertEquals(true, scanner.seekBefore(Bytes.toBytes("l")));
+    assertEquals("k", scanner.getKeyString());
+  }
+  
+  public void testSeekTo() throws Exception {
+    Path p = makeNewFile();
+    HFile.Reader reader = new HFile.Reader(fs, p, null);
+    reader.loadFileInfo();
+    assertEquals(2, reader.blockIndex.count);
+    HFileScanner scanner = reader.getScanner();
+    // lies before the start of the file.
+    assertEquals(-1, scanner.seekTo(Bytes.toBytes("a")));
+  
+    assertEquals(1, scanner.seekTo(Bytes.toBytes("d")));
+    assertEquals("c", scanner.getKeyString());
+    
+    // Across a block boundary now.
+    assertEquals(1, scanner.seekTo(Bytes.toBytes("h")));
+    assertEquals("g", scanner.getKeyString());
+    
+    assertEquals(1, scanner.seekTo(Bytes.toBytes("l")));
+    assertEquals("k", scanner.getKeyString());
+  }
+  
+  public void testBlockContainingKey() throws Exception {
+    Path p = makeNewFile();
+    HFile.Reader reader = new HFile.Reader(fs, p, null);
+    reader.loadFileInfo();
+    System.out.println(reader.blockIndex.toString());
+    // falls before the start of the file.
+    assertEquals(-1, reader.blockIndex.blockContainingKey(Bytes.toBytes("a")));
+    assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("c")));
+    assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("d")));
+    assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("e")));
+    assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("g")));
+    assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("h")));
+    assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("i")));
+    assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("j")));
+    assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("k")));
+    assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("l")));
+
+
+    
+  }
+}
\ No newline at end of file
diff --git a/src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java b/src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java
deleted file mode 100644
index 64d8e0ed6b9..00000000000
--- a/src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java
+++ /dev/null
@@ -1,388 +0,0 @@
-/**
- * Copyright 2007 The Apache Software Foundation
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hbase.regionserver;
-
-import java.io.IOException;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hbase.HBaseTestCase;
-import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HRegionInfo;
-import org.apache.hadoop.hbase.HStoreKey;
-import org.apache.hadoop.hbase.io.HalfMapFileReader;
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
-import org.apache.hadoop.hbase.io.MapFile;
-import org.apache.hadoop.hbase.io.SequenceFile;
-import org.apache.hadoop.hbase.io.Reference;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.io.WritableComparable;
-/**
- * Test HStoreFile
- */
-public class TestHStoreFile extends HBaseTestCase {
-  static final Log LOG = LogFactory.getLog(TestHStoreFile.class);
-  private static String DIR = "/";
-  private MiniDFSCluster cluster;
-  private Path dir = null;
-  
-  @Override
-  public void setUp() throws Exception {
-    try {
-      this.cluster = new MiniDFSCluster(this.conf, 2, true, (String[])null);
-      // Set the hbase.rootdir to be the home directory in mini dfs.
-      this.conf.set(HConstants.HBASE_DIR,
-        this.cluster.getFileSystem().getHomeDirectory().toString());
-      this.dir = new Path(DIR, getName());
-    } catch (IOException e) {
-      shutdownDfs(cluster);
-    }
-    super.setUp();
-  }
-  
-  @Override
-  public void tearDown() throws Exception {
-    super.tearDown();
-    shutdownDfs(cluster);
-    // ReflectionUtils.printThreadInfo(new PrintWriter(System.out),
-    //  "Temporary end-of-test thread dump debugging HADOOP-2040: " + getName());
-  }
-  
-  private Path writeMapFile(final String name)
-  throws IOException {
-    Path path = new Path(DIR, name);
-    MapFile.Writer writer = new MapFile.Writer(this.conf, fs, path.toString(),
-      HStoreKey.class, ImmutableBytesWritable.class);
-    writeStoreFile(writer);
-    return path;
-  }
-  
-  private Path writeSmallMapFile(final String name)
-  throws IOException {
-    Path path = new Path(DIR, name);
-    MapFile.Writer writer = new MapFile.Writer(this.conf, fs, path.toString(),
-      HStoreKey.class, ImmutableBytesWritable.class);
-    try {
-      for (char d = FIRST_CHAR; d <= LAST_CHAR; d++) {
-        byte[] b = new byte[] {(byte)d};
-        byte [] t = Bytes.toBytes(new String(b, HConstants.UTF8_ENCODING));
-        writer.append(new HStoreKey(t, t, System.currentTimeMillis()),
-            new ImmutableBytesWritable(t));
-      }
-    } finally {
-      writer.close();
-    }
-    return path;
-  }
-  
-  /*
-   * Writes HStoreKey and ImmutableBytes data to passed writer and
-   * then closes it.
-   * @param writer
-   * @throws IOException
-   */
-  private void writeStoreFile(final MapFile.Writer writer)
-  throws IOException {
-    try {
-      for (char d = FIRST_CHAR; d <= LAST_CHAR; d++) {
-        for (char e = FIRST_CHAR; e <= LAST_CHAR; e++) {
-          byte[] b = new byte[] { (byte) d, (byte) e };
-          byte [] t = Bytes.toBytes(new String(b, HConstants.UTF8_ENCODING));
-          writer.append(new HStoreKey(t, t, System.currentTimeMillis()),
-            new ImmutableBytesWritable(t));
-        }
-      }
-    } finally {
-      writer.close();
-    }
-  }
-  
-  /**
-   * Test that our mechanism of writing store files in one region to reference
-   * store files in other regions works.
-   * @throws IOException
-   */
-  public void testReference()
-  throws IOException {
-    // Make a store file and write data to it.
-    HStoreFile hsf = new HStoreFile(this.conf, this.fs, this.dir,
-      HRegionInfo.FIRST_META_REGIONINFO,
-      Bytes.toBytes("colfamily"), 1234567890L, null);
-    MapFile.Writer writer =
-      hsf.getWriter(this.fs, SequenceFile.CompressionType.NONE, false, 0);
-    writeStoreFile(writer);
-    MapFile.Reader reader = hsf.getReader(this.fs, false, false);
-    // Split on a row, not in middle of row.  Midkey returned by reader
-    // may be in middle of row.  Create new one with empty column and
-    // timestamp.
-    HStoreKey midkey = new HStoreKey(((HStoreKey)reader.midKey()).getRow());
-    HStoreKey hsk = new HStoreKey();
-    reader.finalKey(hsk);
-    byte [] finalKey = hsk.getRow();
-    // Make a reference for the bottom half of the just written file.
-    Reference reference =
-      new Reference(hsf.getEncodedRegionName(), hsf.getFileId(),
-          midkey, Reference.Range.top);
-    HStoreFile refHsf = new HStoreFile(this.conf, this.fs, 
-        new Path(DIR, getName()),
-        HRegionInfo.FIRST_META_REGIONINFO,
-        hsf.getColFamily(), 456, reference);
-    // Assert that reference files are written and that we can write and
-    // read the info reference file at least.
-    refHsf.writeReferenceFiles(this.fs);
-    assertTrue(this.fs.exists(refHsf.getMapFilePath()));
-    assertTrue(this.fs.exists(refHsf.getInfoFilePath()));
-    Reference otherReference =
-      HStoreFile.readSplitInfo(refHsf.getInfoFilePath(), this.fs);
-    assertEquals(reference.getEncodedRegionName(),
-        otherReference.getEncodedRegionName());
-    assertEquals(reference.getFileId(),
-        otherReference.getFileId());
-    assertEquals(reference.getMidkey().toString(),
-        otherReference.getMidkey().toString());
-    // Now confirm that I can read from the reference and that it only gets
-    // keys from top half of the file.
-    MapFile.Reader halfReader = refHsf.getReader(this.fs, false, false);
-    HStoreKey key = new HStoreKey();
-    ImmutableBytesWritable value = new ImmutableBytesWritable();
-    boolean first = true;
-    while(halfReader.next(key, value)) {
-      if (first) {
-        assertTrue(Bytes.equals(key.getRow(), midkey.getRow()));
-        first = false;
-      }
-    }
-    assertTrue(Bytes.equals(key.getRow(), finalKey));
-  }
-
-  /**
-   * Write a file and then assert that we can read from top and bottom halves
-   * using two HalfMapFiles.
-   * @throws Exception
-   */
-  public void testBasicHalfMapFile() throws Exception {
-    Path p = writeMapFile(getName());
-    WritableComparable midkey = getMidkey(p);
-    checkHalfMapFile(p, midkey);
-  }
-  
-  /**
-   * Check HalfMapFile works even if file we're to go against is smaller than
-   * the default MapFile interval of 128: i.e. index gets entry every 128 
-   * keys.
-   * @throws Exception
-   */
-  public void testSmallHalfMapFile() throws Exception {
-    Path p = writeSmallMapFile(getName());
-    // I know keys are a-z.  Let the midkey we want to use be 'd'.  See if
-    // HalfMapFiles work even if size of file is < than default MapFile
-    // interval.
-    checkHalfMapFile(p, new HStoreKey("d"));
-  }
-  
-  private WritableComparable getMidkey(final Path p) throws IOException {
-    MapFile.Reader reader =
-      new MapFile.Reader(this.fs, p.toString(), this.conf);
-    HStoreKey key = new HStoreKey();
-    ImmutableBytesWritable value = new ImmutableBytesWritable();
-    reader.next(key, value);
-    String firstKey = key.toString();
-    WritableComparable midkey = reader.midKey();
-    reader.finalKey(key);
-    LOG.info("First key " + firstKey + ", midkey " + midkey.toString()
-        + ", last key " + key.toString());
-    reader.close();
-    return midkey;
-  }
-  
-  private void checkHalfMapFile(final Path p, WritableComparable midkey)
-  throws IOException {
-    MapFile.Reader top = null;
-    MapFile.Reader bottom = null;
-    HStoreKey key = new HStoreKey();
-    ImmutableBytesWritable value = new ImmutableBytesWritable();
-    String previous = null;
-    try {
-      // Now make two HalfMapFiles and assert they can read the full backing
-      // file, one from the top and the other from the bottom.
-      // Test bottom half first.
-      bottom = new HalfMapFileReader(this.fs, p.toString(),
-          this.conf, Reference.Range.bottom, midkey, null);
-      boolean first = true;
-      while (bottom.next(key, value)) {
-        previous = key.toString();
-        if (first) {
-          first = false;
-          LOG.info("First in bottom: " + previous);
-        }
-        assertTrue(key.compareTo((HStoreKey)midkey) < 0);
-      }
-      if (previous != null) {
-        LOG.info("Last in bottom: " + previous.toString());
-      }
-      // Now test reading from the top.
-      top = new HalfMapFileReader(this.fs, p.toString(), this.conf,
-          Reference.Range.top, midkey, null);
-      first = true;
-      while (top.next(key, value)) {
-        assertTrue(key.compareTo((HStoreKey)midkey) >= 0);
-        if (first) {
-          first = false;
-          LOG.info("First in top: " + key.toString());
-        }
-      }
-      LOG.info("Last in top: " + key.toString());
-
-      // Next test using a midkey that does not exist in the file.
-      // First, do a key that is < than first key. Ensure splits behave
-      // properly.
-      WritableComparable badkey = new HStoreKey("   ");
-      bottom = new HalfMapFileReader(this.fs, p.toString(),
-          this.conf, Reference.Range.bottom, badkey, null);
-      // When badkey is < than the bottom, should return no values.
-      assertFalse(bottom.next(key, value));
-      // Now read from the top.
-      top = new HalfMapFileReader(this.fs, p.toString(), this.conf,
-          Reference.Range.top, badkey, null);
-      first = true;
-      while (top.next(key, value)) {
-        assertTrue(key.compareTo((HStoreKey)badkey) >= 0);
-        if (first) {
-          first = false;
-          LOG.info("First top when key < bottom: " + key.toString());
-          String tmp = Bytes.toString(key.getRow());
-          for (int i = 0; i < tmp.length(); i++) {
-            assertTrue(tmp.charAt(i) == 'a');
-          }
-        }
-      }
-      LOG.info("Last top when key < bottom: " + key.toString());
-      String tmp = Bytes.toString(key.getRow());
-      for (int i = 0; i < tmp.length(); i++) {
-        assertTrue(tmp.charAt(i) == 'z');
-      }
-
-      // Test when badkey is > than last key in file ('||' > 'zz').
-      badkey = new HStoreKey("|||");
-      bottom = new HalfMapFileReader(this.fs, p.toString(),
-          this.conf, Reference.Range.bottom, badkey, null);
-      first = true;
-      while (bottom.next(key, value)) {
-        if (first) {
-          first = false;
-          LOG.info("First bottom when key > top: " + key.toString());
-          tmp = Bytes.toString(key.getRow());
-          for (int i = 0; i < tmp.length(); i++) {
-            assertTrue(tmp.charAt(i) == 'a');
-          }
-        }
-      }
-      LOG.info("Last bottom when key > top: " + key.toString());
-      tmp = Bytes.toString(key.getRow());
-      for (int i = 0; i < tmp.length(); i++) {
-        assertTrue(tmp.charAt(i) == 'z');
-      }
-      // Now look at top. Should not return any values.
-      top = new HalfMapFileReader(this.fs, p.toString(), this.conf,
-          Reference.Range.top, badkey, null);
-      assertFalse(top.next(key, value));
-      
-    } finally {
-      if (top != null) {
-        top.close();
-      }
-      if (bottom != null) {
-        bottom.close();
-      }
-      fs.delete(p, true);
-    }
-  }
-  
-  /**
-   * Assert HalFMapFile does right thing when midkey does not exist in the
-   * backing file (its larger or smaller than any of the backing mapfiles keys).
-   * 
-   * @throws Exception
-   */
-  public void testOutOfRangeMidkeyHalfMapFile() throws Exception {
-    MapFile.Reader top = null;
-    MapFile.Reader bottom = null;
-    HStoreKey key = new HStoreKey();
-    ImmutableBytesWritable value = new ImmutableBytesWritable();
-    Path p = writeMapFile(getName());
-    try {
-      try {
-        // Test using a midkey that does not exist in the file.
-        // First, do a key that is < than first key.  Ensure splits behave
-        // properly.
-        HStoreKey midkey = new HStoreKey("   ");
-        bottom = new HalfMapFileReader(this.fs, p.toString(),
-          this.conf, Reference.Range.bottom, midkey, null);
-        // When midkey is < than the bottom, should return no values.
-        assertFalse(bottom.next(key, value));
-        // Now read from the top.
-        top = new HalfMapFileReader(this.fs, p.toString(),
-          this.conf, Reference.Range.top, midkey, null);
-        boolean first = true;
-        while (top.next(key, value)) {
-          assertTrue(key.compareTo(midkey) >= 0);
-          if (first) {
-            first = false;
-            LOG.info("First top when key < bottom: " + key.toString());
-            assertEquals("aa", Bytes.toString(key.getRow()));
-          }
-        }
-        LOG.info("Last top when key < bottom: " + key.toString());
-        assertEquals("zz", Bytes.toString(key.getRow()));
-        
-        // Test when midkey is > than last key in file ('||' > 'zz').
-        midkey = new HStoreKey("|||");
-        bottom = new HalfMapFileReader(this.fs, p.toString(),
-          this.conf, Reference.Range.bottom, midkey, null);
-        first = true;
-        while (bottom.next(key, value)) {
-          if (first) {
-            first = false;
-            LOG.info("First bottom when key > top: " + key.toString());
-            assertEquals("aa", Bytes.toString(key.getRow()));
-          }
-        }
-        LOG.info("Last bottom when key > top: " + key.toString());
-        assertEquals("zz", Bytes.toString(key.getRow()));
-        // Now look at top.  Should not return any values.
-        top = new HalfMapFileReader(this.fs, p.toString(),
-          this.conf, Reference.Range.top, midkey, null);
-        assertFalse(top.next(key, value));
-      } finally {
-        if (top != null) {
-          top.close();
-        }
-        if (bottom != null) {
-          bottom.close();
-        }
-        fs.delete(p, true);
-      }
-    } finally {
-      this.fs.delete(p, true);
-    }
-  }
-}
\ No newline at end of file
diff --git a/src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java b/src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java
new file mode 100644
index 00000000000..f91d53d49fd
--- /dev/null
+++ b/src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java
@@ -0,0 +1,295 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseTestCase;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HStoreKey;
+import org.apache.hadoop.hbase.io.Reference.Range;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileScanner;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+/**
+ * Test HStoreFile
+ */
+public class TestStoreFile extends HBaseTestCase {
+  static final Log LOG = LogFactory.getLog(TestStoreFile.class);
+  private MiniDFSCluster cluster;
+  
+  @Override
+  public void setUp() throws Exception {
+    try {
+      this.cluster = new MiniDFSCluster(this.conf, 2, true, (String[])null);
+      // Set the hbase.rootdir to be the home directory in mini dfs.
+      this.conf.set(HConstants.HBASE_DIR,
+        this.cluster.getFileSystem().getHomeDirectory().toString());
+    } catch (IOException e) {
+      shutdownDfs(cluster);
+    }
+    super.setUp();
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    super.tearDown();
+    shutdownDfs(cluster);
+    // ReflectionUtils.printThreadInfo(new PrintWriter(System.out),
+    //  "Temporary end-of-test thread dump debugging HADOOP-2040: " + getName());
+  }
+
+  /**
+   * Write a file and then assert that we can read from top and bottom halves
+   * using two HalfMapFiles.
+   * @throws Exception
+   */
+  public void testBasicHalfMapFile() throws Exception {
+    // Make up a directory hierarchy that has a regiondir and familyname.
+    HFile.Writer writer = StoreFile.getWriter(this.fs,
+      new Path(new Path(this.testDir, "regionname"), "familyname"),
+      2 * 1024, null, null);
+    writeStoreFile(writer);
+    checkHalfHFile(new StoreFile(this.fs, writer.getPath()));
+  }
+
+  /*
+   * Writes HStoreKey and ImmutableBytes data to passed writer and
+   * then closes it.
+   * @param writer
+   * @throws IOException
+   */
+  private void writeStoreFile(final HFile.Writer writer)
+  throws IOException {
+    try {
+      for (char d = FIRST_CHAR; d <= LAST_CHAR; d++) {
+        for (char e = FIRST_CHAR; e <= LAST_CHAR; e++) {
+          byte[] b = new byte[] { (byte) d, (byte) e };
+          byte [] t = Bytes.toBytes(new String(b, HConstants.UTF8_ENCODING));
+          HStoreKey hsk = new HStoreKey(t, t, System.currentTimeMillis());
+          writer.append(hsk.getBytes(), t);
+        }
+      }
+    } finally {
+      writer.close();
+    }
+  }
+  
+  /**
+   * Test that our mechanism of writing store files in one region to reference
+   * store files in other regions works.
+   * @throws IOException
+   */
+  public void testReference()
+  throws IOException {
+    Path storedir = new Path(new Path(this.testDir, "regionname"), "familyname");
+    Path dir = new Path(storedir, "1234567890");
+    // Make a store file and write data to it.
+    HFile.Writer writer = StoreFile.getWriter(this.fs, dir, 8 * 1024, null, null);
+    writeStoreFile(writer);
+    StoreFile hsf = new StoreFile(this.fs, writer.getPath());
+    HFile.Reader reader = hsf.getReader();
+    // Split on a row, not in middle of row.  Midkey returned by reader
+    // may be in middle of row.  Create new one with empty column and
+    // timestamp.
+    HStoreKey hsk = HStoreKey.create(reader.midkey());
+    byte [] midkey = hsk.getRow();
+    hsk = HStoreKey.create(reader.getLastKey());
+    byte [] finalKey = hsk.getRow();
+    // Make a reference
+    Path refPath = StoreFile.split(fs, dir, hsf, reader.midkey(), Range.top);
+    StoreFile refHsf = new StoreFile(this.fs, refPath);
+    // Now confirm that I can read from the reference and that it only gets
+    // keys from top half of the file.
+    HFileScanner s = refHsf.getReader().getScanner();
+    for(boolean first = true; (!s.isSeeked() && s.seekTo()) || s.next();) {
+      ByteBuffer bb = s.getKey();
+      hsk = HStoreKey.create(bb.array(), bb.arrayOffset(), bb.limit());
+      if (first) {
+        assertTrue(Bytes.equals(hsk.getRow(), midkey));
+        first = false;
+      }
+    }
+    assertTrue(Bytes.equals(hsk.getRow(), finalKey));
+  }
+
+  private void checkHalfHFile(final StoreFile f)
+  throws IOException {
+    byte [] midkey = f.getReader().midkey();
+    // Create top split.
+    Path topDir = Store.getStoreHomedir(this.testDir, 1,
+      Bytes.toBytes(f.getPath().getParent().getName()));
+    if (this.fs.exists(topDir)) {
+      this.fs.delete(topDir, true);
+    }
+    Path topPath = StoreFile.split(this.fs, topDir, f, midkey, Range.top);
+    // Create bottom split.
+    Path bottomDir = Store.getStoreHomedir(this.testDir, 2,
+      Bytes.toBytes(f.getPath().getParent().getName()));
+    if (this.fs.exists(bottomDir)) {
+      this.fs.delete(bottomDir, true);
+    }
+    Path bottomPath = StoreFile.split(this.fs, bottomDir,
+      f, midkey, Range.bottom);
+    // Make readers on top and bottom.
+    HFile.Reader top = new StoreFile(this.fs, topPath).getReader();
+    HFile.Reader bottom = new StoreFile(this.fs, bottomPath).getReader();
+    ByteBuffer previous = null;
+    LOG.info("Midkey: " + Bytes.toString(midkey));
+    byte [] midkeyBytes = new HStoreKey(midkey).getBytes();
+    ByteBuffer bbMidkeyBytes = ByteBuffer.wrap(midkeyBytes);
+    try {
+      // Now make two HalfMapFiles and assert they can read the full backing
+      // file, one from the top and the other from the bottom.
+      // Test bottom half first.
+      // Now test reading from the top.
+      boolean first = true;
+      ByteBuffer key = null;
+      HFileScanner topScanner = top.getScanner();
+      while ((!topScanner.isSeeked() && topScanner.seekTo()) ||
+          (topScanner.isSeeked() && topScanner.next())) {
+        key = topScanner.getKey();
+        
+        assertTrue(topScanner.getReader().getComparator().compare(key.array(),
+          key.arrayOffset(), key.limit(), midkeyBytes, 0, midkeyBytes.length) >= 0);
+        if (first) {
+          first = false;
+          LOG.info("First in top: " + Bytes.toString(Bytes.toBytes(key)));
+        }
+      }
+      LOG.info("Last in top: " + Bytes.toString(Bytes.toBytes(key)));
+      
+      first = true;
+      HFileScanner bottomScanner = bottom.getScanner();
+      while ((!bottomScanner.isSeeked() && bottomScanner.seekTo()) ||
+          bottomScanner.next()) {
+        previous = bottomScanner.getKey();
+        key = bottomScanner.getKey();
+        if (first) {
+          first = false;
+          LOG.info("First in bottom: " +
+            Bytes.toString(Bytes.toBytes(previous)));
+        }
+        assertTrue(key.compareTo(bbMidkeyBytes) < 0);
+      }
+      if (previous != null) {
+        LOG.info("Last in bottom: " + Bytes.toString(Bytes.toBytes(previous)));
+      }
+      // Remove references.
+      this.fs.delete(topPath, false);
+      this.fs.delete(bottomPath, false);
+
+      // Next test using a midkey that does not exist in the file.
+      // First, do a key that is < than first key. Ensure splits behave
+      // properly.
+      byte [] badmidkey = Bytes.toBytes("  .");
+      topPath = StoreFile.split(this.fs, topDir, f, badmidkey, Range.top);
+      bottomPath = StoreFile.split(this.fs, bottomDir, f, badmidkey,
+        Range.bottom);
+      top = new StoreFile(this.fs, topPath).getReader();
+      bottom = new StoreFile(this.fs, bottomPath).getReader();
+      bottomScanner = bottom.getScanner();
+      int count = 0;
+      while ((!bottomScanner.isSeeked() && bottomScanner.seekTo()) ||
+          bottomScanner.next()) {
+        count++;
+      }
+      // When badkey is < than the bottom, should return no values.
+      assertTrue(count == 0);
+      // Now read from the top.
+      first = true;
+      topScanner = top.getScanner();
+      while ((!topScanner.isSeeked() && topScanner.seekTo()) ||
+          topScanner.next()) {
+        key = topScanner.getKey();
+        assertTrue(topScanner.getReader().getComparator().compare(key.array(),
+          key.arrayOffset(), key.limit(), badmidkey, 0, badmidkey.length) >= 0);
+        if (first) {
+          first = false;
+          first = false;
+          HStoreKey keyhsk = HStoreKey.create(key);
+          LOG.info("First top when key < bottom: " + keyhsk);
+          String tmp = Bytes.toString(keyhsk.getRow());
+          for (int i = 0; i < tmp.length(); i++) {
+            assertTrue(tmp.charAt(i) == 'a');
+          }
+        }
+      }
+      HStoreKey keyhsk = HStoreKey.create(key);
+      LOG.info("Last top when key < bottom: " + keyhsk);
+      String tmp = Bytes.toString(keyhsk.getRow());
+      for (int i = 0; i < tmp.length(); i++) {
+        assertTrue(tmp.charAt(i) == 'z');
+      }
+      // Remove references.
+      this.fs.delete(topPath, false);
+      this.fs.delete(bottomPath, false);
+
+      // Test when badkey is > than last key in file ('||' > 'zz').
+      badmidkey = Bytes.toBytes("|||");
+      topPath = StoreFile.split(this.fs, topDir, f, badmidkey, Range.top);
+      bottomPath = StoreFile.split(this.fs, bottomDir, f, badmidkey,
+        Range.bottom);
+      top = new StoreFile(this.fs, topPath).getReader();
+      bottom = new StoreFile(this.fs, bottomPath).getReader();
+      first = true;
+      bottomScanner = bottom.getScanner();
+      while ((!bottomScanner.isSeeked() && bottomScanner.seekTo()) ||
+          bottomScanner.next()) {
+        key = bottomScanner.getKey();
+        if (first) {
+          first = false;
+          keyhsk = HStoreKey.create(key);
+          LOG.info("First bottom when key > top: " + keyhsk);
+          tmp = Bytes.toString(keyhsk.getRow());
+          for (int i = 0; i < tmp.length(); i++) {
+            assertTrue(tmp.charAt(i) == 'a');
+          }
+        }
+      }
+      keyhsk = HStoreKey.create(key);
+      LOG.info("Last bottom when key > top: " + keyhsk);
+      for (int i = 0; i < tmp.length(); i++) {
+        assertTrue(Bytes.toString(keyhsk.getRow()).charAt(i) == 'z');
+      }
+      count = 0;
+      topScanner = top.getScanner();
+      while ((!topScanner.isSeeked() && topScanner.seekTo()) ||
+          (topScanner.isSeeked() && topScanner.next())) {
+        count++;
+      }
+      // When badkey is < than the bottom, should return no values.
+      assertTrue(count == 0);
+    } finally {
+      if (top != null) {
+        top.close();
+      }
+      if (bottom != null) {
+        bottom.close();
+      }
+      fs.delete(f.getPath(), true);
+    }
+  }
+}
\ No newline at end of file