HBASE-13109 Make better SEEK vs SKIP decisions during scanning.

2015-03-04 14:03:47 -08:00 · 2015-03-04 14:03:47 -08:00 · 464e7ce685
parent 883d6fd8e5
commit 464e7ce685
18 changed files with 213 additions and 150 deletions
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Scan.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Scan.java
@ -93,23 +93,6 @@ public class Scan extends Query {

  private static final String RAW_ATTR = "_raw_";

-  /**
-   * EXPERT ONLY.
-   * An integer (not long) indicating to the scanner logic how many times we attempt to retrieve the
-   * next KV before we schedule a reseek.
-   * The right value depends on the size of the average KV. A reseek is more efficient when
-   * it can skip 5-10 KVs or 512B-1KB, or when the next KV is likely found in another HFile block.
-   * Setting this only has any effect when columns were added with
-   * {@link #addColumn(byte[], byte[])}
-   * <pre>{@code
-   * Scan s = new Scan(...);
-   * s.addColumn(...);
-   * s.setAttribute(Scan.HINT_LOOKAHEAD, Bytes.toBytes(2));
-   * }</pre>
-   * Default is 0 (always reseek).
-   */
-  public static final String HINT_LOOKAHEAD = "_look_ahead_";
-
  private byte [] startRow = HConstants.EMPTY_START_ROW;
  private byte [] stopRow  = HConstants.EMPTY_END_ROW;
  private int maxVersions = 1;
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
@ -953,7 +953,7 @@ public final class HConstants {
   * The byte array represents for NO_NEXT_INDEXED_KEY;
   * The actual value is irrelevant because this is always compared by reference.
   */
-  public static final byte [] NO_NEXT_INDEXED_KEY = Bytes.toBytes("NO_NEXT_INDEXED_KEY");
+  public static final Cell NO_NEXT_INDEXED_KEY = new KeyValue();
  /** delimiter used between portions of a region name */
  public static final int DELIMITER = ',';
  public static final String HBASE_CONFIG_READ_ZOOKEEPER_CONFIG =
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/KeyValue.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/KeyValue.java
@ -268,9 +268,9 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,

  ////
  // KeyValue core instance fields.
-  private byte [] bytes = null;  // an immutable byte array that contains the KV
-  private int offset = 0;  // offset into bytes buffer KV starts at
-  private int length = 0;  // length of the KV starting from offset.
+  protected byte [] bytes = null;  // an immutable byte array that contains the KV
+  protected int offset = 0;  // offset into bytes buffer KV starts at
+  protected int length = 0;  // length of the KV starting from offset.

  /**
   * @return True if a delete type, a {@link KeyValue.Type#Delete} or
@ -1896,6 +1896,58 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,
      return compareFlatKey(left, 0, left.length, right, 0, right.length);
    }

+    // compare a key against row/fam/qual/ts/type
+    public int compareKey(Cell cell,
+        byte[] row, int roff, int rlen,
+        byte[] fam, int foff, int flen,
+        byte[] col, int coff, int clen,
+        long ts, byte type) {
+
+      int compare = compareRows(
+        cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
+        row, roff, rlen);
+      if (compare != 0) {
+        return compare;
+      }
+      // If the column is not specified, the "minimum" key type appears the
+      // latest in the sorted order, regardless of the timestamp. This is used
+      // for specifying the last key/value in a given row, because there is no
+      // "lexicographically last column" (it would be infinitely long). The
+      // "maximum" key type does not need this behavior.
+      if (cell.getFamilyLength() + cell.getQualifierLength() == 0
+          && cell.getTypeByte() == Type.Minimum.getCode()) {
+        // left is "bigger", i.e. it appears later in the sorted order
+        return 1;
+      }
+      if (flen+clen == 0 && type == Type.Minimum.getCode()) {
+        return -1;
+      }
+
+      compare = compareFamilies(
+        cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
+        fam, foff, flen);
+      if (compare != 0) {
+        return compare;
+      }
+      compare = compareColumns(
+        cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength(),
+        col, coff, clen);
+      if (compare != 0) {
+        return compare;
+      }
+      // Next compare timestamps.
+      compare = compareTimestamps(cell.getTimestamp(), ts);
+      if (compare != 0) {
+        return compare;
+      }
+
+      // Compare types. Let the delete types sort ahead of puts; i.e. types
+      // of higher numbers sort before those of lesser numbers. Maximum (255)
+      // appears ahead of everything, and minimum (0) appears after
+      // everything.
+      return (0xff & type) - (0xff & cell.getTypeByte());
+    }
+
    public int compareOnlyKeyPortion(Cell left, Cell right) {
      return CellComparator.compare(left, right, true);
    }
@ -2595,16 +2647,15 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,
   * Hence create a Keyvalue(aka Cell) that would help in comparing as two cells
   */
  public static class KeyOnlyKeyValue extends KeyValue {
-    private int length = 0;
-    private int offset = 0;
-    private byte[] b;
-
    public KeyOnlyKeyValue() {

    }
+    public KeyOnlyKeyValue(byte[] b) {
+      this(b, 0, b.length);
+    }

    public KeyOnlyKeyValue(byte[] b, int offset, int length) {
-      this.b = b;
+      this.bytes = b;
      this.length = length;
      this.offset = offset;
    }
@ -2622,7 +2673,7 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,
     * @param length
     */
    public void setKey(byte[] key, int offset, int length) {
-      this.b = key;
+      this.bytes = key;
      this.offset = offset;
      this.length = length;
    }
@ -2631,13 +2682,13 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,
    public byte[] getKey() {
      int keylength = getKeyLength();
      byte[] key = new byte[keylength];
-      System.arraycopy(this.b, getKeyOffset(), key, 0, keylength);
+      System.arraycopy(this.bytes, getKeyOffset(), key, 0, keylength);
      return key;
    }

    @Override
    public byte[] getRowArray() {
-      return b;
+      return bytes;
    }

    @Override
@ -2647,12 +2698,12 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,

    @Override
    public byte[] getFamilyArray() {
-      return b;
+      return bytes;
    }

    @Override
    public byte getFamilyLength() {
-      return this.b[getFamilyOffset() - 1];
+      return this.bytes[getFamilyOffset() - 1];
    }

    @Override
@ -2662,7 +2713,7 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,

    @Override
    public byte[] getQualifierArray() {
-      return b;
+      return bytes;
    }

    @Override
@ -2682,12 +2733,12 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,

    @Override
    public short getRowLength() {
-      return Bytes.toShort(this.b, getKeyOffset());
+      return Bytes.toShort(this.bytes, getKeyOffset());
    }

    @Override
    public byte getTypeByte() {
-      return this.b[this.offset + getKeyLength() - 1];
+      return this.bytes[this.offset + getKeyLength() - 1];
    }

    private int getQualifierLength(int rlength, int flength) {
@ -2697,7 +2748,7 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,
    @Override
    public long getTimestamp() {
      int tsOffset = getTimestampOffset();
-      return Bytes.toLong(this.b, tsOffset);
+      return Bytes.toLong(this.bytes, tsOffset);
    }

    @Override
@ -2737,10 +2788,10 @@ public class KeyValue implements Cell, HeapSize, Cloneable, SettableSequenceId,

    @Override
    public String toString() {
-      if (this.b == null || this.b.length == 0) {
+      if (this.bytes == null || this.bytes.length == 0) {
        return "empty";
      }
-      return keyToString(this.b, this.offset, getKeyLength()) + "/vlen=0/mvcc=0";
+      return keyToString(this.bytes, this.offset, getKeyLength()) + "/vlen=0/mvcc=0";
    }

    @Override
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java
@ -317,6 +317,11 @@ public class HalfStoreFileReader extends StoreFile.Reader {
        }
        return ret;
      }
+
+      @Override
+      public Cell getNextIndexedKey() {
+        return null;
+      }
    };
  }
  
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockWithScanInfo.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockWithScanInfo.java
@ -17,6 +17,8 @@
 */
 package org.apache.hadoop.hbase.io.hfile;

+import org.apache.hadoop.hbase.Cell;
+
 /**
 * BlockWithScanInfo is wrapper class for HFileBlock with other attributes. These attributes are
 * supposed to be much cheaper to be maintained in each caller thread than in HFileBlock itself.
@ -27,9 +29,9 @@ public class BlockWithScanInfo {
   * The first key in the next block following this one in the HFile.
   * If this key is unknown, this is reference-equal with HConstants.NO_NEXT_INDEXED_KEY
   */
-  private final byte[] nextIndexedKey;
+  private final Cell nextIndexedKey;

-  public BlockWithScanInfo(HFileBlock hFileBlock, byte[] nextIndexedKey) {
+  public BlockWithScanInfo(HFileBlock hFileBlock, Cell nextIndexedKey) {
    this.hFileBlock = hFileBlock;
    this.nextIndexedKey = nextIndexedKey;
  }
@ -38,7 +40,7 @@ public class BlockWithScanInfo {
    return hFileBlock;
  }

-  public byte[] getNextIndexedKey() {
+  public  Cell getNextIndexedKey() {
    return nextIndexedKey;
  }
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
@ -218,14 +218,14 @@ public class HFileBlockIndex {
      }

      // the next indexed key
-      byte[] nextIndexedKey = null;
+      Cell nextIndexedKey = null;

      // Read the next-level (intermediate or leaf) index block.
      long currentOffset = blockOffsets[rootLevelIndex];
      int currentOnDiskSize = blockDataSizes[rootLevelIndex];

      if (rootLevelIndex < blockKeys.length - 1) {
-        nextIndexedKey = blockKeys[rootLevelIndex + 1];
+        nextIndexedKey = new KeyValue.KeyOnlyKeyValue(blockKeys[rootLevelIndex + 1]);
      } else {
        nextIndexedKey = HConstants.NO_NEXT_INDEXED_KEY;
      }
@ -298,7 +298,7 @@ public class HFileBlockIndex {
        // Only update next indexed key if there is a next indexed key in the current level
        byte[] tmpNextIndexedKey = getNonRootIndexedKey(buffer, index + 1);
        if (tmpNextIndexedKey != null) {
-          nextIndexedKey = tmpNextIndexedKey;
+          nextIndexedKey = new KeyValue.KeyOnlyKeyValue(tmpNextIndexedKey);
        }
      }

--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderV2.java
@ -542,6 +542,10 @@ public class HFileReaderV2 extends AbstractHFileReader {
      extends AbstractHFileReader.Scanner {
    protected HFileBlock block;

+    @Override
+    public Cell getNextIndexedKey() {
+      return nextIndexedKey;
+    }
    /**
     * The next indexed key is to keep track of the indexed key of the next data block.
     * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
@ -549,7 +553,7 @@ public class HFileReaderV2 extends AbstractHFileReader {
     *
     * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
     */
-    protected byte[] nextIndexedKey;
+    protected Cell nextIndexedKey;

    public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
        final boolean pread, final boolean isCompaction) {
@ -558,7 +562,7 @@ public class HFileReaderV2 extends AbstractHFileReader {

    protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);

-    protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
+    protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, Cell nextIndexedKey,
        boolean rewind, Cell key, boolean seekBefore) throws IOException;

    @Override
@ -592,9 +596,7 @@ public class HFileReaderV2 extends AbstractHFileReader {
          if (this.nextIndexedKey != null &&
              (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY || reader
              .getComparator()
-                  .compareOnlyKeyPortion(key,
-                      new KeyValue.KeyOnlyKeyValue(nextIndexedKey, 0,
-                          nextIndexedKey.length)) < 0)) {
+                  .compareOnlyKeyPortion(key, nextIndexedKey) < 0)) {
            // The reader shall continue to scan the current data block instead
            // of querying the
            // block index as long as it knows the target key is strictly
@ -672,7 +674,7 @@ public class HFileReaderV2 extends AbstractHFileReader {
        // TODO shortcut: seek forward in this block to the last key of the
        // block.
      }
-      byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
+      Cell firstKeyInCurrentBlock = new KeyValue.KeyOnlyKeyValue(Bytes.getBytes(firstKey));
      loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, true);
      return true;
    }
@ -877,7 +879,7 @@ public class HFileReaderV2 extends AbstractHFileReader {
    }

    @Override
-    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
+    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, Cell nextIndexedKey,
        boolean rewind, Cell key, boolean seekBefore) throws IOException {
      if (block == null || block.getOffset() != seekToBlock.getOffset()) {
        updateCurrBlock(seekToBlock);
@ -1234,7 +1236,7 @@ public class HFileReaderV2 extends AbstractHFileReader {
    }

    @Override
-    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
+    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, Cell nextIndexedKey,
        boolean rewind, Cell key, boolean seekBefore) throws IOException {
      if (block == null || block.getOffset() != seekToBlock.getOffset()) {
        updateCurrentBlock(seekToBlock);
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java
@ -156,4 +156,9 @@ public interface HFileScanner {
   * Otherwise returns false.
   */
  boolean isSeeked();
+
+  /**
+   * @return the next key in the index (the key to seek to the next block)
+   */
+  Cell getNextIndexedKey();
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/ExplicitColumnTracker.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/ExplicitColumnTracker.java
@ -56,10 +56,6 @@ public class ExplicitColumnTracker implements ColumnTracker {
  private final int maxVersions;
  private final int minVersions;

-  // hint for the tracker about how many KVs we will attempt to search via next()
-  // before we schedule a (re)seek operation
-  private final int lookAhead; 
-
 /**
  * Contains the list of columns that the ExplicitColumnTracker is tracking.
  * Each ColumnCount instance also tracks how many versions of the requested
@ -72,7 +68,6 @@ public class ExplicitColumnTracker implements ColumnTracker {
   * Used to eliminate duplicates. */
  private long latestTSOfCurrentColumn;
  private long oldestStamp;
-  private int skipCount;

  /**
   * Default constructor.
@ -85,10 +80,9 @@ public class ExplicitColumnTracker implements ColumnTracker {
   *  (re)seeking
   */
  public ExplicitColumnTracker(NavigableSet<byte[]> columns, int minVersions,
-      int maxVersions, long oldestUnexpiredTS, int lookAhead) {
+      int maxVersions, long oldestUnexpiredTS) {
    this.maxVersions = maxVersions;
    this.minVersions = minVersions;
-    this.lookAhead = lookAhead;
    this.oldestStamp = oldestUnexpiredTS;
    this.columns = new ColumnCount[columns.size()];
    int i=0;
@ -144,8 +138,7 @@ public class ExplicitColumnTracker implements ColumnTracker {
      if (ret > 0) {
        // The current KV is smaller than the column the ExplicitColumnTracker
        // is interested in, so seek to that column of interest.
-        return this.skipCount++ < this.lookAhead ? ScanQueryMatcher.MatchCode.SKIP
-            : ScanQueryMatcher.MatchCode.SEEK_NEXT_COL;
+        return ScanQueryMatcher.MatchCode.SEEK_NEXT_COL;
      }

      // The current KV is bigger than the column the ExplicitColumnTracker
@ -154,7 +147,6 @@ public class ExplicitColumnTracker implements ColumnTracker {
      // column of interest, and check again.
      if (ret <= -1) {
        ++this.index;
-        this.skipCount = 0;
        if (done()) {
          // No more to match, do not include, done with this row.
          return ScanQueryMatcher.MatchCode.SEEK_NEXT_ROW; // done_row
@ -179,7 +171,6 @@ public class ExplicitColumnTracker implements ColumnTracker {
    if (count >= maxVersions || (count >= minVersions && isExpired(timestamp))) {
      // Done with versions for this column
      ++this.index;
-      this.skipCount = 0;
      resetTS();
      if (done()) {
        // We have served all the requested columns.
@ -198,7 +189,6 @@ public class ExplicitColumnTracker implements ColumnTracker {
  // Called between every row.
  public void reset() {
    this.index = 0;
-    this.skipCount = 0;
    this.column = this.columns[this.index];
    for(ColumnCount col : this.columns) {
      col.setCount(0);
@ -238,7 +228,6 @@ public class ExplicitColumnTracker implements ColumnTracker {
      resetTS();
      if (compare <= 0) {
        ++this.index;
-        this.skipCount = 0;
        if (done()) {
          // Will not hit any more columns in this storefile
          this.column = null;
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java
@ -395,4 +395,10 @@ public class KeyValueHeap extends NonReversedNonLazyKeyValueScanner
  KeyValueScanner getCurrentForTesting() {
    return current;
  }
+
+  @Override
+  public Cell getNextIndexedKey() {
+    // here we return the next index key from the top scanner
+    return current == null ? null : current.getNextIndexedKey();
+  }
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java
@ -156,4 +156,10 @@ public interface KeyValueScanner {
   * @throws IOException
   */
  public boolean seekToLastRow() throws IOException;
+
+  /**
+   * @return the next key in the index (the key to seek to the next block)
+   * if known, or null otherwise
+   */
+  public Cell getNextIndexedKey();
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java
@ -67,4 +67,8 @@ public abstract class NonLazyKeyValueScanner implements KeyValueScanner {
    // Not a file by default.
    return false;
  }
+  @Override
+  public Cell getNextIndexedKey() {
+    return null;
+  }
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/ScanQueryMatcher.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/ScanQueryMatcher.java
@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.regionserver;
 import java.io.IOException;
 import java.util.NavigableSet;

+import org.apache.hadoop.hbase.KeyValue.Type;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellUtil;
@ -204,9 +205,8 @@ public class ScanQueryMatcher {

      // We can share the ExplicitColumnTracker, diff is we reset
      // between rows, not between storefiles.
-      byte[] attr = scan.getAttribute(Scan.HINT_LOOKAHEAD);
      this.columns = new ExplicitColumnTracker(columns, scanInfo.getMinVersions(), maxVersions,
-          oldestUnexpiredTS, attr == null ? 0 : Bytes.toInt(attr));
+          oldestUnexpiredTS);
    }
    this.isReversed = scan.isReversed();
  }
@ -577,6 +577,45 @@ public class ScanQueryMatcher {
        null, 0, 0);
  }

+  /**
+   * @param nextIndexed the key of the next entry in the block index (if any)
+   * @param off
+   * @param len
+   * @param kv The Cell we're using to calculate the seek key
+   * @return result of the compare between the indexed key and the key portion of the passed cell
+   */
+  public int compareKeyForNextRow(Cell nextIndexed, Cell kv) {
+    return rowComparator.compareKey(nextIndexed,
+      kv.getRowArray(), kv.getRowOffset(), kv.getRowLength(),
+      null, 0, 0,
+      null, 0, 0,
+      HConstants.OLDEST_TIMESTAMP, Type.Minimum.getCode());
+  }
+
+  /**
+   * @param nextIndexed the key of the next entry in the block index (if any)
+   * @param off
+   * @param len
+   * @param kv The Cell we're using to calculate the seek key
+   * @return result of the compare between the indexed key and the key portion of the passed cell
+   */
+  public int compareKeyForNextColumn(Cell nextIndexed, Cell kv) {
+    ColumnCount nextColumn = columns.getColumnHint();
+    if (nextColumn == null) {
+      return rowComparator.compareKey(nextIndexed,
+        kv.getRowArray(), kv.getRowOffset(), kv.getRowLength(),
+        kv.getFamilyArray(), kv.getFamilyOffset(), kv.getFamilyLength(),
+        kv.getQualifierArray(), kv.getQualifierOffset(), kv.getQualifierLength(),
+        HConstants.OLDEST_TIMESTAMP, Type.Minimum.getCode());
+    } else {
+      return rowComparator.compareKey(nextIndexed,
+        kv.getRowArray(), kv.getRowOffset(), kv.getRowLength(),
+        kv.getFamilyArray(), kv.getFamilyOffset(), kv.getFamilyLength(),
+        nextColumn.getBuffer(), nextColumn.getOffset(), nextColumn.getLength(),
+        HConstants.LATEST_TIMESTAMP, Type.Maximum.getCode());
+    }
+  }
+
  //Used only for testing purposes
  static MatchCode checkColumn(ColumnTracker columnTracker, byte[] bytes, int offset,
      int length, long ttl, byte type, boolean ignoreCount) throws IOException {
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java
@ -484,4 +484,9 @@ public class StoreFileScanner implements KeyValueScanner {
    }
    return true;
  }
+
+  @Override
+  public Cell getNextIndexedKey() {
+    return hfs.getNextIndexedKey();
+  }
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
@ -42,6 +42,7 @@ import org.apache.hadoop.hbase.client.IsolationLevel;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.executor.ExecutorService;
 import org.apache.hadoop.hbase.filter.Filter;
+import org.apache.hadoop.hbase.regionserver.ScanQueryMatcher.MatchCode;
 import org.apache.hadoop.hbase.regionserver.handler.ParallelSeekHandler;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
@ -494,6 +495,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner
      prevCell = cell;

      ScanQueryMatcher.MatchCode qcode = matcher.match(cell);
+      qcode = optimize(qcode, cell);
      switch(qcode) {
        case INCLUDE:
        case INCLUDE_AND_SEEK_NEXT_ROW:
@ -596,6 +598,38 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner
    }
  }

+  /*
+   * See if we should actually SEEK or rather just SKIP to the next Cell.
+   * (see HBASE-13109)
+   */
+  private ScanQueryMatcher.MatchCode optimize(ScanQueryMatcher.MatchCode qcode, Cell cell) {
+    Cell nextIndexedKey = getNextIndexedKey();
+    if (nextIndexedKey == null || nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY || store == null) {
+      return qcode;
+    }
+    switch(qcode) {
+    case INCLUDE_AND_SEEK_NEXT_COL:
+    case SEEK_NEXT_COL:
+    {
+      if (matcher.compareKeyForNextColumn(nextIndexedKey, cell) >= 0) {
+        return qcode == MatchCode.SEEK_NEXT_COL ? MatchCode.SKIP : MatchCode.INCLUDE;
+      }
+      break;
+    }
+    case INCLUDE_AND_SEEK_NEXT_ROW:
+    case SEEK_NEXT_ROW:
+    {
+      if (matcher.compareKeyForNextRow(nextIndexedKey, cell) >= 0) {
+        return qcode == MatchCode.SEEK_NEXT_ROW ? MatchCode.SKIP : MatchCode.INCLUDE;
+      }
+      break;
+    }
+    default:
+      break;
+    }
+    return qcode;
+  }
+
  @Override
  public boolean next(List<Cell> outResult) throws IOException {
    return next(outResult, -1);
@ -799,5 +833,10 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner
  public long getEstimatedNumberOfKvsScanned() {
    return this.kvsScanned;
  }
+
+  @Override
+  public Cell getNextIndexedKey() {
+    return this.heap.getNextIndexedKey();
+  }
 }

--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestBlocksRead.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestBlocksRead.java
@ -271,7 +271,7 @@ public class TestBlocksRead extends HBaseTestCase {
  }

  /**
-   * Test # of blocks read (targetted at some of the cases Lazy Seek optimizes).
+   * Test # of blocks read (targeted at some of the cases Lazy Seek optimizes).
   *
   * @throws Exception
   */
@ -356,8 +356,8 @@ public class TestBlocksRead extends HBaseTestCase {
      putData(FAMILY, "row", "col3", 9);
      region.flushcache();

-      // Baseline expected blocks read: 8. [HBASE-4532]
-      kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2", "col3"), 5);
+      // Baseline expected blocks read: 6. [HBASE-4532]
+      kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2", "col3"), 6, 7, 7);
      assertEquals(0, kvs.length);
 
      // File 7: Put back new data
@ -367,8 +367,8 @@ public class TestBlocksRead extends HBaseTestCase {
      region.flushcache();


-      // Expected blocks read: 5. [HBASE-4585]
-      kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2", "col3"), 5);
+      // Expected blocks read: 8. [HBASE-4585, HBASE-13109]
+      kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2", "col3"), 8, 9, 9);
      assertEquals(3, kvs.length);
      verifyData(kvs[0], "row", "col1", 11);
      verifyData(kvs[1], "row", "col2", 12);
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestExplicitColumnTracker.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestExplicitColumnTracker.java
@ -48,9 +48,9 @@ public class TestExplicitColumnTracker {
  private void runTest(int maxVersions,
                       TreeSet<byte[]> trackColumns,
                       List<byte[]> scannerColumns,
-                       List<MatchCode> expected, int lookAhead) throws IOException {
+                       List<MatchCode> expected) throws IOException {
    ColumnTracker exp = new ExplicitColumnTracker(
-      trackColumns, 0, maxVersions, Long.MIN_VALUE, lookAhead);
+      trackColumns, 0, maxVersions, Long.MIN_VALUE);


    //Initialize result
@ -92,7 +92,7 @@ public class TestExplicitColumnTracker {
    scanner.add(col4);
    scanner.add(col5);

-    runTest(maxVersions, columns, scanner, expected, 0);
+    runTest(maxVersions, columns, scanner, expected);
  }

  @Test
@ -144,59 +144,7 @@ public class TestExplicitColumnTracker {
    scanner.add(col5);

    //Initialize result
-    runTest(maxVersions, columns, scanner, expected, 0);
-  }
-
-  @Test
-  public void testGet_MultiVersionWithLookAhead() throws IOException{
-    //Create tracker
-    TreeSet<byte[]> columns = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
-    //Looking for every other
-    columns.add(col2);
-    columns.add(col4);
-
-    List<ScanQueryMatcher.MatchCode> expected = new ArrayList<ScanQueryMatcher.MatchCode>();
-    expected.add(ScanQueryMatcher.MatchCode.SKIP);
-    expected.add(ScanQueryMatcher.MatchCode.SKIP);
-    expected.add(ScanQueryMatcher.MatchCode.SEEK_NEXT_COL);
-
-    expected.add(ScanQueryMatcher.MatchCode.INCLUDE);                   // col2; 1st version
-    expected.add(ScanQueryMatcher.MatchCode.INCLUDE_AND_SEEK_NEXT_COL); // col2; 2nd version
-    expected.add(ScanQueryMatcher.MatchCode.SKIP);
-
-    expected.add(ScanQueryMatcher.MatchCode.SKIP);
-    expected.add(ScanQueryMatcher.MatchCode.SEEK_NEXT_COL);
-    expected.add(ScanQueryMatcher.MatchCode.SEEK_NEXT_COL);
-
-    expected.add(ScanQueryMatcher.MatchCode.INCLUDE);                   // col4; 1st version
-    expected.add(ScanQueryMatcher.MatchCode.INCLUDE_AND_SEEK_NEXT_ROW); // col4; 2nd version
-    expected.add(ScanQueryMatcher.MatchCode.SEEK_NEXT_ROW);
-
-    expected.add(ScanQueryMatcher.MatchCode.SEEK_NEXT_ROW);
-    expected.add(ScanQueryMatcher.MatchCode.SEEK_NEXT_ROW);
-    expected.add(ScanQueryMatcher.MatchCode.SEEK_NEXT_ROW);
-    int maxVersions = 2;
-
-    //Create "Scanner"
-    List<byte[]> scanner = new ArrayList<byte[]>();
-    scanner.add(col1);
-    scanner.add(col1);
-    scanner.add(col1);
-    scanner.add(col2);
-    scanner.add(col2);
-    scanner.add(col2);
-    scanner.add(col3);
-    scanner.add(col3);
-    scanner.add(col3);
-    scanner.add(col4);
-    scanner.add(col4);
-    scanner.add(col4);
-    scanner.add(col5);
-    scanner.add(col5);
-    scanner.add(col5);
-
-    //Initialize result
-    runTest(maxVersions, columns, scanner, expected, 2);
+    runTest(maxVersions, columns, scanner, expected);
  }

  /**
@ -211,7 +159,7 @@ public class TestExplicitColumnTracker {
    }

    ColumnTracker explicit = new ExplicitColumnTracker(columns, 0, maxVersions,
-        Long.MIN_VALUE, 0);
+        Long.MIN_VALUE);
    for (int i = 0; i < 100000; i+=2) {
      byte [] col = Bytes.toBytes("col"+i);
      ScanQueryMatcher.checkColumn(explicit, col, 0, col.length, 1, KeyValue.Type.Put.getCode(),
@ -240,7 +188,7 @@ public class TestExplicitColumnTracker {
      new ScanQueryMatcher.MatchCode[] {
        ScanQueryMatcher.MatchCode.SEEK_NEXT_COL,
        ScanQueryMatcher.MatchCode.SEEK_NEXT_COL });
-    runTest(1, columns, scanner, expected, 0);
+    runTest(1, columns, scanner, expected);
  }

 }
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestQueryMatcher.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestQueryMatcher.java
@ -147,27 +147,6 @@ public class TestQueryMatcher extends HBaseTestCase {
    _testMatch_ExplicitColumns(scan, expected);
  }

-  @Test
-  public void testMatch_ExplicitColumnsWithLookAhead()
-  throws IOException {
-    //Moving up from the Tracker by using Gets and List<KeyValue> instead
-    //of just byte []
-
-    //Expected result
-    List<MatchCode> expected = new ArrayList<ScanQueryMatcher.MatchCode>();
-    expected.add(ScanQueryMatcher.MatchCode.SKIP);
-    expected.add(ScanQueryMatcher.MatchCode.INCLUDE_AND_SEEK_NEXT_COL);
-    expected.add(ScanQueryMatcher.MatchCode.SKIP);
-    expected.add(ScanQueryMatcher.MatchCode.INCLUDE_AND_SEEK_NEXT_COL);
-    expected.add(ScanQueryMatcher.MatchCode.INCLUDE_AND_SEEK_NEXT_ROW);
-    expected.add(ScanQueryMatcher.MatchCode.DONE);
-
-    Scan s = new Scan(scan);
-    s.setAttribute(Scan.HINT_LOOKAHEAD, Bytes.toBytes(2));
-    _testMatch_ExplicitColumns(s, expected);
-  }
-
-
  @Test
  public void testMatch_Wildcard()
  throws IOException {