HBASE-2324 Refactoring of TableRecordReader (mapred / mapreduce) for reuse outside the scope of InputSplit / RecordReader

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@923404 13f79535-47bb-0310-9956-ffa450edef68
2010-03-15 19:37:48 +00:00 · 2010-03-15 19:37:48 +00:00 · 9013c837e8
parent 11999bd2b3
commit 9013c837e8
7 changed files with 645 additions and 306 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -434,6 +434,8 @@ Release 0.21.0 - Unreleased
               (Kay Kay via Stack)
   HBASE-2279  Hbase Shell does not have any tests (Alexey Kovyrin via Stack)
   HBASE-2314  [shell] Support for getting counters (Alexey Kovyrin via Stack)
+   HBASE-2324  Refactoring of TableRecordReader (mapred / mapreduce) for reuse
+               outside the scope of InputSplit / RecordReader (Kay Kay via Stack)

  NEW FEATURES
   HBASE-1961  HBase EC2 scripts
--- a/core/src/main/java/org/apache/hadoop/hbase/mapred/TableInputFormatBase.java
+++ b/core/src/main/java/org/apache/hadoop/hbase/mapred/TableInputFormatBase.java
@ -24,23 +24,16 @@ import java.io.IOException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.UnknownScannerException;
-import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.Scan;
-import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.filter.Filter;
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
 import org.apache.hadoop.hbase.regionserver.HRegion;
-import org.apache.hadoop.hbase.util.Writables;
-import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.mapred.InputFormat;
 import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RecordReader;
 import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.util.StringUtils;

 /**
 * A Base for {@link TableInputFormat}s. Receives a {@link HTable}, a
@ -80,157 +73,6 @@ implements InputFormat<ImmutableBytesWritable, Result> {
  private TableRecordReader tableRecordReader;
  private Filter rowFilter;

-  /**
-   * Iterate over an HBase table data, return (Text, RowResult) pairs
-   */
-  protected class TableRecordReader
-  implements RecordReader<ImmutableBytesWritable, Result> {
-    private byte [] startRow;
-    private byte [] endRow;
-    private byte [] lastRow;
-    private Filter trrRowFilter;
-    private ResultScanner scanner;
-    private HTable htable;
-    private byte [][] trrInputColumns;
-
-    /**
-     * Restart from survivable exceptions by creating a new scanner.
-     *
-     * @param firstRow
-     * @throws IOException
-     */
-    public void restart(byte[] firstRow) throws IOException {
-      if ((endRow != null) && (endRow.length > 0)) {
-        if (trrRowFilter != null) {
-          Scan scan = new Scan(firstRow, endRow);
-          scan.addColumns(trrInputColumns);
-          scan.setFilter(trrRowFilter);
-          this.scanner = this.htable.getScanner(scan);
-        } else {
-          LOG.debug("TIFB.restart, firstRow: " +
-              Bytes.toStringBinary(firstRow) + ", endRow: " +
-              Bytes.toStringBinary(endRow));
-          Scan scan = new Scan(firstRow, endRow);
-          scan.addColumns(trrInputColumns);
-          this.scanner = this.htable.getScanner(scan);
-        }
-      } else {
-        LOG.debug("TIFB.restart, firstRow: " +
-            Bytes.toStringBinary(firstRow) + ", no endRow");
-
-        Scan scan = new Scan(firstRow);
-        scan.addColumns(trrInputColumns);
-//        scan.setFilter(trrRowFilter);
-        this.scanner = this.htable.getScanner(scan);
-      }
-    }
-
-    /**
-     * Build the scanner. Not done in constructor to allow for extension.
-     *
-     * @throws IOException
-     */
-    public void init() throws IOException {
-      restart(startRow);
-    }
-
-    /**
-     * @param htable the {@link HTable} to scan.
-     */
-    public void setHTable(HTable htable) {
-      this.htable = htable;
-    }
-
-    /**
-     * @param inputColumns the columns to be placed in {@link Result}.
-     */
-    public void setInputColumns(final byte [][] inputColumns) {
-      this.trrInputColumns = inputColumns;
-    }
-
-    /**
-     * @param startRow the first row in the split
-     */
-    public void setStartRow(final byte [] startRow) {
-      this.startRow = startRow;
-    }
-
-    /**
-     *
-     * @param endRow the last row in the split
-     */
-    public void setEndRow(final byte [] endRow) {
-      this.endRow = endRow;
-    }
-
-    /**
-     * @param rowFilter the {@link Filter} to be used.
-     */
-    public void setRowFilter(Filter rowFilter) {
-      this.trrRowFilter = rowFilter;
-    }
-
-    public void close() {
-      this.scanner.close();
-    }
-
-    /**
-     * @return ImmutableBytesWritable
-     *
-     * @see org.apache.hadoop.mapred.RecordReader#createKey()
-     */
-    public ImmutableBytesWritable createKey() {
-      return new ImmutableBytesWritable();
-    }
-
-    /**
-     * @return RowResult
-     *
-     * @see org.apache.hadoop.mapred.RecordReader#createValue()
-     */
-    public Result createValue() {
-      return new Result();
-    }
-
-    public long getPos() {
-      // This should be the ordinal tuple in the range;
-      // not clear how to calculate...
-      return 0;
-    }
-
-    public float getProgress() {
-      // Depends on the total number of tuples and getPos
-      return 0;
-    }
-
-    /**
-     * @param key HStoreKey as input key.
-     * @param value MapWritable as input value
-     * @return true if there was more data
-     * @throws IOException
-     */
-    public boolean next(ImmutableBytesWritable key, Result value)
-    throws IOException {
-      Result result;
-      try {
-        result = this.scanner.next();
-      } catch (UnknownScannerException e) {
-        LOG.debug("recovered from " + StringUtils.stringifyException(e));
-        restart(lastRow);
-        this.scanner.next();    // skip presumed already mapped row
-        result = this.scanner.next();
-      }
-
-      if (result != null && result.size() > 0) {
-        key.set(result.getRow());
-        lastRow = key.get();
-        Writables.copyWritable(result, value);
-        return true;
-      }
-      return false;
-    }
-  }
-
  /**
   * Builds a TableRecordReader. If no TableRecordReader was provided, uses
   * the default.
--- a/core/src/main/java/org/apache/hadoop/hbase/mapred/TableRecordReader.java
+++ b/core/src/main/java/org/apache/hadoop/hbase/mapred/TableRecordReader.java
@ -0,0 +1,138 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapred;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.filter.Filter;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.mapred.RecordReader;
+
+
+/**
+ * Iterate over an HBase table data, return (Text, RowResult) pairs
+ */
+public class TableRecordReader
+implements RecordReader<ImmutableBytesWritable, Result> {
+
+  private TableRecordReaderImpl recordReaderImpl = new TableRecordReaderImpl();
+  
+  /**
+   * Restart from survivable exceptions by creating a new scanner.
+   *
+   * @param firstRow
+   * @throws IOException
+   */
+  public void restart(byte[] firstRow) throws IOException {
+    this.recordReaderImpl.restart(firstRow);
+  }
+
+  /**
+   * Build the scanner. Not done in constructor to allow for extension.
+   *
+   * @throws IOException
+   */
+  public void init() throws IOException {
+    this.recordReaderImpl.restart(this.recordReaderImpl.getStartRow());
+  }
+
+  /**
+   * @param htable the {@link HTable} to scan.
+   */
+  public void setHTable(HTable htable) {
+    this.recordReaderImpl.setHTable(htable);
+  }
+
+  /**
+   * @param inputColumns the columns to be placed in {@link Result}.
+   */
+  public void setInputColumns(final byte [][] inputColumns) {
+    this.recordReaderImpl.setInputColumns(inputColumns);
+  }
+
+  /**
+   * @param startRow the first row in the split
+   */
+  public void setStartRow(final byte [] startRow) {
+    this.recordReaderImpl.setStartRow(startRow);
+  }
+
+  /**
+   *
+   * @param endRow the last row in the split
+   */
+  public void setEndRow(final byte [] endRow) {
+    this.recordReaderImpl.setEndRow(endRow);
+  }
+
+  /**
+   * @param rowFilter the {@link Filter} to be used.
+   */
+  public void setRowFilter(Filter rowFilter) {
+    this.recordReaderImpl.setRowFilter(rowFilter);
+  }
+
+  public void close() {
+    this.recordReaderImpl.close();
+  }
+
+  /**
+   * @return ImmutableBytesWritable
+   *
+   * @see org.apache.hadoop.mapred.RecordReader#createKey()
+   */
+  public ImmutableBytesWritable createKey() {
+    return this.recordReaderImpl.createKey();
+  }
+
+  /**
+   * @return RowResult
+   *
+   * @see org.apache.hadoop.mapred.RecordReader#createValue()
+   */
+  public Result createValue() {
+    return this.recordReaderImpl.createValue();
+  }
+
+  public long getPos() {
+    
+    // This should be the ordinal tuple in the range;
+    // not clear how to calculate...
+    return this.recordReaderImpl.getPos();
+  }
+
+  public float getProgress() {
+    // Depends on the total number of tuples and getPos
+    return this.recordReaderImpl.getPos();
+  }
+
+  /**
+   * @param key HStoreKey as input key.
+   * @param value MapWritable as input value
+   * @return true if there was more data
+   * @throws IOException
+   */
+  public boolean next(ImmutableBytesWritable key, Result value)
+  throws IOException {
+    return this.recordReaderImpl.next(key, value);
+  }
+}
--- a/core/src/main/java/org/apache/hadoop/hbase/mapred/TableRecordReaderImpl.java
+++ b/core/src/main/java/org/apache/hadoop/hbase/mapred/TableRecordReaderImpl.java
@ -0,0 +1,192 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapred;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.UnknownScannerException;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.filter.Filter;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Writables;
+
+import org.apache.hadoop.util.StringUtils;
+
+
+/**
+ * Iterate over an HBase table data, return (Text, RowResult) pairs
+ */
+public class TableRecordReaderImpl {
+  static final Log LOG = LogFactory.getLog(TableRecordReaderImpl.class);
+
+  private byte [] startRow;
+  private byte [] endRow;
+  private byte [] lastRow;
+  private Filter trrRowFilter;
+  private ResultScanner scanner;
+  private HTable htable;
+  private byte [][] trrInputColumns;
+
+  /**
+   * Restart from survivable exceptions by creating a new scanner.
+   *
+   * @param firstRow
+   * @throws IOException
+   */
+  public void restart(byte[] firstRow) throws IOException {
+    if ((endRow != null) && (endRow.length > 0)) {
+      if (trrRowFilter != null) {
+        Scan scan = new Scan(firstRow, endRow);
+        scan.addColumns(trrInputColumns);
+        scan.setFilter(trrRowFilter);
+        this.scanner = this.htable.getScanner(scan);
+      } else {
+        LOG.debug("TIFB.restart, firstRow: " +
+            Bytes.toStringBinary(firstRow) + ", endRow: " +
+            Bytes.toStringBinary(endRow));
+        Scan scan = new Scan(firstRow, endRow);
+        scan.addColumns(trrInputColumns);
+        this.scanner = this.htable.getScanner(scan);
+      }
+    } else {
+      LOG.debug("TIFB.restart, firstRow: " +
+          Bytes.toStringBinary(firstRow) + ", no endRow");
+
+      Scan scan = new Scan(firstRow);
+      scan.addColumns(trrInputColumns);
+//      scan.setFilter(trrRowFilter);
+      this.scanner = this.htable.getScanner(scan);
+    }
+  }
+
+  /**
+   * Build the scanner. Not done in constructor to allow for extension.
+   *
+   * @throws IOException
+   */
+  public void init() throws IOException {
+    restart(startRow);
+  }
+
+  byte[] getStartRow() { 
+    return this.startRow;
+  }
+  /**
+   * @param htable the {@link HTable} to scan.
+   */
+  public void setHTable(HTable htable) {
+    this.htable = htable;
+  }
+
+  /**
+   * @param inputColumns the columns to be placed in {@link Result}.
+   */
+  public void setInputColumns(final byte [][] inputColumns) {
+    this.trrInputColumns = inputColumns;
+  }
+
+  /**
+   * @param startRow the first row in the split
+   */
+  public void setStartRow(final byte [] startRow) {
+    this.startRow = startRow;
+  }
+
+  /**
+   *
+   * @param endRow the last row in the split
+   */
+  public void setEndRow(final byte [] endRow) {
+    this.endRow = endRow;
+  }
+
+  /**
+   * @param rowFilter the {@link Filter} to be used.
+   */
+  public void setRowFilter(Filter rowFilter) {
+    this.trrRowFilter = rowFilter;
+  }
+
+  public void close() {
+    this.scanner.close();
+  }
+
+  /**
+   * @return ImmutableBytesWritable
+   *
+   * @see org.apache.hadoop.mapred.RecordReader#createKey()
+   */
+  public ImmutableBytesWritable createKey() {
+    return new ImmutableBytesWritable();
+  }
+
+  /**
+   * @return RowResult
+   *
+   * @see org.apache.hadoop.mapred.RecordReader#createValue()
+   */
+  public Result createValue() {
+    return new Result();
+  }
+
+  public long getPos() {
+    // This should be the ordinal tuple in the range;
+    // not clear how to calculate...
+    return 0;
+  }
+
+  public float getProgress() {
+    // Depends on the total number of tuples and getPos
+    return 0;
+  }
+
+  /**
+   * @param key HStoreKey as input key.
+   * @param value MapWritable as input value
+   * @return true if there was more data
+   * @throws IOException
+   */
+  public boolean next(ImmutableBytesWritable key, Result value)
+  throws IOException {
+    Result result;
+    try {
+      result = this.scanner.next();
+    } catch (UnknownScannerException e) {
+      LOG.debug("recovered from " + StringUtils.stringifyException(e));
+      restart(lastRow);
+      this.scanner.next();    // skip presumed already mapped row
+      result = this.scanner.next();
+    }
+
+    if (result != null && result.size() > 0) {
+      key.set(result.getRow());
+      lastRow = key.get();
+      Writables.copyWritable(result, value);
+      return true;
+    }
+    return false;
+  }
+}
--- a/core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java
+++ b/core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java
@ -79,154 +79,7 @@ extends InputFormat<ImmutableBytesWritable, Result> {
  /** The reader scanning the table, can be a custom one. */
  private TableRecordReader tableRecordReader = null;

-  /**
-   * Iterate over an HBase table data, return (ImmutableBytesWritable, Result) 
-   * pairs.
-   */
-  protected class TableRecordReader
-  extends RecordReader<ImmutableBytesWritable, Result> {
-    
-    private ResultScanner scanner = null;
-    private Scan scan = null;
-    private HTable htable = null;
-    private byte[] lastRow = null;
-    private ImmutableBytesWritable key = null;
-    private Result value = null;
-
-    /**
-     * Restart from survivable exceptions by creating a new scanner.
-     *
-     * @param firstRow  The first row to start at.
-     * @throws IOException When restarting fails.
-     */
-    public void restart(byte[] firstRow) throws IOException {
-      Scan newScan = new Scan(scan);
-      newScan.setStartRow(firstRow);
-      this.scanner = this.htable.getScanner(newScan);
-    }
-
-    /**
-     * Build the scanner. Not done in constructor to allow for extension.
-     *
-     * @throws IOException When restarting the scan fails. 
-     */
-    public void init() throws IOException {
-      restart(scan.getStartRow());
-    }
-
-    /**
-     * Sets the HBase table.
-     * 
-     * @param htable  The {@link HTable} to scan.
-     */
-    public void setHTable(HTable htable) {
-      this.htable = htable;
-    }
-
-    /**
-     * Sets the scan defining the actual details like columns etc.
-     *  
-     * @param scan  The scan to set.
-     */
-    public void setScan(Scan scan) {
-      this.scan = scan;
-    }
-
-    /**
-     * Closes the split.
-     * 
-     * @see org.apache.hadoop.mapreduce.RecordReader#close()
-     */
-    @Override
-    public void close() {
-      this.scanner.close();
-    }
-
-    /**
-     * Returns the current key.
-     *  
-     * @return The current key.
-     * @throws IOException
-     * @throws InterruptedException When the job is aborted.
-     * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentKey()
-     */
-    @Override
-    public ImmutableBytesWritable getCurrentKey() throws IOException,
-        InterruptedException {
-      return key;
-    }
-
-    /**
-     * Returns the current value.
-     * 
-     * @return The current value.
-     * @throws IOException When the value is faulty.
-     * @throws InterruptedException When the job is aborted.
-     * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentValue()
-     */
-    @Override
-    public Result getCurrentValue() throws IOException, InterruptedException {
-      return value;
-    }
-
-    /**
-     * Initializes the reader.
-     * 
-     * @param inputsplit  The split to work with.
-     * @param context  The current task context.
-     * @throws IOException When setting up the reader fails.
-     * @throws InterruptedException When the job is aborted.
-     * @see org.apache.hadoop.mapreduce.RecordReader#initialize(
-     *   org.apache.hadoop.mapreduce.InputSplit, 
-     *   org.apache.hadoop.mapreduce.TaskAttemptContext)
-     */
-    @Override
-    public void initialize(InputSplit inputsplit,
-        TaskAttemptContext context) throws IOException,
-        InterruptedException {
-    }
-
-    /**
-     * Positions the record reader to the next record.
-     *  
-     * @return <code>true</code> if there was another record.
-     * @throws IOException When reading the record failed.
-     * @throws InterruptedException When the job was aborted.
-     * @see org.apache.hadoop.mapreduce.RecordReader#nextKeyValue()
-     */
-    @Override
-    public boolean nextKeyValue() throws IOException, InterruptedException {
-      if (key == null) key = new ImmutableBytesWritable();
-      if (value == null) value = new Result();
-      try {
-        value = this.scanner.next();
-      } catch (IOException e) {
-        LOG.debug("recovered from " + StringUtils.stringifyException(e));  
-        restart(lastRow);
-        scanner.next();    // skip presumed already mapped row
-        value = scanner.next();
-      }
-      if (value != null && value.size() > 0) {
-        key.set(value.getRow());
-        lastRow = key.get();
-        return true;
-      }
-      return false;
-    }
-
-    /**
-     * The current progress of the record reader through its data.
-     * 
-     * @return A number between 0.0 and 1.0, the fraction of the data read.
-     * @see org.apache.hadoop.mapreduce.RecordReader#getProgress()
-     */
-    @Override
-    public float getProgress() {
-      // Depends on the total number of tuples
-      return 0;
-    }
-  }
-
+ 
  /**
   * Builds a TableRecordReader. If no TableRecordReader was provided, uses
   * the default.
--- a/core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableRecordReader.java
+++ b/core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableRecordReader.java
@ -0,0 +1,155 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapreduce;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+/**
+ * Iterate over an HBase table data, return (ImmutableBytesWritable, Result) 
+ * pairs.
+ */
+public class TableRecordReader
+extends RecordReader<ImmutableBytesWritable, Result> {
+  
+  private TableRecordReaderImpl recordReaderImpl = new TableRecordReaderImpl();
+  
+  /**
+   * Restart from survivable exceptions by creating a new scanner.
+   *
+   * @param firstRow  The first row to start at.
+   * @throws IOException When restarting fails.
+   */
+  public void restart(byte[] firstRow) throws IOException {
+    this.recordReaderImpl.restart(firstRow);
+  }
+
+  /**
+   * Build the scanner. Not done in constructor to allow for extension.
+   *
+   * @throws IOException When restarting the scan fails. 
+   */
+  public void init() throws IOException {
+    this.recordReaderImpl.init();
+  }
+
+  /**
+   * Sets the HBase table.
+   * 
+   * @param htable  The {@link HTable} to scan.
+   */
+  public void setHTable(HTable htable) {
+    this.recordReaderImpl.setHTable(htable);
+  }
+
+  /**
+   * Sets the scan defining the actual details like columns etc.
+   *  
+   * @param scan  The scan to set.
+   */
+  public void setScan(Scan scan) {
+    this.recordReaderImpl.setScan(scan);
+  }
+
+  /**
+   * Closes the split.
+   * 
+   * @see org.apache.hadoop.mapreduce.RecordReader#close()
+   */
+  @Override
+  public void close() {
+    this.recordReaderImpl.close();
+  }
+
+  /**
+   * Returns the current key.
+   *  
+   * @return The current key.
+   * @throws IOException
+   * @throws InterruptedException When the job is aborted.
+   * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentKey()
+   */
+  @Override
+  public ImmutableBytesWritable getCurrentKey() throws IOException,
+      InterruptedException {
+    return this.recordReaderImpl.getCurrentKey();
+  }
+
+  /**
+   * Returns the current value.
+   * 
+   * @return The current value.
+   * @throws IOException When the value is faulty.
+   * @throws InterruptedException When the job is aborted.
+   * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentValue()
+   */
+  @Override
+  public Result getCurrentValue() throws IOException, InterruptedException {
+    return this.recordReaderImpl.getCurrentValue();
+  }
+
+  /**
+   * Initializes the reader.
+   * 
+   * @param inputsplit  The split to work with.
+   * @param context  The current task context.
+   * @throws IOException When setting up the reader fails.
+   * @throws InterruptedException When the job is aborted.
+   * @see org.apache.hadoop.mapreduce.RecordReader#initialize(
+   *   org.apache.hadoop.mapreduce.InputSplit, 
+   *   org.apache.hadoop.mapreduce.TaskAttemptContext)
+   */
+  @Override
+  public void initialize(InputSplit inputsplit,
+      TaskAttemptContext context) throws IOException,
+      InterruptedException {
+  }
+
+  /**
+   * Positions the record reader to the next record.
+   *  
+   * @return <code>true</code> if there was another record.
+   * @throws IOException When reading the record failed.
+   * @throws InterruptedException When the job was aborted.
+   * @see org.apache.hadoop.mapreduce.RecordReader#nextKeyValue()
+   */
+  @Override
+  public boolean nextKeyValue() throws IOException, InterruptedException {
+    return this.recordReaderImpl.nextKeyValue();
+  }
+
+  /**
+   * The current progress of the record reader through its data.
+   * 
+   * @return A number between 0.0 and 1.0, the fraction of the data read.
+   * @see org.apache.hadoop.mapreduce.RecordReader#getProgress()
+   */
+  @Override
+  public float getProgress() {
+    return this.recordReaderImpl.getProgress();
+  }
+}
--- a/core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableRecordReaderImpl.java
+++ b/core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableRecordReaderImpl.java
@ -0,0 +1,157 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapreduce;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Iterate over an HBase table data, return (ImmutableBytesWritable, Result) 
+ * pairs.
+ */
+public class TableRecordReaderImpl {
+
+
+  static final Log LOG = LogFactory.getLog(TableRecordReader.class);
+
+  private ResultScanner scanner = null;
+  private Scan scan = null;
+  private HTable htable = null;
+  private byte[] lastRow = null;
+  private ImmutableBytesWritable key = null;
+  private Result value = null;
+
+  /**
+   * Restart from survivable exceptions by creating a new scanner.
+   *
+   * @param firstRow  The first row to start at.
+   * @throws IOException When restarting fails.
+   */
+  public void restart(byte[] firstRow) throws IOException {
+    Scan newScan = new Scan(scan);
+    newScan.setStartRow(firstRow);
+    this.scanner = this.htable.getScanner(newScan);
+  }
+
+  /**
+   * Build the scanner. Not done in constructor to allow for extension.
+   *
+   * @throws IOException When restarting the scan fails. 
+   */
+  public void init() throws IOException {
+    restart(scan.getStartRow());
+  }
+
+  /**
+   * Sets the HBase table.
+   * 
+   * @param htable  The {@link HTable} to scan.
+   */
+  public void setHTable(HTable htable) {
+    this.htable = htable;
+  }
+
+  /**
+   * Sets the scan defining the actual details like columns etc.
+   *  
+   * @param scan  The scan to set.
+   */
+  public void setScan(Scan scan) {
+    this.scan = scan;
+  }
+
+  /**
+   * Closes the split.
+   * 
+   * 
+   */
+  public void close() {
+    this.scanner.close();
+  }
+
+  /**
+   * Returns the current key.
+   *  
+   * @return The current key.
+   * @throws IOException
+   * @throws InterruptedException When the job is aborted.
+   */
+  public ImmutableBytesWritable getCurrentKey() throws IOException,
+      InterruptedException {
+    return key;
+  }
+
+  /**
+   * Returns the current value.
+   * 
+   * @return The current value.
+   * @throws IOException When the value is faulty.
+   * @throws InterruptedException When the job is aborted.
+   */
+  public Result getCurrentValue() throws IOException, InterruptedException {
+    return value;
+  }
+
+
+  /**
+   * Positions the record reader to the next record.
+   *  
+   * @return <code>true</code> if there was another record.
+   * @throws IOException When reading the record failed.
+   * @throws InterruptedException When the job was aborted.
+   */
+  public boolean nextKeyValue() throws IOException, InterruptedException {
+    if (key == null) key = new ImmutableBytesWritable();
+    if (value == null) value = new Result();
+    try {
+      value = this.scanner.next();
+    } catch (IOException e) {
+      LOG.debug("recovered from " + StringUtils.stringifyException(e));  
+      restart(lastRow);
+      scanner.next();    // skip presumed already mapped row
+      value = scanner.next();
+    }
+    if (value != null && value.size() > 0) {
+      key.set(value.getRow());
+      lastRow = key.get();
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * The current progress of the record reader through its data.
+   * 
+   * @return A number between 0.0 and 1.0, the fraction of the data read.
+   */
+  public float getProgress() {
+    // Depends on the total number of tuples
+    return 0;
+  }
+  
+}