HBASE-4079 HTableUtil - helper class for loading data (Doug Meil via Ted Yu)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1144581 13f79535-47bb-0310-9956-ffa450edef68
2011-07-09 03:15:02 +00:00 · 2011-07-09 03:15:02 +00:00 · a618fea162
parent c08fe0f40e
commit a618fea162
3 changed files with 263 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -312,6 +312,7 @@ Release 0.91.0 - Unreleased
   HBASE-3240  Improve documentation of importtsv and bulk loads.
               (Aaron T. Myers via todd)
   HBASE-4054  Usability improvement to HTablePool (Daniel Iancu)
   HBASE-4079  HTableUtil - helper class for loading data (Doug Meil via Ted Yu)
  TASKS
   HBASE-3559  Move report of split to master OFF the heartbeat channel
--- a/src/main/java/org/apache/hadoop/hbase/client/HTableUtil.java
+++ b/src/main/java/org/apache/hadoop/hbase/client/HTableUtil.java
@ -0,0 +1,137 @@
 /**
 * Copyright 2011 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.hadoop.hbase.client;
 import java.io.IOException;
 import java.lang.InterruptedException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import org.apache.hadoop.hbase.HRegionLocation;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Row;
 /**
 * Utility class for HTable.
 * 
 *
 */
 public class HTableUtil {
  private static final int INITIAL_LIST_SIZE = 250;
  /**
   * Processes a List of Puts and writes them to an HTable instance in RegionServer buckets via the htable.put method. 
   * This will utilize the writeBuffer, thus the writeBuffer flush frequency may be tuned accordingly via htable.setWriteBufferSize. 
   * <br><br>
   * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs in each flush.
   * <br><br>
   * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
   * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
   * <br>
   * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
   * <br>
   * Assumption #3:  That the input list of Puts is big enough to be useful (in the thousands or more).  The intent of this
   * method is to process larger chunks of data.
   * <br>
   * Assumption #4:  htable.setAutoFlush(false) has been set.  This is a requirement to use the writeBuffer.
   * <br><br>
   * @param htable HTable instance for target HBase table
   * @param puts List of Put instances
   * @throws IOException if a remote or network exception occurs
   * 
   */
  public static void bucketRsPut(HTable htable, List<Put> puts) throws IOException {
    Map<String, List<Put>> putMap = createRsPutMap(htable, puts);
    for (List<Put> rsPuts: putMap.values()) {
      htable.put( rsPuts );
    }
    htable.flushCommits();
  }
  /**
   * Processes a List of Rows (Put, Delete) and writes them to an HTable instance in RegionServer buckets via the htable.batch method. 
   * <br><br>
   * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs, thus this will
   * produce one RPC of Puts per RegionServer.
   * <br><br>
   * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
   * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
   * <br>
   * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
   * <br>
   * Assumption #3:  That the input list of Rows is big enough to be useful (in the thousands or more).  The intent of this
   * method is to process larger chunks of data.
   * <br><br>
   * This method accepts a list of Row objects because the underlying .batch method accepts a list of Row objects.
   * <br><br>
   * @param htable HTable instance for target HBase table
   * @param rows List of Row instances
   * @throws IOException if a remote or network exception occurs
   */
  public static void bucketRsBatch(HTable htable, List<Row> rows) throws IOException {
    try {
      Map<String, List<Row>> rowMap = createRsRowMap(htable, rows);
      for (List<Row> rsRows: rowMap.values()) {
        htable.batch( rsRows );
      }
    } catch (InterruptedException e) {
      throw new IOException(e); 
    }
  }
  private static Map<String,List<Put>> createRsPutMap(HTable htable, List<Put> puts) throws IOException {
    Map<String, List<Put>> putMap = new HashMap<String, List<Put>>();
    for (Put put: puts) {
      HRegionLocation rl = htable.getRegionLocation( put.getRow() );
      String hostname = rl.getHostname();
      List<Put> recs = putMap.get( hostname);
      if (recs == null) {
        recs = new ArrayList<Put>(INITIAL_LIST_SIZE);
    		putMap.put( hostname, recs);
      }
      recs.add(put);
    }
    return putMap;
  }
  private static Map<String,List<Row>> createRsRowMap(HTable htable, List<Row> rows) throws IOException {
    Map<String, List<Row>> rowMap = new HashMap<String, List<Row>>();
    for (Row row: rows) {
      HRegionLocation rl = htable.getRegionLocation( row.getRow() );
      String hostname = rl.getHostname();
      List<Row> recs = rowMap.get( hostname);
      if (recs == null) {
        recs = new ArrayList<Row>(INITIAL_LIST_SIZE);
        rowMap.put( hostname, recs);
      }
      recs.add(row);
    }
    return rowMap;
  }
 }
--- a/src/test/java/org/apache/hadoop/hbase/client/TestHTableUtil.java
+++ b/src/test/java/org/apache/hadoop/hbase/client/TestHTableUtil.java
@ -0,0 +1,125 @@
 /*
 * Copyright 2011 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.hadoop.hbase.client;
 import static org.junit.Assert.assertEquals;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 /**
 * This class provides tests for the {@link HTableUtil} class
 *
 */
 public class TestHTableUtil {
  final Log LOG = LogFactory.getLog(getClass());
  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
  private static byte [] ROW = Bytes.toBytes("testRow");
  private static byte [] FAMILY = Bytes.toBytes("testFamily");
  private static byte [] QUALIFIER = Bytes.toBytes("testQualifier");
  private static byte [] VALUE = Bytes.toBytes("testValue");
  /**
   * @throws java.lang.Exception
   */
  @BeforeClass
  public static void setUpBeforeClass() throws Exception {
    TEST_UTIL.startMiniCluster(3);
  }
  /**
   * @throws java.lang.Exception
   */
  @AfterClass
  public static void tearDownAfterClass() throws Exception {
    TEST_UTIL.shutdownMiniCluster();
  }
  /**
   *
   * @throws Exception
   */
  @Test
  public void testBucketPut() throws Exception {
    byte [] TABLE = Bytes.toBytes("testBucketPut");
    HTable ht = TEST_UTIL.createTable(TABLE, FAMILY);
    ht.setAutoFlush( false );
    List<Put> puts = new ArrayList<Put>();
    puts.add( createPut("row1") );
    puts.add( createPut("row2") );
    puts.add( createPut("row3") );
    puts.add( createPut("row4") );
    HTableUtil.bucketRsPut( ht, puts );
    Scan scan = new Scan();
    scan.addColumn(FAMILY, QUALIFIER);
    int count = 0;
    for(Result result : ht.getScanner(scan)) {
      count++;
    }
    LOG.info("bucket put count=" + count);
    assertEquals(count, puts.size());
   }
  private Put createPut(String row) {
    Put put = new Put( Bytes.toBytes(row));
    put.add(FAMILY, QUALIFIER, VALUE);
    return put;
  }
  /**
  *
  * @throws Exception
  */
 @Test
 public void testBucketBatch() throws Exception {
   byte [] TABLE = Bytes.toBytes("testBucketBatch");
   HTable ht = TEST_UTIL.createTable(TABLE, FAMILY);
   List<Row> rows = new ArrayList<Row>();
   rows.add( createPut("row1") );
   rows.add( createPut("row2") );
   rows.add( createPut("row3") );
   rows.add( createPut("row4") );
   HTableUtil.bucketRsBatch( ht, rows );
   Scan scan = new Scan();
   scan.addColumn(FAMILY, QUALIFIER);
   int count = 0;
   for(Result result : ht.getScanner(scan)) {
     count++;
   }
   LOG.info("bucket batch count=" + count);
   assertEquals(count, rows.size());
 }
 }