HBASE-3623 Allow non-XML representable separator characters in the ImportTSV tool

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1080449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2011-03-11 04:44:31 +00:00
parent d86e7e1c18
commit 897890bca1
3 changed files with 135 additions and 1 deletions

View File

@ -147,6 +147,8 @@ Release 0.90.2 - Unreleased
HBASE-3542 MultiGet methods in Thrift
HBASE-3285 Hlog recovery takes too much time
HBASE-3586 Improve the selection of regions to balance
HBASE-3623 Allow non-XML representable separator characters in the ImportTSV tool
(Harsh J Chouraria via Stack)
Release 0.90.1 - Unreleased

View File

@ -19,6 +19,8 @@
*/
package org.apache.hadoop.hbase.mapreduce;
import org.apache.hadoop.hbase.util.Base64;
import java.io.IOException;
import java.util.ArrayList;
@ -203,8 +205,18 @@ public class ImportTsv {
@Override
protected void setup(Context context) {
Configuration conf = context.getConfiguration();
// If a custom separator has been used,
// decode it back from Base64 encoding.
String separator = conf.get(SEPARATOR_CONF_KEY);
if (separator == null) {
separator = DEFAULT_SEPARATOR;
} else {
separator = new String(Base64.decode(separator));
}
parser = new TsvParser(conf.get(COLUMNS_CONF_KEY),
conf.get(SEPARATOR_CONF_KEY, DEFAULT_SEPARATOR));
separator);
if (parser.getRowKeyColumnIndex() == -1) {
throw new RuntimeException("No row key column specified");
}
@ -271,6 +283,15 @@ public class ImportTsv {
*/
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
// Support non-XML supported characters
// by re-encoding the passed separator as a Base64 string.
String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
if (actualSeparator != null) {
conf.set(SEPARATOR_CONF_KEY, new String(
Base64.encodeBytes(actualSeparator.getBytes())));
}
String tableName = args[0];
Path inputDir = new Path(args[1]);
Job job = new Job(conf, NAME + "_" + tableName);

View File

@ -19,13 +19,33 @@
*/
package org.apache.hadoop.hbase.mapreduce;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.ArrayList;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser;
import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.BadTsvLineException;
import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.ParsedLine;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.client.Result;
import org.junit.Test;
import com.google.common.base.Joiner;
@ -35,6 +55,7 @@ import com.google.common.collect.Iterables;
import static org.junit.Assert.*;
public class TestImportTsv {
@Test
public void testTsvParserSpecParsing() {
TsvParser parser;
@ -125,4 +146,94 @@ public class TestImportTsv {
byte[] line = Bytes.toBytes("only_cola_data_and_no_row_key");
ParsedLine parsed = parser.parse(line, line.length);
}
@Test
public void testMROnTable()
throws Exception {
String TABLE_NAME = "TestTable";
String FAMILY = "FAM";
String INPUT_FILE = "InputFile.esv";
// Prepare the arguments required for the test.
String[] args = new String[] {
"-D" + ImportTsv.COLUMNS_CONF_KEY + "=HBASE_ROW_KEY,FAM:A,FAM:B",
"-D" + ImportTsv.SEPARATOR_CONF_KEY + "=\u001b",
TABLE_NAME,
INPUT_FILE
};
// Cluster
HBaseTestingUtility htu1 = new HBaseTestingUtility();
MiniHBaseCluster cluster = htu1.startMiniCluster();
GenericOptionsParser opts = new GenericOptionsParser(cluster.getConfiguration(), args);
Configuration conf = opts.getConfiguration();
args = opts.getRemainingArgs();
try {
FileSystem fs = FileSystem.get(conf);
FSDataOutputStream op = fs.create(new Path(INPUT_FILE), true);
String line = "KEY\u001bVALUE1\u001bVALUE2\n";
op.write(line.getBytes(HConstants.UTF8_ENCODING));
op.close();
final byte[] FAM = Bytes.toBytes(FAMILY);
final byte[] TAB = Bytes.toBytes(TABLE_NAME);
final byte[] QA = Bytes.toBytes("A");
final byte[] QB = Bytes.toBytes("B");
HTableDescriptor desc = new HTableDescriptor(TAB);
desc.addFamily(new HColumnDescriptor(FAM));
new HBaseAdmin(conf).createTable(desc);
Job job = ImportTsv.createSubmittableJob(conf, args);
job.waitForCompletion(false);
assertTrue(job.isSuccessful());
HTable table = new HTable(new Configuration(conf), TAB);
boolean verified = false;
long pause = conf.getLong("hbase.client.pause", 5 * 1000);
int numRetries = conf.getInt("hbase.client.retries.number", 5);
for (int i = 0; i < numRetries; i++) {
try {
Scan scan = new Scan();
// Scan entire family.
scan.addFamily(FAM);
ResultScanner resScanner = table.getScanner(scan);
for (Result res : resScanner) {
assertTrue(res.size() == 2);
List<KeyValue> kvs = res.list();
assertEquals(toU8Str(kvs.get(0).getRow()),
toU8Str(Bytes.toBytes("KEY")));
assertEquals(toU8Str(kvs.get(1).getRow()),
toU8Str(Bytes.toBytes("KEY")));
assertEquals(toU8Str(kvs.get(0).getValue()),
toU8Str(Bytes.toBytes("VALUE1")));
assertEquals(toU8Str(kvs.get(1).getValue()),
toU8Str(Bytes.toBytes("VALUE2")));
// Only one result set is expected, so let it loop.
}
verified = true;
break;
} catch (NullPointerException e) {
// If here, a cell was empty. Presume its because updates came in
// after the scanner had been opened. Wait a while and retry.
}
try {
Thread.sleep(pause);
} catch (InterruptedException e) {
// continue
}
}
assertTrue(verified);
} finally {
cluster.shutdown();
}
}
public static String toU8Str(byte[] bytes) throws UnsupportedEncodingException {
return new String(bytes, HConstants.UTF8_ENCODING);
}
}