diff --git a/dev-support/findbugs-exclude.xml b/dev-support/findbugs-exclude.xml index 3162cb2710c..9813546c527 100644 --- a/dev-support/findbugs-exclude.xml +++ b/dev-support/findbugs-exclude.xml @@ -246,9 +246,4 @@ - - - - - diff --git a/hbase-assembly/pom.xml b/hbase-assembly/pom.xml index 298505b8c04..8961b2e31d1 100644 --- a/hbase-assembly/pom.xml +++ b/hbase-assembly/pom.xml @@ -37,22 +37,6 @@ - - org.apache.maven.plugins - maven-enforcer-plugin - - - - banned-hbase-spark - - enforce - - - true - - - - org.apache.maven.plugins @@ -266,11 +250,6 @@ ${project.version} true - - org.apache.hbase - hbase-spark - ${project.version} - org.apache.httpcomponents httpclient @@ -279,12 +258,6 @@ org.apache.httpcomponents httpcore - - org.apache.hbase - hbase-spark-it - ${project.version} - test-jar - org.apache.hbase hbase-backup diff --git a/hbase-assembly/src/main/assembly/hadoop-two-compat.xml b/hbase-assembly/src/main/assembly/hadoop-two-compat.xml index a66237bf0d7..820430f4c75 100644 --- a/hbase-assembly/src/main/assembly/hadoop-two-compat.xml +++ b/hbase-assembly/src/main/assembly/hadoop-two-compat.xml @@ -46,7 +46,6 @@ org.apache.hbase:hbase-rsgroup org.apache.hbase:hbase-server org.apache.hbase:hbase-shell - org.apache.hbase:hbase-spark org.apache.hbase:hbase-thrift org.apache.hbase:hbase-external-blockcache org.apache.hbase:hbase-backup diff --git a/hbase-spark-it/pom.xml b/hbase-spark-it/pom.xml deleted file mode 100644 index faeaf23febd..00000000000 --- a/hbase-spark-it/pom.xml +++ /dev/null @@ -1,333 +0,0 @@ - - - - 4.0.0 - - hbase-build-configuration - org.apache.hbase - 2.0.0-beta-1.SNAPSHOT - ../hbase-build-configuration - - - hbase-spark-it - Apache HBase - Spark Integration Tests - Integration and System tests for HBase - - - - 1.6.0 - 2.10.4 - 2.10 - - **/Test*.java - **/IntegrationTest*.java - - 4g - - - - - - - org.apache.maven.plugins - maven-site-plugin - - true - - - - - org.apache.maven.plugins - maven-source-plugin - - - - maven-assembly-plugin - - true - - - - org.apache.maven.plugins - maven-failsafe-plugin - ${surefire.version} - - - org.apache.maven.surefire - surefire-junit4 - ${surefire.version} - - - - - ${integrationtest.include} - - - ${unittest.include} - **/*$* - - ${test.output.tofile} - false - false - - - - integration-test - integration-test - - integration-test - - - - verify - verify - - verify - - - - - - - - - - - org.apache.maven.plugins - maven-failsafe-plugin - - false - always - - 1800 - -enableassertions -Xmx${failsafe.Xmx} - -Djava.security.egd=file:/dev/./urandom -XX:+CMSClassUnloadingEnabled - -verbose:gc -XX:+PrintCommandLineFlags -XX:+PrintFlagsFinal - - - - org.apache.maven.plugins - maven-enforcer-plugin - - - - banned-jsr305 - - enforce - - - false - - - - - banned-hbase-spark - - enforce - - - true - - - - banned-scala - - enforce - - - true - - - - - - maven-dependency-plugin - - - create-mrapp-generated-classpath - generate-test-resources - - build-classpath - - - - ${project.build.directory}/test-classes/spark-generated-classpath - - - - - - - - - - - - org.apache.hbase - hbase-common - jar - - - org.apache.hbase - hbase-client - - - org.apache.hbase - hbase-server - jar - - - org.apache.hbase - hbase-server - test-jar - test - - - org.apache.hbase - hbase-spark - ${project.version} - - - org.apache.hbase - hbase-it - test-jar - - - org.apache.hbase - ${compat.module} - ${project.version} - - - org.apache.hbase - hbase-testing-util - - - commons-logging - commons-logging - - - commons-cli - commons-cli - - - org.apache.commons - commons-lang3 - - - - io.netty - netty - ${netty.hadoop.version} - test - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - - org.scala-lang - scala-library - - - - org.scala-lang - scalap - - - com.google.code.findbugs - jsr305 - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - test-jar - tests - test - - - junit - junit - test - - - - - - - skipIntegrationTests - - - skipIntegrationTests - - - - true - - - - - - - - org.apache.maven.plugins - maven-surefire-report-plugin - 2.7.2 - - - spark-integration-tests - - report-only - - - failsafe-report - - ${project.build.directory}/failsafe-reports - - - - - - - - - diff --git a/hbase-spark-it/src/test/java/org/apache/hadoop/hbase/spark/IntegrationTestSparkBulkLoad.java b/hbase-spark-it/src/test/java/org/apache/hadoop/hbase/spark/IntegrationTestSparkBulkLoad.java deleted file mode 100644 index b22c9ca4631..00000000000 --- a/hbase-spark-it/src/test/java/org/apache/hadoop/hbase/spark/IntegrationTestSparkBulkLoad.java +++ /dev/null @@ -1,663 +0,0 @@ -/** - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark; - -import com.google.common.collect.Sets; -import org.apache.commons.cli.CommandLine; -import org.apache.commons.lang3.RandomStringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -import org.apache.hadoop.hbase.Cell; -import org.apache.hadoop.hbase.CellUtil; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.IntegrationTestBase; -import org.apache.hadoop.hbase.IntegrationTestingUtility; -import org.apache.hadoop.hbase.TableName; - -import org.apache.hadoop.hbase.client.Admin; -import org.apache.hadoop.hbase.client.Connection; -import org.apache.hadoop.hbase.client.ConnectionFactory; -import org.apache.hadoop.hbase.client.Consistency; -import org.apache.hadoop.hbase.client.RegionLocator; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.Scan; -import org.apache.hadoop.hbase.client.Table; - -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; - -import org.apache.hadoop.hbase.mapreduce.IntegrationTestBulkLoad; -import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.util.Pair; -import org.apache.hadoop.hbase.util.RegionSplitter; - -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolRunner; -import org.apache.spark.SerializableWritable; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import org.apache.spark.Partitioner; - -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.function.PairFlatMapFunction; -import org.apache.spark.api.java.function.VoidFunction; -import org.junit.Test; -import scala.Tuple2; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.Set; - -/** - * Test Bulk Load and Spark on a distributed cluster. - * It starts an Spark job that creates linked chains. - * This test mimic {@link IntegrationTestBulkLoad} in mapreduce. - * - * Usage on cluster: - * First add hbase related jars and hbase-spark.jar into spark classpath. - * - * spark-submit --class org.apache.hadoop.hbase.spark.IntegrationTestSparkBulkLoad - * HBASE_HOME/lib/hbase-spark-it-XXX-tests.jar -m slowDeterministic -Dhbase.spark.bulkload.chainlength=300 - */ -public class IntegrationTestSparkBulkLoad extends IntegrationTestBase { - - private static final Log LOG = LogFactory.getLog(IntegrationTestSparkBulkLoad.class); - - // The number of partitions for random generated data - private static String BULKLOAD_PARTITIONS_NUM = "hbase.spark.bulkload.partitionsnum"; - private static int DEFAULT_BULKLOAD_PARTITIONS_NUM = 3; - - private static String BULKLOAD_CHAIN_LENGTH = "hbase.spark.bulkload.chainlength"; - private static int DEFAULT_BULKLOAD_CHAIN_LENGTH = 200000; - - private static String BULKLOAD_IMPORT_ROUNDS = "hbase.spark.bulkload.importround"; - private static int DEFAULT_BULKLOAD_IMPORT_ROUNDS = 1; - - private static String CURRENT_ROUND_NUM = "hbase.spark.bulkload.current.roundnum"; - - private static String NUM_REPLICA_COUNT_KEY = "hbase.spark.bulkload.replica.countkey"; - private static int DEFAULT_NUM_REPLICA_COUNT = 1; - - private static String BULKLOAD_TABLE_NAME = "hbase.spark.bulkload.tableName"; - private static String DEFAULT_BULKLOAD_TABLE_NAME = "IntegrationTestSparkBulkLoad"; - - private static String BULKLOAD_OUTPUT_PATH = "hbase.spark.bulkload.output.path"; - - private static final String OPT_LOAD = "load"; - private static final String OPT_CHECK = "check"; - - private boolean load = false; - private boolean check = false; - - private static final byte[] CHAIN_FAM = Bytes.toBytes("L"); - private static final byte[] SORT_FAM = Bytes.toBytes("S"); - private static final byte[] DATA_FAM = Bytes.toBytes("D"); - - /** - * Running spark job to load data into hbase table - */ - public void runLoad() throws Exception { - setupTable(); - int numImportRounds = getConf().getInt(BULKLOAD_IMPORT_ROUNDS, DEFAULT_BULKLOAD_IMPORT_ROUNDS); - LOG.info("Running load with numIterations:" + numImportRounds); - for (int i = 0; i < numImportRounds; i++) { - runLinkedListSparkJob(i); - } - } - - /** - * Running spark job to create LinkedList for testing - * @param iteration iteration th of this job - * @throws Exception - */ - public void runLinkedListSparkJob(int iteration) throws Exception { - String jobName = IntegrationTestSparkBulkLoad.class.getSimpleName() + " _load " + - EnvironmentEdgeManager.currentTime(); - - LOG.info("Running iteration " + iteration + "in Spark Job"); - - Path output = null; - if (conf.get(BULKLOAD_OUTPUT_PATH) == null) { - output = util.getDataTestDirOnTestFS(getTablename() + "-" + iteration); - } else { - output = new Path(conf.get(BULKLOAD_OUTPUT_PATH)); - } - - SparkConf sparkConf = new SparkConf().setAppName(jobName).setMaster("local"); - Configuration hbaseConf = new Configuration(getConf()); - hbaseConf.setInt(CURRENT_ROUND_NUM, iteration); - int partitionNum = hbaseConf.getInt(BULKLOAD_PARTITIONS_NUM, DEFAULT_BULKLOAD_PARTITIONS_NUM); - - - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, hbaseConf); - - - LOG.info("Partition RDD into " + partitionNum + " parts"); - List temp = new ArrayList<>(); - JavaRDD> rdd = jsc.parallelize(temp, partitionNum). - mapPartitionsWithIndex(new LinkedListCreationMapper(new SerializableWritable<>(hbaseConf)), false); - - hbaseContext.bulkLoad(rdd, getTablename(), new ListToKeyValueFunc(), output.toUri().getPath(), - new HashMap<>(), false, HConstants.DEFAULT_MAX_FILE_SIZE); - - try (Connection conn = ConnectionFactory.createConnection(conf); - Admin admin = conn.getAdmin(); - Table table = conn.getTable(getTablename()); - RegionLocator regionLocator = conn.getRegionLocator(getTablename())) { - // Create a new loader. - LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); - - // Load the HFiles into table. - loader.doBulkLoad(output, admin, table, regionLocator); - } - - - // Delete the files. - util.getTestFileSystem().delete(output, true); - jsc.close(); - } - - // See mapreduce.IntegrationTestBulkLoad#LinkedListCreationMapper - // Used to generate test data - public static class LinkedListCreationMapper implements - Function2, Iterator>> { - - SerializableWritable swConfig = null; - private Random rand = new Random(); - - public LinkedListCreationMapper(SerializableWritable conf) { - this.swConfig = conf; - } - - @Override - public Iterator> call(Integer v1, Iterator v2) throws Exception { - Configuration config = (Configuration) swConfig.value(); - int partitionId = v1.intValue(); - LOG.info("Starting create List in Partition " + partitionId); - - int partitionNum = config.getInt(BULKLOAD_PARTITIONS_NUM, DEFAULT_BULKLOAD_PARTITIONS_NUM); - int chainLength = config.getInt(BULKLOAD_CHAIN_LENGTH, DEFAULT_BULKLOAD_CHAIN_LENGTH); - int iterationsNum = config.getInt(BULKLOAD_IMPORT_ROUNDS, DEFAULT_BULKLOAD_IMPORT_ROUNDS); - int iterationsCur = config.getInt(CURRENT_ROUND_NUM, 0); - List> res = new LinkedList<>(); - - - long tempId = partitionId + iterationsCur * partitionNum; - long totalPartitionNum = partitionNum * iterationsNum; - long chainId = Math.abs(rand.nextLong()); - chainId = chainId - (chainId % totalPartitionNum) + tempId; - - byte[] chainIdArray = Bytes.toBytes(chainId); - long currentRow = 0; - long nextRow = getNextRow(0, chainLength); - for(long i = 0; i < chainLength; i++) { - byte[] rk = Bytes.toBytes(currentRow); - // Insert record into a list - List tmp1 = Arrays.asList(rk, CHAIN_FAM, chainIdArray, Bytes.toBytes(nextRow)); - List tmp2 = Arrays.asList(rk, SORT_FAM, chainIdArray, Bytes.toBytes(i)); - List tmp3 = Arrays.asList(rk, DATA_FAM, chainIdArray, Bytes.toBytes( - RandomStringUtils.randomAlphabetic(50))); - res.add(tmp1); - res.add(tmp2); - res.add(tmp3); - - currentRow = nextRow; - nextRow = getNextRow(i+1, chainLength); - } - return res.iterator(); - } - - /** Returns a unique row id within this chain for this index */ - private long getNextRow(long index, long chainLength) { - long nextRow = Math.abs(new Random().nextLong()); - // use significant bits from the random number, but pad with index to ensure it is unique - // this also ensures that we do not reuse row = 0 - // row collisions from multiple mappers are fine, since we guarantee unique chainIds - nextRow = nextRow - (nextRow % chainLength) + index; - return nextRow; - } - } - - - - public static class ListToKeyValueFunc implements - Function, Pair> { - @Override - public Pair call(List v1) throws Exception { - if (v1 == null || v1.size() != 4) { - return null; - } - KeyFamilyQualifier kfq = new KeyFamilyQualifier(v1.get(0), v1.get(1), v1.get(2)); - - return new Pair<>(kfq, v1.get(3)); - } - } - - /** - * After adding data to the table start a mr job to - * @throws IOException - * @throws ClassNotFoundException - * @throws InterruptedException - */ - public void runCheck() throws Exception { - LOG.info("Running check"); - String jobName = IntegrationTestSparkBulkLoad.class.getSimpleName() + "_check" + EnvironmentEdgeManager.currentTime(); - - SparkConf sparkConf = new SparkConf().setAppName(jobName).setMaster("local"); - Configuration hbaseConf = new Configuration(getConf()); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, hbaseConf); - - Scan scan = new Scan(); - scan.addFamily(CHAIN_FAM); - scan.addFamily(SORT_FAM); - scan.setMaxVersions(1); - scan.setCacheBlocks(false); - scan.setBatch(1000); - int replicaCount = conf.getInt(NUM_REPLICA_COUNT_KEY, DEFAULT_NUM_REPLICA_COUNT); - if (replicaCount != DEFAULT_NUM_REPLICA_COUNT) { - scan.setConsistency(Consistency.TIMELINE); - } - - // 1. Using TableInputFormat to get data from HBase table - // 2. Mimic LinkedListCheckingMapper in mapreduce.IntegrationTestBulkLoad - // 3. Sort LinkKey by its order ID - // 4. Group LinkKey if they have same chainId, and repartition RDD by NaturalKeyPartitioner - // 5. Check LinkList in each Partition using LinkedListCheckingFlatMapFunc - hbaseContext.hbaseRDD(getTablename(), scan).flatMapToPair(new LinkedListCheckingFlatMapFunc()) - .sortByKey() - .combineByKey(new createCombinerFunc(), new mergeValueFunc(), new mergeCombinersFunc(), - new NaturalKeyPartitioner(new SerializableWritable<>(hbaseConf))) - .foreach(new LinkedListCheckingForeachFunc(new SerializableWritable<>(hbaseConf))); - jsc.close(); - } - - private void runCheckWithRetry() throws Exception { - try { - runCheck(); - } catch (Throwable t) { - LOG.warn("Received " + StringUtils.stringifyException(t)); - LOG.warn("Running the check MR Job again to see whether an ephemeral problem or not"); - runCheck(); - throw t; // we should still fail the test even if second retry succeeds - } - // everything green - } - - /** - * PairFlatMapFunction used to transfer to Tuple - */ - public static class LinkedListCheckingFlatMapFunc implements - PairFlatMapFunction, SparkLinkKey, SparkLinkChain> { - - @Override - public Iterable> call(Tuple2 v) - throws Exception { - Result value = v._2(); - long longRk = Bytes.toLong(value.getRow()); - List> list = new LinkedList<>(); - - for (Map.Entry entry : value.getFamilyMap(CHAIN_FAM).entrySet()) { - long chainId = Bytes.toLong(entry.getKey()); - long next = Bytes.toLong(entry.getValue()); - Cell c = value.getColumnCells(SORT_FAM, entry.getKey()).get(0); - long order = Bytes.toLong(CellUtil.cloneValue(c)); - Tuple2 tuple2 = - new Tuple2<>(new SparkLinkKey(chainId, order), new SparkLinkChain(longRk, next)); - list.add(tuple2); - } - return list; - } - } - - public static class createCombinerFunc implements - Function> { - @Override - public List call(SparkLinkChain v1) throws Exception { - List list = new LinkedList<>(); - list.add(v1); - return list; - } - } - - public static class mergeValueFunc implements - Function2, SparkLinkChain, List> { - @Override - public List call(List v1, SparkLinkChain v2) throws Exception { - if (v1 == null) - v1 = new LinkedList<>(); - v1.add(v2); - return v1; - } - } - - public static class mergeCombinersFunc implements - Function2, List, List> { - @Override - public List call(List v1, List v2) throws Exception { - v1.addAll(v2); - return v1; - } - } - - /** - * Class to figure out what partition to send a link in the chain to. This is based upon - * the linkKey's ChainId. - */ - public static class NaturalKeyPartitioner extends Partitioner { - - private int numPartions = 0; - public NaturalKeyPartitioner(SerializableWritable swConf) { - Configuration hbaseConf = (Configuration) swConf.value(); - numPartions = hbaseConf.getInt(BULKLOAD_PARTITIONS_NUM, DEFAULT_BULKLOAD_PARTITIONS_NUM); - - } - - @Override - public int numPartitions() { - return numPartions; - } - - @Override - public int getPartition(Object key) { - if (!(key instanceof SparkLinkKey)) - return -1; - int hash = ((SparkLinkKey) key).getChainId().hashCode(); - return Math.abs(hash % numPartions); - - } - } - - /** - * Sort all LinkChain for one LinkKey, and test List - */ - public static class LinkedListCheckingForeachFunc - implements VoidFunction>> { - - private SerializableWritable swConf = null; - - public LinkedListCheckingForeachFunc(SerializableWritable conf) { - swConf = conf; - } - - @Override - public void call(Tuple2> v1) throws Exception { - long next = -1L; - long prev = -1L; - long count = 0L; - - SparkLinkKey key = v1._1(); - List values = v1._2(); - - for (SparkLinkChain lc : values) { - - if (next == -1) { - if (lc.getRk() != 0L) { - String msg = "Chains should all start at rk 0, but read rk " + lc.getRk() - + ". Chain:" + key.getChainId() + ", order:" + key.getOrder(); - throw new RuntimeException(msg); - } - next = lc.getNext(); - } else { - if (next != lc.getRk()) { - String msg = "Missing a link in the chain. Prev rk " + prev + " was, expecting " - + next + " but got " + lc.getRk() + ". Chain:" + key.getChainId() - + ", order:" + key.getOrder(); - throw new RuntimeException(msg); - } - prev = lc.getRk(); - next = lc.getNext(); - } - count++; - } - Configuration hbaseConf = (Configuration) swConf.value(); - int expectedChainLen = hbaseConf.getInt(BULKLOAD_CHAIN_LENGTH, DEFAULT_BULKLOAD_CHAIN_LENGTH); - if (count != expectedChainLen) { - String msg = "Chain wasn't the correct length. Expected " + expectedChainLen + " got " - + count + ". Chain:" + key.getChainId() + ", order:" + key.getOrder(); - throw new RuntimeException(msg); - } - } - } - - /** - * Writable class used as the key to group links in the linked list. - * - * Used as the key emited from a pass over the table. - */ - public static class SparkLinkKey implements java.io.Serializable, Comparable { - - private Long chainId; - private Long order; - - public Long getOrder() { - return order; - } - - public Long getChainId() { - return chainId; - } - - public SparkLinkKey(long chainId, long order) { - this.chainId = chainId; - this.order = order; - } - - @Override - public int hashCode() { - return this.getChainId().hashCode(); - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof SparkLinkKey)) - return false; - SparkLinkKey otherKey = (SparkLinkKey) other; - return this.getChainId().equals(otherKey.getChainId()); - } - - @Override - public int compareTo(SparkLinkKey other) { - int res = getChainId().compareTo(other.getChainId()); - if (res == 0) - res= getOrder().compareTo(other.getOrder()); - return res; - } - } - - /** - * Writable used as the value emitted from a pass over the hbase table. - */ - public static class SparkLinkChain implements java.io.Serializable, Comparable{ - - public Long getNext() { - return next; - } - - public Long getRk() { - return rk; - } - - - public SparkLinkChain(Long rk, Long next) { - this.rk = rk; - this.next = next; - } - - private Long rk; - private Long next; - - @Override - public int compareTo(SparkLinkChain linkChain) { - int res = getRk().compareTo(linkChain.getRk()); - if (res == 0) { - res = getNext().compareTo(linkChain.getNext()); - } - return res; - } - - @Override - public int hashCode() { - return getRk().hashCode() ^ getNext().hashCode(); - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof SparkLinkChain)) - return false; - SparkLinkChain otherKey = (SparkLinkChain) other; - return this.getRk().equals(otherKey.getRk()) && this.getNext().equals(otherKey.getNext()); - } - } - - - /** - * Allow the scan to go to replica, this would not affect the runCheck() - * Since data are BulkLoaded from HFile into table - * @throws IOException - * @throws InterruptedException - */ - private void installSlowingCoproc() throws IOException, InterruptedException { - int replicaCount = conf.getInt(NUM_REPLICA_COUNT_KEY, DEFAULT_NUM_REPLICA_COUNT); - if (replicaCount == DEFAULT_NUM_REPLICA_COUNT) return; - - TableName t = getTablename(); - Admin admin = util.getAdmin(); - HTableDescriptor desc = admin.getTableDescriptor(t); - desc.addCoprocessor(IntegrationTestBulkLoad.SlowMeCoproScanOperations.class.getName()); - HBaseTestingUtility.modifyTableSync(admin, desc); - } - - @Test - public void testBulkLoad() throws Exception { - runLoad(); - installSlowingCoproc(); - runCheckWithRetry(); - } - - - private byte[][] getSplits(int numRegions) { - RegionSplitter.UniformSplit split = new RegionSplitter.UniformSplit(); - split.setFirstRow(Bytes.toBytes(0L)); - split.setLastRow(Bytes.toBytes(Long.MAX_VALUE)); - return split.split(numRegions); - } - - private void setupTable() throws IOException, InterruptedException { - if (util.getAdmin().tableExists(getTablename())) { - util.deleteTable(getTablename()); - } - - util.createTable( - getTablename(), - new byte[][]{CHAIN_FAM, SORT_FAM, DATA_FAM}, - getSplits(16) - ); - - int replicaCount = conf.getInt(NUM_REPLICA_COUNT_KEY, DEFAULT_NUM_REPLICA_COUNT); - if (replicaCount == DEFAULT_NUM_REPLICA_COUNT) return; - - TableName t = getTablename(); - HBaseTestingUtility.setReplicas(util.getAdmin(), t, replicaCount); - } - - @Override - public void setUpCluster() throws Exception { - util = getTestingUtil(getConf()); - util.initializeCluster(1); - int replicaCount = getConf().getInt(NUM_REPLICA_COUNT_KEY, DEFAULT_NUM_REPLICA_COUNT); - if (LOG.isDebugEnabled() && replicaCount != DEFAULT_NUM_REPLICA_COUNT) { - LOG.debug("Region Replicas enabled: " + replicaCount); - } - - // Scale this up on a real cluster - if (util.isDistributedCluster()) { - util.getConfiguration().setIfUnset(BULKLOAD_PARTITIONS_NUM, String.valueOf(DEFAULT_BULKLOAD_PARTITIONS_NUM)); - util.getConfiguration().setIfUnset(BULKLOAD_IMPORT_ROUNDS, "1"); - } else { - util.startMiniMapReduceCluster(); - } - } - - @Override - protected void addOptions() { - super.addOptions(); - super.addOptNoArg(OPT_CHECK, "Run check only"); - super.addOptNoArg(OPT_LOAD, "Run load only"); - } - - @Override - protected void processOptions(CommandLine cmd) { - super.processOptions(cmd); - check = cmd.hasOption(OPT_CHECK); - load = cmd.hasOption(OPT_LOAD); - } - - @Override - public int runTestFromCommandLine() throws Exception { - if (load) { - runLoad(); - } else if (check) { - installSlowingCoproc(); - runCheckWithRetry(); - } else { - testBulkLoad(); - } - return 0; - } - - @Override - public TableName getTablename() { - return getTableName(getConf()); - } - - public static TableName getTableName(Configuration conf) { - return TableName.valueOf(conf.get(BULKLOAD_TABLE_NAME, DEFAULT_BULKLOAD_TABLE_NAME)); - } - - @Override - protected Set getColumnFamilies() { - return Sets.newHashSet(Bytes.toString(CHAIN_FAM) , Bytes.toString(DATA_FAM), - Bytes.toString(SORT_FAM)); - } - - public static void main(String[] args) throws Exception { - Configuration conf = HBaseConfiguration.create(); - IntegrationTestingUtility.setUseDistributedCluster(conf); - int status = ToolRunner.run(conf, new IntegrationTestSparkBulkLoad(), args); - System.exit(status); - } -} diff --git a/hbase-spark/README.txt b/hbase-spark/README.txt deleted file mode 100644 index 7fad811b211..00000000000 --- a/hbase-spark/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -ON PROTOBUFS -This maven module has core protobuf definition files ('.protos') used by hbase -Spark that ship with hbase core including tests. - -Generation of java files from protobuf .proto files included here is done as -part of the build. diff --git a/hbase-spark/pom.xml b/hbase-spark/pom.xml deleted file mode 100644 index b3e74ea9a18..00000000000 --- a/hbase-spark/pom.xml +++ /dev/null @@ -1,702 +0,0 @@ - - - - 4.0.0 - - hbase-build-configuration - org.apache.hbase - 2.0.0-beta-1.SNAPSHOT - ../hbase-build-configuration - - hbase-spark - Apache HBase - Spark - - 1.6.0 - 2.10.4 - 2.10 - ${project.basedir}/.. - - - - org.apache.hbase.thirdparty - hbase-shaded-miscellaneous - - - - javax.servlet - javax.servlet-api - test - - - - org.scala-lang - scala-library - ${scala.version} - provided - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - - org.scala-lang - scala-library - - - - org.scala-lang - scalap - - - com.google.code.findbugs - jsr305 - - - - - com.google.code.findbugs - jsr305 - 1.3.9 - provided - true - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - test-jar - tests - test - - - junit - junit - test - - - org.scalatest - scalatest_${scala.binary.version} - 2.2.4 - test - - - org.scalamock - scalamock-scalatest-support_${scala.binary.version} - 3.1.4 - test - - - com.fasterxml.jackson.module - jackson-module-scala_${scala.binary.version} - ${jackson.version} - - - org.scala-lang - scala-library - - - org.scala-lang - scala-reflect - - - - - org.apache.hadoop - hadoop-client - ${hadoop-two.version} - - - log4j - log4j - - - javax.servlet - servlet-api - - - javax.servlet.jsp - jsp-api - - - org.jruby - jruby-complete - - - org.jboss.netty - netty - - - io.netty - netty - - - - - org.apache.hadoop - hadoop-common - ${hadoop-two.version} - - - log4j - log4j - - - javax.servlet - servlet-api - - - javax.servlet.jsp - jsp-api - - - org.jruby - jruby-complete - - - org.jboss.netty - netty - - - io.netty - netty - - - com.google.code.findbugs - jsr305 - - - - - org.apache.hadoop - hadoop-common - ${hadoop-two.version} - test-jar - test - - - log4j - log4j - - - javax.servlet - servlet-api - - - javax.servlet.jsp - jsp-api - - - org.jruby - jruby-complete - - - org.jboss.netty - netty - - - io.netty - netty - - - com.google.code.findbugs - jsr305 - - - - - org.apache.hadoop - hadoop-hdfs - ${hadoop-two.version} - test-jar - test - - - log4j - log4j - - - javax.servlet - servlet-api - - - javax.servlet.jsp - jsp-api - - - org.jruby - jruby-complete - - - org.jboss.netty - netty - - - io.netty - netty - - - xerces - xercesImpl - - - - - org.apache.hbase - hbase-client - - - log4j - log4j - - - org.apache.thrift - thrift - - - org.slf4j - slf4j-log4j12 - - - org.mortbay.jetty - jsp-2.1 - - - org.mortbay.jetty - jsp-api-2.1 - - - org.mortbay.jetty - servlet-api-2.5 - - - com.sun.jersey - jersey-core - - - com.sun.jersey - jersey-json - - - com.sun.jersey - jersey-server - - - org.mortbay.jetty - jetty - - - org.mortbay.jetty - jetty-util - - - tomcat - jasper-runtime - - - tomcat - jasper-compiler - - - org.jboss.netty - netty - - - io.netty - netty - - - - - org.apache.hbase - hbase-protocol - ${project.version} - - - org.apache.hbase - hbase-protocol-shaded - ${project.version} - - - org.apache.hbase - hbase-annotations - ${project.version} - - - org.apache.hbase - hbase-common - ${project.version} - - - org.apache.hbase - hbase-annotations - ${project.version} - test-jar - test - - - org.apache.hbase - hbase-hadoop-compat - ${project.version} - test - test-jar - - - log4j - log4j - - - org.apache.thrift - thrift - - - org.slf4j - slf4j-log4j12 - - - org.mortbay.jetty - jsp-2.1 - - - org.mortbay.jetty - jsp-api-2.1 - - - org.mortbay.jetty - servlet-api-2.5 - - - com.sun.jersey - jersey-core - - - com.sun.jersey - jersey-json - - - com.sun.jersey - jersey-server - - - org.mortbay.jetty - jetty - - - org.mortbay.jetty - jetty-util - - - tomcat - jasper-runtime - - - tomcat - jasper-compiler - - - org.jboss.netty - netty - - - io.netty - netty - - - - - org.apache.hbase - hbase-hadoop2-compat - ${project.version} - test - test-jar - - - log4j - log4j - - - org.apache.thrift - thrift - - - org.slf4j - slf4j-log4j12 - - - org.mortbay.jetty - jsp-2.1 - - - org.mortbay.jetty - jsp-api-2.1 - - - org.mortbay.jetty - servlet-api-2.5 - - - com.sun.jersey - jersey-core - - - com.sun.jersey - jersey-json - - - com.sun.jersey - jersey-server - - - org.mortbay.jetty - jetty - - - org.mortbay.jetty - jetty-util - - - tomcat - jasper-runtime - - - tomcat - jasper-compiler - - - org.jboss.netty - netty - - - io.netty - netty - - - - - org.apache.hbase - hbase-server - ${project.version} - - - org.apache.hbase - hbase-server - ${project.version} - test - test-jar - - - org.apache.hbase - hbase-mapreduce - - - com.google.protobuf - protobuf-java - - - commons-io - commons-io - - - org.apache.hadoop - hadoop-mapreduce-client-jobclient - test-jar - test - - - org.apache.avro - avro - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - - net.alchim31.maven - scala-maven-plugin - 3.2.0 - - ${project.build.sourceEncoding} - ${scala.version} - - -feature - - - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - - org.scalatest - scalatest-maven-plugin - 1.0 - - ${project.build.directory}/surefire-reports - . - WDF TestSuite.txt - false - - org.apache.hadoop.hbase.shaded. - - - - - test - test - - test - - - - -Xmx1536m -XX:ReservedCodeCacheSize=512m - - false - - - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-source - validate - - add-source - - - - src/main/scala - - - - - add-test-source - validate - - add-test-source - - - - src/test/scala - - - - - - - org.xolstice.maven.plugins - protobuf-maven-plugin - - - compile-protoc - generate-sources - - compile - - - - - - org.apache.maven.plugins - maven-enforcer-plugin - - - - banned-jsr305 - - enforce - - - false - - - - - banned-scala - - enforce - - - true - - - - - - - - - - skipSparkTests - - - skipSparkTests - - - - true - true - true - - - - diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/SparkSQLPushDownFilter.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/SparkSQLPushDownFilter.java deleted file mode 100644 index a94c59c4c0f..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/SparkSQLPushDownFilter.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hbase.Cell; -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.exceptions.DeserializationException; -import org.apache.hadoop.hbase.filter.FilterBase; -import org.apache.hadoop.hbase.spark.datasources.BytesEncoder; -import org.apache.hadoop.hbase.spark.datasources.JavaBytesEncoder; -import org.apache.hadoop.hbase.spark.protobuf.generated.SparkFilterProtos; -import org.apache.hadoop.hbase.util.ByteStringer; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.sql.datasources.hbase.Field; -import scala.collection.mutable.MutableList; - - -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.ByteString; - -/** - * This filter will push down all qualifier logic given to us - * by SparkSQL so that we have make the filters at the region server level - * and avoid sending the data back to the client to be filtered. - */ -@InterfaceAudience.Private -public class SparkSQLPushDownFilter extends FilterBase{ - protected static final Log log = LogFactory.getLog(SparkSQLPushDownFilter.class); - - //The following values are populated with protobuffer - DynamicLogicExpression dynamicLogicExpression; - byte[][] valueFromQueryArray; - HashMap> - currentCellToColumnIndexMap; - - //The following values are transient - HashMap columnToCurrentRowValueMap = null; - - static final byte[] rowKeyFamily = new byte[0]; - static final byte[] rowKeyQualifier = Bytes.toBytes("key"); - - String encoderClassName; - - public SparkSQLPushDownFilter(DynamicLogicExpression dynamicLogicExpression, - byte[][] valueFromQueryArray, - HashMap> - currentCellToColumnIndexMap, String encoderClassName) { - this.dynamicLogicExpression = dynamicLogicExpression; - this.valueFromQueryArray = valueFromQueryArray; - this.currentCellToColumnIndexMap = currentCellToColumnIndexMap; - this.encoderClassName = encoderClassName; - } - - public SparkSQLPushDownFilter(DynamicLogicExpression dynamicLogicExpression, - byte[][] valueFromQueryArray, - MutableList fields, String encoderClassName) { - this.dynamicLogicExpression = dynamicLogicExpression; - this.valueFromQueryArray = valueFromQueryArray; - this.encoderClassName = encoderClassName; - - //generate family qualifier to index mapping - this.currentCellToColumnIndexMap = - new HashMap<>(); - - for (int i = 0; i < fields.size(); i++) { - Field field = fields.apply(i); - - byte[] cfBytes = field.cfBytes(); - ByteArrayComparable familyByteComparable = - new ByteArrayComparable(cfBytes, 0, cfBytes.length); - - HashMap qualifierIndexMap = - currentCellToColumnIndexMap.get(familyByteComparable); - - if (qualifierIndexMap == null) { - qualifierIndexMap = new HashMap<>(); - currentCellToColumnIndexMap.put(familyByteComparable, qualifierIndexMap); - } - byte[] qBytes = field.colBytes(); - ByteArrayComparable qualifierByteComparable = - new ByteArrayComparable(qBytes, 0, qBytes.length); - - qualifierIndexMap.put(qualifierByteComparable, field.colName()); - } - } - - @Override - public ReturnCode filterCell(final Cell c) throws IOException { - - //If the map RowValueMap is empty then we need to populate - // the row key - if (columnToCurrentRowValueMap == null) { - columnToCurrentRowValueMap = new HashMap<>(); - HashMap qualifierColumnMap = - currentCellToColumnIndexMap.get( - new ByteArrayComparable(rowKeyFamily, 0, rowKeyFamily.length)); - - if (qualifierColumnMap != null) { - String rowKeyColumnName = - qualifierColumnMap.get( - new ByteArrayComparable(rowKeyQualifier, 0, - rowKeyQualifier.length)); - //Make sure that the rowKey is part of the where clause - if (rowKeyColumnName != null) { - columnToCurrentRowValueMap.put(rowKeyColumnName, - new ByteArrayComparable(c.getRowArray(), - c.getRowOffset(), c.getRowLength())); - } - } - } - - //Always populate the column value into the RowValueMap - ByteArrayComparable currentFamilyByteComparable = - new ByteArrayComparable(c.getFamilyArray(), - c.getFamilyOffset(), - c.getFamilyLength()); - - HashMap qualifierColumnMap = - currentCellToColumnIndexMap.get( - currentFamilyByteComparable); - - if (qualifierColumnMap != null) { - - String columnName = - qualifierColumnMap.get( - new ByteArrayComparable(c.getQualifierArray(), - c.getQualifierOffset(), - c.getQualifierLength())); - - if (columnName != null) { - columnToCurrentRowValueMap.put(columnName, - new ByteArrayComparable(c.getValueArray(), - c.getValueOffset(), c.getValueLength())); - } - } - - return ReturnCode.INCLUDE; - } - - - @Override - public boolean filterRow() throws IOException { - - try { - boolean result = - dynamicLogicExpression.execute(columnToCurrentRowValueMap, - valueFromQueryArray); - columnToCurrentRowValueMap = null; - return !result; - } catch (Throwable e) { - log.error("Error running dynamic logic on row", e); - } - return false; - } - - - /** - * @param pbBytes A pb serialized instance - * @return An instance of SparkSQLPushDownFilter - * @throws org.apache.hadoop.hbase.exceptions.DeserializationException - */ - @SuppressWarnings("unused") - public static SparkSQLPushDownFilter parseFrom(final byte[] pbBytes) - throws DeserializationException { - - SparkFilterProtos.SQLPredicatePushDownFilter proto; - try { - proto = SparkFilterProtos.SQLPredicatePushDownFilter.parseFrom(pbBytes); - } catch (InvalidProtocolBufferException e) { - throw new DeserializationException(e); - } - - String encoder = proto.getEncoderClassName(); - BytesEncoder enc = JavaBytesEncoder.create(encoder); - - //Load DynamicLogicExpression - DynamicLogicExpression dynamicLogicExpression = - DynamicLogicExpressionBuilder.build(proto.getDynamicLogicExpression(), enc); - - //Load valuesFromQuery - final List valueFromQueryArrayList = proto.getValueFromQueryArrayList(); - byte[][] valueFromQueryArray = new byte[valueFromQueryArrayList.size()][]; - for (int i = 0; i < valueFromQueryArrayList.size(); i++) { - valueFromQueryArray[i] = valueFromQueryArrayList.get(i).toByteArray(); - } - - //Load mapping from HBase family/qualifier to Spark SQL columnName - HashMap> - currentCellToColumnIndexMap = new HashMap<>(); - - for (SparkFilterProtos.SQLPredicatePushDownCellToColumnMapping - sqlPredicatePushDownCellToColumnMapping : - proto.getCellToColumnMappingList()) { - - byte[] familyArray = - sqlPredicatePushDownCellToColumnMapping.getColumnFamily().toByteArray(); - ByteArrayComparable familyByteComparable = - new ByteArrayComparable(familyArray, 0, familyArray.length); - HashMap qualifierMap = - currentCellToColumnIndexMap.get(familyByteComparable); - - if (qualifierMap == null) { - qualifierMap = new HashMap<>(); - currentCellToColumnIndexMap.put(familyByteComparable, qualifierMap); - } - byte[] qualifierArray = - sqlPredicatePushDownCellToColumnMapping.getQualifier().toByteArray(); - - ByteArrayComparable qualifierByteComparable = - new ByteArrayComparable(qualifierArray, 0 ,qualifierArray.length); - - qualifierMap.put(qualifierByteComparable, - sqlPredicatePushDownCellToColumnMapping.getColumnName()); - } - - return new SparkSQLPushDownFilter(dynamicLogicExpression, - valueFromQueryArray, currentCellToColumnIndexMap, encoder); - } - - /** - * @return The filter serialized using pb - */ - public byte[] toByteArray() { - - SparkFilterProtos.SQLPredicatePushDownFilter.Builder builder = - SparkFilterProtos.SQLPredicatePushDownFilter.newBuilder(); - - SparkFilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder columnMappingBuilder = - SparkFilterProtos.SQLPredicatePushDownCellToColumnMapping.newBuilder(); - - builder.setDynamicLogicExpression(dynamicLogicExpression.toExpressionString()); - for (byte[] valueFromQuery: valueFromQueryArray) { - builder.addValueFromQueryArray(ByteStringer.wrap(valueFromQuery)); - } - - for (Map.Entry> - familyEntry : currentCellToColumnIndexMap.entrySet()) { - for (Map.Entry qualifierEntry : - familyEntry.getValue().entrySet()) { - columnMappingBuilder.setColumnFamily( - ByteStringer.wrap(familyEntry.getKey().bytes())); - columnMappingBuilder.setQualifier( - ByteStringer.wrap(qualifierEntry.getKey().bytes())); - columnMappingBuilder.setColumnName(qualifierEntry.getValue()); - builder.addCellToColumnMapping(columnMappingBuilder.build()); - } - } - builder.setEncoderClassName(encoderClassName); - - - return builder.build().toByteArray(); - } -} diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkDeleteExample.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkDeleteExample.java deleted file mode 100644 index 97cf1404210..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkDeleteExample.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.Delete; -import org.apache.hadoop.hbase.spark.JavaHBaseContext; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; - -import java.util.ArrayList; -import java.util.List; - -/** - * This is a simple example of deleting records in HBase - * with the bulkDelete function. - */ -final public class JavaHBaseBulkDeleteExample { - - private JavaHBaseBulkDeleteExample() {} - - public static void main(String[] args) { - if (args.length < 1) { - System.out.println("JavaHBaseBulkDeleteExample {tableName}"); - return; - } - - String tableName = args[0]; - - SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkDeleteExample " + tableName); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - try { - List list = new ArrayList<>(5); - list.add(Bytes.toBytes("1")); - list.add(Bytes.toBytes("2")); - list.add(Bytes.toBytes("3")); - list.add(Bytes.toBytes("4")); - list.add(Bytes.toBytes("5")); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = HBaseConfiguration.create(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.bulkDelete(rdd, - TableName.valueOf(tableName), new DeleteFunction(), 4); - } finally { - jsc.stop(); - } - - } - - public static class DeleteFunction implements Function { - private static final long serialVersionUID = 1L; - public Delete call(byte[] v) throws Exception { - return new Delete(v); - } - } -} diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkGetExample.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkGetExample.java deleted file mode 100644 index cb9e0c7fdea..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkGetExample.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.Cell; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.Get; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.spark.JavaHBaseContext; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; - -/** - * This is a simple example of getting records in HBase - * with the bulkGet function. - */ -final public class JavaHBaseBulkGetExample { - - private JavaHBaseBulkGetExample() {} - - public static void main(String[] args) { - if (args.length < 1) { - System.out.println("JavaHBaseBulkGetExample {tableName}"); - return; - } - - String tableName = args[0]; - - SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample " + tableName); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - try { - List list = new ArrayList<>(5); - list.add(Bytes.toBytes("1")); - list.add(Bytes.toBytes("2")); - list.add(Bytes.toBytes("3")); - list.add(Bytes.toBytes("4")); - list.add(Bytes.toBytes("5")); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = HBaseConfiguration.create(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.bulkGet(TableName.valueOf(tableName), 2, rdd, new GetFunction(), - new ResultFunction()); - } finally { - jsc.stop(); - } - } - - public static class GetFunction implements Function { - - private static final long serialVersionUID = 1L; - - public Get call(byte[] v) throws Exception { - return new Get(v); - } - } - - public static class ResultFunction implements Function { - - private static final long serialVersionUID = 1L; - - public String call(Result result) throws Exception { - Iterator it = result.listCells().iterator(); - StringBuilder b = new StringBuilder(); - - b.append(Bytes.toString(result.getRow())).append(":"); - - while (it.hasNext()) { - Cell cell = it.next(); - String q = Bytes.toString(cell.getQualifierArray()); - if (q.equals("counter")) { - b.append("(") - .append(Bytes.toString(cell.getQualifierArray())) - .append(",") - .append(Bytes.toLong(cell.getValueArray())) - .append(")"); - } else { - b.append("(") - .append(Bytes.toString(cell.getQualifierArray())) - .append(",") - .append(Bytes.toString(cell.getValueArray())) - .append(")"); - } - } - return b.toString(); - } - } -} diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkLoadExample.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkLoadExample.java deleted file mode 100644 index 54ff658ca91..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkLoadExample.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.spark.FamilyHFileWriteOptions; -import org.apache.hadoop.hbase.spark.JavaHBaseContext; -import org.apache.hadoop.hbase.spark.KeyFamilyQualifier; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.Pair; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.Function; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -/** - * Run this example using command below: - * - * SPARK_HOME/bin/spark-submit --master local[2] --class org.apache.hadoop.hbase.spark.example.hbasecontext.JavaHBaseBulkLoadExample - * path/to/hbase-spark.jar {path/to/output/HFiles} - * - * This example will output put hfiles in {path/to/output/HFiles}, and user can run - * 'hbase org.apache.hadoop.hbase.tool.LoadIncrementalHFiles' to load the HFiles into table to verify this example. - */ -final public class JavaHBaseBulkLoadExample { - private JavaHBaseBulkLoadExample() {} - - public static void main(String[] args) { - if (args.length < 1) { - System.out.println("JavaHBaseBulkLoadExample " + "{outputPath}"); - return; - } - - String tableName = "bulkload-table-test"; - String columnFamily1 = "f1"; - String columnFamily2 = "f2"; - - SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkLoadExample " + tableName); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - try { - List list= new ArrayList(); - // row1 - list.add("1," + columnFamily1 + ",b,1"); - // row3 - list.add("3," + columnFamily1 + ",a,2"); - list.add("3," + columnFamily1 + ",b,1"); - list.add("3," + columnFamily2 + ",a,1"); - /* row2 */ - list.add("2," + columnFamily2 + ",a,3"); - list.add("2," + columnFamily2 + ",b,3"); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = HBaseConfiguration.create(); - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - - - hbaseContext.bulkLoad(rdd, TableName.valueOf(tableName),new BulkLoadFunction(), args[0], - new HashMap(), false, HConstants.DEFAULT_MAX_FILE_SIZE); - } finally { - jsc.stop(); - } - } - - public static class BulkLoadFunction implements Function> { - - @Override - public Pair call(String v1) throws Exception { - if (v1 == null) - return null; - String[] strs = v1.split(","); - if(strs.length != 4) - return null; - KeyFamilyQualifier kfq = new KeyFamilyQualifier(Bytes.toBytes(strs[0]), Bytes.toBytes(strs[1]), - Bytes.toBytes(strs[2])); - return new Pair(kfq, Bytes.toBytes(strs[3])); - } - } -} diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkPutExample.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkPutExample.java deleted file mode 100644 index 5821c1957a4..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkPutExample.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.spark.JavaHBaseContext; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; - -/** - * This is a simple example of putting records in HBase - * with the bulkPut function. - */ -final public class JavaHBaseBulkPutExample { - - private JavaHBaseBulkPutExample() {} - - public static void main(String[] args) { - if (args.length < 2) { - System.out.println("JavaHBaseBulkPutExample " + - "{tableName} {columnFamily}"); - return; - } - - String tableName = args[0]; - String columnFamily = args[1]; - - SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkPutExample " + tableName); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - try { - List list = new ArrayList<>(5); - list.add("1," + columnFamily + ",a,1"); - list.add("2," + columnFamily + ",a,2"); - list.add("3," + columnFamily + ",a,3"); - list.add("4," + columnFamily + ",a,4"); - list.add("5," + columnFamily + ",a,5"); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = HBaseConfiguration.create(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.bulkPut(rdd, - TableName.valueOf(tableName), - new PutFunction()); - } finally { - jsc.stop(); - } - } - - public static class PutFunction implements Function { - - private static final long serialVersionUID = 1L; - - public Put call(String v) throws Exception { - String[] cells = v.split(","); - Put put = new Put(Bytes.toBytes(cells[0])); - - put.addColumn(Bytes.toBytes(cells[1]), Bytes.toBytes(cells[2]), - Bytes.toBytes(cells[3])); - return put; - } - - } -} diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseDistributedScan.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseDistributedScan.java deleted file mode 100644 index 8d4c0929ef0..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseDistributedScan.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext; - -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.Scan; -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; -import org.apache.hadoop.hbase.spark.JavaHBaseContext; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import org.apache.spark.api.java.function.Function; -import scala.Tuple2; - -/** - * This is a simple example of scanning records from HBase - * with the hbaseRDD function. - */ -final public class JavaHBaseDistributedScan { - - private JavaHBaseDistributedScan() {} - - public static void main(String[] args) { - if (args.length < 1) { - System.out.println("JavaHBaseDistributedScan {tableName}"); - return; - } - - String tableName = args[0]; - - SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseDistributedScan " + tableName); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - try { - Configuration conf = HBaseConfiguration.create(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - Scan scan = new Scan(); - scan.setCaching(100); - - JavaRDD> javaRdd = - hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan); - - List results = javaRdd.map(new ScanConvertFunction()).collect(); - - System.out.println("Result Size: " + results.size()); - } finally { - jsc.stop(); - } - } - - private static class ScanConvertFunction implements - Function, String> { - @Override - public String call(Tuple2 v1) throws Exception { - return Bytes.toString(v1._1().copyBytes()); - } - } -} diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseMapGetPutExample.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseMapGetPutExample.java deleted file mode 100644 index 316f8a101a3..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseMapGetPutExample.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.BufferedMutator; -import org.apache.hadoop.hbase.client.Connection; -import org.apache.hadoop.hbase.client.Get; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.Table; -import org.apache.hadoop.hbase.spark.JavaHBaseContext; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.VoidFunction; - -import scala.Tuple2; - -/** - * This is a simple example of using the foreachPartition - * method with a HBase connection - */ -final public class JavaHBaseMapGetPutExample { - - private JavaHBaseMapGetPutExample() {} - - public static void main(String[] args) { - if (args.length < 1) { - System.out.println("JavaHBaseBulkGetExample {tableName}"); - return; - } - - final String tableName = args[0]; - - SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample " + tableName); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - try { - List list = new ArrayList<>(5); - list.add(Bytes.toBytes("1")); - list.add(Bytes.toBytes("2")); - list.add(Bytes.toBytes("3")); - list.add(Bytes.toBytes("4")); - list.add(Bytes.toBytes("5")); - - JavaRDD rdd = jsc.parallelize(list); - Configuration conf = HBaseConfiguration.create(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.foreachPartition(rdd, - new VoidFunction, Connection>>() { - public void call(Tuple2, Connection> t) - throws Exception { - Table table = t._2().getTable(TableName.valueOf(tableName)); - BufferedMutator mutator = t._2().getBufferedMutator(TableName.valueOf(tableName)); - - while (t._1().hasNext()) { - byte[] b = t._1().next(); - Result r = table.get(new Get(b)); - if (r.getExists()) { - mutator.mutate(new Put(b)); - } - } - - mutator.flush(); - mutator.close(); - table.close(); - } - }); - } finally { - jsc.stop(); - } - } - - public static class GetFunction implements Function { - private static final long serialVersionUID = 1L; - public Get call(byte[] v) throws Exception { - return new Get(v); - } - } -} diff --git a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseStreamingBulkPutExample.java b/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseStreamingBulkPutExample.java deleted file mode 100644 index cd4cf24f15e..00000000000 --- a/hbase-spark/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseStreamingBulkPutExample.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.spark.JavaHBaseContext; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.streaming.Duration; -import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; - -/** - * This is a simple example of BulkPut with Spark Streaming - */ -final public class JavaHBaseStreamingBulkPutExample { - - private JavaHBaseStreamingBulkPutExample() {} - - public static void main(String[] args) { - if (args.length < 4) { - System.out.println("JavaHBaseBulkPutExample " + - "{host} {port} {tableName}"); - return; - } - - String host = args[0]; - String port = args[1]; - String tableName = args[2]; - - SparkConf sparkConf = - new SparkConf().setAppName("JavaHBaseStreamingBulkPutExample " + - tableName + ":" + port + ":" + tableName); - - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - try { - JavaStreamingContext jssc = - new JavaStreamingContext(jsc, new Duration(1000)); - - JavaReceiverInputDStream javaDstream = - jssc.socketTextStream(host, Integer.parseInt(port)); - - Configuration conf = HBaseConfiguration.create(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.streamBulkPut(javaDstream, - TableName.valueOf(tableName), - new PutFunction()); - } finally { - jsc.stop(); - } - } - - public static class PutFunction implements Function { - - private static final long serialVersionUID = 1L; - - public Put call(String v) throws Exception { - String[] part = v.split(","); - Put put = new Put(Bytes.toBytes(part[0])); - - put.addColumn(Bytes.toBytes(part[1]), - Bytes.toBytes(part[2]), - Bytes.toBytes(part[3])); - return put; - } - - } -} diff --git a/hbase-spark/src/main/protobuf/SparkFilter.proto b/hbase-spark/src/main/protobuf/SparkFilter.proto deleted file mode 100644 index e16c5517882..00000000000 --- a/hbase-spark/src/main/protobuf/SparkFilter.proto +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// This file contains protocol buffers that are used for Spark filters -// over in the hbase-spark module -package hbase.pb; - -option java_package = "org.apache.hadoop.hbase.spark.protobuf.generated"; -option java_outer_classname = "SparkFilterProtos"; -option java_generic_services = true; -option java_generate_equals_and_hash = true; -option optimize_for = SPEED; - -message SQLPredicatePushDownCellToColumnMapping { - required bytes column_family = 1; - required bytes qualifier = 2; - required string column_name = 3; -} - -message SQLPredicatePushDownFilter { - required string dynamic_logic_expression = 1; - repeated bytes value_from_query_array = 2; - repeated SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; - optional string encoderClassName = 4; -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/BulkLoadPartitioner.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/BulkLoadPartitioner.scala deleted file mode 100644 index 9442c50b74d..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/BulkLoadPartitioner.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.util -import java.util.Comparator - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.Partitioner - -/** - * A Partitioner implementation that will separate records to different - * HBase Regions based on region splits - * - * @param startKeys The start keys for the given table - */ -@InterfaceAudience.Public -class BulkLoadPartitioner(startKeys:Array[Array[Byte]]) - extends Partitioner { - // when table not exist, startKeys = Byte[0][] - override def numPartitions: Int = if (startKeys.length == 0) 1 else startKeys.length - - override def getPartition(key: Any): Int = { - - val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] { - override def compare(o1: Array[Byte], o2: Array[Byte]): Int = { - Bytes.compareTo(o1, o2) - } - } - - val rowKey:Array[Byte] = - key match { - case qualifier: KeyFamilyQualifier => - qualifier.rowKey - case wrapper: ByteArrayWrapper => - wrapper.value - case _ => - key.asInstanceOf[Array[Byte]] - } - var partition = util.Arrays.binarySearch(startKeys, rowKey, comparator) - if (partition < 0) - partition = partition * -1 + -2 - if (partition < 0) - partition = 0 - partition - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayComparable.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayComparable.scala deleted file mode 100644 index 2d0be38c174..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayComparable.scala +++ /dev/null @@ -1,49 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.util.Bytes - -@InterfaceAudience.Public -class ByteArrayComparable(val bytes:Array[Byte], val offset:Int = 0, var length:Int = -1) - extends Comparable[ByteArrayComparable] { - - if (length == -1) { - length = bytes.length - } - - override def compareTo(o: ByteArrayComparable): Int = { - Bytes.compareTo(bytes, offset, length, o.bytes, o.offset, o.length) - } - - override def hashCode(): Int = { - Bytes.hashCode(bytes, offset, length) - } - - override def equals (obj: Any): Boolean = { - obj match { - case b: ByteArrayComparable => - Bytes.equals(bytes, offset, length, b.bytes, b.offset, b.length) - case _ => - false - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayWrapper.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayWrapper.scala deleted file mode 100644 index 738fa45a654..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayWrapper.scala +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark - -import java.io.Serializable - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.util.Bytes - -/** - * This is a wrapper over a byte array so it can work as - * a key in a hashMap - * - * @param value The Byte Array value - */ -@InterfaceAudience.Public -class ByteArrayWrapper (var value:Array[Byte]) - extends Comparable[ByteArrayWrapper] with Serializable { - override def compareTo(valueOther: ByteArrayWrapper): Int = { - Bytes.compareTo(value,valueOther.value) - } - override def equals(o2: Any): Boolean = { - o2 match { - case wrapper: ByteArrayWrapper => - Bytes.equals(value, wrapper.value) - case _ => - false - } - } - override def hashCode():Int = { - Bytes.hashCode(value) - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ColumnFamilyQualifierMapKeyWrapper.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ColumnFamilyQualifierMapKeyWrapper.scala deleted file mode 100644 index 3037001b647..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/ColumnFamilyQualifierMapKeyWrapper.scala +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.util.Bytes - -/** - * A wrapper class that will allow both columnFamily and qualifier to - * be the key of a hashMap. Also allow for finding the value in a hashmap - * with out cloning the HBase value from the HBase Cell object - * @param columnFamily ColumnFamily byte array - * @param columnFamilyOffSet Offset of columnFamily value in the array - * @param columnFamilyLength Length of the columnFamily value in the columnFamily array - * @param qualifier Qualifier byte array - * @param qualifierOffSet Offset of qualifier value in the array - * @param qualifierLength Length of the qualifier value with in the array - */ -@InterfaceAudience.Public -class ColumnFamilyQualifierMapKeyWrapper(val columnFamily:Array[Byte], - val columnFamilyOffSet:Int, - val columnFamilyLength:Int, - val qualifier:Array[Byte], - val qualifierOffSet:Int, - val qualifierLength:Int) - extends Serializable{ - - override def equals(other:Any): Boolean = { - val otherWrapper = other.asInstanceOf[ColumnFamilyQualifierMapKeyWrapper] - - Bytes.compareTo(columnFamily, - columnFamilyOffSet, - columnFamilyLength, - otherWrapper.columnFamily, - otherWrapper.columnFamilyOffSet, - otherWrapper.columnFamilyLength) == 0 && Bytes.compareTo(qualifier, - qualifierOffSet, - qualifierLength, - otherWrapper.qualifier, - otherWrapper.qualifierOffSet, - otherWrapper.qualifierLength) == 0 - } - - override def hashCode():Int = { - Bytes.hashCode(columnFamily, columnFamilyOffSet, columnFamilyLength) + - Bytes.hashCode(qualifier, qualifierOffSet, qualifierLength) - } - - def cloneColumnFamily():Array[Byte] = { - val resultArray = new Array[Byte](columnFamilyLength) - System.arraycopy(columnFamily, columnFamilyOffSet, resultArray, 0, columnFamilyLength) - resultArray - } - - def cloneQualifier():Array[Byte] = { - val resultArray = new Array[Byte](qualifierLength) - System.arraycopy(qualifier, qualifierOffSet, resultArray, 0, qualifierLength) - resultArray - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DefaultSource.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DefaultSource.scala deleted file mode 100644 index a488dd333a1..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DefaultSource.scala +++ /dev/null @@ -1,1224 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.util -import java.util.concurrent.ConcurrentLinkedQueue - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.hadoop.hbase.mapred.TableOutputFormat -import org.apache.hadoop.hbase.spark.datasources._ -import org.apache.hadoop.hbase.types._ -import org.apache.hadoop.hbase.util.{Bytes, PositionedByteRange, SimplePositionedMutableByteRange} -import org.apache.hadoop.hbase.HBaseConfiguration -import org.apache.hadoop.hbase.HTableDescriptor -import org.apache.hadoop.hbase.HColumnDescriptor -import org.apache.hadoop.hbase.TableName -import org.apache.hadoop.hbase.CellUtil -import org.apache.hadoop.mapred.JobConf -import org.apache.spark.Logging -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.datasources.hbase.{Utils, Field, HBaseTableCatalog} -import org.apache.spark.sql.{DataFrame, SaveMode, Row, SQLContext} -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types._ - -import scala.collection.mutable - -/** - * DefaultSource for integration with Spark's dataframe datasources. - * This class will produce a relationProvider based on input given to it from spark - * - * This class needs to stay in the current package 'org.apache.hadoop.hbase.spark' - * for Spark to match the hbase data source name. - * - * In all this DefaultSource support the following datasource functionality - * - Scan range pruning through filter push down logic based on rowKeys - * - Filter push down logic on HBase Cells - * - Qualifier filtering based on columns used in the SparkSQL statement - * - Type conversions of basic SQL types. All conversions will be - * Through the HBase Bytes object commands. - */ -@InterfaceAudience.Private -class DefaultSource extends RelationProvider with CreatableRelationProvider with Logging { - /** - * Is given input from SparkSQL to construct a BaseRelation - * - * @param sqlContext SparkSQL context - * @param parameters Parameters given to us from SparkSQL - * @return A BaseRelation Object - */ - override def createRelation(sqlContext: SQLContext, - parameters: Map[String, String]): - BaseRelation = { - new HBaseRelation(parameters, None)(sqlContext) - } - - - override def createRelation( - sqlContext: SQLContext, - mode: SaveMode, - parameters: Map[String, String], - data: DataFrame): BaseRelation = { - val relation = HBaseRelation(parameters, Some(data.schema))(sqlContext) - relation.createTable() - relation.insert(data, false) - relation - } -} - -/** - * Implementation of Spark BaseRelation that will build up our scan logic - * , do the scan pruning, filter push down, and value conversions - * - * @param sqlContext SparkSQL context - */ -@InterfaceAudience.Private -case class HBaseRelation ( - @transient parameters: Map[String, String], - userSpecifiedSchema: Option[StructType] - )(@transient val sqlContext: SQLContext) - extends BaseRelation with PrunedFilteredScan with InsertableRelation with Logging { - val timestamp = parameters.get(HBaseSparkConf.TIMESTAMP).map(_.toLong) - val minTimestamp = parameters.get(HBaseSparkConf.TIMERANGE_START).map(_.toLong) - val maxTimestamp = parameters.get(HBaseSparkConf.TIMERANGE_END).map(_.toLong) - val maxVersions = parameters.get(HBaseSparkConf.MAX_VERSIONS).map(_.toInt) - val encoderClsName = parameters.get(HBaseSparkConf.QUERY_ENCODER).getOrElse(HBaseSparkConf.DEFAULT_QUERY_ENCODER) - - @transient val encoder = JavaBytesEncoder.create(encoderClsName) - - val catalog = HBaseTableCatalog(parameters) - def tableName = catalog.name - val configResources = parameters.getOrElse(HBaseSparkConf.HBASE_CONFIG_LOCATION, "") - val useHBaseContext = parameters.get(HBaseSparkConf.USE_HBASECONTEXT).map(_.toBoolean).getOrElse(HBaseSparkConf.DEFAULT_USE_HBASECONTEXT) - val usePushDownColumnFilter = parameters.get(HBaseSparkConf.PUSHDOWN_COLUMN_FILTER) - .map(_.toBoolean).getOrElse(HBaseSparkConf.DEFAULT_PUSHDOWN_COLUMN_FILTER) - - // The user supplied per table parameter will overwrite global ones in SparkConf - val blockCacheEnable = parameters.get(HBaseSparkConf.QUERY_CACHEBLOCKS).map(_.toBoolean) - .getOrElse( - sqlContext.sparkContext.getConf.getBoolean( - HBaseSparkConf.QUERY_CACHEBLOCKS, HBaseSparkConf.DEFAULT_QUERY_CACHEBLOCKS)) - val cacheSize = parameters.get(HBaseSparkConf.QUERY_CACHEDROWS).map(_.toInt) - .getOrElse( - sqlContext.sparkContext.getConf.getInt( - HBaseSparkConf.QUERY_CACHEDROWS, -1)) - val batchNum = parameters.get(HBaseSparkConf.QUERY_BATCHSIZE).map(_.toInt) - .getOrElse(sqlContext.sparkContext.getConf.getInt( - HBaseSparkConf.QUERY_BATCHSIZE, -1)) - - val bulkGetSize = parameters.get(HBaseSparkConf.BULKGET_SIZE).map(_.toInt) - .getOrElse(sqlContext.sparkContext.getConf.getInt( - HBaseSparkConf.BULKGET_SIZE, HBaseSparkConf.DEFAULT_BULKGET_SIZE)) - - //create or get latest HBaseContext - val hbaseContext:HBaseContext = if (useHBaseContext) { - LatestHBaseContextCache.latest - } else { - val config = HBaseConfiguration.create() - configResources.split(",").foreach( r => config.addResource(r)) - new HBaseContext(sqlContext.sparkContext, config) - } - - val wrappedConf = new SerializableConfiguration(hbaseContext.config) - def hbaseConf = wrappedConf.value - - /** - * Generates a Spark SQL schema objeparametersct so Spark SQL knows what is being - * provided by this BaseRelation - * - * @return schema generated from the SCHEMA_COLUMNS_MAPPING_KEY value - */ - override val schema: StructType = userSpecifiedSchema.getOrElse(catalog.toDataType) - - - - def createTable() { - val numReg = parameters.get(HBaseTableCatalog.newTable).map(x => x.toInt).getOrElse(0) - val startKey = Bytes.toBytes( - parameters.get(HBaseTableCatalog.regionStart) - .getOrElse(HBaseTableCatalog.defaultRegionStart)) - val endKey = Bytes.toBytes( - parameters.get(HBaseTableCatalog.regionEnd) - .getOrElse(HBaseTableCatalog.defaultRegionEnd)) - if (numReg > 3) { - val tName = TableName.valueOf(catalog.name) - val cfs = catalog.getColumnFamilies - - val connection = HBaseConnectionCache.getConnection(hbaseConf) - // Initialize hBase table if necessary - val admin = connection.getAdmin - try { - if (!admin.isTableAvailable(tName)) { - val tableDesc = new HTableDescriptor(tName) - cfs.foreach { x => - val cf = new HColumnDescriptor(x.getBytes()) - logDebug(s"add family $x to ${catalog.name}") - tableDesc.addFamily(cf) - } - val splitKeys = Bytes.split(startKey, endKey, numReg); - admin.createTable(tableDesc, splitKeys) - - } - }finally { - admin.close() - connection.close() - } - } else { - logInfo( - s"""${HBaseTableCatalog.newTable} - |is not defined or no larger than 3, skip the create table""".stripMargin) - } - } - - /** - * - * @param data - * @param overwrite - */ - override def insert(data: DataFrame, overwrite: Boolean): Unit = { - val jobConfig: JobConf = new JobConf(hbaseConf, this.getClass) - jobConfig.setOutputFormat(classOf[TableOutputFormat]) - jobConfig.set(TableOutputFormat.OUTPUT_TABLE, catalog.name) - var count = 0 - val rkFields = catalog.getRowKey - val rkIdxedFields = rkFields.map{ case x => - (schema.fieldIndex(x.colName), x) - } - val colsIdxedFields = schema - .fieldNames - .partition( x => rkFields.map(_.colName).contains(x)) - ._2.map(x => (schema.fieldIndex(x), catalog.getField(x))) - val rdd = data.rdd - def convertToPut(row: Row) = { - // construct bytes for row key - val rowBytes = rkIdxedFields.map { case (x, y) => - Utils.toBytes(row(x), y) - } - val rLen = rowBytes.foldLeft(0) { case (x, y) => - x + y.length - } - val rBytes = new Array[Byte](rLen) - var offset = 0 - rowBytes.foreach { x => - System.arraycopy(x, 0, rBytes, offset, x.length) - offset += x.length - } - val put = timestamp.fold(new Put(rBytes))(new Put(rBytes, _)) - - colsIdxedFields.foreach { case (x, y) => - val b = Utils.toBytes(row(x), y) - put.addColumn(Bytes.toBytes(y.cf), Bytes.toBytes(y.col), b) - } - count += 1 - (new ImmutableBytesWritable, put) - } - rdd.map(convertToPut(_)).saveAsHadoopDataset(jobConfig) - } - - def getIndexedProjections(requiredColumns: Array[String]): Seq[(Field, Int)] = { - requiredColumns.map(catalog.sMap.getField(_)).zipWithIndex - } - - - /** - * Takes a HBase Row object and parses all of the fields from it. - * This is independent of which fields were requested from the key - * Because we have all the data it's less complex to parse everything. - * - * @param row the retrieved row from hbase. - * @param keyFields all of the fields in the row key, ORDERED by their order in the row key. - */ - def parseRowKey(row: Array[Byte], keyFields: Seq[Field]): Map[Field, Any] = { - keyFields.foldLeft((0, Seq[(Field, Any)]()))((state, field) => { - val idx = state._1 - val parsed = state._2 - if (field.length != -1) { - val value = Utils.hbaseFieldToScalaType(field, row, idx, field.length) - // Return the new index and appended value - (idx + field.length, parsed ++ Seq((field, value))) - } else { - field.dt match { - case StringType => - val pos = row.indexOf(HBaseTableCatalog.delimiter, idx) - if (pos == -1 || pos > row.length) { - // this is at the last dimension - val value = Utils.hbaseFieldToScalaType(field, row, idx, row.length) - (row.length + 1, parsed ++ Seq((field, value))) - } else { - val value = Utils.hbaseFieldToScalaType(field, row, idx, pos - idx) - (pos, parsed ++ Seq((field, value))) - } - // We don't know the length, assume it extends to the end of the rowkey. - case _ => (row.length + 1, parsed ++ Seq((field, Utils.hbaseFieldToScalaType(field, row, idx, row.length)))) - } - } - })._2.toMap - } - - def buildRow(fields: Seq[Field], result: Result): Row = { - val r = result.getRow - val keySeq = parseRowKey(r, catalog.getRowKey) - val valueSeq = fields.filter(!_.isRowKey).map { x => - val kv = result.getColumnLatestCell(Bytes.toBytes(x.cf), Bytes.toBytes(x.col)) - if (kv == null || kv.getValueLength == 0) { - (x, null) - } else { - val v = CellUtil.cloneValue(kv) - (x, x.dt match { - // Here, to avoid arraycopy, return v directly instead of calling hbaseFieldToScalaType - case BinaryType => v - case _ => Utils.hbaseFieldToScalaType(x, v, 0, v.length) - }) - } - }.toMap - val unionedRow = keySeq ++ valueSeq - // Return the row ordered by the requested order - Row.fromSeq(fields.map(unionedRow.get(_).getOrElse(null))) - } - - /** - * Here we are building the functionality to populate the resulting RDD[Row] - * Here is where we will do the following: - * - Filter push down - * - Scan or GetList pruning - * - Executing our scan(s) or/and GetList to generate result - * - * @param requiredColumns The columns that are being requested by the requesting query - * @param filters The filters that are being applied by the requesting query - * @return RDD will all the results from HBase needed for SparkSQL to - * execute the query on - */ - override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { - - val pushDownTuple = buildPushDownPredicatesResource(filters) - val pushDownRowKeyFilter = pushDownTuple._1 - var pushDownDynamicLogicExpression = pushDownTuple._2 - val valueArray = pushDownTuple._3 - - if (!usePushDownColumnFilter) { - pushDownDynamicLogicExpression = null - } - - logDebug("pushDownRowKeyFilter: " + pushDownRowKeyFilter.ranges) - if (pushDownDynamicLogicExpression != null) { - logDebug("pushDownDynamicLogicExpression: " + - pushDownDynamicLogicExpression.toExpressionString) - } - logDebug("valueArray: " + valueArray.length) - - val requiredQualifierDefinitionList = - new mutable.MutableList[Field] - - requiredColumns.foreach( c => { - val field = catalog.getField(c) - requiredQualifierDefinitionList += field - }) - - //retain the information for unit testing checks - DefaultSourceStaticUtils.populateLatestExecutionRules(pushDownRowKeyFilter, - pushDownDynamicLogicExpression) - - val getList = new util.ArrayList[Get]() - val rddList = new util.ArrayList[RDD[Row]]() - - //add points to getList - pushDownRowKeyFilter.points.foreach(p => { - val get = new Get(p) - requiredQualifierDefinitionList.foreach( d => { - if (d.isRowKey) - get.addColumn(d.cfBytes, d.colBytes) - }) - getList.add(get) - }) - - val pushDownFilterJava = if (usePushDownColumnFilter && pushDownDynamicLogicExpression != null) { - Some(new SparkSQLPushDownFilter(pushDownDynamicLogicExpression, - valueArray, requiredQualifierDefinitionList, encoderClsName)) - } else { - None - } - val hRdd = new HBaseTableScanRDD(this, hbaseContext, pushDownFilterJava, requiredQualifierDefinitionList.seq) - pushDownRowKeyFilter.points.foreach(hRdd.addPoint(_)) - pushDownRowKeyFilter.ranges.foreach(hRdd.addRange(_)) - - var resultRDD: RDD[Row] = { - val tmp = hRdd.map{ r => - val indexedFields = getIndexedProjections(requiredColumns).map(_._1) - buildRow(indexedFields, r) - - } - if (tmp.partitions.size > 0) { - tmp - } else { - null - } - } - - if (resultRDD == null) { - val scan = new Scan() - scan.setCacheBlocks(blockCacheEnable) - scan.setBatch(batchNum) - scan.setCaching(cacheSize) - requiredQualifierDefinitionList.foreach( d => - scan.addColumn(d.cfBytes, d.colBytes)) - - val rdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan).map(r => { - val indexedFields = getIndexedProjections(requiredColumns).map(_._1) - buildRow(indexedFields, r._2) - }) - resultRDD=rdd - } - resultRDD - } - - def buildPushDownPredicatesResource(filters: Array[Filter]): - (RowKeyFilter, DynamicLogicExpression, Array[Array[Byte]]) = { - var superRowKeyFilter:RowKeyFilter = null - val queryValueList = new mutable.MutableList[Array[Byte]] - var superDynamicLogicExpression: DynamicLogicExpression = null - - filters.foreach( f => { - val rowKeyFilter = new RowKeyFilter() - val logicExpression = transverseFilterTree(rowKeyFilter, queryValueList, f) - if (superDynamicLogicExpression == null) { - superDynamicLogicExpression = logicExpression - superRowKeyFilter = rowKeyFilter - } else { - superDynamicLogicExpression = - new AndLogicExpression(superDynamicLogicExpression, logicExpression) - superRowKeyFilter.mergeIntersect(rowKeyFilter) - } - - }) - - val queryValueArray = queryValueList.toArray - - if (superRowKeyFilter == null) { - superRowKeyFilter = new RowKeyFilter - } - - (superRowKeyFilter, superDynamicLogicExpression, queryValueArray) - } - - /** - * For some codec, the order may be inconsistent between java primitive - * type and its byte array. We may have to split the predicates on some - * of the java primitive type into multiple predicates. The encoder will take - * care of it and returning the concrete ranges. - * - * For example in naive codec, some of the java primitive types have to be split into multiple - * predicates, and union these predicates together to make the predicates be performed correctly. - * For example, if we have "COLUMN < 2", we will transform it into - * "0 <= COLUMN < 2 OR Integer.MIN_VALUE <= COLUMN <= -1" - */ - - def transverseFilterTree(parentRowKeyFilter:RowKeyFilter, - valueArray:mutable.MutableList[Array[Byte]], - filter:Filter): DynamicLogicExpression = { - filter match { - case EqualTo(attr, value) => - val field = catalog.getField(attr) - if (field != null) { - if (field.isRowKey) { - parentRowKeyFilter.mergeIntersect(new RowKeyFilter( - DefaultSourceStaticUtils.getByteValue(field, - value.toString), null)) - } - val byteValue = - DefaultSourceStaticUtils.getByteValue(field, value.toString) - valueArray += byteValue - } - new EqualLogicExpression(attr, valueArray.length - 1, false) - - /** - * encoder may split the predicates into multiple byte array boundaries. - * Each boundaries is mapped into the RowKeyFilter and then is unioned by the reduce - * operation. If the data type is not supported, b will be None, and there is - * no operation happens on the parentRowKeyFilter. - * - * Note that because LessThan is not inclusive, thus the first bound should be exclusive, - * which is controlled by inc. - * - * The other predicates, i.e., GreaterThan/LessThanOrEqual/GreaterThanOrEqual follows - * the similar logic. - */ - case LessThan(attr, value) => - val field = catalog.getField(attr) - if (field != null) { - if (field.isRowKey) { - val b = encoder.ranges(value) - var inc = false - b.map(_.less.map { x => - val r = new RowKeyFilter(null, - new ScanRange(x.upper, inc, x.low, true) - ) - inc = true - r - }).map { x => - x.reduce { (i, j) => - i.mergeUnion(j) - } - }.map(parentRowKeyFilter.mergeIntersect(_)) - } - val byteValue = encoder.encode(field.dt, value) - valueArray += byteValue - } - new LessThanLogicExpression(attr, valueArray.length - 1) - case GreaterThan(attr, value) => - val field = catalog.getField(attr) - if (field != null) { - if (field.isRowKey) { - val b = encoder.ranges(value) - var inc = false - b.map(_.greater.map{x => - val r = new RowKeyFilter(null, - new ScanRange(x.upper, true, x.low, inc)) - inc = true - r - }).map { x => - x.reduce { (i, j) => - i.mergeUnion(j) - } - }.map(parentRowKeyFilter.mergeIntersect(_)) - } - val byteValue = encoder.encode(field.dt, value) - valueArray += byteValue - } - new GreaterThanLogicExpression(attr, valueArray.length - 1) - case LessThanOrEqual(attr, value) => - val field = catalog.getField(attr) - if (field != null) { - if (field.isRowKey) { - val b = encoder.ranges(value) - b.map(_.less.map(x => - new RowKeyFilter(null, - new ScanRange(x.upper, true, x.low, true)))) - .map { x => - x.reduce{ (i, j) => - i.mergeUnion(j) - } - }.map(parentRowKeyFilter.mergeIntersect(_)) - } - val byteValue = encoder.encode(field.dt, value) - valueArray += byteValue - } - new LessThanOrEqualLogicExpression(attr, valueArray.length - 1) - case GreaterThanOrEqual(attr, value) => - val field = catalog.getField(attr) - if (field != null) { - if (field.isRowKey) { - val b = encoder.ranges(value) - b.map(_.greater.map(x => - new RowKeyFilter(null, - new ScanRange(x.upper, true, x.low, true)))) - .map { x => - x.reduce { (i, j) => - i.mergeUnion(j) - } - }.map(parentRowKeyFilter.mergeIntersect(_)) - } - val byteValue = encoder.encode(field.dt, value) - valueArray += byteValue - } - new GreaterThanOrEqualLogicExpression(attr, valueArray.length - 1) - case Or(left, right) => - val leftExpression = transverseFilterTree(parentRowKeyFilter, valueArray, left) - val rightSideRowKeyFilter = new RowKeyFilter - val rightExpression = transverseFilterTree(rightSideRowKeyFilter, valueArray, right) - - parentRowKeyFilter.mergeUnion(rightSideRowKeyFilter) - - new OrLogicExpression(leftExpression, rightExpression) - case And(left, right) => - - val leftExpression = transverseFilterTree(parentRowKeyFilter, valueArray, left) - val rightSideRowKeyFilter = new RowKeyFilter - val rightExpression = transverseFilterTree(rightSideRowKeyFilter, valueArray, right) - parentRowKeyFilter.mergeIntersect(rightSideRowKeyFilter) - - new AndLogicExpression(leftExpression, rightExpression) - case IsNull(attr) => - new IsNullLogicExpression(attr, false) - case IsNotNull(attr) => - new IsNullLogicExpression(attr, true) - case _ => - new PassThroughLogicExpression - } - } -} - -/** - * Construct to contain a single scan ranges information. Also - * provide functions to merge with other scan ranges through AND - * or OR operators - * - * @param upperBound Upper bound of scan - * @param isUpperBoundEqualTo Include upper bound value in the results - * @param lowerBound Lower bound of scan - * @param isLowerBoundEqualTo Include lower bound value in the results - */ -@InterfaceAudience.Private -class ScanRange(var upperBound:Array[Byte], var isUpperBoundEqualTo:Boolean, - var lowerBound:Array[Byte], var isLowerBoundEqualTo:Boolean) - extends Serializable { - - /** - * Function to merge another scan object through a AND operation - * - * @param other Other scan object - */ - def mergeIntersect(other:ScanRange): Unit = { - val upperBoundCompare = compareRange(upperBound, other.upperBound) - val lowerBoundCompare = compareRange(lowerBound, other.lowerBound) - - upperBound = if (upperBoundCompare <0) upperBound else other.upperBound - lowerBound = if (lowerBoundCompare >0) lowerBound else other.lowerBound - - isLowerBoundEqualTo = if (lowerBoundCompare == 0) - isLowerBoundEqualTo && other.isLowerBoundEqualTo - else isLowerBoundEqualTo - - isUpperBoundEqualTo = if (upperBoundCompare == 0) - isUpperBoundEqualTo && other.isUpperBoundEqualTo - else isUpperBoundEqualTo - } - - /** - * Function to merge another scan object through a OR operation - * - * @param other Other scan object - */ - def mergeUnion(other:ScanRange): Unit = { - - val upperBoundCompare = compareRange(upperBound, other.upperBound) - val lowerBoundCompare = compareRange(lowerBound, other.lowerBound) - - upperBound = if (upperBoundCompare >0) upperBound else other.upperBound - lowerBound = if (lowerBoundCompare <0) lowerBound else other.lowerBound - - isLowerBoundEqualTo = if (lowerBoundCompare == 0) - isLowerBoundEqualTo || other.isLowerBoundEqualTo - else if (lowerBoundCompare < 0) isLowerBoundEqualTo else other.isLowerBoundEqualTo - - isUpperBoundEqualTo = if (upperBoundCompare == 0) - isUpperBoundEqualTo || other.isUpperBoundEqualTo - else if (upperBoundCompare < 0) other.isUpperBoundEqualTo else isUpperBoundEqualTo - } - - /** - * Common function to see if this scan over laps with another - * - * Reference Visual - * - * A B - * |---------------------------| - * LL--------------LU - * RL--------------RU - * - * A = lowest value is byte[0] - * B = highest value is null - * LL = Left Lower Bound - * LU = Left Upper Bound - * RL = Right Lower Bound - * RU = Right Upper Bound - * - * @param other Other scan object - * @return True is overlap false is not overlap - */ - def getOverLapScanRange(other:ScanRange): ScanRange = { - - var leftRange:ScanRange = null - var rightRange:ScanRange = null - - // First identify the Left range - // Also lower bound can't be null - if (compareRange(lowerBound, other.lowerBound) < 0 || - compareRange(upperBound, other.upperBound) < 0) { - leftRange = this - rightRange = other - } else { - leftRange = other - rightRange = this - } - - if (hasOverlap(leftRange, rightRange)) { - // Find the upper bound and lower bound - if (compareRange(leftRange.upperBound, rightRange.upperBound) >= 0) { - new ScanRange(rightRange.upperBound, rightRange.isUpperBoundEqualTo, - rightRange.lowerBound, rightRange.isLowerBoundEqualTo) - } else { - new ScanRange(leftRange.upperBound, leftRange.isUpperBoundEqualTo, - rightRange.lowerBound, rightRange.isLowerBoundEqualTo) - } - } else { - null - } - } - - /** - * The leftRange.upperBound has to be larger than the rightRange's lowerBound. - * Otherwise, there is no overlap. - * - * @param left: The range with the smaller lowBound - * @param right: The range with the larger lowBound - * @return Whether two ranges have overlap. - */ - - def hasOverlap(left: ScanRange, right: ScanRange): Boolean = { - compareRange(left.upperBound, right.lowerBound) >= 0 - } - - /** - * Special compare logic because we can have null values - * for left or right bound - * - * @param left Left byte array - * @param right Right byte array - * @return 0 for equals 1 is left is greater and -1 is right is greater - */ - def compareRange(left:Array[Byte], right:Array[Byte]): Int = { - if (left == null && right == null) 0 - else if (left == null && right != null) 1 - else if (left != null && right == null) -1 - else Bytes.compareTo(left, right) - } - - /** - * - * @return - */ - def containsPoint(point:Array[Byte]): Boolean = { - val lowerCompare = compareRange(point, lowerBound) - val upperCompare = compareRange(point, upperBound) - - ((isLowerBoundEqualTo && lowerCompare >= 0) || - (!isLowerBoundEqualTo && lowerCompare > 0)) && - ((isUpperBoundEqualTo && upperCompare <= 0) || - (!isUpperBoundEqualTo && upperCompare < 0)) - - } - override def toString:String = { - "ScanRange:(upperBound:" + Bytes.toString(upperBound) + - ",isUpperBoundEqualTo:" + isUpperBoundEqualTo + ",lowerBound:" + - Bytes.toString(lowerBound) + ",isLowerBoundEqualTo:" + isLowerBoundEqualTo + ")" - } -} - -/** - * Contains information related to a filters for a given column. - * This can contain many ranges or points. - * - * @param currentPoint the initial point when the filter is created - * @param currentRange the initial scanRange when the filter is created - */ -@InterfaceAudience.Private -class ColumnFilter (currentPoint:Array[Byte] = null, - currentRange:ScanRange = null, - var points:mutable.MutableList[Array[Byte]] = - new mutable.MutableList[Array[Byte]](), - var ranges:mutable.MutableList[ScanRange] = - new mutable.MutableList[ScanRange]() ) extends Serializable { - //Collection of ranges - if (currentRange != null ) ranges.+=(currentRange) - - //Collection of points - if (currentPoint != null) points.+=(currentPoint) - - /** - * This will validate a give value through the filter's points and/or ranges - * the result will be if the value passed the filter - * - * @param value Value to be validated - * @param valueOffSet The offset of the value - * @param valueLength The length of the value - * @return True is the value passes the filter false if not - */ - def validate(value:Array[Byte], valueOffSet:Int, valueLength:Int):Boolean = { - var result = false - - points.foreach( p => { - if (Bytes.equals(p, 0, p.length, value, valueOffSet, valueLength)) { - result = true - } - }) - - ranges.foreach( r => { - val upperBoundPass = r.upperBound == null || - (r.isUpperBoundEqualTo && - Bytes.compareTo(r.upperBound, 0, r.upperBound.length, - value, valueOffSet, valueLength) >= 0) || - (!r.isUpperBoundEqualTo && - Bytes.compareTo(r.upperBound, 0, r.upperBound.length, - value, valueOffSet, valueLength) > 0) - - val lowerBoundPass = r.lowerBound == null || r.lowerBound.length == 0 - (r.isLowerBoundEqualTo && - Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, - value, valueOffSet, valueLength) <= 0) || - (!r.isLowerBoundEqualTo && - Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, - value, valueOffSet, valueLength) < 0) - - result = result || (upperBoundPass && lowerBoundPass) - }) - result - } - - /** - * This will allow us to merge filter logic that is joined to the existing filter - * through a OR operator - * - * @param other Filter to merge - */ - def mergeUnion(other:ColumnFilter): Unit = { - other.points.foreach( p => points += p) - - other.ranges.foreach( otherR => { - var doesOverLap = false - ranges.foreach{ r => - if (r.getOverLapScanRange(otherR) != null) { - r.mergeUnion(otherR) - doesOverLap = true - }} - if (!doesOverLap) ranges.+=(otherR) - }) - } - - /** - * This will allow us to merge filter logic that is joined to the existing filter - * through a AND operator - * - * @param other Filter to merge - */ - def mergeIntersect(other:ColumnFilter): Unit = { - val survivingPoints = new mutable.MutableList[Array[Byte]]() - points.foreach( p => { - other.points.foreach( otherP => { - if (Bytes.equals(p, otherP)) { - survivingPoints.+=(p) - } - }) - }) - points = survivingPoints - - val survivingRanges = new mutable.MutableList[ScanRange]() - - other.ranges.foreach( otherR => { - ranges.foreach( r => { - if (r.getOverLapScanRange(otherR) != null) { - r.mergeIntersect(otherR) - survivingRanges += r - } - }) - }) - ranges = survivingRanges - } - - override def toString:String = { - val strBuilder = new StringBuilder - strBuilder.append("(points:(") - var isFirst = true - points.foreach( p => { - if (isFirst) isFirst = false - else strBuilder.append(",") - strBuilder.append(Bytes.toString(p)) - }) - strBuilder.append("),ranges:") - isFirst = true - ranges.foreach( r => { - if (isFirst) isFirst = false - else strBuilder.append(",") - strBuilder.append(r) - }) - strBuilder.append("))") - strBuilder.toString() - } -} - -/** - * A collection of ColumnFilters indexed by column names. - * - * Also contains merge commends that will consolidate the filters - * per column name - */ -@InterfaceAudience.Private -class ColumnFilterCollection { - val columnFilterMap = new mutable.HashMap[String, ColumnFilter] - - def clear(): Unit = { - columnFilterMap.clear() - } - - /** - * This will allow us to merge filter logic that is joined to the existing filter - * through a OR operator. This will merge a single columns filter - * - * @param column The column to be merged - * @param other The other ColumnFilter object to merge - */ - def mergeUnion(column:String, other:ColumnFilter): Unit = { - val existingFilter = columnFilterMap.get(column) - if (existingFilter.isEmpty) { - columnFilterMap.+=((column, other)) - } else { - existingFilter.get.mergeUnion(other) - } - } - - /** - * This will allow us to merge all filters in the existing collection - * to the filters in the other collection. All merges are done as a result - * of a OR operator - * - * @param other The other Column Filter Collection to be merged - */ - def mergeUnion(other:ColumnFilterCollection): Unit = { - other.columnFilterMap.foreach( e => { - mergeUnion(e._1, e._2) - }) - } - - /** - * This will allow us to merge all filters in the existing collection - * to the filters in the other collection. All merges are done as a result - * of a AND operator - * - * @param other The column filter from the other collection - */ - def mergeIntersect(other:ColumnFilterCollection): Unit = { - other.columnFilterMap.foreach( e => { - val existingColumnFilter = columnFilterMap.get(e._1) - if (existingColumnFilter.isEmpty) { - columnFilterMap += e - } else { - existingColumnFilter.get.mergeIntersect(e._2) - } - }) - } - - override def toString:String = { - val strBuilder = new StringBuilder - columnFilterMap.foreach( e => strBuilder.append(e)) - strBuilder.toString() - } -} - -/** - * Status object to store static functions but also to hold last executed - * information that can be used for unit testing. - */ -@InterfaceAudience.Private -object DefaultSourceStaticUtils { - - val rawInteger = new RawInteger - val rawLong = new RawLong - val rawFloat = new RawFloat - val rawDouble = new RawDouble - val rawString = RawString.ASCENDING - - val byteRange = new ThreadLocal[PositionedByteRange] { - override def initialValue(): PositionedByteRange = { - val range = new SimplePositionedMutableByteRange() - range.setOffset(0) - range.setPosition(0) - } - } - - def getFreshByteRange(bytes: Array[Byte]): PositionedByteRange = { - getFreshByteRange(bytes, 0, bytes.length) - } - - def getFreshByteRange(bytes: Array[Byte], offset: Int = 0, length: Int): - PositionedByteRange = { - byteRange.get().set(bytes).setLength(length).setOffset(offset) - } - - //This will contain the last 5 filters and required fields used in buildScan - // These values can be used in unit testing to make sure we are converting - // The Spark SQL input correctly - val lastFiveExecutionRules = - new ConcurrentLinkedQueue[ExecutionRuleForUnitTesting]() - - /** - * This method is to populate the lastFiveExecutionRules for unit test perposes - * This method is not thread safe. - * - * @param rowKeyFilter The rowKey Filter logic used in the last query - * @param dynamicLogicExpression The dynamicLogicExpression used in the last query - */ - def populateLatestExecutionRules(rowKeyFilter: RowKeyFilter, - dynamicLogicExpression: DynamicLogicExpression): Unit = { - lastFiveExecutionRules.add(new ExecutionRuleForUnitTesting( - rowKeyFilter, dynamicLogicExpression)) - while (lastFiveExecutionRules.size() > 5) { - lastFiveExecutionRules.poll() - } - } - - /** - * This method will convert the result content from HBase into the - * SQL value type that is requested by the Spark SQL schema definition - * - * @param field The structure of the SparkSQL Column - * @param r The result object from HBase - * @return The converted object type - */ - def getValue(field: Field, - r: Result): Any = { - if (field.isRowKey) { - val row = r.getRow - - field.dt match { - case IntegerType => rawInteger.decode(getFreshByteRange(row)) - case LongType => rawLong.decode(getFreshByteRange(row)) - case FloatType => rawFloat.decode(getFreshByteRange(row)) - case DoubleType => rawDouble.decode(getFreshByteRange(row)) - case StringType => rawString.decode(getFreshByteRange(row)) - case TimestampType => rawLong.decode(getFreshByteRange(row)) - case _ => Bytes.toString(row) - } - } else { - val cellByteValue = - r.getColumnLatestCell(field.cfBytes, field.colBytes) - if (cellByteValue == null) null - else field.dt match { - case IntegerType => rawInteger.decode(getFreshByteRange(cellByteValue.getValueArray, - cellByteValue.getValueOffset, cellByteValue.getValueLength)) - case LongType => rawLong.decode(getFreshByteRange(cellByteValue.getValueArray, - cellByteValue.getValueOffset, cellByteValue.getValueLength)) - case FloatType => rawFloat.decode(getFreshByteRange(cellByteValue.getValueArray, - cellByteValue.getValueOffset, cellByteValue.getValueLength)) - case DoubleType => rawDouble.decode(getFreshByteRange(cellByteValue.getValueArray, - cellByteValue.getValueOffset, cellByteValue.getValueLength)) - case StringType => Bytes.toString(cellByteValue.getValueArray, - cellByteValue.getValueOffset, cellByteValue.getValueLength) - case TimestampType => rawLong.decode(getFreshByteRange(cellByteValue.getValueArray, - cellByteValue.getValueOffset, cellByteValue.getValueLength)) - case _ => Bytes.toString(cellByteValue.getValueArray, - cellByteValue.getValueOffset, cellByteValue.getValueLength) - } - } - } - - /** - * This will convert the value from SparkSQL to be stored into HBase using the - * right byte Type - * - * @param value String value from SparkSQL - * @return Returns the byte array to go into HBase - */ - def getByteValue(field: Field, - value: String): Array[Byte] = { - field.dt match { - case IntegerType => - val result = new Array[Byte](Bytes.SIZEOF_INT) - val localDataRange = getFreshByteRange(result) - rawInteger.encode(localDataRange, value.toInt) - localDataRange.getBytes - case LongType => - val result = new Array[Byte](Bytes.SIZEOF_LONG) - val localDataRange = getFreshByteRange(result) - rawLong.encode(localDataRange, value.toLong) - localDataRange.getBytes - case FloatType => - val result = new Array[Byte](Bytes.SIZEOF_FLOAT) - val localDataRange = getFreshByteRange(result) - rawFloat.encode(localDataRange, value.toFloat) - localDataRange.getBytes - case DoubleType => - val result = new Array[Byte](Bytes.SIZEOF_DOUBLE) - val localDataRange = getFreshByteRange(result) - rawDouble.encode(localDataRange, value.toDouble) - localDataRange.getBytes - case StringType => - Bytes.toBytes(value) - case TimestampType => - val result = new Array[Byte](Bytes.SIZEOF_LONG) - val localDataRange = getFreshByteRange(result) - rawLong.encode(localDataRange, value.toLong) - localDataRange.getBytes - - case _ => Bytes.toBytes(value) - } - } -} - -/** - * Contains information related to a filters for a given column. - * This can contain many ranges or points. - * - * @param currentPoint the initial point when the filter is created - * @param currentRange the initial scanRange when the filter is created - */ -@InterfaceAudience.Private -class RowKeyFilter (currentPoint:Array[Byte] = null, - currentRange:ScanRange = - new ScanRange(null, true, new Array[Byte](0), true), - var points:mutable.MutableList[Array[Byte]] = - new mutable.MutableList[Array[Byte]](), - var ranges:mutable.MutableList[ScanRange] = - new mutable.MutableList[ScanRange]() ) extends Serializable { - //Collection of ranges - if (currentRange != null ) ranges.+=(currentRange) - - //Collection of points - if (currentPoint != null) points.+=(currentPoint) - - /** - * This will validate a give value through the filter's points and/or ranges - * the result will be if the value passed the filter - * - * @param value Value to be validated - * @param valueOffSet The offset of the value - * @param valueLength The length of the value - * @return True is the value passes the filter false if not - */ - def validate(value:Array[Byte], valueOffSet:Int, valueLength:Int):Boolean = { - var result = false - - points.foreach( p => { - if (Bytes.equals(p, 0, p.length, value, valueOffSet, valueLength)) { - result = true - } - }) - - ranges.foreach( r => { - val upperBoundPass = r.upperBound == null || - (r.isUpperBoundEqualTo && - Bytes.compareTo(r.upperBound, 0, r.upperBound.length, - value, valueOffSet, valueLength) >= 0) || - (!r.isUpperBoundEqualTo && - Bytes.compareTo(r.upperBound, 0, r.upperBound.length, - value, valueOffSet, valueLength) > 0) - - val lowerBoundPass = r.lowerBound == null || r.lowerBound.length == 0 - (r.isLowerBoundEqualTo && - Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, - value, valueOffSet, valueLength) <= 0) || - (!r.isLowerBoundEqualTo && - Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, - value, valueOffSet, valueLength) < 0) - - result = result || (upperBoundPass && lowerBoundPass) - }) - result - } - - /** - * This will allow us to merge filter logic that is joined to the existing filter - * through a OR operator - * - * @param other Filter to merge - */ - def mergeUnion(other:RowKeyFilter): RowKeyFilter = { - other.points.foreach( p => points += p) - - other.ranges.foreach( otherR => { - var doesOverLap = false - ranges.foreach{ r => - if (r.getOverLapScanRange(otherR) != null) { - r.mergeUnion(otherR) - doesOverLap = true - }} - if (!doesOverLap) ranges.+=(otherR) - }) - this - } - - /** - * This will allow us to merge filter logic that is joined to the existing filter - * through a AND operator - * - * @param other Filter to merge - */ - def mergeIntersect(other:RowKeyFilter): RowKeyFilter = { - val survivingPoints = new mutable.MutableList[Array[Byte]]() - val didntSurviveFirstPassPoints = new mutable.MutableList[Array[Byte]]() - if (points == null || points.length == 0) { - other.points.foreach( otherP => { - didntSurviveFirstPassPoints += otherP - }) - } else { - points.foreach(p => { - if (other.points.length == 0) { - didntSurviveFirstPassPoints += p - } else { - other.points.foreach(otherP => { - if (Bytes.equals(p, otherP)) { - survivingPoints += p - } else { - didntSurviveFirstPassPoints += p - } - }) - } - }) - } - - val survivingRanges = new mutable.MutableList[ScanRange]() - - if (ranges.length == 0) { - didntSurviveFirstPassPoints.foreach(p => { - survivingPoints += p - }) - } else { - ranges.foreach(r => { - other.ranges.foreach(otherR => { - val overLapScanRange = r.getOverLapScanRange(otherR) - if (overLapScanRange != null) { - survivingRanges += overLapScanRange - } - }) - didntSurviveFirstPassPoints.foreach(p => { - if (r.containsPoint(p)) { - survivingPoints += p - } - }) - }) - } - points = survivingPoints - ranges = survivingRanges - this - } - - override def toString:String = { - val strBuilder = new StringBuilder - strBuilder.append("(points:(") - var isFirst = true - points.foreach( p => { - if (isFirst) isFirst = false - else strBuilder.append(",") - strBuilder.append(Bytes.toString(p)) - }) - strBuilder.append("),ranges:") - isFirst = true - ranges.foreach( r => { - if (isFirst) isFirst = false - else strBuilder.append(",") - strBuilder.append(r) - }) - strBuilder.append("))") - strBuilder.toString() - } -} - -@InterfaceAudience.Private -class ExecutionRuleForUnitTesting(val rowKeyFilter: RowKeyFilter, - val dynamicLogicExpression: DynamicLogicExpression) diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DynamicLogicExpression.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DynamicLogicExpression.scala deleted file mode 100644 index 283838f52b1..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DynamicLogicExpression.scala +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.util - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.spark.datasources.{BytesEncoder, JavaBytesEncoder} -import org.apache.hadoop.hbase.spark.datasources.JavaBytesEncoder.JavaBytesEncoder -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.sql.datasources.hbase.{Field, Utils} -import org.apache.spark.sql.types._ -/** - * Dynamic logic for SQL push down logic there is an instance for most - * common operations and a pass through for other operations not covered here - * - * Logic can be nested with And or Or operators. - * - * A logic tree can be written out as a string and reconstructed from that string - * - */ -@InterfaceAudience.Private -trait DynamicLogicExpression { - def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], - valueFromQueryValueArray:Array[Array[Byte]]): Boolean - def toExpressionString: String = { - val strBuilder = new StringBuilder - appendToExpression(strBuilder) - strBuilder.toString() - } - def filterOps: JavaBytesEncoder = JavaBytesEncoder.Unknown - - def appendToExpression(strBuilder:StringBuilder) - - var encoder: BytesEncoder = _ - - def setEncoder(enc: BytesEncoder): DynamicLogicExpression = { - encoder = enc - this - } -} - -@InterfaceAudience.Private -trait CompareTrait { - self: DynamicLogicExpression => - def columnName: String - def valueFromQueryIndex: Int - def execute(columnToCurrentRowValueMap: - util.HashMap[String, ByteArrayComparable], - valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { - val currentRowValue = columnToCurrentRowValueMap.get(columnName) - val valueFromQuery = valueFromQueryValueArray(valueFromQueryIndex) - currentRowValue != null && - encoder.filter(currentRowValue.bytes, currentRowValue.offset, currentRowValue.length, - valueFromQuery, 0, valueFromQuery.length, filterOps) - } -} - -@InterfaceAudience.Private -class AndLogicExpression (val leftExpression:DynamicLogicExpression, - val rightExpression:DynamicLogicExpression) - extends DynamicLogicExpression{ - override def execute(columnToCurrentRowValueMap: - util.HashMap[String, ByteArrayComparable], - valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { - leftExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) && - rightExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) - } - - override def appendToExpression(strBuilder: StringBuilder): Unit = { - strBuilder.append("( ") - strBuilder.append(leftExpression.toExpressionString) - strBuilder.append(" AND ") - strBuilder.append(rightExpression.toExpressionString) - strBuilder.append(" )") - } -} - -@InterfaceAudience.Private -class OrLogicExpression (val leftExpression:DynamicLogicExpression, - val rightExpression:DynamicLogicExpression) - extends DynamicLogicExpression{ - override def execute(columnToCurrentRowValueMap: - util.HashMap[String, ByteArrayComparable], - valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { - leftExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) || - rightExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) - } - override def appendToExpression(strBuilder: StringBuilder): Unit = { - strBuilder.append("( ") - strBuilder.append(leftExpression.toExpressionString) - strBuilder.append(" OR ") - strBuilder.append(rightExpression.toExpressionString) - strBuilder.append(" )") - } -} - -@InterfaceAudience.Private -class EqualLogicExpression (val columnName:String, - val valueFromQueryIndex:Int, - val isNot:Boolean) extends DynamicLogicExpression{ - override def execute(columnToCurrentRowValueMap: - util.HashMap[String, ByteArrayComparable], - valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { - val currentRowValue = columnToCurrentRowValueMap.get(columnName) - val valueFromQuery = valueFromQueryValueArray(valueFromQueryIndex) - - currentRowValue != null && - Bytes.equals(valueFromQuery, - 0, valueFromQuery.length, currentRowValue.bytes, - currentRowValue.offset, currentRowValue.length) != isNot - } - override def appendToExpression(strBuilder: StringBuilder): Unit = { - val command = if (isNot) "!=" else "==" - strBuilder.append(columnName + " " + command + " " + valueFromQueryIndex) - } -} - -@InterfaceAudience.Private -class IsNullLogicExpression (val columnName:String, - val isNot:Boolean) extends DynamicLogicExpression{ - override def execute(columnToCurrentRowValueMap: - util.HashMap[String, ByteArrayComparable], - valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { - val currentRowValue = columnToCurrentRowValueMap.get(columnName) - - (currentRowValue == null) != isNot - } - override def appendToExpression(strBuilder: StringBuilder): Unit = { - val command = if (isNot) "isNotNull" else "isNull" - strBuilder.append(columnName + " " + command) - } -} - -@InterfaceAudience.Private -class GreaterThanLogicExpression (override val columnName:String, - override val valueFromQueryIndex:Int) - extends DynamicLogicExpression with CompareTrait{ - override val filterOps = JavaBytesEncoder.Greater - override def appendToExpression(strBuilder: StringBuilder): Unit = { - strBuilder.append(columnName + " > " + valueFromQueryIndex) - } -} - -@InterfaceAudience.Private -class GreaterThanOrEqualLogicExpression (override val columnName:String, - override val valueFromQueryIndex:Int) - extends DynamicLogicExpression with CompareTrait{ - override val filterOps = JavaBytesEncoder.GreaterEqual - override def appendToExpression(strBuilder: StringBuilder): Unit = { - strBuilder.append(columnName + " >= " + valueFromQueryIndex) - } -} - -@InterfaceAudience.Private -class LessThanLogicExpression (override val columnName:String, - override val valueFromQueryIndex:Int) - extends DynamicLogicExpression with CompareTrait { - override val filterOps = JavaBytesEncoder.Less - override def appendToExpression(strBuilder: StringBuilder): Unit = { - strBuilder.append(columnName + " < " + valueFromQueryIndex) - } -} - -@InterfaceAudience.Private -class LessThanOrEqualLogicExpression (val columnName:String, - val valueFromQueryIndex:Int) - extends DynamicLogicExpression with CompareTrait{ - override val filterOps = JavaBytesEncoder.LessEqual - override def appendToExpression(strBuilder: StringBuilder): Unit = { - strBuilder.append(columnName + " <= " + valueFromQueryIndex) - } -} - -@InterfaceAudience.Private -class PassThroughLogicExpression() extends DynamicLogicExpression { - override def execute(columnToCurrentRowValueMap: - util.HashMap[String, ByteArrayComparable], - valueFromQueryValueArray: Array[Array[Byte]]): Boolean = true - - override def appendToExpression(strBuilder: StringBuilder): Unit = { - // Fix the offset bug by add dummy to avoid crash the region server. - // because in the DynamicLogicExpressionBuilder.build function, the command is always retrieved from offset + 1 as below - // val command = expressionArray(offSet + 1) - // we have to padding it so that `Pass` is on the right offset. - strBuilder.append("dummy Pass -1") - } -} - -@InterfaceAudience.Private -object DynamicLogicExpressionBuilder { - def build(expressionString: String, encoder: BytesEncoder): DynamicLogicExpression = { - - val expressionAndOffset = build(expressionString.split(' '), 0, encoder) - expressionAndOffset._1 - } - - private def build(expressionArray:Array[String], - offSet:Int, encoder: BytesEncoder): (DynamicLogicExpression, Int) = { - val expr = { - if (expressionArray(offSet).equals("(")) { - val left = build(expressionArray, offSet + 1, encoder) - val right = build(expressionArray, left._2 + 1, encoder) - if (expressionArray(left._2).equals("AND")) { - (new AndLogicExpression(left._1, right._1), right._2 + 1) - } else if (expressionArray(left._2).equals("OR")) { - (new OrLogicExpression(left._1, right._1), right._2 + 1) - } else { - throw new Throwable("Unknown gate:" + expressionArray(left._2)) - } - } else { - val command = expressionArray(offSet + 1) - if (command.equals("<")) { - (new LessThanLogicExpression(expressionArray(offSet), - expressionArray(offSet + 2).toInt), offSet + 3) - } else if (command.equals("<=")) { - (new LessThanOrEqualLogicExpression(expressionArray(offSet), - expressionArray(offSet + 2).toInt), offSet + 3) - } else if (command.equals(">")) { - (new GreaterThanLogicExpression(expressionArray(offSet), - expressionArray(offSet + 2).toInt), offSet + 3) - } else if (command.equals(">=")) { - (new GreaterThanOrEqualLogicExpression(expressionArray(offSet), - expressionArray(offSet + 2).toInt), offSet + 3) - } else if (command.equals("==")) { - (new EqualLogicExpression(expressionArray(offSet), - expressionArray(offSet + 2).toInt, false), offSet + 3) - } else if (command.equals("!=")) { - (new EqualLogicExpression(expressionArray(offSet), - expressionArray(offSet + 2).toInt, true), offSet + 3) - } else if (command.equals("isNull")) { - (new IsNullLogicExpression(expressionArray(offSet), false), offSet + 2) - } else if (command.equals("isNotNull")) { - (new IsNullLogicExpression(expressionArray(offSet), true), offSet + 2) - } else if (command.equals("Pass")) { - (new PassThroughLogicExpression, offSet + 3) - } else { - throw new Throwable("Unknown logic command:" + command) - } - } - } - expr._1.setEncoder(encoder) - expr - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/FamiliesQualifiersValues.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/FamiliesQualifiersValues.scala deleted file mode 100644 index 7a651e1f778..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/FamiliesQualifiersValues.scala +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark - -import java.util - -import org.apache.yetus.audience.InterfaceAudience; - -/** - * This object is a clean way to store and sort all cells that will be bulk - * loaded into a single row - */ -@InterfaceAudience.Public -class FamiliesQualifiersValues extends Serializable { - //Tree maps are used because we need the results to - // be sorted when we read them - val familyMap = new util.TreeMap[ByteArrayWrapper, - util.TreeMap[ByteArrayWrapper, Array[Byte]]]() - - //normally in a row there are more columns then - //column families this wrapper is reused for column - //family look ups - val reusableWrapper = new ByteArrayWrapper(null) - - /** - * Adds a new cell to an existing row - * @param family HBase column family - * @param qualifier HBase column qualifier - * @param value HBase cell value - */ - def += (family: Array[Byte], qualifier: Array[Byte], value: Array[Byte]): Unit = { - - reusableWrapper.value = family - - var qualifierValues = familyMap.get(reusableWrapper) - - if (qualifierValues == null) { - qualifierValues = new util.TreeMap[ByteArrayWrapper, Array[Byte]]() - familyMap.put(new ByteArrayWrapper(family), qualifierValues) - } - - qualifierValues.put(new ByteArrayWrapper(qualifier), value) - } - - /** - * A wrapper for "+=" method above, can be used by Java - * @param family HBase column family - * @param qualifier HBase column qualifier - * @param value HBase cell value - */ - def add(family: Array[Byte], qualifier: Array[Byte], value: Array[Byte]): Unit = { - this += (family, qualifier, value) - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/FamilyHFileWriteOptions.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/FamilyHFileWriteOptions.scala deleted file mode 100644 index 9ee9291f0e9..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/FamilyHFileWriteOptions.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.io.Serializable - -import org.apache.yetus.audience.InterfaceAudience; - -/** - * This object will hold optional data for how a given column family's - * writer will work - * - * @param compression String to define the Compression to be used in the HFile - * @param bloomType String to define the bloom type to be used in the HFile - * @param blockSize The block size to be used in the HFile - * @param dataBlockEncoding String to define the data block encoding to be used - * in the HFile - */ -@InterfaceAudience.Public -class FamilyHFileWriteOptions( val compression:String, - val bloomType: String, - val blockSize: Int, - val dataBlockEncoding: String) extends Serializable diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseConnectionCache.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseConnectionCache.scala deleted file mode 100644 index 2858da8c518..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseConnectionCache.scala +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.io.IOException - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory, RegionLocator, Table} -import org.apache.hadoop.hbase.ipc.RpcControllerFactory -import org.apache.hadoop.hbase.security.{User, UserProvider} -import org.apache.hadoop.hbase.spark.datasources.HBaseSparkConf -import org.apache.hadoop.hbase.{HConstants, TableName} -import org.apache.spark.Logging - -import scala.collection.mutable - -private[spark] object HBaseConnectionCache extends Logging { - - // A hashmap of Spark-HBase connections. Key is HBaseConnectionKey. - val connectionMap = new mutable.HashMap[HBaseConnectionKey, SmartConnection]() - - val cacheStat = HBaseConnectionCacheStat(0, 0, 0) - - // in milliseconds - private final val DEFAULT_TIME_OUT: Long = HBaseSparkConf.DEFAULT_CONNECTION_CLOSE_DELAY - private var timeout = DEFAULT_TIME_OUT - private var closed: Boolean = false - - var housekeepingThread = new Thread(new Runnable { - override def run() { - while (true) { - try { - Thread.sleep(timeout) - } catch { - case e: InterruptedException => - // setTimeout() and close() may interrupt the sleep and it's safe - // to ignore the exception - } - if (closed) - return - performHousekeeping(false) - } - } - }) - housekeepingThread.setDaemon(true) - housekeepingThread.start() - - def getStat: HBaseConnectionCacheStat = { - connectionMap.synchronized { - cacheStat.numActiveConnections = connectionMap.size - cacheStat.copy() - } - } - - def close(): Unit = { - try { - connectionMap.synchronized { - if (closed) - return - closed = true - housekeepingThread.interrupt() - housekeepingThread = null - HBaseConnectionCache.performHousekeeping(true) - } - } catch { - case e: Exception => logWarning("Error in finalHouseKeeping", e) - } - } - - def performHousekeeping(forceClean: Boolean) = { - val tsNow: Long = System.currentTimeMillis() - connectionMap.synchronized { - connectionMap.foreach { - x => { - if(x._2.refCount < 0) { - logError(s"Bug to be fixed: negative refCount of connection ${x._2}") - } - - if(forceClean || ((x._2.refCount <= 0) && (tsNow - x._2.timestamp > timeout))) { - try{ - x._2.connection.close() - } catch { - case e: IOException => logWarning(s"Fail to close connection ${x._2}", e) - } - connectionMap.remove(x._1) - } - } - } - } - } - - // For testing purpose only - def getConnection(key: HBaseConnectionKey, conn: => Connection): SmartConnection = { - connectionMap.synchronized { - if (closed) - return null - cacheStat.numTotalRequests += 1 - val sc = connectionMap.getOrElseUpdate(key, {cacheStat.numActualConnectionsCreated += 1 - new SmartConnection(conn)}) - sc.refCount += 1 - sc - } - } - - def getConnection(conf: Configuration): SmartConnection = - getConnection(new HBaseConnectionKey(conf), ConnectionFactory.createConnection(conf)) - - // For testing purpose only - def setTimeout(to: Long): Unit = { - connectionMap.synchronized { - if (closed) - return - timeout = to - housekeepingThread.interrupt() - } - } -} - -private[hbase] case class SmartConnection ( - connection: Connection, var refCount: Int = 0, var timestamp: Long = 0) { - def getTable(tableName: TableName): Table = connection.getTable(tableName) - def getRegionLocator(tableName: TableName): RegionLocator = connection.getRegionLocator(tableName) - def isClosed: Boolean = connection.isClosed - def getAdmin: Admin = connection.getAdmin - def close() = { - HBaseConnectionCache.connectionMap.synchronized { - refCount -= 1 - if(refCount <= 0) - timestamp = System.currentTimeMillis() - } - } -} - -/** - * Denotes a unique key to an HBase Connection instance. - * Please refer to 'org.apache.hadoop.hbase.client.HConnectionKey'. - * - * In essence, this class captures the properties in Configuration - * that may be used in the process of establishing a connection. - * - */ -class HBaseConnectionKey(c: Configuration) extends Logging { - val conf: Configuration = c - val CONNECTION_PROPERTIES: Array[String] = Array[String]( - HConstants.ZOOKEEPER_QUORUM, - HConstants.ZOOKEEPER_ZNODE_PARENT, - HConstants.ZOOKEEPER_CLIENT_PORT, - HConstants.ZOOKEEPER_RECOVERABLE_WAITTIME, - HConstants.HBASE_CLIENT_PAUSE, - HConstants.HBASE_CLIENT_RETRIES_NUMBER, - HConstants.HBASE_RPC_TIMEOUT_KEY, - HConstants.HBASE_META_SCANNER_CACHING, - HConstants.HBASE_CLIENT_INSTANCE_ID, - HConstants.RPC_CODEC_CONF_KEY, - HConstants.USE_META_REPLICAS, - RpcControllerFactory.CUSTOM_CONTROLLER_CONF_KEY) - - var username: String = _ - var m_properties = mutable.HashMap.empty[String, String] - if (conf != null) { - for (property <- CONNECTION_PROPERTIES) { - val value: String = conf.get(property) - if (value != null) { - m_properties.+=((property, value)) - } - } - try { - val provider: UserProvider = UserProvider.instantiate(conf) - val currentUser: User = provider.getCurrent - if (currentUser != null) { - username = currentUser.getName - } - } - catch { - case e: IOException => { - logWarning("Error obtaining current user, skipping username in HBaseConnectionKey", e) - } - } - } - - // make 'properties' immutable - val properties = m_properties.toMap - - override def hashCode: Int = { - val prime: Int = 31 - var result: Int = 1 - if (username != null) { - result = username.hashCode - } - for (property <- CONNECTION_PROPERTIES) { - val value: Option[String] = properties.get(property) - if (value.isDefined) { - result = prime * result + value.hashCode - } - } - result - } - - override def equals(obj: Any): Boolean = { - if (obj == null) return false - if (getClass ne obj.getClass) return false - val that: HBaseConnectionKey = obj.asInstanceOf[HBaseConnectionKey] - if (this.username != null && !(this.username == that.username)) { - return false - } - else if (this.username == null && that.username != null) { - return false - } - if (this.properties == null) { - if (that.properties != null) { - return false - } - } - else { - if (that.properties == null) { - return false - } - var flag: Boolean = true - for (property <- CONNECTION_PROPERTIES) { - val thisValue: Option[String] = this.properties.get(property) - val thatValue: Option[String] = that.properties.get(property) - flag = true - if (thisValue eq thatValue) { - flag = false //continue, so make flag be false - } - if (flag && (thisValue == null || !(thisValue == thatValue))) { - return false - } - } - } - true - } - - override def toString: String = { - "HBaseConnectionKey{" + "properties=" + properties + ", username='" + username + '\'' + '}' - } -} - -/** - * To log the state of 'HBaseConnectionCache' - * - * @param numTotalRequests number of total connection requests to the cache - * @param numActualConnectionsCreated number of actual HBase connections the cache ever created - * @param numActiveConnections number of current alive HBase connections the cache is holding - */ -case class HBaseConnectionCacheStat(var numTotalRequests: Long, - var numActualConnectionsCreated: Long, - var numActiveConnections: Long) - - diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseContext.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseContext.scala deleted file mode 100644 index eb0d6835d69..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseContext.scala +++ /dev/null @@ -1,1115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.net.InetSocketAddress -import java.util -import java.util.UUID -import javax.management.openmbean.KeyAlreadyExistsException - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.fs.HFileSystem -import org.apache.hadoop.hbase._ -import org.apache.hadoop.hbase.io.compress.Compression -import org.apache.hadoop.hbase.io.compress.Compression.Algorithm -import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding -import org.apache.hadoop.hbase.io.hfile.{HFile, CacheConfig, HFileContextBuilder, HFileWriterImpl} -import org.apache.hadoop.hbase.regionserver.{HStore, HStoreFile, StoreFileWriter, BloomType} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.mapred.JobConf -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.rdd.RDD -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.hadoop.hbase.client._ -import scala.reflect.ClassTag -import org.apache.spark.{Logging, SerializableWritable, SparkContext} -import org.apache.hadoop.hbase.mapreduce.{TableMapReduceUtil, -TableInputFormat, IdentityTableMapper} -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.hadoop.mapreduce.Job -import org.apache.spark.streaming.dstream.DStream -import java.io._ -import org.apache.hadoop.security.UserGroupInformation -import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod -import org.apache.hadoop.fs.{Path, FileAlreadyExistsException, FileSystem} -import scala.collection.mutable - -/** - * HBaseContext is a façade for HBase operations - * like bulk put, get, increment, delete, and scan - * - * HBaseContext will take the responsibilities - * of disseminating the configuration information - * to the working and managing the life cycle of Connections. - */ -@InterfaceAudience.Public -class HBaseContext(@transient sc: SparkContext, - @transient val config: Configuration, - val tmpHdfsConfgFile: String = null) - extends Serializable with Logging { - - @transient var credentials = SparkHadoopUtil.get.getCurrentUserCredentials() - @transient var tmpHdfsConfiguration:Configuration = config - @transient var appliedCredentials = false - @transient val job = Job.getInstance(config) - TableMapReduceUtil.initCredentials(job) - val broadcastedConf = sc.broadcast(new SerializableWritable(config)) - val credentialsConf = sc.broadcast(new SerializableWritable(job.getCredentials)) - - LatestHBaseContextCache.latest = this - - if (tmpHdfsConfgFile != null && config != null) { - val fs = FileSystem.newInstance(config) - val tmpPath = new Path(tmpHdfsConfgFile) - if (!fs.exists(tmpPath)) { - val outputStream = fs.create(tmpPath) - config.write(outputStream) - outputStream.close() - } else { - logWarning("tmpHdfsConfigDir " + tmpHdfsConfgFile + " exist!!") - } - } - - /** - * A simple enrichment of the traditional Spark RDD foreachPartition. - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * @param rdd Original RDD with data to iterate over - * @param f Function to be given a iterator to iterate through - * the RDD values and a Connection object to interact - * with HBase - */ - def foreachPartition[T](rdd: RDD[T], - f: (Iterator[T], Connection) => Unit):Unit = { - rdd.foreachPartition( - it => hbaseForeachPartition(broadcastedConf, it, f)) - } - - /** - * A simple enrichment of the traditional Spark Streaming dStream foreach - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * @param dstream Original DStream with data to iterate over - * @param f Function to be given a iterator to iterate through - * the DStream values and a Connection object to - * interact with HBase - */ - def foreachPartition[T](dstream: DStream[T], - f: (Iterator[T], Connection) => Unit):Unit = { - dstream.foreachRDD((rdd, time) => { - foreachPartition(rdd, f) - }) - } - - /** - * A simple enrichment of the traditional Spark RDD mapPartition. - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * @param rdd Original RDD with data to iterate over - * @param mp Function to be given a iterator to iterate through - * the RDD values and a Connection object to interact - * with HBase - * @return Returns a new RDD generated by the user definition - * function just like normal mapPartition - */ - def mapPartitions[T, R: ClassTag](rdd: RDD[T], - mp: (Iterator[T], Connection) => Iterator[R]): RDD[R] = { - - rdd.mapPartitions[R](it => hbaseMapPartition[T, R](broadcastedConf, - it, - mp)) - - } - - /** - * A simple enrichment of the traditional Spark Streaming DStream - * foreachPartition. - * - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * Note: Make sure to partition correctly to avoid memory issue when - * getting data from HBase - * - * @param dstream Original DStream with data to iterate over - * @param f Function to be given a iterator to iterate through - * the DStream values and a Connection object to - * interact with HBase - * @return Returns a new DStream generated by the user - * definition function just like normal mapPartition - */ - def streamForeachPartition[T](dstream: DStream[T], - f: (Iterator[T], Connection) => Unit): Unit = { - - dstream.foreachRDD(rdd => this.foreachPartition(rdd, f)) - } - - /** - * A simple enrichment of the traditional Spark Streaming DStream - * mapPartition. - * - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * Note: Make sure to partition correctly to avoid memory issue when - * getting data from HBase - * - * @param dstream Original DStream with data to iterate over - * @param f Function to be given a iterator to iterate through - * the DStream values and a Connection object to - * interact with HBase - * @return Returns a new DStream generated by the user - * definition function just like normal mapPartition - */ - def streamMapPartitions[T, U: ClassTag](dstream: DStream[T], - f: (Iterator[T], Connection) => Iterator[U]): - DStream[U] = { - dstream.mapPartitions(it => hbaseMapPartition[T, U]( - broadcastedConf, - it, - f)) - } - - /** - * A simple abstraction over the HBaseContext.foreachPartition method. - * - * It allow addition support for a user to take RDD - * and generate puts and send them to HBase. - * The complexity of managing the Connection is - * removed from the developer - * - * @param rdd Original RDD with data to iterate over - * @param tableName The name of the table to put into - * @param f Function to convert a value in the RDD to a HBase Put - */ - def bulkPut[T](rdd: RDD[T], tableName: TableName, f: (T) => Put) { - - val tName = tableName.getName - rdd.foreachPartition( - it => hbaseForeachPartition[T]( - broadcastedConf, - it, - (iterator, connection) => { - val m = connection.getBufferedMutator(TableName.valueOf(tName)) - iterator.foreach(T => m.mutate(f(T))) - m.flush() - m.close() - })) - } - - def applyCreds[T] (){ - credentials = SparkHadoopUtil.get.getCurrentUserCredentials() - - logDebug("appliedCredentials:" + appliedCredentials + ",credentials:" + credentials) - - if (!appliedCredentials && credentials != null) { - appliedCredentials = true - - @transient val ugi = UserGroupInformation.getCurrentUser - ugi.addCredentials(credentials) - // specify that this is a proxy user - ugi.setAuthenticationMethod(AuthenticationMethod.PROXY) - - ugi.addCredentials(credentialsConf.value.value) - } - } - - /** - * A simple abstraction over the HBaseContext.streamMapPartition method. - * - * It allow addition support for a user to take a DStream and - * generate puts and send them to HBase. - * - * The complexity of managing the Connection is - * removed from the developer - * - * @param dstream Original DStream with data to iterate over - * @param tableName The name of the table to put into - * @param f Function to convert a value in - * the DStream to a HBase Put - */ - def streamBulkPut[T](dstream: DStream[T], - tableName: TableName, - f: (T) => Put) = { - val tName = tableName.getName - dstream.foreachRDD((rdd, time) => { - bulkPut(rdd, TableName.valueOf(tName), f) - }) - } - - /** - * A simple abstraction over the HBaseContext.foreachPartition method. - * - * It allow addition support for a user to take a RDD and generate delete - * and send them to HBase. The complexity of managing the Connection is - * removed from the developer - * - * @param rdd Original RDD with data to iterate over - * @param tableName The name of the table to delete from - * @param f Function to convert a value in the RDD to a - * HBase Deletes - * @param batchSize The number of delete to batch before sending to HBase - */ - def bulkDelete[T](rdd: RDD[T], tableName: TableName, - f: (T) => Delete, batchSize: Integer) { - bulkMutation(rdd, tableName, f, batchSize) - } - - /** - * A simple abstraction over the HBaseContext.streamBulkMutation method. - * - * It allow addition support for a user to take a DStream and - * generate Delete and send them to HBase. - * - * The complexity of managing the Connection is - * removed from the developer - * - * @param dstream Original DStream with data to iterate over - * @param tableName The name of the table to delete from - * @param f function to convert a value in the DStream to a - * HBase Delete - * @param batchSize The number of deletes to batch before sending to HBase - */ - def streamBulkDelete[T](dstream: DStream[T], - tableName: TableName, - f: (T) => Delete, - batchSize: Integer) = { - streamBulkMutation(dstream, tableName, f, batchSize) - } - - /** - * Under lining function to support all bulk mutations - * - * May be opened up if requested - */ - private def bulkMutation[T](rdd: RDD[T], tableName: TableName, - f: (T) => Mutation, batchSize: Integer) { - - val tName = tableName.getName - rdd.foreachPartition( - it => hbaseForeachPartition[T]( - broadcastedConf, - it, - (iterator, connection) => { - val table = connection.getTable(TableName.valueOf(tName)) - val mutationList = new java.util.ArrayList[Mutation] - iterator.foreach(T => { - mutationList.add(f(T)) - if (mutationList.size >= batchSize) { - table.batch(mutationList, null) - mutationList.clear() - } - }) - if (mutationList.size() > 0) { - table.batch(mutationList, null) - mutationList.clear() - } - table.close() - })) - } - - /** - * Under lining function to support all bulk streaming mutations - * - * May be opened up if requested - */ - private def streamBulkMutation[T](dstream: DStream[T], - tableName: TableName, - f: (T) => Mutation, - batchSize: Integer) = { - val tName = tableName.getName - dstream.foreachRDD((rdd, time) => { - bulkMutation(rdd, TableName.valueOf(tName), f, batchSize) - }) - } - - /** - * A simple abstraction over the HBaseContext.mapPartition method. - * - * It allow addition support for a user to take a RDD and generates a - * new RDD based on Gets and the results they bring back from HBase - * - * @param rdd Original RDD with data to iterate over - * @param tableName The name of the table to get from - * @param makeGet function to convert a value in the RDD to a - * HBase Get - * @param convertResult This will convert the HBase Result object to - * what ever the user wants to put in the resulting - * RDD - * return new RDD that is created by the Get to HBase - */ - def bulkGet[T, U: ClassTag](tableName: TableName, - batchSize: Integer, - rdd: RDD[T], - makeGet: (T) => Get, - convertResult: (Result) => U): RDD[U] = { - - val getMapPartition = new GetMapPartition(tableName, - batchSize, - makeGet, - convertResult) - - rdd.mapPartitions[U](it => - hbaseMapPartition[T, U]( - broadcastedConf, - it, - getMapPartition.run)) - } - - /** - * A simple abstraction over the HBaseContext.streamMap method. - * - * It allow addition support for a user to take a DStream and - * generates a new DStream based on Gets and the results - * they bring back from HBase - * - * @param tableName The name of the table to get from - * @param batchSize The number of Gets to be sent in a single batch - * @param dStream Original DStream with data to iterate over - * @param makeGet Function to convert a value in the DStream to a - * HBase Get - * @param convertResult This will convert the HBase Result object to - * what ever the user wants to put in the resulting - * DStream - * @return A new DStream that is created by the Get to HBase - */ - def streamBulkGet[T, U: ClassTag](tableName: TableName, - batchSize: Integer, - dStream: DStream[T], - makeGet: (T) => Get, - convertResult: (Result) => U): DStream[U] = { - - val getMapPartition = new GetMapPartition(tableName, - batchSize, - makeGet, - convertResult) - - dStream.mapPartitions[U](it => hbaseMapPartition[T, U]( - broadcastedConf, - it, - getMapPartition.run)) - } - - /** - * This function will use the native HBase TableInputFormat with the - * given scan object to generate a new RDD - * - * @param tableName the name of the table to scan - * @param scan the HBase scan object to use to read data from HBase - * @param f function to convert a Result object from HBase into - * what the user wants in the final generated RDD - * @return new RDD with results from scan - */ - def hbaseRDD[U: ClassTag](tableName: TableName, scan: Scan, - f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = { - - val job: Job = Job.getInstance(getConf(broadcastedConf)) - - TableMapReduceUtil.initCredentials(job) - TableMapReduceUtil.initTableMapperJob(tableName, scan, - classOf[IdentityTableMapper], null, null, job) - - val jconf = new JobConf(job.getConfiguration) - SparkHadoopUtil.get.addCredentials(jconf) - new NewHBaseRDD(sc, - classOf[TableInputFormat], - classOf[ImmutableBytesWritable], - classOf[Result], - job.getConfiguration, - this).map(f) - } - - /** - * A overloaded version of HBaseContext hbaseRDD that defines the - * type of the resulting RDD - * - * @param tableName the name of the table to scan - * @param scans the HBase scan object to use to read data from HBase - * @return New RDD with results from scan - * - */ - def hbaseRDD(tableName: TableName, scans: Scan): - RDD[(ImmutableBytesWritable, Result)] = { - - hbaseRDD[(ImmutableBytesWritable, Result)]( - tableName, - scans, - (r: (ImmutableBytesWritable, Result)) => r) - } - - /** - * underlining wrapper all foreach functions in HBaseContext - */ - private def hbaseForeachPartition[T](configBroadcast: - Broadcast[SerializableWritable[Configuration]], - it: Iterator[T], - f: (Iterator[T], Connection) => Unit) = { - - val config = getConf(configBroadcast) - - applyCreds - // specify that this is a proxy user - val smartConn = HBaseConnectionCache.getConnection(config) - f(it, smartConn.connection) - smartConn.close() - } - - private def getConf(configBroadcast: Broadcast[SerializableWritable[Configuration]]): - Configuration = { - - if (tmpHdfsConfiguration == null && tmpHdfsConfgFile != null) { - val fs = FileSystem.newInstance(SparkHadoopUtil.get.conf) - val inputStream = fs.open(new Path(tmpHdfsConfgFile)) - tmpHdfsConfiguration = new Configuration(false) - tmpHdfsConfiguration.readFields(inputStream) - inputStream.close() - } - - if (tmpHdfsConfiguration == null) { - try { - tmpHdfsConfiguration = configBroadcast.value.value - } catch { - case ex: Exception => logError("Unable to getConfig from broadcast", ex) - } - } - tmpHdfsConfiguration - } - - /** - * underlining wrapper all mapPartition functions in HBaseContext - * - */ - private def hbaseMapPartition[K, U]( - configBroadcast: - Broadcast[SerializableWritable[Configuration]], - it: Iterator[K], - mp: (Iterator[K], Connection) => - Iterator[U]): Iterator[U] = { - - val config = getConf(configBroadcast) - applyCreds - - val smartConn = HBaseConnectionCache.getConnection(config) - val res = mp(it, smartConn.connection) - smartConn.close() - res - } - - /** - * underlining wrapper all get mapPartition functions in HBaseContext - */ - private class GetMapPartition[T, U](tableName: TableName, - batchSize: Integer, - makeGet: (T) => Get, - convertResult: (Result) => U) - extends Serializable { - - val tName = tableName.getName - - def run(iterator: Iterator[T], connection: Connection): Iterator[U] = { - val table = connection.getTable(TableName.valueOf(tName)) - - val gets = new java.util.ArrayList[Get]() - var res = List[U]() - - while (iterator.hasNext) { - gets.add(makeGet(iterator.next())) - - if (gets.size() == batchSize) { - val results = table.get(gets) - res = res ++ results.map(convertResult) - gets.clear() - } - } - if (gets.size() > 0) { - val results = table.get(gets) - res = res ++ results.map(convertResult) - gets.clear() - } - table.close() - res.iterator - } - } - - /** - * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef]. - * - * This method is used to keep ClassTags out of the external Java API, as - * the Java compiler cannot produce them automatically. While this - * ClassTag-faking does please the compiler, it can cause problems at runtime - * if the Scala API relies on ClassTags for correctness. - * - * Often, though, a ClassTag[AnyRef] will not lead to incorrect behavior, - * just worse performance or security issues. - * For instance, an Array of AnyRef can hold any type T, but may lose primitive - * specialization. - */ - private[spark] - def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] - - /** - * Spark Implementation of HBase Bulk load for wide rows or when - * values are not already combined at the time of the map process - * - * This will take the content from an existing RDD then sort and shuffle - * it with respect to region splits. The result of that sort and shuffle - * will be written to HFiles. - * - * After this function is executed the user will have to call - * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase - * - * Also note this version of bulk load is different from past versions in - * that it includes the qualifier as part of the sort process. The - * reason for this is to be able to support rows will very large number - * of columns. - * - * @param rdd The RDD we are bulk loading from - * @param tableName The HBase table we are loading into - * @param flatMap A flapMap function that will make every - * row in the RDD - * into N cells for the bulk load - * @param stagingDir The location on the FileSystem to bulk load into - * @param familyHFileWriteOptionsMap Options that will define how the HFile for a - * column family is written - * @param compactionExclude Compaction excluded for the HFiles - * @param maxSize Max size for the HFiles before they roll - * @tparam T The Type of values in the original RDD - */ - def bulkLoad[T](rdd:RDD[T], - tableName: TableName, - flatMap: (T) => Iterator[(KeyFamilyQualifier, Array[Byte])], - stagingDir:String, - familyHFileWriteOptionsMap: - util.Map[Array[Byte], FamilyHFileWriteOptions] = - new util.HashMap[Array[Byte], FamilyHFileWriteOptions], - compactionExclude: Boolean = false, - maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE): - Unit = { - val stagingPath = new Path(stagingDir) - val fs = stagingPath.getFileSystem(config) - if (fs.exists(stagingPath)) { - throw new FileAlreadyExistsException("Path " + stagingDir + " already exists") - } - val conn = HBaseConnectionCache.getConnection(config) - val regionLocator = conn.getRegionLocator(tableName) - val startKeys = regionLocator.getStartKeys - if (startKeys.length == 0) { - logInfo("Table " + tableName.toString + " was not found") - } - val defaultCompressionStr = config.get("hfile.compression", - Compression.Algorithm.NONE.getName) - val hfileCompression = HFileWriterImpl - .compressionByName(defaultCompressionStr) - val nowTimeStamp = System.currentTimeMillis() - val tableRawName = tableName.getName - - val familyHFileWriteOptionsMapInternal = - new util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions] - - val entrySetIt = familyHFileWriteOptionsMap.entrySet().iterator() - - while (entrySetIt.hasNext) { - val entry = entrySetIt.next() - familyHFileWriteOptionsMapInternal.put(new ByteArrayWrapper(entry.getKey), entry.getValue) - } - - val regionSplitPartitioner = - new BulkLoadPartitioner(startKeys) - - //This is where all the magic happens - //Here we are going to do the following things - // 1. FlapMap every row in the RDD into key column value tuples - // 2. Then we are going to repartition sort and shuffle - // 3. Finally we are going to write out our HFiles - rdd.flatMap( r => flatMap(r)). - repartitionAndSortWithinPartitions(regionSplitPartitioner). - hbaseForeachPartition(this, (it, conn) => { - - val conf = broadcastedConf.value.value - val fs = FileSystem.get(conf) - val writerMap = new mutable.HashMap[ByteArrayWrapper, WriterLength] - var previousRow:Array[Byte] = HConstants.EMPTY_BYTE_ARRAY - var rollOverRequested = false - val localTableName = TableName.valueOf(tableRawName) - - //Here is where we finally iterate through the data in this partition of the - //RDD that has been sorted and partitioned - it.foreach{ case (keyFamilyQualifier, cellValue:Array[Byte]) => - - val wl = writeValueToHFile(keyFamilyQualifier.rowKey, - keyFamilyQualifier.family, - keyFamilyQualifier.qualifier, - cellValue, - nowTimeStamp, - fs, - conn, - localTableName, - conf, - familyHFileWriteOptionsMapInternal, - hfileCompression, - writerMap, - stagingDir) - - rollOverRequested = rollOverRequested || wl.written > maxSize - - //This will only roll if we have at least one column family file that is - //bigger then maxSize and we have finished a given row key - if (rollOverRequested && Bytes.compareTo(previousRow, keyFamilyQualifier.rowKey) != 0) { - rollWriters(fs, writerMap, - regionSplitPartitioner, - previousRow, - compactionExclude) - rollOverRequested = false - } - - previousRow = keyFamilyQualifier.rowKey - } - //We have finished all the data so lets close up the writers - rollWriters(fs, writerMap, - regionSplitPartitioner, - previousRow, - compactionExclude) - rollOverRequested = false - }) - } - - /** - * Spark Implementation of HBase Bulk load for short rows some where less then - * a 1000 columns. This bulk load should be faster for tables will thinner - * rows then the other spark implementation of bulk load that puts only one - * value into a record going into a shuffle - * - * This will take the content from an existing RDD then sort and shuffle - * it with respect to region splits. The result of that sort and shuffle - * will be written to HFiles. - * - * After this function is executed the user will have to call - * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase - * - * In this implementation, only the rowKey is given to the shuffle as the key - * and all the columns are already linked to the RowKey before the shuffle - * stage. The sorting of the qualifier is done in memory out side of the - * shuffle stage - * - * Also make sure that incoming RDDs only have one record for every row key. - * - * @param rdd The RDD we are bulk loading from - * @param tableName The HBase table we are loading into - * @param mapFunction A function that will convert the RDD records to - * the key value format used for the shuffle to prep - * for writing to the bulk loaded HFiles - * @param stagingDir The location on the FileSystem to bulk load into - * @param familyHFileWriteOptionsMap Options that will define how the HFile for a - * column family is written - * @param compactionExclude Compaction excluded for the HFiles - * @param maxSize Max size for the HFiles before they roll - * @tparam T The Type of values in the original RDD - */ - def bulkLoadThinRows[T](rdd:RDD[T], - tableName: TableName, - mapFunction: (T) => - (ByteArrayWrapper, FamiliesQualifiersValues), - stagingDir:String, - familyHFileWriteOptionsMap: - util.Map[Array[Byte], FamilyHFileWriteOptions] = - new util.HashMap[Array[Byte], FamilyHFileWriteOptions], - compactionExclude: Boolean = false, - maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE): - Unit = { - val stagingPath = new Path(stagingDir) - val fs = stagingPath.getFileSystem(config) - if (fs.exists(stagingPath)) { - throw new FileAlreadyExistsException("Path " + stagingDir + " already exists") - } - val conn = HBaseConnectionCache.getConnection(config) - val regionLocator = conn.getRegionLocator(tableName) - val startKeys = regionLocator.getStartKeys - if (startKeys.length == 0) { - logInfo("Table " + tableName.toString + " was not found") - } - val defaultCompressionStr = config.get("hfile.compression", - Compression.Algorithm.NONE.getName) - val defaultCompression = HFileWriterImpl - .compressionByName(defaultCompressionStr) - val nowTimeStamp = System.currentTimeMillis() - val tableRawName = tableName.getName - - val familyHFileWriteOptionsMapInternal = - new util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions] - - val entrySetIt = familyHFileWriteOptionsMap.entrySet().iterator() - - while (entrySetIt.hasNext) { - val entry = entrySetIt.next() - familyHFileWriteOptionsMapInternal.put(new ByteArrayWrapper(entry.getKey), entry.getValue) - } - - val regionSplitPartitioner = - new BulkLoadPartitioner(startKeys) - - //This is where all the magic happens - //Here we are going to do the following things - // 1. FlapMap every row in the RDD into key column value tuples - // 2. Then we are going to repartition sort and shuffle - // 3. Finally we are going to write out our HFiles - rdd.map( r => mapFunction(r)). - repartitionAndSortWithinPartitions(regionSplitPartitioner). - hbaseForeachPartition(this, (it, conn) => { - - val conf = broadcastedConf.value.value - val fs = FileSystem.get(conf) - val writerMap = new mutable.HashMap[ByteArrayWrapper, WriterLength] - var previousRow:Array[Byte] = HConstants.EMPTY_BYTE_ARRAY - var rollOverRequested = false - val localTableName = TableName.valueOf(tableRawName) - - //Here is where we finally iterate through the data in this partition of the - //RDD that has been sorted and partitioned - it.foreach{ case (rowKey:ByteArrayWrapper, - familiesQualifiersValues:FamiliesQualifiersValues) => - - - if (Bytes.compareTo(previousRow, rowKey.value) == 0) { - throw new KeyAlreadyExistsException("The following key was sent to the " + - "HFile load more then one: " + Bytes.toString(previousRow)) - } - - //The family map is a tree map so the families will be sorted - val familyIt = familiesQualifiersValues.familyMap.entrySet().iterator() - while (familyIt.hasNext) { - val familyEntry = familyIt.next() - - val family = familyEntry.getKey.value - - val qualifierIt = familyEntry.getValue.entrySet().iterator() - - //The qualifier map is a tree map so the families will be sorted - while (qualifierIt.hasNext) { - - val qualifierEntry = qualifierIt.next() - val qualifier = qualifierEntry.getKey - val cellValue = qualifierEntry.getValue - - writeValueToHFile(rowKey.value, - family, - qualifier.value, // qualifier - cellValue, // value - nowTimeStamp, - fs, - conn, - localTableName, - conf, - familyHFileWriteOptionsMapInternal, - defaultCompression, - writerMap, - stagingDir) - - previousRow = rowKey.value - } - - writerMap.values.foreach( wl => { - rollOverRequested = rollOverRequested || wl.written > maxSize - - //This will only roll if we have at least one column family file that is - //bigger then maxSize and we have finished a given row key - if (rollOverRequested) { - rollWriters(fs, writerMap, - regionSplitPartitioner, - previousRow, - compactionExclude) - rollOverRequested = false - } - }) - } - } - - //This will get a writer for the column family - //If there is no writer for a given column family then - //it will get created here. - //We have finished all the data so lets close up the writers - rollWriters(fs, writerMap, - regionSplitPartitioner, - previousRow, - compactionExclude) - rollOverRequested = false - }) - } - - /** - * This will return a new HFile writer when requested - * - * @param family column family - * @param conf configuration to connect to HBase - * @param favoredNodes nodes that we would like to write too - * @param fs FileSystem object where we will be writing the HFiles to - * @return WriterLength object - */ - private def getNewHFileWriter(family: Array[Byte], conf: Configuration, - favoredNodes: Array[InetSocketAddress], - fs:FileSystem, - familydir:Path, - familyHFileWriteOptionsMapInternal: - util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions], - defaultCompression:Compression.Algorithm): WriterLength = { - - - var familyOptions = familyHFileWriteOptionsMapInternal.get(new ByteArrayWrapper(family)) - - if (familyOptions == null) { - familyOptions = new FamilyHFileWriteOptions(defaultCompression.toString, - BloomType.NONE.toString, HConstants.DEFAULT_BLOCKSIZE, DataBlockEncoding.NONE.toString) - familyHFileWriteOptionsMapInternal.put(new ByteArrayWrapper(family), familyOptions) - } - - val tempConf = new Configuration(conf) - tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f) - val contextBuilder = new HFileContextBuilder() - .withCompression(Algorithm.valueOf(familyOptions.compression)) - .withChecksumType(HStore.getChecksumType(conf)) - .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf)) - .withBlockSize(familyOptions.blockSize) - - if (HFile.getFormatVersion(conf) >= HFile.MIN_FORMAT_VERSION_WITH_TAGS) { - contextBuilder.withIncludesTags(true) - } - - contextBuilder.withDataBlockEncoding(DataBlockEncoding. - valueOf(familyOptions.dataBlockEncoding)) - val hFileContext = contextBuilder.build() - - //Add a '_' to the file name because this is a unfinished file. A rename will happen - // to remove the '_' when the file is closed. - new WriterLength(0, - new StoreFileWriter.Builder(conf, new CacheConfig(tempConf), new HFileSystem(fs)) - .withBloomType(BloomType.valueOf(familyOptions.bloomType)) - .withComparator(CellComparator.getInstance()).withFileContext(hFileContext) - .withFilePath(new Path(familydir, "_" + UUID.randomUUID.toString.replaceAll("-", ""))) - .withFavoredNodes(favoredNodes).build()) - - } - - /** - * Encompasses the logic to write a value to an HFile - * - * @param rowKey The RowKey for the record - * @param family HBase column family for the record - * @param qualifier HBase column qualifier for the record - * @param cellValue HBase cell value - * @param nowTimeStamp The cell time stamp - * @param fs Connection to the FileSystem for the HFile - * @param conn Connection to HBaes - * @param tableName HBase TableName object - * @param conf Configuration to be used when making a new HFile - * @param familyHFileWriteOptionsMapInternal Extra configs for the HFile - * @param hfileCompression The compression codec for the new HFile - * @param writerMap HashMap of existing writers and their offsets - * @param stagingDir The staging directory on the FileSystem to store - * the HFiles - * @return The writer for the given HFile that was writen - * too - */ - private def writeValueToHFile(rowKey: Array[Byte], - family: Array[Byte], - qualifier: Array[Byte], - cellValue:Array[Byte], - nowTimeStamp: Long, - fs: FileSystem, - conn: Connection, - tableName: TableName, - conf: Configuration, - familyHFileWriteOptionsMapInternal: - util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions], - hfileCompression:Compression.Algorithm, - writerMap:mutable.HashMap[ByteArrayWrapper, WriterLength], - stagingDir: String - ): WriterLength = { - - val wl = writerMap.getOrElseUpdate(new ByteArrayWrapper(family), { - val familyDir = new Path(stagingDir, Bytes.toString(family)) - - fs.mkdirs(familyDir) - - val loc:HRegionLocation = { - try { - val locator = - conn.getRegionLocator(tableName) - locator.getRegionLocation(rowKey) - } catch { - case e: Throwable => - logWarning("there's something wrong when locating rowkey: " + - Bytes.toString(rowKey)) - null - } - } - if (null == loc) { - if (log.isTraceEnabled) { - logTrace("failed to get region location, so use default writer: " + - Bytes.toString(rowKey)) - } - getNewHFileWriter(family = family, - conf = conf, - favoredNodes = null, - fs = fs, - familydir = familyDir, - familyHFileWriteOptionsMapInternal, - hfileCompression) - } else { - if (log.isDebugEnabled) { - logDebug("first rowkey: [" + Bytes.toString(rowKey) + "]") - } - val initialIsa = - new InetSocketAddress(loc.getHostname, loc.getPort) - if (initialIsa.isUnresolved) { - if (log.isTraceEnabled) { - logTrace("failed to resolve bind address: " + loc.getHostname + ":" - + loc.getPort + ", so use default writer") - } - getNewHFileWriter(family, - conf, - null, - fs, - familyDir, - familyHFileWriteOptionsMapInternal, - hfileCompression) - } else { - if(log.isDebugEnabled) { - logDebug("use favored nodes writer: " + initialIsa.getHostString) - } - getNewHFileWriter(family, - conf, - Array[InetSocketAddress](initialIsa), - fs, - familyDir, - familyHFileWriteOptionsMapInternal, - hfileCompression) - } - } - }) - - val keyValue =new KeyValue(rowKey, - family, - qualifier, - nowTimeStamp,cellValue) - - wl.writer.append(keyValue) - wl.written += keyValue.getLength - - wl - } - - /** - * This will roll all Writers - * @param fs Hadoop FileSystem object - * @param writerMap HashMap that contains all the writers - * @param regionSplitPartitioner The partitioner with knowledge of how the - * Region's are split by row key - * @param previousRow The last row to fill the HFile ending range metadata - * @param compactionExclude The exclude compaction metadata flag for the HFile - */ - private def rollWriters(fs:FileSystem, - writerMap:mutable.HashMap[ByteArrayWrapper, WriterLength], - regionSplitPartitioner: BulkLoadPartitioner, - previousRow: Array[Byte], - compactionExclude: Boolean): Unit = { - writerMap.values.foreach( wl => { - if (wl.writer != null) { - logDebug("Writer=" + wl.writer.getPath + - (if (wl.written == 0) "" else ", wrote=" + wl.written)) - closeHFileWriter(fs, wl.writer, - regionSplitPartitioner, - previousRow, - compactionExclude) - } - }) - writerMap.clear() - - } - - /** - * Function to close an HFile - * @param fs Hadoop FileSystem object - * @param w HFile Writer - * @param regionSplitPartitioner The partitioner with knowledge of how the - * Region's are split by row key - * @param previousRow The last row to fill the HFile ending range metadata - * @param compactionExclude The exclude compaction metadata flag for the HFile - */ - private def closeHFileWriter(fs:FileSystem, - w: StoreFileWriter, - regionSplitPartitioner: BulkLoadPartitioner, - previousRow: Array[Byte], - compactionExclude: Boolean): Unit = { - if (w != null) { - w.appendFileInfo(HStoreFile.BULKLOAD_TIME_KEY, - Bytes.toBytes(System.currentTimeMillis())) - w.appendFileInfo(HStoreFile.BULKLOAD_TASK_KEY, - Bytes.toBytes(regionSplitPartitioner.getPartition(previousRow))) - w.appendFileInfo(HStoreFile.MAJOR_COMPACTION_KEY, - Bytes.toBytes(true)) - w.appendFileInfo(HStoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, - Bytes.toBytes(compactionExclude)) - w.appendTrackedTimestampsToMetadata() - w.close() - - val srcPath = w.getPath - - //In the new path you will see that we are using substring. This is to - // remove the '_' character in front of the HFile name. '_' is a character - // that will tell HBase that this file shouldn't be included in the bulk load - // This feature is to protect for unfinished HFiles being submitted to HBase - val newPath = new Path(w.getPath.getParent, w.getPath.getName.substring(1)) - if (!fs.rename(srcPath, newPath)) { - throw new IOException("Unable to rename '" + srcPath + - "' to " + newPath) - } - } - } - - /** - * This is a wrapper class around StoreFileWriter. The reason for the - * wrapper is to keep the length of the file along side the writer - * - * @param written The writer to be wrapped - * @param writer The number of bytes written to the writer - */ - class WriterLength(var written:Long, val writer:StoreFileWriter) -} - -object LatestHBaseContextCache { - var latest:HBaseContext = null -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseDStreamFunctions.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseDStreamFunctions.scala deleted file mode 100644 index 4edde442ae5..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseDStreamFunctions.scala +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.TableName -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.spark.streaming.dstream.DStream - -import scala.reflect.ClassTag - -/** - * HBaseDStreamFunctions contains a set of implicit functions that can be - * applied to a Spark DStream so that we can easily interact with HBase - */ -@InterfaceAudience.Public -object HBaseDStreamFunctions { - - /** - * These are implicit methods for a DStream that contains any type of - * data. - * - * @param dStream This is for dStreams of any type - * @tparam T Type T - */ - implicit class GenericHBaseDStreamFunctions[T](val dStream: DStream[T]) { - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * put. This will not return a new Stream. Think of it like a foreach - * - * @param hc The hbaseContext object to identify which - * HBase cluster connection to use - * @param tableName The tableName that the put will be sent to - * @param f The function that will turn the DStream values - * into HBase Put objects. - */ - def hbaseBulkPut(hc: HBaseContext, - tableName: TableName, - f: (T) => Put): Unit = { - hc.streamBulkPut(dStream, tableName, f) - } - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * get. This will return a new DStream. Think about it as a DStream map - * function. In that every DStream value will get a new value out of - * HBase. That new value will populate the newly generated DStream. - * - * @param hc The hbaseContext object to identify which - * HBase cluster connection to use - * @param tableName The tableName that the put will be sent to - * @param batchSize How many gets to execute in a single batch - * @param f The function that will turn the RDD values - * in HBase Get objects - * @param convertResult The function that will convert a HBase - * Result object into a value that will go - * into the resulting DStream - * @tparam R The type of Object that will be coming - * out of the resulting DStream - * @return A resulting DStream with type R objects - */ - def hbaseBulkGet[R: ClassTag](hc: HBaseContext, - tableName: TableName, - batchSize:Int, f: (T) => Get, convertResult: (Result) => R): - DStream[R] = { - hc.streamBulkGet[T, R](tableName, batchSize, dStream, f, convertResult) - } - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * get. This will return a new DStream. Think about it as a DStream map - * function. In that every DStream value will get a new value out of - * HBase. That new value will populate the newly generated DStream. - * - * @param hc The hbaseContext object to identify which - * HBase cluster connection to use - * @param tableName The tableName that the put will be sent to - * @param batchSize How many gets to execute in a single batch - * @param f The function that will turn the RDD values - * in HBase Get objects - * @return A resulting DStream with type R objects - */ - def hbaseBulkGet(hc: HBaseContext, - tableName: TableName, batchSize:Int, - f: (T) => Get): DStream[(ImmutableBytesWritable, Result)] = { - hc.streamBulkGet[T, (ImmutableBytesWritable, Result)]( - tableName, batchSize, dStream, f, - result => (new ImmutableBytesWritable(result.getRow), result)) - } - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * Delete. This will not return a new DStream. - * - * @param hc The hbaseContext object to identify which HBase - * cluster connection to use - * @param tableName The tableName that the deletes will be sent to - * @param f The function that will convert the DStream value into - * a HBase Delete Object - * @param batchSize The number of Deletes to be sent in a single batch - */ - def hbaseBulkDelete(hc: HBaseContext, - tableName: TableName, - f:(T) => Delete, batchSize:Int): Unit = { - hc.streamBulkDelete(dStream, tableName, f, batchSize) - } - - /** - * Implicit method that gives easy access to HBaseContext's - * foreachPartition method. This will ack very much like a normal DStream - * foreach method but for the fact that you will now have a HBase connection - * while iterating through the values. - * - * @param hc The hbaseContext object to identify which HBase - * cluster connection to use - * @param f This function will get an iterator for a Partition of an - * DStream along with a connection object to HBase - */ - def hbaseForeachPartition(hc: HBaseContext, - f: (Iterator[T], Connection) => Unit): Unit = { - hc.streamForeachPartition(dStream, f) - } - - /** - * Implicit method that gives easy access to HBaseContext's - * mapPartitions method. This will ask very much like a normal DStream - * map partitions method but for the fact that you will now have a - * HBase connection while iterating through the values - * - * @param hc The hbaseContext object to identify which HBase - * cluster connection to use - * @param f This function will get an iterator for a Partition of an - * DStream along with a connection object to HBase - * @tparam R This is the type of objects that will go into the resulting - * DStream - * @return A resulting DStream of type R - */ - def hbaseMapPartitions[R: ClassTag](hc: HBaseContext, - f: (Iterator[T], Connection) => Iterator[R]): - DStream[R] = { - hc.streamMapPartitions(dStream, f) - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseRDDFunctions.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseRDDFunctions.scala deleted file mode 100644 index 2469c8e9ed2..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/HBaseRDDFunctions.scala +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.util - -import org.apache.hadoop.hbase.{HConstants, TableName} -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.spark.rdd.RDD - -import scala.reflect.ClassTag - -/** - * HBaseRDDFunctions contains a set of implicit functions that can be - * applied to a Spark RDD so that we can easily interact with HBase - */ -@InterfaceAudience.Public -object HBaseRDDFunctions -{ - - /** - * These are implicit methods for a RDD that contains any type of - * data. - * - * @param rdd This is for rdd of any type - * @tparam T This is any type - */ - implicit class GenericHBaseRDDFunctions[T](val rdd: RDD[T]) { - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * put. This will not return a new RDD. Think of it like a foreach - * - * @param hc The hbaseContext object to identify which - * HBase cluster connection to use - * @param tableName The tableName that the put will be sent to - * @param f The function that will turn the RDD values - * into HBase Put objects. - */ - def hbaseBulkPut(hc: HBaseContext, - tableName: TableName, - f: (T) => Put): Unit = { - hc.bulkPut(rdd, tableName, f) - } - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * get. This will return a new RDD. Think about it as a RDD map - * function. In that every RDD value will get a new value out of - * HBase. That new value will populate the newly generated RDD. - * - * @param hc The hbaseContext object to identify which - * HBase cluster connection to use - * @param tableName The tableName that the put will be sent to - * @param batchSize How many gets to execute in a single batch - * @param f The function that will turn the RDD values - * in HBase Get objects - * @param convertResult The function that will convert a HBase - * Result object into a value that will go - * into the resulting RDD - * @tparam R The type of Object that will be coming - * out of the resulting RDD - * @return A resulting RDD with type R objects - */ - def hbaseBulkGet[R: ClassTag](hc: HBaseContext, - tableName: TableName, batchSize:Int, - f: (T) => Get, convertResult: (Result) => R): RDD[R] = { - hc.bulkGet[T, R](tableName, batchSize, rdd, f, convertResult) - } - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * get. This will return a new RDD. Think about it as a RDD map - * function. In that every RDD value will get a new value out of - * HBase. That new value will populate the newly generated RDD. - * - * @param hc The hbaseContext object to identify which - * HBase cluster connection to use - * @param tableName The tableName that the put will be sent to - * @param batchSize How many gets to execute in a single batch - * @param f The function that will turn the RDD values - * in HBase Get objects - * @return A resulting RDD with type R objects - */ - def hbaseBulkGet(hc: HBaseContext, - tableName: TableName, batchSize:Int, - f: (T) => Get): RDD[(ImmutableBytesWritable, Result)] = { - hc.bulkGet[T, (ImmutableBytesWritable, Result)](tableName, - batchSize, rdd, f, - result => if (result != null && result.getRow != null) { - (new ImmutableBytesWritable(result.getRow), result) - } else { - null - }) - } - - /** - * Implicit method that gives easy access to HBaseContext's bulk - * Delete. This will not return a new RDD. - * - * @param hc The hbaseContext object to identify which HBase - * cluster connection to use - * @param tableName The tableName that the deletes will be sent to - * @param f The function that will convert the RDD value into - * a HBase Delete Object - * @param batchSize The number of Deletes to be sent in a single batch - */ - def hbaseBulkDelete(hc: HBaseContext, - tableName: TableName, f:(T) => Delete, batchSize:Int): Unit = { - hc.bulkDelete(rdd, tableName, f, batchSize) - } - - /** - * Implicit method that gives easy access to HBaseContext's - * foreachPartition method. This will ack very much like a normal RDD - * foreach method but for the fact that you will now have a HBase connection - * while iterating through the values. - * - * @param hc The hbaseContext object to identify which HBase - * cluster connection to use - * @param f This function will get an iterator for a Partition of an - * RDD along with a connection object to HBase - */ - def hbaseForeachPartition(hc: HBaseContext, - f: (Iterator[T], Connection) => Unit): Unit = { - hc.foreachPartition(rdd, f) - } - - /** - * Implicit method that gives easy access to HBaseContext's - * mapPartitions method. This will ask very much like a normal RDD - * map partitions method but for the fact that you will now have a - * HBase connection while iterating through the values - * - * @param hc The hbaseContext object to identify which HBase - * cluster connection to use - * @param f This function will get an iterator for a Partition of an - * RDD along with a connection object to HBase - * @tparam R This is the type of objects that will go into the resulting - * RDD - * @return A resulting RDD of type R - */ - def hbaseMapPartitions[R: ClassTag](hc: HBaseContext, - f: (Iterator[T], Connection) => Iterator[R]): - RDD[R] = { - hc.mapPartitions[T,R](rdd, f) - } - - /** - * Spark Implementation of HBase Bulk load for wide rows or when - * values are not already combined at the time of the map process - * - * A Spark Implementation of HBase Bulk load - * - * This will take the content from an existing RDD then sort and shuffle - * it with respect to region splits. The result of that sort and shuffle - * will be written to HFiles. - * - * After this function is executed the user will have to call - * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase - * - * Also note this version of bulk load is different from past versions in - * that it includes the qualifier as part of the sort process. The - * reason for this is to be able to support rows will very large number - * of columns. - * - * @param tableName The HBase table we are loading into - * @param flatMap A flapMap function that will make every row in the RDD - * into N cells for the bulk load - * @param stagingDir The location on the FileSystem to bulk load into - * @param familyHFileWriteOptionsMap Options that will define how the HFile for a - * column family is written - * @param compactionExclude Compaction excluded for the HFiles - * @param maxSize Max size for the HFiles before they roll - */ - def hbaseBulkLoad(hc: HBaseContext, - tableName: TableName, - flatMap: (T) => Iterator[(KeyFamilyQualifier, Array[Byte])], - stagingDir:String, - familyHFileWriteOptionsMap: - util.Map[Array[Byte], FamilyHFileWriteOptions] = - new util.HashMap[Array[Byte], FamilyHFileWriteOptions](), - compactionExclude: Boolean = false, - maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE):Unit = { - hc.bulkLoad(rdd, tableName, - flatMap, stagingDir, familyHFileWriteOptionsMap, - compactionExclude, maxSize) - } - - /** - * Implicit method that gives easy access to HBaseContext's - * bulkLoadThinRows method. - * - * Spark Implementation of HBase Bulk load for short rows some where less then - * a 1000 columns. This bulk load should be faster for tables will thinner - * rows then the other spark implementation of bulk load that puts only one - * value into a record going into a shuffle - * - * This will take the content from an existing RDD then sort and shuffle - * it with respect to region splits. The result of that sort and shuffle - * will be written to HFiles. - * - * After this function is executed the user will have to call - * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase - * - * In this implementation only the rowKey is given to the shuffle as the key - * and all the columns are already linked to the RowKey before the shuffle - * stage. The sorting of the qualifier is done in memory out side of the - * shuffle stage - * - * @param tableName The HBase table we are loading into - * @param mapFunction A function that will convert the RDD records to - * the key value format used for the shuffle to prep - * for writing to the bulk loaded HFiles - * @param stagingDir The location on the FileSystem to bulk load into - * @param familyHFileWriteOptionsMap Options that will define how the HFile for a - * column family is written - * @param compactionExclude Compaction excluded for the HFiles - * @param maxSize Max size for the HFiles before they roll - */ - def hbaseBulkLoadThinRows(hc: HBaseContext, - tableName: TableName, - mapFunction: (T) => - (ByteArrayWrapper, FamiliesQualifiersValues), - stagingDir:String, - familyHFileWriteOptionsMap: - util.Map[Array[Byte], FamilyHFileWriteOptions] = - new util.HashMap[Array[Byte], FamilyHFileWriteOptions](), - compactionExclude: Boolean = false, - maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE):Unit = { - hc.bulkLoadThinRows(rdd, tableName, - mapFunction, stagingDir, familyHFileWriteOptionsMap, - compactionExclude, maxSize) - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/JavaHBaseContext.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/JavaHBaseContext.scala deleted file mode 100644 index fe4b65f66c0..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/JavaHBaseContext.scala +++ /dev/null @@ -1,408 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.util.Map - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.hbase.TableName -import org.apache.hadoop.hbase.util.Pair -import org.apache.yetus.audience.InterfaceAudience -import org.apache.hadoop.hbase.client.{Connection, Delete, Get, Put, Result, Scan} -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} -import org.apache.spark.api.java.function.{FlatMapFunction, Function, VoidFunction} -import org.apache.spark.streaming.api.java.JavaDStream - -import scala.collection.JavaConversions._ -import scala.reflect.ClassTag - -/** - * This is the Java Wrapper over HBaseContext which is written in - * Scala. This class will be used by developers that want to - * work with Spark or Spark Streaming in Java - * - * @param jsc This is the JavaSparkContext that we will wrap - * @param config This is the config information to out HBase cluster - */ -@InterfaceAudience.Public -class JavaHBaseContext(@transient jsc: JavaSparkContext, - @transient config: Configuration) extends Serializable { - val hbaseContext = new HBaseContext(jsc.sc, config) - - /** - * A simple enrichment of the traditional Spark javaRdd foreachPartition. - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * @param javaRdd Original javaRdd with data to iterate over - * @param f Function to be given a iterator to iterate through - * the RDD values and a Connection object to interact - * with HBase - */ - def foreachPartition[T](javaRdd: JavaRDD[T], - f: VoidFunction[(java.util.Iterator[T], Connection)]) = { - - hbaseContext.foreachPartition(javaRdd.rdd, - (it: Iterator[T], conn: Connection) => { - f.call((it, conn)) - }) - } - - /** - * A simple enrichment of the traditional Spark Streaming dStream foreach - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * @param javaDstream Original DStream with data to iterate over - * @param f Function to be given a iterator to iterate through - * the JavaDStream values and a Connection object to - * interact with HBase - */ - def foreachPartition[T](javaDstream: JavaDStream[T], - f: VoidFunction[(Iterator[T], Connection)]) = { - hbaseContext.foreachPartition(javaDstream.dstream, - (it: Iterator[T], conn: Connection) => f.call(it, conn)) - } - - /** - * A simple enrichment of the traditional Spark JavaRDD mapPartition. - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * Note: Make sure to partition correctly to avoid memory issue when - * getting data from HBase - * - * @param javaRdd Original JavaRdd with data to iterate over - * @param f Function to be given a iterator to iterate through - * the RDD values and a Connection object to interact - * with HBase - * @return Returns a new RDD generated by the user definition - * function just like normal mapPartition - */ - def mapPartitions[T, R](javaRdd: JavaRDD[T], - f: FlatMapFunction[(java.util.Iterator[T], - Connection), R]): JavaRDD[R] = { - - def fn = (it: Iterator[T], conn: Connection) => - asScalaIterator( - f.call((asJavaIterator(it), conn)).iterator() - ) - - JavaRDD.fromRDD(hbaseContext.mapPartitions(javaRdd.rdd, - (iterator: Iterator[T], connection: Connection) => - fn(iterator, connection))(fakeClassTag[R]))(fakeClassTag[R]) - } - - /** - * A simple enrichment of the traditional Spark Streaming JavaDStream - * mapPartition. - * - * This function differs from the original in that it offers the - * developer access to a already connected Connection object - * - * Note: Do not close the Connection object. All Connection - * management is handled outside this method - * - * Note: Make sure to partition correctly to avoid memory issue when - * getting data from HBase - * - * @param javaDstream Original JavaDStream with data to iterate over - * @param mp Function to be given a iterator to iterate through - * the JavaDStream values and a Connection object to - * interact with HBase - * @return Returns a new JavaDStream generated by the user - * definition function just like normal mapPartition - */ - def streamMap[T, U](javaDstream: JavaDStream[T], - mp: Function[(Iterator[T], Connection), Iterator[U]]): - JavaDStream[U] = { - JavaDStream.fromDStream(hbaseContext.streamMapPartitions(javaDstream.dstream, - (it: Iterator[T], conn: Connection) => - mp.call(it, conn))(fakeClassTag[U]))(fakeClassTag[U]) - } - - /** - * A simple abstraction over the HBaseContext.foreachPartition method. - * - * It allow addition support for a user to take JavaRDD - * and generate puts and send them to HBase. - * The complexity of managing the Connection is - * removed from the developer - * - * @param javaRdd Original JavaRDD with data to iterate over - * @param tableName The name of the table to put into - * @param f Function to convert a value in the JavaRDD - * to a HBase Put - */ - def bulkPut[T](javaRdd: JavaRDD[T], - tableName: TableName, - f: Function[(T), Put]) { - - hbaseContext.bulkPut(javaRdd.rdd, tableName, (t: T) => f.call(t)) - } - - /** - * A simple abstraction over the HBaseContext.streamMapPartition method. - * - * It allow addition support for a user to take a JavaDStream and - * generate puts and send them to HBase. - * - * The complexity of managing the Connection is - * removed from the developer - * - * @param javaDstream Original DStream with data to iterate over - * @param tableName The name of the table to put into - * @param f Function to convert a value in - * the JavaDStream to a HBase Put - */ - def streamBulkPut[T](javaDstream: JavaDStream[T], - tableName: TableName, - f: Function[T, Put]) = { - hbaseContext.streamBulkPut(javaDstream.dstream, - tableName, - (t: T) => f.call(t)) - } - - /** - * A simple abstraction over the HBaseContext.foreachPartition method. - * - * It allow addition support for a user to take a JavaRDD and - * generate delete and send them to HBase. - * - * The complexity of managing the Connection is - * removed from the developer - * - * @param javaRdd Original JavaRDD with data to iterate over - * @param tableName The name of the table to delete from - * @param f Function to convert a value in the JavaRDD to a - * HBase Deletes - * @param batchSize The number of deletes to batch before sending to HBase - */ - def bulkDelete[T](javaRdd: JavaRDD[T], tableName: TableName, - f: Function[T, Delete], batchSize: Integer) { - hbaseContext.bulkDelete(javaRdd.rdd, tableName, (t: T) => f.call(t), batchSize) - } - - /** - * A simple abstraction over the HBaseContext.streamBulkMutation method. - * - * It allow addition support for a user to take a JavaDStream and - * generate Delete and send them to HBase. - * - * The complexity of managing the Connection is - * removed from the developer - * - * @param javaDStream Original DStream with data to iterate over - * @param tableName The name of the table to delete from - * @param f Function to convert a value in the JavaDStream to a - * HBase Delete - * @param batchSize The number of deletes to be sent at once - */ - def streamBulkDelete[T](javaDStream: JavaDStream[T], - tableName: TableName, - f: Function[T, Delete], - batchSize: Integer) = { - hbaseContext.streamBulkDelete(javaDStream.dstream, tableName, - (t: T) => f.call(t), - batchSize) - } - - /** - * A simple abstraction over the HBaseContext.mapPartition method. - * - * It allow addition support for a user to take a JavaRDD and generates a - * new RDD based on Gets and the results they bring back from HBase - * - * @param tableName The name of the table to get from - * @param batchSize batch size of how many gets to retrieve in a single fetch - * @param javaRdd Original JavaRDD with data to iterate over - * @param makeGet Function to convert a value in the JavaRDD to a - * HBase Get - * @param convertResult This will convert the HBase Result object to - * what ever the user wants to put in the resulting - * JavaRDD - * @return New JavaRDD that is created by the Get to HBase - */ - def bulkGet[T, U](tableName: TableName, - batchSize: Integer, - javaRdd: JavaRDD[T], - makeGet: Function[T, Get], - convertResult: Function[Result, U]): JavaRDD[U] = { - - JavaRDD.fromRDD(hbaseContext.bulkGet[T, U](tableName, - batchSize, - javaRdd.rdd, - (t: T) => makeGet.call(t), - (r: Result) => { - convertResult.call(r) - })(fakeClassTag[U]))(fakeClassTag[U]) - - } - - /** - * A simple abstraction over the HBaseContext.streamMap method. - * - * It allow addition support for a user to take a DStream and - * generates a new DStream based on Gets and the results - * they bring back from HBase - * - * @param tableName The name of the table to get from - * @param batchSize The number of gets to be batched together - * @param javaDStream Original DStream with data to iterate over - * @param makeGet Function to convert a value in the JavaDStream to a - * HBase Get - * @param convertResult This will convert the HBase Result object to - * what ever the user wants to put in the resulting - * JavaDStream - * @return New JavaDStream that is created by the Get to HBase - */ - def streamBulkGet[T, U](tableName: TableName, - batchSize: Integer, - javaDStream: JavaDStream[T], - makeGet: Function[T, Get], - convertResult: Function[Result, U]): JavaDStream[U] = { - JavaDStream.fromDStream(hbaseContext.streamBulkGet(tableName, - batchSize, - javaDStream.dstream, - (t: T) => makeGet.call(t), - (r: Result) => convertResult.call(r))(fakeClassTag[U]))(fakeClassTag[U]) - } - - /** - * A simple abstraction over the HBaseContext.bulkLoad method. - * It allow addition support for a user to take a JavaRDD and - * convert into new JavaRDD[Pair] based on MapFunction, - * and HFiles will be generated in stagingDir for bulk load - * - * @param javaRdd The javaRDD we are bulk loading from - * @param tableName The HBase table we are loading into - * @param mapFunc A Function that will convert a value in JavaRDD - * to Pair(KeyFamilyQualifier, Array[Byte]) - * @param stagingDir The location on the FileSystem to bulk load into - * @param familyHFileWriteOptionsMap Options that will define how the HFile for a - * column family is written - * @param compactionExclude Compaction excluded for the HFiles - * @param maxSize Max size for the HFiles before they roll - */ - def bulkLoad[T](javaRdd: JavaRDD[T], - tableName: TableName, - mapFunc : Function[T, Pair[KeyFamilyQualifier, Array[Byte]]], - stagingDir: String, - familyHFileWriteOptionsMap: Map[Array[Byte], FamilyHFileWriteOptions], - compactionExclude: Boolean, - maxSize: Long): - Unit = { - hbaseContext.bulkLoad[Pair[KeyFamilyQualifier, Array[Byte]]](javaRdd.map(mapFunc).rdd, tableName, t => { - val keyFamilyQualifier = t.getFirst - val value = t.getSecond - Seq((keyFamilyQualifier, value)).iterator - }, stagingDir, familyHFileWriteOptionsMap, compactionExclude, maxSize) - } - - /** - * A simple abstraction over the HBaseContext.bulkLoadThinRows method. - * It allow addition support for a user to take a JavaRDD and - * convert into new JavaRDD[Pair] based on MapFunction, - * and HFiles will be generated in stagingDir for bulk load - * - * @param javaRdd The javaRDD we are bulk loading from - * @param tableName The HBase table we are loading into - * @param mapFunc A Function that will convert a value in JavaRDD - * to Pair(ByteArrayWrapper, FamiliesQualifiersValues) - * @param stagingDir The location on the FileSystem to bulk load into - * @param familyHFileWriteOptionsMap Options that will define how the HFile for a - * column family is written - * @param compactionExclude Compaction excluded for the HFiles - * @param maxSize Max size for the HFiles before they roll - */ - def bulkLoadThinRows[T](javaRdd: JavaRDD[T], - tableName: TableName, - mapFunc : Function[T, Pair[ByteArrayWrapper, FamiliesQualifiersValues]], - stagingDir: String, - familyHFileWriteOptionsMap: Map[Array[Byte], FamilyHFileWriteOptions], - compactionExclude: Boolean, - maxSize: Long): - Unit = { - hbaseContext.bulkLoadThinRows[Pair[ByteArrayWrapper, FamiliesQualifiersValues]](javaRdd.map(mapFunc).rdd, - tableName, t => { - (t.getFirst, t.getSecond) - }, stagingDir, familyHFileWriteOptionsMap, compactionExclude, maxSize) - } - - /** - * This function will use the native HBase TableInputFormat with the - * given scan object to generate a new JavaRDD - * - * @param tableName The name of the table to scan - * @param scans The HBase scan object to use to read data from HBase - * @param f Function to convert a Result object from HBase into - * What the user wants in the final generated JavaRDD - * @return New JavaRDD with results from scan - */ - def hbaseRDD[U](tableName: TableName, - scans: Scan, - f: Function[(ImmutableBytesWritable, Result), U]): - JavaRDD[U] = { - JavaRDD.fromRDD( - hbaseContext.hbaseRDD[U](tableName, - scans, - (v: (ImmutableBytesWritable, Result)) => - f.call(v._1, v._2))(fakeClassTag[U]))(fakeClassTag[U]) - } - - /** - * A overloaded version of HBaseContext hbaseRDD that define the - * type of the resulting JavaRDD - * - * @param tableName The name of the table to scan - * @param scans The HBase scan object to use to read data from HBase - * @return New JavaRDD with results from scan - */ - def hbaseRDD(tableName: TableName, - scans: Scan): - JavaRDD[(ImmutableBytesWritable, Result)] = { - JavaRDD.fromRDD(hbaseContext.hbaseRDD(tableName, scans)) - } - - /** - * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef]. - * - * This method is used to keep ClassTags out of the external Java API, as the Java compiler - * cannot produce them automatically. While this ClassTag-faking does please the compiler, - * it can cause problems at runtime if the Scala API relies on ClassTags for correctness. - * - * Often, though, a ClassTag[AnyRef] will not lead to incorrect behavior, - * just worse performance or security issues. - * For instance, an Array[AnyRef] can hold any type T, - * but may lose primitive - * specialization. - */ - private[spark] - def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] - -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/KeyFamilyQualifier.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/KeyFamilyQualifier.scala deleted file mode 100644 index 7fd5a62924a..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/KeyFamilyQualifier.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.io.Serializable - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.util.Bytes - -/** - * This is the key to be used for sorting and shuffling. - * - * We will only partition on the rowKey but we will sort on all three - * - * @param rowKey Record RowKey - * @param family Record ColumnFamily - * @param qualifier Cell Qualifier - */ -@InterfaceAudience.Public -class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte]) - extends Comparable[KeyFamilyQualifier] with Serializable { - override def compareTo(o: KeyFamilyQualifier): Int = { - var result = Bytes.compareTo(rowKey, o.rowKey) - if (result == 0) { - result = Bytes.compareTo(family, o.family) - if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier) - } - result - } - override def toString: String = { - Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier) - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/NewHBaseRDD.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/NewHBaseRDD.scala deleted file mode 100644 index 6d0a2d20f89..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/NewHBaseRDD.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.conf.Configuration -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.mapreduce.InputFormat -import org.apache.spark.rdd.NewHadoopRDD -import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} - -@InterfaceAudience.Public -class NewHBaseRDD[K,V](@transient sc : SparkContext, - @transient inputFormatClass: Class[_ <: InputFormat[K, V]], - @transient keyClass: Class[K], - @transient valueClass: Class[V], - @transient conf: Configuration, - val hBaseContext: HBaseContext) extends NewHadoopRDD(sc,inputFormatClass, keyClass, valueClass, conf) { - - override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = { - hBaseContext.applyCreds() - super.compute(theSplit, context) - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/Bound.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/Bound.scala deleted file mode 100644 index 4602ac8b35e..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/Bound.scala +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.datasources - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.spark.hbase._ - -/** - * The Bound represent the boudary for the scan - * - * @param b The byte array of the bound - * @param inc inclusive or not. - */ -@InterfaceAudience.Private -case class Bound(b: Array[Byte], inc: Boolean) -// The non-overlapping ranges we need to scan, if lower is equal to upper, it is a get request - -@InterfaceAudience.Private -case class Range(lower: Option[Bound], upper: Option[Bound]) - -@InterfaceAudience.Private -object Range { - def apply(region: HBaseRegion): Range = { - Range(region.start.map(Bound(_, true)), if (region.end.get.size == 0) { - None - } else { - region.end.map((Bound(_, false))) - }) - } -} - -@InterfaceAudience.Private -object Ranges { - // We assume that - // 1. r.lower.inc is true, and r.upper.inc is false - // 2. for each range in rs, its upper.inc is false - def and(r: Range, rs: Seq[Range]): Seq[Range] = { - rs.flatMap{ s => - val lower = s.lower.map { x => - // the scan has lower bound - r.lower.map { y => - // the region has lower bound - if (ord.compare(x.b, y.b) < 0) { - // scan lower bound is smaller than region server lower bound - Some(y) - } else { - // scan low bound is greater or equal to region server lower bound - Some(x) - } - }.getOrElse(Some(x)) - }.getOrElse(r.lower) - - val upper = s.upper.map { x => - // the scan has upper bound - r.upper.map { y => - // the region has upper bound - if (ord.compare(x.b, y.b) >= 0) { - // scan upper bound is larger than server upper bound - // but region server scan stop is exclusive. It is OK here. - Some(y) - } else { - // scan upper bound is less or equal to region server upper bound - Some(x) - } - }.getOrElse(Some(x)) - }.getOrElse(r.upper) - - val c = lower.map { case x => - upper.map { case y => - ord.compare(x.b, y.b) - }.getOrElse(-1) - }.getOrElse(-1) - if (c < 0) { - Some(Range(lower, upper)) - } else { - None - } - }.seq - } -} - -@InterfaceAudience.Private -object Points { - def and(r: Range, ps: Seq[Array[Byte]]): Seq[Array[Byte]] = { - ps.flatMap { p => - if (ord.compare(r.lower.get.b, p) <= 0) { - // if region lower bound is less or equal to the point - if (r.upper.isDefined) { - // if region upper bound is defined - if (ord.compare(r.upper.get.b, p) > 0) { - // if the upper bound is greater than the point (because upper bound is exclusive) - Some(p) - } else { - None - } - } else { - // if the region upper bound is not defined (infinity) - Some(p) - } - } else { - None - } - } - } -} - diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseResources.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseResources.scala deleted file mode 100644 index 0f467a78aa9..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseResources.scala +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.datasources - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.TableName -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.spark.{HBaseConnectionKey, SmartConnection, - HBaseConnectionCache, HBaseRelation} -import scala.language.implicitConversions - -// Resource and ReferencedResources are defined for extensibility, -// e.g., consolidate scan and bulkGet in the future work. - -// User has to invoke release explicitly to release the resource, -// and potentially parent resources -@InterfaceAudience.Private -trait Resource { - def release(): Unit -} - -@InterfaceAudience.Private -case class ScanResource(tbr: TableResource, rs: ResultScanner) extends Resource { - def release() { - rs.close() - tbr.release() - } -} - -@InterfaceAudience.Private -case class GetResource(tbr: TableResource, rs: Array[Result]) extends Resource { - def release() { - tbr.release() - } -} - -@InterfaceAudience.Private -trait ReferencedResource { - var count: Int = 0 - def init(): Unit - def destroy(): Unit - def acquire() = synchronized { - try { - count += 1 - if (count == 1) { - init() - } - } catch { - case e: Throwable => - release() - throw e - } - } - - def release() = synchronized { - count -= 1 - if (count == 0) { - destroy() - } - } - - def releaseOnException[T](func: => T): T = { - acquire() - val ret = { - try { - func - } catch { - case e: Throwable => - release() - throw e - } - } - ret - } -} - -@InterfaceAudience.Private -case class TableResource(relation: HBaseRelation) extends ReferencedResource { - var connection: SmartConnection = _ - var table: Table = _ - - override def init(): Unit = { - connection = HBaseConnectionCache.getConnection(relation.hbaseConf) - table = connection.getTable(TableName.valueOf(relation.tableName)) - } - - override def destroy(): Unit = { - if (table != null) { - table.close() - table = null - } - if (connection != null) { - connection.close() - connection = null - } - } - - def getScanner(scan: Scan): ScanResource = releaseOnException { - ScanResource(this, table.getScanner(scan)) - } - - def get(list: java.util.List[org.apache.hadoop.hbase.client.Get]) = releaseOnException { - GetResource(this, table.get(list)) - } -} - -@InterfaceAudience.Private -case class RegionResource(relation: HBaseRelation) extends ReferencedResource { - var connection: SmartConnection = _ - var rl: RegionLocator = _ - val regions = releaseOnException { - val keys = rl.getStartEndKeys - keys.getFirst.zip(keys.getSecond) - .zipWithIndex - .map(x => - HBaseRegion(x._2, - Some(x._1._1), - Some(x._1._2), - Some(rl.getRegionLocation(x._1._1).getHostname))) - } - - override def init(): Unit = { - connection = HBaseConnectionCache.getConnection(relation.hbaseConf) - rl = connection.getRegionLocator(TableName.valueOf(relation.tableName)) - } - - override def destroy(): Unit = { - if (rl != null) { - rl.close() - rl = null - } - if (connection != null) { - connection.close() - connection = null - } - } -} - -@InterfaceAudience.Private -object HBaseResources{ - implicit def ScanResToScan(sr: ScanResource): ResultScanner = { - sr.rs - } - - implicit def GetResToResult(gr: GetResource): Array[Result] = { - gr.rs - } - - implicit def TableResToTable(tr: TableResource): Table = { - tr.table - } - - implicit def RegionResToRegions(rr: RegionResource): Seq[HBaseRegion] = { - rr.regions - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseSparkConf.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseSparkConf.scala deleted file mode 100644 index dc497f949b2..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseSparkConf.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.datasources - -import org.apache.yetus.audience.InterfaceAudience; - -/** - * This is the hbase configuration. User can either set them in SparkConf, which - * will take effect globally, or configure it per table, which will overwrite the value - * set in SparkConf. If not set, the default value will take effect. - */ -@InterfaceAudience.Public -object HBaseSparkConf{ - /** Set to false to disable server-side caching of blocks for this scan, - * false by default, since full table scans generate too much BC churn. - */ - val QUERY_CACHEBLOCKS = "hbase.spark.query.cacheblocks" - val DEFAULT_QUERY_CACHEBLOCKS = false - /** The number of rows for caching that will be passed to scan. */ - val QUERY_CACHEDROWS = "hbase.spark.query.cachedrows" - /** Set the maximum number of values to return for each call to next() in scan. */ - val QUERY_BATCHSIZE = "hbase.spark.query.batchsize" - /** The number of BulkGets send to HBase. */ - val BULKGET_SIZE = "hbase.spark.bulkget.size" - val DEFAULT_BULKGET_SIZE = 1000 - /** Set to specify the location of hbase configuration file. */ - val HBASE_CONFIG_LOCATION = "hbase.spark.config.location" - /** Set to specify whether create or use latest cached HBaseContext*/ - val USE_HBASECONTEXT = "hbase.spark.use.hbasecontext" - val DEFAULT_USE_HBASECONTEXT = true - /** Pushdown the filter to data source engine to increase the performance of queries. */ - val PUSHDOWN_COLUMN_FILTER = "hbase.spark.pushdown.columnfilter" - val DEFAULT_PUSHDOWN_COLUMN_FILTER= true - /** Class name of the encoder, which encode data types from Spark to HBase bytes. */ - val QUERY_ENCODER = "hbase.spark.query.encoder" - val DEFAULT_QUERY_ENCODER = classOf[NaiveEncoder].getCanonicalName - /** The timestamp used to filter columns with a specific timestamp. */ - val TIMESTAMP = "hbase.spark.query.timestamp" - /** The starting timestamp used to filter columns with a specific range of versions. */ - val TIMERANGE_START = "hbase.spark.query.timerange.start" - /** The ending timestamp used to filter columns with a specific range of versions. */ - val TIMERANGE_END = "hbase.spark.query.timerange.end" - /** The maximum number of version to return. */ - val MAX_VERSIONS = "hbase.spark.query.maxVersions" - /** Delayed time to close hbase-spark connection when no reference to this connection, in milliseconds. */ - val DEFAULT_CONNECTION_CLOSE_DELAY = 10 * 60 * 1000 -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala deleted file mode 100644 index 1ca1b454f14..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.datasources - -import java.util.ArrayList - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.spark._ -import org.apache.hadoop.hbase.spark.hbase._ -import org.apache.hadoop.hbase.spark.datasources.HBaseResources._ -import org.apache.hadoop.hbase.util.ShutdownHookManager -import org.apache.spark.sql.datasources.hbase.Field -import org.apache.spark.{SparkEnv, TaskContext, Logging, Partition} -import org.apache.spark.rdd.RDD - -import scala.collection.mutable - -@InterfaceAudience.Private -class HBaseTableScanRDD(relation: HBaseRelation, - val hbaseContext: HBaseContext, - @transient val filter: Option[SparkSQLPushDownFilter] = None, - val columns: Seq[Field] = Seq.empty - )extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging { - private def sparkConf = SparkEnv.get.conf - @transient var ranges = Seq.empty[Range] - @transient var points = Seq.empty[Array[Byte]] - def addPoint(p: Array[Byte]) { - points :+= p - } - - def addRange(r: ScanRange) = { - val lower = if (r.lowerBound != null && r.lowerBound.length > 0) { - Some(Bound(r.lowerBound, r.isLowerBoundEqualTo)) - } else { - None - } - val upper = if (r.upperBound != null && r.upperBound.length > 0) { - if (!r.isUpperBoundEqualTo) { - Some(Bound(r.upperBound, false)) - } else { - - // HBase stopRow is exclusive: therefore it DOESN'T act like isUpperBoundEqualTo - // by default. So we need to add a new max byte to the stopRow key - val newArray = new Array[Byte](r.upperBound.length + 1) - System.arraycopy(r.upperBound, 0, newArray, 0, r.upperBound.length) - - //New Max Bytes - newArray(r.upperBound.length) = ByteMin - Some(Bound(newArray, false)) - } - } else { - None - } - ranges :+= Range(lower, upper) - } - - override def getPartitions: Array[Partition] = { - val regions = RegionResource(relation) - var idx = 0 - logDebug(s"There are ${regions.size} regions") - val ps = regions.flatMap { x => - val rs = Ranges.and(Range(x), ranges) - val ps = Points.and(Range(x), points) - if (rs.size > 0 || ps.size > 0) { - if(log.isDebugEnabled) { - rs.foreach(x => logDebug(x.toString)) - } - idx += 1 - Some(HBaseScanPartition(idx - 1, x, rs, ps, SerializedFilter.toSerializedTypedFilter(filter))) - } else { - None - } - }.toArray - regions.release() - ShutdownHookManager.affixShutdownHook( new Thread() { - override def run() { - HBaseConnectionCache.close() - } - }, 0) - ps.asInstanceOf[Array[Partition]] - } - - override def getPreferredLocations(split: Partition): Seq[String] = { - split.asInstanceOf[HBaseScanPartition].regions.server.map { - identity - }.toSeq - } - - private def buildGets( - tbr: TableResource, - g: Seq[Array[Byte]], - filter: Option[SparkSQLPushDownFilter], - columns: Seq[Field], - hbaseContext: HBaseContext): Iterator[Result] = { - g.grouped(relation.bulkGetSize).flatMap{ x => - val gets = new ArrayList[Get](x.size) - x.foreach{ y => - val g = new Get(y) - handleTimeSemantics(g) - columns.foreach { d => - if (!d.isRowKey) { - g.addColumn(d.cfBytes, d.colBytes) - } - } - filter.foreach(g.setFilter(_)) - gets.add(g) - } - hbaseContext.applyCreds() - val tmp = tbr.get(gets) - rddResources.addResource(tmp) - toResultIterator(tmp) - } - } - - private def toResultIterator(result: GetResource): Iterator[Result] = { - val iterator = new Iterator[Result] { - var idx = 0 - var cur: Option[Result] = None - override def hasNext: Boolean = { - while(idx < result.length && cur.isEmpty) { - val r = result(idx) - idx += 1 - if (!r.isEmpty) { - cur = Some(r) - } - } - if (cur.isEmpty) { - rddResources.release(result) - } - cur.isDefined - } - override def next(): Result = { - hasNext - val ret = cur.get - cur = None - ret - } - } - iterator - } - - private def buildScan(range: Range, - filter: Option[SparkSQLPushDownFilter], - columns: Seq[Field]): Scan = { - val scan = (range.lower, range.upper) match { - case (Some(Bound(a, b)), Some(Bound(c, d))) => new Scan(a, c) - case (None, Some(Bound(c, d))) => new Scan(Array[Byte](), c) - case (Some(Bound(a, b)), None) => new Scan(a) - case (None, None) => new Scan() - } - handleTimeSemantics(scan) - - columns.foreach { d => - if (!d.isRowKey) { - scan.addColumn(d.cfBytes, d.colBytes) - } - } - scan.setCacheBlocks(relation.blockCacheEnable) - scan.setBatch(relation.batchNum) - scan.setCaching(relation.cacheSize) - filter.foreach(scan.setFilter(_)) - scan - } - private def toResultIterator(scanner: ScanResource): Iterator[Result] = { - val iterator = new Iterator[Result] { - var cur: Option[Result] = None - override def hasNext: Boolean = { - if (cur.isEmpty) { - val r = scanner.next() - if (r == null) { - rddResources.release(scanner) - } else { - cur = Some(r) - } - } - cur.isDefined - } - override def next(): Result = { - hasNext - val ret = cur.get - cur = None - ret - } - } - iterator - } - - lazy val rddResources = RDDResources(new mutable.HashSet[Resource]()) - - private def close() { - rddResources.release() - } - - override def compute(split: Partition, context: TaskContext): Iterator[Result] = { - val partition = split.asInstanceOf[HBaseScanPartition] - val filter = SerializedFilter.fromSerializedFilter(partition.sf) - val scans = partition.scanRanges - .map(buildScan(_, filter, columns)) - val tableResource = TableResource(relation) - context.addTaskCompletionListener(context => close()) - val points = partition.points - val gIt: Iterator[Result] = { - if (points.isEmpty) { - Iterator.empty: Iterator[Result] - } else { - buildGets(tableResource, points, filter, columns, hbaseContext) - } - } - val rIts = scans.par - .map { scan => - hbaseContext.applyCreds() - val scanner = tableResource.getScanner(scan) - rddResources.addResource(scanner) - scanner - }.map(toResultIterator(_)) - .fold(Iterator.empty: Iterator[Result]){ case (x, y) => - x ++ y - } ++ gIt - ShutdownHookManager.affixShutdownHook( new Thread() { - override def run() { - HBaseConnectionCache.close() - } - }, 0) - rIts - } - - private def handleTimeSemantics(query: Query): Unit = { - // Set timestamp related values if present - (query, relation.timestamp, relation.minTimestamp, relation.maxTimestamp) match { - case (q: Scan, Some(ts), None, None) => q.setTimeStamp(ts) - case (q: Get, Some(ts), None, None) => q.setTimeStamp(ts) - - case (q:Scan, None, Some(minStamp), Some(maxStamp)) => q.setTimeRange(minStamp, maxStamp) - case (q:Get, None, Some(minStamp), Some(maxStamp)) => q.setTimeRange(minStamp, maxStamp) - - case (q, None, None, None) => - - case _ => throw new IllegalArgumentException(s"Invalid combination of query/timestamp/time range provided. " + - s"timeStamp is: ${relation.timestamp.get}, minTimeStamp is: ${relation.minTimestamp.get}, " + - s"maxTimeStamp is: ${relation.maxTimestamp.get}") - } - if (relation.maxVersions.isDefined) { - query match { - case q: Scan => q.setMaxVersions(relation.maxVersions.get) - case q: Get => q.setMaxVersions(relation.maxVersions.get) - case _ => throw new IllegalArgumentException("Invalid query provided with maxVersions") - } - } - } -} - -case class SerializedFilter(b: Option[Array[Byte]]) - -object SerializedFilter { - def toSerializedTypedFilter(f: Option[SparkSQLPushDownFilter]): SerializedFilter = { - SerializedFilter(f.map(_.toByteArray)) - } - - def fromSerializedFilter(sf: SerializedFilter): Option[SparkSQLPushDownFilter] = { - sf.b.map(SparkSQLPushDownFilter.parseFrom(_)) - } -} - -private[hbase] case class HBaseRegion( - override val index: Int, - val start: Option[HBaseType] = None, - val end: Option[HBaseType] = None, - val server: Option[String] = None) extends Partition - - -private[hbase] case class HBaseScanPartition( - override val index: Int, - val regions: HBaseRegion, - val scanRanges: Seq[Range], - val points: Seq[Array[Byte]], - val sf: SerializedFilter) extends Partition - -case class RDDResources(set: mutable.HashSet[Resource]) { - def addResource(s: Resource) { - set += s - } - def release() { - set.foreach(release(_)) - } - def release(rs: Resource) { - try { - rs.release() - } finally { - set.remove(rs) - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/JavaBytesEncoder.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/JavaBytesEncoder.scala deleted file mode 100644 index 6a5018947f7..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/JavaBytesEncoder.scala +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.datasources - -import org.apache.hadoop.hbase.HBaseInterfaceAudience; -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.yetus.audience.InterfaceStability; -import org.apache.hadoop.hbase.spark.datasources.JavaBytesEncoder.JavaBytesEncoder -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.Logging -import org.apache.spark.sql.types._ - -/** - * The ranges for the data type whose size is known. Whether the bound is inclusive - * or exclusive is undefind, and upper to the caller to decide. - * - * @param low: the lower bound of the range. - * @param upper: the upper bound of the range. - */ -@InterfaceAudience.LimitedPrivate(Array(HBaseInterfaceAudience.SPARK)) -@InterfaceStability.Evolving -case class BoundRange(low: Array[Byte],upper: Array[Byte]) - -/** - * The class identifies the ranges for a java primitive type. The caller needs - * to decide the bound is either inclusive or exclusive on its own. - * information - * - * @param less: the set of ranges for LessThan/LessOrEqualThan - * @param greater: the set of ranges for GreaterThan/GreaterThanOrEqualTo - * @param value: the byte array of the original value - */ -@InterfaceAudience.LimitedPrivate(Array(HBaseInterfaceAudience.SPARK)) -@InterfaceStability.Evolving -case class BoundRanges(less: Array[BoundRange], greater: Array[BoundRange], value: Array[Byte]) - -/** - * The trait to support plugin architecture for different encoder/decoder. - * encode is used for serializing the data type to byte array and the filter is - * used to filter out the unnecessary records. - */ -@InterfaceAudience.LimitedPrivate(Array(HBaseInterfaceAudience.SPARK)) -@InterfaceStability.Evolving -trait BytesEncoder { - def encode(dt: DataType, value: Any): Array[Byte] - - /** - * The function performing real filtering operations. The format of filterBytes depends on the - * implementation of the BytesEncoder. - * - * @param input: the current input byte array that needs to be filtered out - * @param offset1: the starting offset of the input byte array. - * @param length1: the length of the input byte array. - * @param filterBytes: the byte array provided by query condition. - * @param offset2: the starting offset in the filterBytes. - * @param length2: the length of the bytes in the filterBytes - * @param ops: The operation of the filter operator. - * @return true: the record satisfies the predicates - * false: the record does not satisfy the predicates. - */ - def filter(input: Array[Byte], offset1: Int, length1: Int, - filterBytes: Array[Byte], offset2: Int, length2: Int, - ops: JavaBytesEncoder): Boolean - - /** - * Currently, it is used for partition pruning. - * As for some codec, the order may be inconsistent between java primitive - * type and its byte array. We may have to split the predicates on some - * of the java primitive type into multiple predicates. - * - * For example in naive codec, some of the java primitive types have to be - * split into multiple predicates, and union these predicates together to - * make the predicates be performed correctly. - * For example, if we have "COLUMN < 2", we will transform it into - * "0 <= COLUMN < 2 OR Integer.MIN_VALUE <= COLUMN <= -1" - */ - def ranges(in: Any): Option[BoundRanges] -} - -@InterfaceAudience.LimitedPrivate(Array(HBaseInterfaceAudience.SPARK)) -@InterfaceStability.Evolving -object JavaBytesEncoder extends Enumeration with Logging{ - type JavaBytesEncoder = Value - val Greater, GreaterEqual, Less, LessEqual, Equal, Unknown = Value - - /** - * create the encoder/decoder - * - * @param clsName: the class name of the encoder/decoder class - * @return the instance of the encoder plugin. - */ - def create(clsName: String): BytesEncoder = { - try { - Class.forName(clsName).newInstance.asInstanceOf[BytesEncoder] - } catch { - case _: Throwable => - logWarning(s"$clsName cannot be initiated, falling back to naive encoder") - new NaiveEncoder() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/NaiveEncoder.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/NaiveEncoder.scala deleted file mode 100644 index 61382426f8b..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/NaiveEncoder.scala +++ /dev/null @@ -1,261 +0,0 @@ -package org.apache.hadoop.hbase.spark.datasources -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.spark.datasources.JavaBytesEncoder.JavaBytesEncoder -import org.apache.hadoop.hbase.spark.hbase._ -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.Logging -import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String - - -/** - * This is the naive non-order preserving encoder/decoder. - * Due to the inconsistency of the order between java primitive types - * and their bytearray. The data type has to be passed in so that the filter - * can work correctly, which is done by wrapping the type into the first byte - * of the serialized array. - */ -@InterfaceAudience.Private -class NaiveEncoder extends BytesEncoder with Logging{ - var code = 0 - def nextCode: Byte = { - code += 1 - (code - 1).asInstanceOf[Byte] - } - val BooleanEnc = nextCode - val ShortEnc = nextCode - val IntEnc = nextCode - val LongEnc = nextCode - val FloatEnc = nextCode - val DoubleEnc = nextCode - val StringEnc = nextCode - val BinaryEnc = nextCode - val TimestampEnc = nextCode - val UnknownEnc = nextCode - - - /** - * Evaluate the java primitive type and return the BoundRanges. For one value, it may have - * multiple output ranges because of the inconsistency of order between java primitive type - * and its byte array order. - * - * For short, integer, and long, the order of number is consistent with byte array order - * if two number has the same sign bit. But the negative number is larger than positive - * number in byte array. - * - * For double and float, the order of positive number is consistent with its byte array order. - * But the order of negative number is the reverse order of byte array. Please refer to IEEE-754 - * and https://en.wikipedia.org/wiki/Single-precision_floating-point_format - */ - def ranges(in: Any): Option[BoundRanges] = in match { - case a: Integer => - val b = Bytes.toBytes(a) - if (a >= 0) { - logDebug(s"range is 0 to $a and ${Integer.MIN_VALUE} to -1") - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(0: Int), b), - BoundRange(Bytes.toBytes(Integer.MIN_VALUE), Bytes.toBytes(-1: Int))), - Array(BoundRange(b, Bytes.toBytes(Integer.MAX_VALUE))), b)) - } else { - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(Integer.MIN_VALUE), b)), - Array(BoundRange(b, Bytes.toBytes(-1: Integer)), - BoundRange(Bytes.toBytes(0: Int), Bytes.toBytes(Integer.MAX_VALUE))), b)) - } - case a: Long => - val b = Bytes.toBytes(a) - if (a >= 0) { - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(0: Long), b), - BoundRange(Bytes.toBytes(Long.MinValue), Bytes.toBytes(-1: Long))), - Array(BoundRange(b, Bytes.toBytes(Long.MaxValue))), b)) - } else { - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(Long.MinValue), b)), - Array(BoundRange(b, Bytes.toBytes(-1: Long)), - BoundRange(Bytes.toBytes(0: Long), Bytes.toBytes(Long.MaxValue))), b)) - } - case a: Short => - val b = Bytes.toBytes(a) - if (a >= 0) { - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(0: Short), b), - BoundRange(Bytes.toBytes(Short.MinValue), Bytes.toBytes(-1: Short))), - Array(BoundRange(b, Bytes.toBytes(Short.MaxValue))), b)) - } else { - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(Short.MinValue), b)), - Array(BoundRange(b, Bytes.toBytes(-1: Short)), - BoundRange(Bytes.toBytes(0: Short), Bytes.toBytes(Short.MaxValue))), b)) - } - case a: Double => - val b = Bytes.toBytes(a) - if (a >= 0.0f) { - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(0.0d), b), - BoundRange(Bytes.toBytes(-0.0d), Bytes.toBytes(Double.MinValue))), - Array(BoundRange(b, Bytes.toBytes(Double.MaxValue))), b)) - } else { - Some(BoundRanges( - Array(BoundRange(b, Bytes.toBytes(Double.MinValue))), - Array(BoundRange(Bytes.toBytes(-0.0d), b), - BoundRange(Bytes.toBytes(0.0d), Bytes.toBytes(Double.MaxValue))), b)) - } - case a: Float => - val b = Bytes.toBytes(a) - if (a >= 0.0f) { - Some(BoundRanges( - Array(BoundRange(Bytes.toBytes(0.0f), b), - BoundRange(Bytes.toBytes(-0.0f), Bytes.toBytes(Float.MinValue))), - Array(BoundRange(b, Bytes.toBytes(Float.MaxValue))), b)) - } else { - Some(BoundRanges( - Array(BoundRange(b, Bytes.toBytes(Float.MinValue))), - Array(BoundRange(Bytes.toBytes(-0.0f), b), - BoundRange(Bytes.toBytes(0.0f), Bytes.toBytes(Float.MaxValue))), b)) - } - case a: Array[Byte] => - Some(BoundRanges( - Array(BoundRange(bytesMin, a)), - Array(BoundRange(a, bytesMax)), a)) - case a: Byte => - val b = Array(a) - Some(BoundRanges( - Array(BoundRange(bytesMin, b)), - Array(BoundRange(b, bytesMax)), b)) - case a: String => - val b = Bytes.toBytes(a) - Some(BoundRanges( - Array(BoundRange(bytesMin, b)), - Array(BoundRange(b, bytesMax)), b)) - case a: UTF8String => - val b = a.getBytes - Some(BoundRanges( - Array(BoundRange(bytesMin, b)), - Array(BoundRange(b, bytesMax)), b)) - case _ => None - } - - def compare(c: Int, ops: JavaBytesEncoder): Boolean = { - ops match { - case JavaBytesEncoder.Greater => c > 0 - case JavaBytesEncoder.GreaterEqual => c >= 0 - case JavaBytesEncoder.Less => c < 0 - case JavaBytesEncoder.LessEqual => c <= 0 - } - } - - /** - * encode the data type into byte array. Note that it is a naive implementation with the - * data type byte appending to the head of the serialized byte array. - * - * @param dt: The data type of the input - * @param value: the value of the input - * @return the byte array with the first byte indicating the data type. - */ - override def encode(dt: DataType, - value: Any): Array[Byte] = { - dt match { - case BooleanType => - val result = new Array[Byte](Bytes.SIZEOF_BOOLEAN + 1) - result(0) = BooleanEnc - value.asInstanceOf[Boolean] match { - case true => result(1) = -1: Byte - case false => result(1) = 0: Byte - } - result - case ShortType => - val result = new Array[Byte](Bytes.SIZEOF_SHORT + 1) - result(0) = ShortEnc - Bytes.putShort(result, 1, value.asInstanceOf[Short]) - result - case IntegerType => - val result = new Array[Byte](Bytes.SIZEOF_INT + 1) - result(0) = IntEnc - Bytes.putInt(result, 1, value.asInstanceOf[Int]) - result - case LongType|TimestampType => - val result = new Array[Byte](Bytes.SIZEOF_LONG + 1) - result(0) = LongEnc - Bytes.putLong(result, 1, value.asInstanceOf[Long]) - result - case FloatType => - val result = new Array[Byte](Bytes.SIZEOF_FLOAT + 1) - result(0) = FloatEnc - Bytes.putFloat(result, 1, value.asInstanceOf[Float]) - result - case DoubleType => - val result = new Array[Byte](Bytes.SIZEOF_DOUBLE + 1) - result(0) = DoubleEnc - Bytes.putDouble(result, 1, value.asInstanceOf[Double]) - result - case BinaryType => - val v = value.asInstanceOf[Array[Bytes]] - val result = new Array[Byte](v.length + 1) - result(0) = BinaryEnc - System.arraycopy(v, 0, result, 1, v.length) - result - case StringType => - val bytes = Bytes.toBytes(value.asInstanceOf[String]) - val result = new Array[Byte](bytes.length + 1) - result(0) = StringEnc - System.arraycopy(bytes, 0, result, 1, bytes.length) - result - case _ => - val bytes = Bytes.toBytes(value.toString) - val result = new Array[Byte](bytes.length + 1) - result(0) = UnknownEnc - System.arraycopy(bytes, 0, result, 1, bytes.length) - result - } - } - - override def filter(input: Array[Byte], offset1: Int, length1: Int, - filterBytes: Array[Byte], offset2: Int, length2: Int, - ops: JavaBytesEncoder): Boolean = { - filterBytes(offset2) match { - case ShortEnc => - val in = Bytes.toShort(input, offset1) - val value = Bytes.toShort(filterBytes, offset2 + 1) - compare(in.compareTo(value), ops) - case IntEnc => - val in = Bytes.toInt(input, offset1) - val value = Bytes.toInt(filterBytes, offset2 + 1) - compare(in.compareTo(value), ops) - case LongEnc | TimestampEnc => - val in = Bytes.toInt(input, offset1) - val value = Bytes.toInt(filterBytes, offset2 + 1) - compare(in.compareTo(value), ops) - case FloatEnc => - val in = Bytes.toFloat(input, offset1) - val value = Bytes.toFloat(filterBytes, offset2 + 1) - compare(in.compareTo(value), ops) - case DoubleEnc => - val in = Bytes.toDouble(input, offset1) - val value = Bytes.toDouble(filterBytes, offset2 + 1) - compare(in.compareTo(value), ops) - case _ => - // for String, Byte, Binary, Boolean and other types - // we can use the order of byte array directly. - compare( - Bytes.compareTo(input, offset1, length1, filterBytes, offset2 + 1, length2 - 1), ops) - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SchemaConverters.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SchemaConverters.scala deleted file mode 100644 index 9eeabc54292..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SchemaConverters.scala +++ /dev/null @@ -1,430 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.io.ByteArrayInputStream -import java.nio.ByteBuffer -import java.sql.Timestamp -import java.util -import java.util.HashMap - -import org.apache.avro.SchemaBuilder.BaseFieldTypeBuilder -import org.apache.avro.SchemaBuilder.BaseTypeBuilder -import org.apache.avro.SchemaBuilder.FieldAssembler -import org.apache.avro.SchemaBuilder.FieldDefault -import org.apache.avro.SchemaBuilder.RecordBuilder -import org.apache.avro.io._ -import org.apache.commons.io.output.ByteArrayOutputStream -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.util.Bytes - -import scala.collection.JavaConversions._ - -import org.apache.avro.{SchemaBuilder, Schema} -import org.apache.avro.Schema.Type._ -import org.apache.avro.generic.GenericData.{Record, Fixed} -import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericData, GenericRecord} -import org.apache.spark.sql.Row -import org.apache.spark.sql.types._ - -import scala.collection.immutable.Map - -@InterfaceAudience.Private -abstract class AvroException(msg: String) extends Exception(msg) - -@InterfaceAudience.Private -case class SchemaConversionException(msg: String) extends AvroException(msg) - -/*** - * On top level, the converters provide three high level interface. - * 1. toSqlType: This function takes an avro schema and returns a sql schema. - * 2. createConverterToSQL: Returns a function that is used to convert avro types to their - * corresponding sparkSQL representations. - * 3. convertTypeToAvro: This function constructs converter function for a given sparkSQL - * datatype. This is used in writing Avro records out to disk - */ -@InterfaceAudience.Private -object SchemaConverters { - - case class SchemaType(dataType: DataType, nullable: Boolean) - - /** - * This function takes an avro schema and returns a sql schema. - */ - def toSqlType(avroSchema: Schema): SchemaType = { - avroSchema.getType match { - case INT => SchemaType(IntegerType, nullable = false) - case STRING => SchemaType(StringType, nullable = false) - case BOOLEAN => SchemaType(BooleanType, nullable = false) - case BYTES => SchemaType(BinaryType, nullable = false) - case DOUBLE => SchemaType(DoubleType, nullable = false) - case FLOAT => SchemaType(FloatType, nullable = false) - case LONG => SchemaType(LongType, nullable = false) - case FIXED => SchemaType(BinaryType, nullable = false) - case ENUM => SchemaType(StringType, nullable = false) - - case RECORD => - val fields = avroSchema.getFields.map { f => - val schemaType = toSqlType(f.schema()) - StructField(f.name, schemaType.dataType, schemaType.nullable) - } - - SchemaType(StructType(fields), nullable = false) - - case ARRAY => - val schemaType = toSqlType(avroSchema.getElementType) - SchemaType( - ArrayType(schemaType.dataType, containsNull = schemaType.nullable), - nullable = false) - - case MAP => - val schemaType = toSqlType(avroSchema.getValueType) - SchemaType( - MapType(StringType, schemaType.dataType, valueContainsNull = schemaType.nullable), - nullable = false) - - case UNION => - if (avroSchema.getTypes.exists(_.getType == NULL)) { - // In case of a union with null, eliminate it and make a recursive call - val remainingUnionTypes = avroSchema.getTypes.filterNot(_.getType == NULL) - if (remainingUnionTypes.size == 1) { - toSqlType(remainingUnionTypes.get(0)).copy(nullable = true) - } else { - toSqlType(Schema.createUnion(remainingUnionTypes)).copy(nullable = true) - } - } else avroSchema.getTypes.map(_.getType) match { - case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => - SchemaType(LongType, nullable = false) - case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => - SchemaType(DoubleType, nullable = false) - case other => throw new SchemaConversionException( - s"This mix of union types is not supported: $other") - } - - case other => throw new SchemaConversionException(s"Unsupported type $other") - } - } - - /** - * This function converts sparkSQL StructType into avro schema. This method uses two other - * converter methods in order to do the conversion. - */ - private def convertStructToAvro[T]( - structType: StructType, - schemaBuilder: RecordBuilder[T], - recordNamespace: String): T = { - val fieldsAssembler: FieldAssembler[T] = schemaBuilder.fields() - structType.fields.foreach { field => - val newField = fieldsAssembler.name(field.name).`type`() - - if (field.nullable) { - convertFieldTypeToAvro(field.dataType, newField.nullable(), field.name, recordNamespace) - .noDefault - } else { - convertFieldTypeToAvro(field.dataType, newField, field.name, recordNamespace) - .noDefault - } - } - fieldsAssembler.endRecord() - } - - /** - * Returns a function that is used to convert avro types to their - * corresponding sparkSQL representations. - */ - def createConverterToSQL(schema: Schema): Any => Any = { - schema.getType match { - // Avro strings are in Utf8, so we have to call toString on them - case STRING | ENUM => (item: Any) => if (item == null) null else item.toString - case INT | BOOLEAN | DOUBLE | FLOAT | LONG => identity - // Byte arrays are reused by avro, so we have to make a copy of them. - case FIXED => (item: Any) => if (item == null) { - null - } else { - item.asInstanceOf[Fixed].bytes().clone() - } - case BYTES => (item: Any) => if (item == null) { - null - } else { - val bytes = item.asInstanceOf[ByteBuffer] - val javaBytes = new Array[Byte](bytes.remaining) - bytes.get(javaBytes) - javaBytes - } - case RECORD => - val fieldConverters = schema.getFields.map(f => createConverterToSQL(f.schema)) - (item: Any) => if (item == null) { - null - } else { - val record = item.asInstanceOf[GenericRecord] - val converted = new Array[Any](fieldConverters.size) - var idx = 0 - while (idx < fieldConverters.size) { - converted(idx) = fieldConverters.apply(idx)(record.get(idx)) - idx += 1 - } - Row.fromSeq(converted.toSeq) - } - case ARRAY => - val elementConverter = createConverterToSQL(schema.getElementType) - (item: Any) => if (item == null) { - null - } else { - try { - item.asInstanceOf[GenericData.Array[Any]].map(elementConverter) - } catch { - case e: Throwable => - item.asInstanceOf[util.ArrayList[Any]].map(elementConverter) - } - } - case MAP => - val valueConverter = createConverterToSQL(schema.getValueType) - (item: Any) => if (item == null) { - null - } else { - item.asInstanceOf[HashMap[Any, Any]].map(x => (x._1.toString, valueConverter(x._2))).toMap - } - case UNION => - if (schema.getTypes.exists(_.getType == NULL)) { - val remainingUnionTypes = schema.getTypes.filterNot(_.getType == NULL) - if (remainingUnionTypes.size == 1) { - createConverterToSQL(remainingUnionTypes.get(0)) - } else { - createConverterToSQL(Schema.createUnion(remainingUnionTypes)) - } - } else schema.getTypes.map(_.getType) match { - case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => - (item: Any) => { - item match { - case l: Long => l - case i: Int => i.toLong - case null => null - } - } - case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => - (item: Any) => { - item match { - case d: Double => d - case f: Float => f.toDouble - case null => null - } - } - case other => throw new SchemaConversionException( - s"This mix of union types is not supported (see README): $other") - } - case other => throw new SchemaConversionException(s"invalid avro type: $other") - } - } - - /** - * This function is used to convert some sparkSQL type to avro type. Note that this function won't - * be used to construct fields of avro record (convertFieldTypeToAvro is used for that). - */ - private def convertTypeToAvro[T]( - dataType: DataType, - schemaBuilder: BaseTypeBuilder[T], - structName: String, - recordNamespace: String): T = { - dataType match { - case ByteType => schemaBuilder.intType() - case ShortType => schemaBuilder.intType() - case IntegerType => schemaBuilder.intType() - case LongType => schemaBuilder.longType() - case FloatType => schemaBuilder.floatType() - case DoubleType => schemaBuilder.doubleType() - case _: DecimalType => schemaBuilder.stringType() - case StringType => schemaBuilder.stringType() - case BinaryType => schemaBuilder.bytesType() - case BooleanType => schemaBuilder.booleanType() - case TimestampType => schemaBuilder.longType() - - case ArrayType(elementType, _) => - val builder = getSchemaBuilder(dataType.asInstanceOf[ArrayType].containsNull) - val elementSchema = convertTypeToAvro(elementType, builder, structName, recordNamespace) - schemaBuilder.array().items(elementSchema) - - case MapType(StringType, valueType, _) => - val builder = getSchemaBuilder(dataType.asInstanceOf[MapType].valueContainsNull) - val valueSchema = convertTypeToAvro(valueType, builder, structName, recordNamespace) - schemaBuilder.map().values(valueSchema) - - case structType: StructType => - convertStructToAvro( - structType, - schemaBuilder.record(structName).namespace(recordNamespace), - recordNamespace) - - case other => throw new IllegalArgumentException(s"Unexpected type $dataType.") - } - } - - /** - * This function is used to construct fields of the avro record, where schema of the field is - * specified by avro representation of dataType. Since builders for record fields are different - * from those for everything else, we have to use a separate method. - */ - private def convertFieldTypeToAvro[T]( - dataType: DataType, - newFieldBuilder: BaseFieldTypeBuilder[T], - structName: String, - recordNamespace: String): FieldDefault[T, _] = { - dataType match { - case ByteType => newFieldBuilder.intType() - case ShortType => newFieldBuilder.intType() - case IntegerType => newFieldBuilder.intType() - case LongType => newFieldBuilder.longType() - case FloatType => newFieldBuilder.floatType() - case DoubleType => newFieldBuilder.doubleType() - case _: DecimalType => newFieldBuilder.stringType() - case StringType => newFieldBuilder.stringType() - case BinaryType => newFieldBuilder.bytesType() - case BooleanType => newFieldBuilder.booleanType() - case TimestampType => newFieldBuilder.longType() - - case ArrayType(elementType, _) => - val builder = getSchemaBuilder(dataType.asInstanceOf[ArrayType].containsNull) - val elementSchema = convertTypeToAvro(elementType, builder, structName, recordNamespace) - newFieldBuilder.array().items(elementSchema) - - case MapType(StringType, valueType, _) => - val builder = getSchemaBuilder(dataType.asInstanceOf[MapType].valueContainsNull) - val valueSchema = convertTypeToAvro(valueType, builder, structName, recordNamespace) - newFieldBuilder.map().values(valueSchema) - - case structType: StructType => - convertStructToAvro( - structType, - newFieldBuilder.record(structName).namespace(recordNamespace), - recordNamespace) - - case other => throw new IllegalArgumentException(s"Unexpected type $dataType.") - } - } - - private def getSchemaBuilder(isNullable: Boolean): BaseTypeBuilder[Schema] = { - if (isNullable) { - SchemaBuilder.builder().nullable() - } else { - SchemaBuilder.builder() - } - } - /** - * This function constructs converter function for a given sparkSQL datatype. This is used in - * writing Avro records out to disk - */ - def createConverterToAvro( - dataType: DataType, - structName: String, - recordNamespace: String): (Any) => Any = { - dataType match { - case BinaryType => (item: Any) => item match { - case null => null - case bytes: Array[Byte] => ByteBuffer.wrap(bytes) - } - case ByteType | ShortType | IntegerType | LongType | - FloatType | DoubleType | StringType | BooleanType => identity - case _: DecimalType => (item: Any) => if (item == null) null else item.toString - case TimestampType => (item: Any) => - if (item == null) null else item.asInstanceOf[Timestamp].getTime - case ArrayType(elementType, _) => - val elementConverter = createConverterToAvro(elementType, structName, recordNamespace) - (item: Any) => { - if (item == null) { - null - } else { - val sourceArray = item.asInstanceOf[Seq[Any]] - val sourceArraySize = sourceArray.size - val targetArray = new util.ArrayList[Any](sourceArraySize) - var idx = 0 - while (idx < sourceArraySize) { - targetArray.add(elementConverter(sourceArray(idx))) - idx += 1 - } - targetArray - } - } - case MapType(StringType, valueType, _) => - val valueConverter = createConverterToAvro(valueType, structName, recordNamespace) - (item: Any) => { - if (item == null) { - null - } else { - val javaMap = new HashMap[String, Any]() - item.asInstanceOf[Map[String, Any]].foreach { case (key, value) => - javaMap.put(key, valueConverter(value)) - } - javaMap - } - } - case structType: StructType => - val builder = SchemaBuilder.record(structName).namespace(recordNamespace) - val schema: Schema = SchemaConverters.convertStructToAvro( - structType, builder, recordNamespace) - val fieldConverters = structType.fields.map(field => - createConverterToAvro(field.dataType, field.name, recordNamespace)) - (item: Any) => { - if (item == null) { - null - } else { - val record = new Record(schema) - val convertersIterator = fieldConverters.iterator - val fieldNamesIterator = dataType.asInstanceOf[StructType].fieldNames.iterator - val rowIterator = item.asInstanceOf[Row].toSeq.iterator - - while (convertersIterator.hasNext) { - val converter = convertersIterator.next() - record.put(fieldNamesIterator.next(), converter(rowIterator.next())) - } - record - } - } - } - } -} - -@InterfaceAudience.Private -object AvroSerdes { - // We only handle top level is record or primary type now - def serialize(input: Any, schema: Schema): Array[Byte]= { - schema.getType match { - case BOOLEAN => Bytes.toBytes(input.asInstanceOf[Boolean]) - case BYTES | FIXED=> input.asInstanceOf[Array[Byte]] - case DOUBLE => Bytes.toBytes(input.asInstanceOf[Double]) - case FLOAT => Bytes.toBytes(input.asInstanceOf[Float]) - case INT => Bytes.toBytes(input.asInstanceOf[Int]) - case LONG => Bytes.toBytes(input.asInstanceOf[Long]) - case STRING => Bytes.toBytes(input.asInstanceOf[String]) - case RECORD => - val gr = input.asInstanceOf[GenericRecord] - val writer2 = new GenericDatumWriter[GenericRecord](schema) - val bao2 = new ByteArrayOutputStream() - val encoder2: BinaryEncoder = EncoderFactory.get().directBinaryEncoder(bao2, null) - writer2.write(gr, encoder2) - bao2.toByteArray() - case _ => throw new Exception(s"unsupported data type ${schema.getType}") //TODO - } - } - - def deserialize(input: Array[Byte], schema: Schema): GenericRecord = { - val reader2: DatumReader[GenericRecord] = new GenericDatumReader[GenericRecord](schema) - val bai2 = new ByteArrayInputStream(input) - val decoder2: BinaryDecoder = DecoderFactory.get().directBinaryDecoder(bai2, null) - val gr2: GenericRecord = reader2.read(null, decoder2) - gr2 - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SerDes.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SerDes.scala deleted file mode 100644 index 98cc8719c90..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SerDes.scala +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.datasources - -import java.io.ByteArrayInputStream - -import org.apache.avro.Schema -import org.apache.avro.Schema.Type._ -import org.apache.avro.generic.GenericDatumReader -import org.apache.avro.generic.GenericDatumWriter -import org.apache.avro.generic.GenericRecord -import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord} -import org.apache.avro.io._ -import org.apache.commons.io.output.ByteArrayOutputStream -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.sql.types._ - -// TODO: This is not really used in code. -trait SerDes { - def serialize(value: Any): Array[Byte] - def deserialize(bytes: Array[Byte], start: Int, end: Int): Any -} - -// TODO: This is not really used in code. -class DoubleSerDes extends SerDes { - override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double]) - override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = { - Bytes.toDouble(bytes, start) - } -} - - diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SerializableConfiguration.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SerializableConfiguration.scala deleted file mode 100644 index 0e2b6f4c6d0..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SerializableConfiguration.scala +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.datasources - -import java.io.{IOException, ObjectInputStream, ObjectOutputStream} - -import org.apache.hadoop.conf.Configuration -import org.apache.yetus.audience.InterfaceAudience; - -import scala.util.control.NonFatal - -@InterfaceAudience.Private -class SerializableConfiguration(@transient var value: Configuration) extends Serializable { - private def writeObject(out: ObjectOutputStream): Unit = tryOrIOException { - out.defaultWriteObject() - value.write(out) - } - - private def readObject(in: ObjectInputStream): Unit = tryOrIOException { - value = new Configuration(false) - value.readFields(in) - } - - def tryOrIOException(block: => Unit) { - try { - block - } catch { - case e: IOException => throw e - case NonFatal(t) => throw new IOException(t) - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/package.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/package.scala deleted file mode 100644 index ce7b55a7a47..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/datasources/package.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.util.Bytes - -import scala.math.Ordering - -package object hbase { - type HBaseType = Array[Byte] - def bytesMin = new Array[Byte](0) - def bytesMax = null - val ByteMax = -1.asInstanceOf[Byte] - val ByteMin = 0.asInstanceOf[Byte] - val ord: Ordering[HBaseType] = new Ordering[HBaseType] { - def compare(x: Array[Byte], y: Array[Byte]): Int = { - return Bytes.compareTo(x, y) - } - } - //Do not use BinaryType.ordering - implicit val order: Ordering[HBaseType] = ord - -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/AvroSource.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/AvroSource.scala deleted file mode 100644 index c09e99d906d..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/AvroSource.scala +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.datasources - -import org.apache.avro.Schema -import org.apache.avro.generic.GenericData -import org.apache.hadoop.hbase.spark.AvroSerdes -import org.apache.spark.sql.datasources.hbase.HBaseTableCatalog -import org.apache.spark.sql.{DataFrame, SQLContext} -import org.apache.spark.{SparkConf, SparkContext} - -/** - * @param col0 Column #0, Type is String - * @param col1 Column #1, Type is Array[Byte] - */ -case class AvroHBaseRecord(col0: String, - col1: Array[Byte]) - -object AvroHBaseRecord { - val schemaString = - s"""{"namespace": "example.avro", - | "type": "record", "name": "User", - | "fields": [ - | {"name": "name", "type": "string"}, - | {"name": "favorite_number", "type": ["int", "null"]}, - | {"name": "favorite_color", "type": ["string", "null"]}, - | {"name": "favorite_array", "type": {"type": "array", "items": "string"}}, - | {"name": "favorite_map", "type": {"type": "map", "values": "int"}} - | ] }""".stripMargin - - val avroSchema: Schema = { - val p = new Schema.Parser - p.parse(schemaString) - } - - def apply(i: Int): AvroHBaseRecord = { - - val user = new GenericData.Record(avroSchema); - user.put("name", s"name${"%03d".format(i)}") - user.put("favorite_number", i) - user.put("favorite_color", s"color${"%03d".format(i)}") - val favoriteArray = new GenericData.Array[String](2, avroSchema.getField("favorite_array").schema()) - favoriteArray.add(s"number${i}") - favoriteArray.add(s"number${i+1}") - user.put("favorite_array", favoriteArray) - import collection.JavaConverters._ - val favoriteMap = Map[String, Int](("key1" -> i), ("key2" -> (i+1))).asJava - user.put("favorite_map", favoriteMap) - val avroByte = AvroSerdes.serialize(user, avroSchema) - AvroHBaseRecord(s"name${"%03d".format(i)}", avroByte) - } -} - -object AvroSource { - def catalog = s"""{ - |"table":{"namespace":"default", "name":"ExampleAvrotable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, - |"col1":{"cf":"cf1", "col":"col1", "type":"binary"} - |} - |}""".stripMargin - - def avroCatalog = s"""{ - |"table":{"namespace":"default", "name":"ExampleAvrotable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, - |"col1":{"cf":"cf1", "col":"col1", "avro":"avroSchema"} - |} - |}""".stripMargin - - def avroCatalogInsert = s"""{ - |"table":{"namespace":"default", "name":"ExampleAvrotableInsert"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, - |"col1":{"cf":"cf1", "col":"col1", "avro":"avroSchema"} - |} - |}""".stripMargin - - def main(args: Array[String]) { - val sparkConf = new SparkConf().setAppName("AvroSourceExample") - val sc = new SparkContext(sparkConf) - val sqlContext = new SQLContext(sc) - - import sqlContext.implicits._ - - def withCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map("avroSchema" -> AvroHBaseRecord.schemaString, HBaseTableCatalog.tableCatalog -> avroCatalog)) - .format("org.apache.hadoop.hbase.spark") - .load() - } - - val data = (0 to 255).map { i => - AvroHBaseRecord(i) - } - - sc.parallelize(data).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - - val df = withCatalog(catalog) - df.show() - df.printSchema() - df.registerTempTable("ExampleAvrotable") - val c = sqlContext.sql("select count(1) from ExampleAvrotable") - c.show() - - val filtered = df.select($"col0", $"col1.favorite_array").where($"col0" === "name001") - filtered.show() - val collected = filtered.collect() - if (collected(0).getSeq[String](1)(0) != "number1") { - throw new UserCustomizedSampleException("value invalid") - } - if (collected(0).getSeq[String](1)(1) != "number2") { - throw new UserCustomizedSampleException("value invalid") - } - - df.write.options( - Map("avroSchema"->AvroHBaseRecord.schemaString, HBaseTableCatalog.tableCatalog->avroCatalogInsert, - HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - val newDF = withCatalog(avroCatalogInsert) - newDF.show() - newDF.printSchema() - if(newDF.count() != 256) { - throw new UserCustomizedSampleException("value invalid") - } - - df.filter($"col1.name" === "name005" || $"col1.name" <= "name005") - .select("col0", "col1.favorite_color", "col1.favorite_number") - .show() - - df.filter($"col1.name" <= "name005" || $"col1.name".contains("name007")) - .select("col0", "col1.favorite_color", "col1.favorite_number") - .show() - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/DataType.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/DataType.scala deleted file mode 100644 index 96c6d6e4f92..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/DataType.scala +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.datasources - -import org.apache.spark.sql.{DataFrame, SQLContext} -import org.apache.spark.{SparkContext, SparkConf} -import org.apache.spark.sql.datasources.hbase.HBaseTableCatalog - -class UserCustomizedSampleException(message: String = null, cause: Throwable = null) extends - RuntimeException(UserCustomizedSampleException.message(message, cause), cause) - -object UserCustomizedSampleException { - def message(message: String, cause: Throwable) = - if (message != null) message - else if (cause != null) cause.toString() - else null -} - -case class IntKeyRecord( - col0: Integer, - col1: Boolean, - col2: Double, - col3: Float, - col4: Int, - col5: Long, - col6: Short, - col7: String, - col8: Byte) - -object IntKeyRecord { - def apply(i: Int): IntKeyRecord = { - IntKeyRecord(if (i % 2 == 0) i else -i, - i % 2 == 0, - i.toDouble, - i.toFloat, - i, - i.toLong, - i.toShort, - s"String$i extra", - i.toByte) - } -} - -object DataType { - val cat = s"""{ - |"table":{"namespace":"default", "name":"DataTypeExampleTable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"int"}, - |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, - |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, - |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, - |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, - |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, - |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, - |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, - |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} - |} - |}""".stripMargin - - def main(args: Array[String]){ - val sparkConf = new SparkConf().setAppName("DataTypeExample") - val sc = new SparkContext(sparkConf) - val sqlContext = new SQLContext(sc) - - import sqlContext.implicits._ - - def withCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map(HBaseTableCatalog.tableCatalog->cat)) - .format("org.apache.hadoop.hbase.spark") - .load() - } - - // test populate table - val data = (0 until 32).map { i => - IntKeyRecord(i) - } - sc.parallelize(data).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - - // test less than 0 - val df = withCatalog(cat) - val s = df.filter($"col0" < 0) - s.show() - if(s.count() != 16){ - throw new UserCustomizedSampleException("value invalid") - } - - //test less or equal than -10. The number of results is 11 - val num1 = df.filter($"col0" <= -10) - num1.show() - val c1 = num1.count() - println(s"test result count should be 11: $c1") - - //test less or equal than -9. The number of results is 12 - val num2 = df.filter($"col0" <= -9) - num2.show() - val c2 = num2.count() - println(s"test result count should be 12: $c2") - - //test greater or equal than -9". The number of results is 21 - val num3 = df.filter($"col0" >= -9) - num3.show() - val c3 = num3.count() - println(s"test result count should be 21: $c3") - - //test greater or equal than 0. The number of results is 16 - val num4 = df.filter($"col0" >= 0) - num4.show() - val c4 = num4.count() - println(s"test result count should be 16: $c4") - - //test greater than 10. The number of results is 10 - val num5 = df.filter($"col0" > 10) - num5.show() - val c5 = num5.count() - println(s"test result count should be 10: $c5") - - // test "and". The number of results is 11 - val num6 = df.filter($"col0" > -10 && $"col0" <= 10) - num6.show() - val c6 = num6.count() - println(s"test result count should be 11: $c6") - - //test "or". The number of results is 21 - val num7 = df.filter($"col0" <= -10 || $"col0" > 10) - num7.show() - val c7 = num7.count() - println(s"test result count should be 21: $c7") - - //test "all". The number of results is 32 - val num8 = df.filter($"col0" >= -100) - num8.show() - val c8 = num8.count() - println(s"test result count should be 32: $c8") - - //test "full query" - val df1 = withCatalog(cat) - df1.show() - val c_df = df1.count() - println(s"df count should be 32: $c_df") - if(c_df != 32){ - throw new UserCustomizedSampleException("value invalid") - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/HBaseSource.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/HBaseSource.scala deleted file mode 100644 index 056c071d5dd..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/datasources/HBaseSource.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.datasources - -import org.apache.spark.sql.{DataFrame, SQLContext} -import org.apache.spark.{SparkContext, SparkConf} -import org.apache.spark.sql.datasources.hbase.HBaseTableCatalog - -case class HBaseRecord( - col0: String, - col1: Boolean, - col2: Double, - col3: Float, - col4: Int, - col5: Long, - col6: Short, - col7: String, - col8: Byte) - -object HBaseRecord { - def apply(i: Int): HBaseRecord = { - val s = s"""row${"%03d".format(i)}""" - HBaseRecord(s, - i % 2 == 0, - i.toDouble, - i.toFloat, - i, - i.toLong, - i.toShort, - s"String$i extra", - i.toByte) - } -} - -object HBaseSource { - val cat = s"""{ - |"table":{"namespace":"default", "name":"HBaseSourceExampleTable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, - |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, - |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, - |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, - |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, - |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, - |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, - |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, - |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} - |} - |}""".stripMargin - - def main(args: Array[String]) { - val sparkConf = new SparkConf().setAppName("HBaseSourceExample") - val sc = new SparkContext(sparkConf) - val sqlContext = new SQLContext(sc) - - import sqlContext.implicits._ - - def withCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map(HBaseTableCatalog.tableCatalog->cat)) - .format("org.apache.hadoop.hbase.spark") - .load() - } - - val data = (0 to 255).map { i => - HBaseRecord(i) - } - - sc.parallelize(data).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - - val df = withCatalog(cat) - df.show() - df.filter($"col0" <= "row005") - .select($"col0", $"col1").show - df.filter($"col0" === "row005" || $"col0" <= "row005") - .select($"col0", $"col1").show - df.filter($"col0" > "row250") - .select($"col0", $"col1").show - df.registerTempTable("table1") - val c = sqlContext.sql("select count(col1) from table1 where col0 < 'row050'") - c.show() - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkDeleteExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkDeleteExample.scala deleted file mode 100644 index 46135a59483..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkDeleteExample.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.hbasecontext - -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.spark.SparkContext -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.client.Delete -import org.apache.spark.SparkConf - -/** - * This is a simple example of deleting records in HBase - * with the bulkDelete function. - */ -object HBaseBulkDeleteExample { - def main(args: Array[String]) { - if (args.length < 1) { - println("HBaseBulkDeleteExample {tableName} missing an argument") - return - } - - val tableName = args(0) - - val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) - val sc = new SparkContext(sparkConf) - try { - //[Array[Byte]] - val rdd = sc.parallelize(Array( - Bytes.toBytes("1"), - Bytes.toBytes("2"), - Bytes.toBytes("3"), - Bytes.toBytes("4"), - Bytes.toBytes("5") - )) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - hbaseContext.bulkDelete[Array[Byte]](rdd, - TableName.valueOf(tableName), - putRecord => new Delete(putRecord), - 4) - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkGetExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkGetExample.scala deleted file mode 100644 index 1bdc90ddc6d..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkGetExample.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.hbasecontext - -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.spark.SparkContext -import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.client.Get -import org.apache.hadoop.hbase.client.Result -import org.apache.spark.SparkConf - -/** - * This is a simple example of getting records from HBase - * with the bulkGet function. - */ -object HBaseBulkGetExample { - def main(args: Array[String]) { - if (args.length < 1) { - println("HBaseBulkGetExample {tableName} missing an argument") - return - } - - val tableName = args(0) - - val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) - val sc = new SparkContext(sparkConf) - - try { - - //[(Array[Byte])] - val rdd = sc.parallelize(Array( - Bytes.toBytes("1"), - Bytes.toBytes("2"), - Bytes.toBytes("3"), - Bytes.toBytes("4"), - Bytes.toBytes("5"), - Bytes.toBytes("6"), - Bytes.toBytes("7"))) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - val getRdd = hbaseContext.bulkGet[Array[Byte], String]( - TableName.valueOf(tableName), - 2, - rdd, - record => { - System.out.println("making Get") - new Get(record) - }, - (result: Result) => { - - val it = result.listCells().iterator() - val b = new StringBuilder - - b.append(Bytes.toString(result.getRow) + ":") - - while (it.hasNext) { - val cell = it.next() - val q = Bytes.toString(CellUtil.cloneQualifier(cell)) - if (q.equals("counter")) { - b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") - } else { - b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") - } - } - b.toString() - }) - - getRdd.collect().foreach(v => println(v)) - - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExample.scala deleted file mode 100644 index 063f2c2d6bd..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExample.scala +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.hbasecontext - -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.spark.SparkContext -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.client.Put -import org.apache.spark.SparkConf - -/** - * This is a simple example of putting records in HBase - * with the bulkPut function. - */ -object HBaseBulkPutExample { - def main(args: Array[String]) { - if (args.length < 2) { - println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments") - return - } - - val tableName = args(0) - val columnFamily = args(1) - - val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + - tableName + " " + columnFamily) - val sc = new SparkContext(sparkConf) - - try { - //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), - (Bytes.toBytes("2"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), - (Bytes.toBytes("3"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), - (Bytes.toBytes("4"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), - (Bytes.toBytes("5"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) - )) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, - TableName.valueOf(tableName), - (putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => - put.addColumn(putValue._1, putValue._2, putValue._3)) - put - }); - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExampleFromFile.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExampleFromFile.scala deleted file mode 100644 index 37a0358653f..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExampleFromFile.scala +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.hbasecontext - -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.spark.SparkContext -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.client.Put -import org.apache.hadoop.mapred.TextInputFormat -import org.apache.hadoop.io.LongWritable -import org.apache.hadoop.io.Text -import org.apache.spark.SparkConf - -/** - * This is a simple example of putting records in HBase - * with the bulkPut function. In this example we are - * getting the put information from a file - */ -object HBaseBulkPutExampleFromFile { - def main(args: Array[String]) { - if (args.length < 3) { - println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile} are missing an argument") - return - } - - val tableName = args(0) - val columnFamily = args(1) - val inputFile = args(2) - - val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + - tableName + " " + columnFamily + " " + inputFile) - val sc = new SparkContext(sparkConf) - - try { - var rdd = sc.hadoopFile( - inputFile, - classOf[TextInputFormat], - classOf[LongWritable], - classOf[Text]).map(v => { - System.out.println("reading-" + v._2.toString) - v._2.toString - }) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - hbaseContext.bulkPut[String](rdd, - TableName.valueOf(tableName), - (putRecord) => { - System.out.println("hbase-" + putRecord) - val put = new Put(Bytes.toBytes("Value- " + putRecord)) - put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), - Bytes.toBytes(putRecord.length())) - put - }); - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutTimestampExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutTimestampExample.scala deleted file mode 100644 index fa782166d7c..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutTimestampExample.scala +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.hbasecontext - -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.spark.SparkContext -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.client.Put -import org.apache.spark.SparkConf - -/** - * This is a simple example of putting records in HBase - * with the bulkPut function. In this example we are - * also setting the timestamp in the put - */ -object HBaseBulkPutTimestampExample { - def main(args: Array[String]) { - if (args.length < 2) { - System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily} are missing an argument") - return - } - - val tableName = args(0) - val columnFamily = args(1) - - val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " + - tableName + " " + columnFamily) - val sc = new SparkContext(sparkConf) - - try { - - val rdd = sc.parallelize(Array( - (Bytes.toBytes("6"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), - (Bytes.toBytes("7"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), - (Bytes.toBytes("8"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), - (Bytes.toBytes("9"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), - (Bytes.toBytes("10"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))))) - - val conf = HBaseConfiguration.create() - - val timeStamp = System.currentTimeMillis() - - val hbaseContext = new HBaseContext(sc, conf) - hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, - TableName.valueOf(tableName), - (putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, - timeStamp, putValue._3)) - put - }) - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseDistributedScanExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseDistributedScanExample.scala deleted file mode 100644 index bb2e79d08b4..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseDistributedScanExample.scala +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.hbasecontext - -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.spark.SparkContext -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.client.Scan -import org.apache.spark.SparkConf -/** - * This is a simple example of scanning records from HBase - * with the hbaseRDD function in Distributed fashion. - */ -object HBaseDistributedScanExample { - def main(args: Array[String]) { - if (args.length < 1) { - println("HBaseDistributedScanExample {tableName} missing an argument") - return - } - - val tableName = args(0) - - val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName ) - val sc = new SparkContext(sparkConf) - - try { - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - val scan = new Scan() - scan.setCaching(100) - - val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan) - - getRdd.foreach(v => println(Bytes.toString(v._1.get()))) - - println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length); - } finally { - sc.stop() - } - } - -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseStreamingBulkPutExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseStreamingBulkPutExample.scala deleted file mode 100644 index 8ac93efe481..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseStreamingBulkPutExample.scala +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.hbasecontext - -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.spark.SparkContext -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.client.Put -import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming.Seconds -import org.apache.spark.SparkConf - -/** - * This is a simple example of BulkPut with Spark Streaming - */ -object HBaseStreamingBulkPutExample { - def main(args: Array[String]) { - if (args.length < 4) { - println("HBaseStreamingBulkPutExample " + - "{host} {port} {tableName} {columnFamily} are missing an argument") - return - } - - val host = args(0) - val port = args(1) - val tableName = args(2) - val columnFamily = args(3) - - val sparkConf = new SparkConf().setAppName("HBaseStreamingBulkPutExample " + - tableName + " " + columnFamily) - val sc = new SparkContext(sparkConf) - try { - val ssc = new StreamingContext(sc, Seconds(1)) - - val lines = ssc.socketTextStream(host, port.toInt) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - hbaseContext.streamBulkPut[String](lines, - TableName.valueOf(tableName), - (putRecord) => { - if (putRecord.length() > 0) { - val put = new Put(Bytes.toBytes(putRecord)) - put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar")) - put - } else { - null - } - }) - ssc.start() - ssc.awaitTerminationOrTimeout(60000) - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkDeleteExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkDeleteExample.scala deleted file mode 100644 index 83d3f9e3013..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkDeleteExample.scala +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.rdd - -import org.apache.hadoop.hbase.client.Delete -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.hadoop.hbase.util.Bytes - -import org.apache.spark.{SparkContext, SparkConf} - -/** - * This is a simple example of deleting records in HBase - * with the bulkDelete function. - */ -object HBaseBulkDeleteExample { - def main(args: Array[String]) { - if (args.length < 1) { - println("HBaseBulkDeleteExample {tableName} are missing an argument") - return - } - - val tableName = args(0) - - val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) - val sc = new SparkContext(sparkConf) - try { - //[Array[Byte]] - val rdd = sc.parallelize(Array( - Bytes.toBytes("1"), - Bytes.toBytes("2"), - Bytes.toBytes("3"), - Bytes.toBytes("4"), - Bytes.toBytes("5") - )) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName), - putRecord => new Delete(putRecord), - 4) - - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkGetExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkGetExample.scala deleted file mode 100644 index eedabc3a6c1..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkGetExample.scala +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark.example.rdd - -import org.apache.hadoop.hbase.client.{Result, Get} -import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.spark.{SparkContext, SparkConf} - -/** - * This is a simple example of getting records from HBase - * with the bulkGet function. - */ -object HBaseBulkGetExample { - def main(args: Array[String]) { - if (args.length < 1) { - println("HBaseBulkGetExample {tableName} is missing an argument") - return - } - - val tableName = args(0) - - val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) - val sc = new SparkContext(sparkConf) - - try { - - //[(Array[Byte])] - val rdd = sc.parallelize(Array( - Bytes.toBytes("1"), - Bytes.toBytes("2"), - Bytes.toBytes("3"), - Bytes.toBytes("4"), - Bytes.toBytes("5"), - Bytes.toBytes("6"), - Bytes.toBytes("7"))) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2, - record => { - System.out.println("making Get") - new Get(record) - }, - (result: Result) => { - - val it = result.listCells().iterator() - val b = new StringBuilder - - b.append(Bytes.toString(result.getRow) + ":") - - while (it.hasNext) { - val cell = it.next() - val q = Bytes.toString(CellUtil.cloneQualifier(cell)) - if (q.equals("counter")) { - b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") - } else { - b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") - } - } - b.toString() - }) - - getRdd.collect().foreach(v => println(v)) - - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkPutExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkPutExample.scala deleted file mode 100644 index 28711b8878d..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkPutExample.scala +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.rdd - -import org.apache.hadoop.hbase.client.Put -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} -import org.apache.spark.{SparkConf, SparkContext} - -/** - * This is a simple example of putting records in HBase - * with the bulkPut function. - */ -object HBaseBulkPutExample { - def main(args: Array[String]) { - if (args.length < 2) { - println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments") - return - } - - val tableName = args(0) - val columnFamily = args(1) - - val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + - tableName + " " + columnFamily) - val sc = new SparkContext(sparkConf) - - try { - //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), - (Bytes.toBytes("2"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), - (Bytes.toBytes("3"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), - (Bytes.toBytes("4"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), - (Bytes.toBytes("5"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) - )) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), - (putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, - putValue._3)) - put - }) - - } finally { - sc.stop() - } - } - } diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseForeachPartitionExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseForeachPartitionExample.scala deleted file mode 100644 index 8dfefc26184..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseForeachPartitionExample.scala +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.rdd - -import org.apache.hadoop.hbase.client.Put -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.{SparkContext, SparkConf} - -/** - * This is a simple example of using the foreachPartition - * method with a HBase connection - */ -object HBaseForeachPartitionExample { - def main(args: Array[String]) { - if (args.length < 2) { - println("HBaseForeachPartitionExample {tableName} {columnFamily} are missing an arguments") - return - } - - val tableName = args(0) - val columnFamily = args(1) - - val sparkConf = new SparkConf().setAppName("HBaseForeachPartitionExample " + - tableName + " " + columnFamily) - val sc = new SparkContext(sparkConf) - - try { - //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), - (Bytes.toBytes("2"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), - (Bytes.toBytes("3"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), - (Bytes.toBytes("4"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), - (Bytes.toBytes("5"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) - )) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - - rdd.hbaseForeachPartition(hbaseContext, - (it, connection) => { - val m = connection.getBufferedMutator(TableName.valueOf(tableName)) - - it.foreach(r => { - val put = new Put(r._1) - r._2.foreach((putValue) => - put.addColumn(putValue._1, putValue._2, putValue._3)) - m.mutate(put) - }) - m.flush() - m.close() - }) - - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseMapPartitionExample.scala b/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseMapPartitionExample.scala deleted file mode 100644 index 0d0b314b7eb..00000000000 --- a/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseMapPartitionExample.scala +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark.example.rdd - -import org.apache.hadoop.hbase.client.Get -import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} -import org.apache.hadoop.hbase.spark.HBaseContext -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.{SparkContext, SparkConf} - -/** - * This is a simple example of using the mapPartitions - * method with a HBase connection - */ -object HBaseMapPartitionExample { - def main(args: Array[String]) { - if (args.length < 1) { - println("HBaseMapPartitionExample {tableName} is missing an argument") - return - } - - val tableName = args(0) - - val sparkConf = new SparkConf().setAppName("HBaseMapPartitionExample " + tableName) - val sc = new SparkContext(sparkConf) - - try { - - //[(Array[Byte])] - val rdd = sc.parallelize(Array( - Bytes.toBytes("1"), - Bytes.toBytes("2"), - Bytes.toBytes("3"), - Bytes.toBytes("4"), - Bytes.toBytes("5"), - Bytes.toBytes("6"), - Bytes.toBytes("7"))) - - val conf = HBaseConfiguration.create() - - val hbaseContext = new HBaseContext(sc, conf) - - val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => { - val table = connection.getTable(TableName.valueOf(tableName)) - it.map{r => - //batching would be faster. This is just an example - val result = table.get(new Get(r)) - - val it = result.listCells().iterator() - val b = new StringBuilder - - b.append(Bytes.toString(result.getRow) + ":") - - while (it.hasNext) { - val cell = it.next() - val q = Bytes.toString(cell.getQualifierArray) - if (q.equals("counter")) { - b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")") - } else { - b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")") - } - } - b.toString() - } - }) - - getRdd.collect().foreach(v => println(v)) - - } finally { - sc.stop() - } - } -} diff --git a/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/DataTypeParserWrapper.scala b/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/DataTypeParserWrapper.scala deleted file mode 100644 index 3df23f958ee..00000000000 --- a/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/DataTypeParserWrapper.scala +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.datasources.hbase - -import org.apache.spark.sql.catalyst.SqlLexical -import org.apache.spark.sql.catalyst.util.DataTypeParser -import org.apache.spark.sql.types.DataType - -// TODO: Only used in test suite. -object DataTypeParserWrapper { - lazy val dataTypeParser = new DataTypeParser { - override val lexical = new SqlLexical - } - - def parse(dataTypeString: String): DataType = dataTypeParser.toDataType(dataTypeString) -} diff --git a/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/HBaseTableCatalog.scala b/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/HBaseTableCatalog.scala deleted file mode 100644 index 65a3bc70e29..00000000000 --- a/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/HBaseTableCatalog.scala +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.datasources.hbase - -import org.apache.avro.Schema -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.hadoop.hbase.spark.SchemaConverters -import org.apache.hadoop.hbase.spark.datasources._ -import org.apache.hadoop.hbase.spark.hbase._ -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.util.DataTypeParser -import org.apache.spark.sql.types._ -import org.json4s.jackson.JsonMethods._ - -import scala.collection.mutable - -// Due the access issue defined in spark, we have to locate the file in this package. -// The definition of each column cell, which may be composite type -// TODO: add avro support -@InterfaceAudience.Private -case class Field( - colName: String, - cf: String, - col: String, - sType: Option[String] = None, - avroSchema: Option[String] = None, - serdes: Option[SerDes]= None, - len: Int = -1) extends Logging { - override def toString = s"$colName $cf $col" - val isRowKey = cf == HBaseTableCatalog.rowKey - var start: Int = _ - def schema: Option[Schema] = avroSchema.map { x => - logDebug(s"avro: $x") - val p = new Schema.Parser - p.parse(x) - } - - lazy val exeSchema = schema - - // converter from avro to catalyst structure - lazy val avroToCatalyst: Option[Any => Any] = { - schema.map(SchemaConverters.createConverterToSQL(_)) - } - - // converter from catalyst to avro - lazy val catalystToAvro: (Any) => Any ={ - SchemaConverters.createConverterToAvro(dt, colName, "recordNamespace") - } - - def cfBytes: Array[Byte] = { - if (isRowKey) { - Bytes.toBytes("") - } else { - Bytes.toBytes(cf) - } - } - def colBytes: Array[Byte] = { - if (isRowKey) { - Bytes.toBytes("key") - } else { - Bytes.toBytes(col) - } - } - - val dt = { - sType.map(DataTypeParser.parse(_)).getOrElse{ - schema.map{ x=> - SchemaConverters.toSqlType(x).dataType - }.get - } - } - - var length: Int = { - if (len == -1) { - dt match { - case BinaryType | StringType => -1 - case BooleanType => Bytes.SIZEOF_BOOLEAN - case ByteType => 1 - case DoubleType => Bytes.SIZEOF_DOUBLE - case FloatType => Bytes.SIZEOF_FLOAT - case IntegerType => Bytes.SIZEOF_INT - case LongType => Bytes.SIZEOF_LONG - case ShortType => Bytes.SIZEOF_SHORT - case _ => -1 - } - } else { - len - } - - } - - override def equals(other: Any): Boolean = other match { - case that: Field => - colName == that.colName && cf == that.cf && col == that.col - case _ => false - } -} - -// The row key definition, with each key refer to the col defined in Field, e.g., -// key1:key2:key3 -@InterfaceAudience.Private -case class RowKey(k: String) { - val keys = k.split(":") - var fields: Seq[Field] = _ - var varLength = false - def length = { - if (varLength) { - -1 - } else { - fields.foldLeft(0){case (x, y) => - x + y.length - } - } - } -} -// The map between the column presented to Spark and the HBase field -@InterfaceAudience.Private -case class SchemaMap(map: mutable.HashMap[String, Field]) { - def toFields = map.map { case (name, field) => - StructField(name, field.dt) - }.toSeq - - def fields = map.values - - def getField(name: String) = map(name) -} - - -// The definition of HBase and Relation relation schema -@InterfaceAudience.Private -case class HBaseTableCatalog( - namespace: String, - name: String, - row: RowKey, - sMap: SchemaMap, - @transient params: Map[String, String]) extends Logging { - def toDataType = StructType(sMap.toFields) - def getField(name: String) = sMap.getField(name) - def getRowKey: Seq[Field] = row.fields - def getPrimaryKey= row.keys(0) - def getColumnFamilies = { - sMap.fields.map(_.cf).filter(_ != HBaseTableCatalog.rowKey).toSeq.distinct - } - - def get(key: String) = params.get(key) - - // Setup the start and length for each dimension of row key at runtime. - def dynSetupRowKey(rowKey: Array[Byte]) { - logDebug(s"length: ${rowKey.length}") - if(row.varLength) { - var start = 0 - row.fields.foreach { f => - logDebug(s"start: $start") - f.start = start - f.length = { - // If the length is not defined - if (f.length == -1) { - f.dt match { - case StringType => - var pos = rowKey.indexOf(HBaseTableCatalog.delimiter, start) - if (pos == -1 || pos > rowKey.length) { - // this is at the last dimension - pos = rowKey.length - } - pos - start - // We don't know the length, assume it extend to the end of the rowkey. - case _ => rowKey.length - start - } - } else { - f.length - } - } - start += f.length - } - } - } - - def initRowKey = { - val fields = sMap.fields.filter(_.cf == HBaseTableCatalog.rowKey) - row.fields = row.keys.flatMap(n => fields.find(_.col == n)) - // The length is determined at run time if it is string or binary and the length is undefined. - if (row.fields.filter(_.length == -1).isEmpty) { - var start = 0 - row.fields.foreach { f => - f.start = start - start += f.length - } - } else { - row.varLength = true - } - } - initRowKey -} - -@InterfaceAudience.Public -object HBaseTableCatalog { - // If defined and larger than 3, a new table will be created with the nubmer of region specified. - val newTable = "newtable" - // The json string specifying hbase catalog information - val regionStart = "regionStart" - val defaultRegionStart = "aaaaaaa" - val regionEnd = "regionEnd" - val defaultRegionEnd = "zzzzzzz" - val tableCatalog = "catalog" - // The row key with format key1:key2 specifying table row key - val rowKey = "rowkey" - // The key for hbase table whose value specify namespace and table name - val table = "table" - // The namespace of hbase table - val nameSpace = "namespace" - // The name of hbase table - val tableName = "name" - // The name of columns in hbase catalog - val columns = "columns" - val cf = "cf" - val col = "col" - val `type` = "type" - // the name of avro schema json string - val avro = "avro" - val delimiter: Byte = 0 - val serdes = "serdes" - val length = "length" - - /** - * User provide table schema definition - * {"tablename":"name", "rowkey":"key1:key2", - * "columns":{"col1":{"cf":"cf1", "col":"col1", "type":"type1"}, - * "col2":{"cf":"cf2", "col":"col2", "type":"type2"}}} - * Note that any col in the rowKey, there has to be one corresponding col defined in columns - */ - def apply(params: Map[String, String]): HBaseTableCatalog = { - val parameters = convert(params) - // println(jString) - val jString = parameters(tableCatalog) - val map = parse(jString).values.asInstanceOf[Map[String, _]] - val tableMeta = map.get(table).get.asInstanceOf[Map[String, _]] - val nSpace = tableMeta.get(nameSpace).getOrElse("default").asInstanceOf[String] - val tName = tableMeta.get(tableName).get.asInstanceOf[String] - val cIter = map.get(columns).get.asInstanceOf[Map[String, Map[String, String]]].toIterator - val schemaMap = mutable.HashMap.empty[String, Field] - cIter.foreach { case (name, column) => - val sd = { - column.get(serdes).asInstanceOf[Option[String]].map(n => - Class.forName(n).newInstance().asInstanceOf[SerDes] - ) - } - val len = column.get(length).map(_.toInt).getOrElse(-1) - val sAvro = column.get(avro).map(parameters(_)) - val f = Field(name, column.getOrElse(cf, rowKey), - column.get(col).get, - column.get(`type`), - sAvro, sd, len) - schemaMap.+=((name, f)) - } - val rKey = RowKey(map.get(rowKey).get.asInstanceOf[String]) - HBaseTableCatalog(nSpace, tName, rKey, SchemaMap(schemaMap), parameters) - } - - val TABLE_KEY: String = "hbase.table" - val SCHEMA_COLUMNS_MAPPING_KEY: String = "hbase.columns.mapping" - - /* for backward compatibility. Convert the old definition to new json based definition formated as below - val catalog = s"""{ - |"table":{"namespace":"default", "name":"htable"}, - |"rowkey":"key1:key2", - |"columns":{ - |"col1":{"cf":"rowkey", "col":"key1", "type":"string"}, - |"col2":{"cf":"rowkey", "col":"key2", "type":"double"}, - |"col3":{"cf":"cf1", "col":"col2", "type":"binary"}, - |"col4":{"cf":"cf1", "col":"col3", "type":"timestamp"}, - |"col5":{"cf":"cf1", "col":"col4", "type":"double", "serdes":"${classOf[DoubleSerDes].getName}"}, - |"col6":{"cf":"cf1", "col":"col5", "type":"$map"}, - |"col7":{"cf":"cf1", "col":"col6", "type":"$array"}, - |"col8":{"cf":"cf1", "col":"col7", "type":"$arrayMap"} - |} - |}""".stripMargin - */ - @deprecated("Please use new json format to define HBaseCatalog") - // TODO: There is no need to deprecate since this is the first release. - def convert(parameters: Map[String, String]): Map[String, String] = { - val tableName = parameters.get(TABLE_KEY).getOrElse(null) - // if the hbase.table is not defined, we assume it is json format already. - if (tableName == null) return parameters - val schemaMappingString = parameters.getOrElse(SCHEMA_COLUMNS_MAPPING_KEY, "") - import scala.collection.JavaConverters._ - val schemaMap = generateSchemaMappingMap(schemaMappingString).asScala.map(_._2.asInstanceOf[SchemaQualifierDefinition]) - - val rowkey = schemaMap.filter { - _.columnFamily == "rowkey" - }.map(_.columnName) - val cols = schemaMap.map { x => - s""""${x.columnName}":{"cf":"${x.columnFamily}", "col":"${x.qualifier}", "type":"${x.colType}"}""".stripMargin - } - val jsonCatalog = - s"""{ - |"table":{"namespace":"default", "name":"${tableName}"}, - |"rowkey":"${rowkey.mkString(":")}", - |"columns":{ - |${cols.mkString(",")} - |} - |} - """.stripMargin - parameters ++ Map(HBaseTableCatalog.tableCatalog->jsonCatalog) - } - - /** - * Reads the SCHEMA_COLUMNS_MAPPING_KEY and converts it to a map of - * SchemaQualifierDefinitions with the original sql column name as the key - * - * @param schemaMappingString The schema mapping string from the SparkSQL map - * @return A map of definitions keyed by the SparkSQL column name - */ - @InterfaceAudience.Private - def generateSchemaMappingMap(schemaMappingString:String): - java.util.HashMap[String, SchemaQualifierDefinition] = { - println(schemaMappingString) - try { - val columnDefinitions = schemaMappingString.split(',') - val resultingMap = new java.util.HashMap[String, SchemaQualifierDefinition]() - columnDefinitions.map(cd => { - val parts = cd.trim.split(' ') - - //Make sure we get three parts - // - if (parts.length == 3) { - val hbaseDefinitionParts = if (parts(2).charAt(0) == ':') { - Array[String]("rowkey", parts(0)) - } else { - parts(2).split(':') - } - resultingMap.put(parts(0), new SchemaQualifierDefinition(parts(0), - parts(1), hbaseDefinitionParts(0), hbaseDefinitionParts(1))) - } else { - throw new IllegalArgumentException("Invalid value for schema mapping '" + cd + - "' should be ' :' " + - "for columns and ' :' for rowKeys") - } - }) - resultingMap - } catch { - case e:Exception => throw - new IllegalArgumentException("Invalid value for " + SCHEMA_COLUMNS_MAPPING_KEY + - " '" + - schemaMappingString + "'", e ) - } - } -} - -/** - * Construct to contains column data that spend SparkSQL and HBase - * - * @param columnName SparkSQL column name - * @param colType SparkSQL column type - * @param columnFamily HBase column family - * @param qualifier HBase qualifier name - */ -@InterfaceAudience.Private -case class SchemaQualifierDefinition(columnName:String, - colType:String, - columnFamily:String, - qualifier:String) diff --git a/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/Utils.scala b/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/Utils.scala deleted file mode 100644 index 36b8bbf68e6..00000000000 --- a/hbase-spark/src/main/scala/org/apache/spark/sql/datasources/hbase/Utils.scala +++ /dev/null @@ -1,100 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.datasources.hbase - -import org.apache.hadoop.hbase.spark.AvroSerdes -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.sql.execution.SparkSqlSerializer -import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String - -import org.apache.yetus.audience.InterfaceAudience; - -@InterfaceAudience.Private -object Utils { - - - /** - * Parses the hbase field to it's corresponding - * scala type which can then be put into a Spark GenericRow - * which is then automatically converted by Spark. - */ - def hbaseFieldToScalaType( - f: Field, - src: Array[Byte], - offset: Int, - length: Int): Any = { - if (f.exeSchema.isDefined) { - // If we have avro schema defined, use it to get record, and then convert them to catalyst data type - val m = AvroSerdes.deserialize(src, f.exeSchema.get) - val n = f.avroToCatalyst.map(_(m)) - n.get - } else { - // Fall back to atomic type - f.dt match { - case BooleanType => toBoolean(src, offset) - case ByteType => src(offset) - case DoubleType => Bytes.toDouble(src, offset) - case FloatType => Bytes.toFloat(src, offset) - case IntegerType => Bytes.toInt(src, offset) - case LongType|TimestampType => Bytes.toLong(src, offset) - case ShortType => Bytes.toShort(src, offset) - case StringType => toUTF8String(src, offset, length) - case BinaryType => - val newArray = new Array[Byte](length) - System.arraycopy(src, offset, newArray, 0, length) - newArray - // TODO: add more data type support - case _ => SparkSqlSerializer.deserialize[Any](src) - } - } - } - - // convert input to data type - def toBytes(input: Any, field: Field): Array[Byte] = { - if (field.schema.isDefined) { - // Here we assume the top level type is structType - val record = field.catalystToAvro(input) - AvroSerdes.serialize(record, field.schema.get) - } else { - input match { - case data: Boolean => Bytes.toBytes(data) - case data: Byte => Array(data) - case data: Array[Byte] => data - case data: Double => Bytes.toBytes(data) - case data: Float => Bytes.toBytes(data) - case data: Int => Bytes.toBytes(data) - case data: Long => Bytes.toBytes(data) - case data: Short => Bytes.toBytes(data) - case data: UTF8String => data.getBytes - case data: String => Bytes.toBytes(data) - // TODO: add more data type support - case _ => throw new Exception(s"unsupported data type ${field.dt}") - } - } - } - - def toBoolean(input: Array[Byte], offset: Int): Boolean = { - input(offset) != 0 - } - - def toUTF8String(input: Array[Byte], offset: Int, length: Int): UTF8String = { - UTF8String.fromBytes(input.slice(offset, offset + length)) - } -} diff --git a/hbase-spark/src/test/java/org/apache/hadoop/hbase/spark/TestJavaHBaseContext.java b/hbase-spark/src/test/java/org/apache/hadoop/hbase/spark/TestJavaHBaseContext.java deleted file mode 100644 index e383b5e0bfb..00000000000 --- a/hbase-spark/src/test/java/org/apache/hadoop/hbase/spark/TestJavaHBaseContext.java +++ /dev/null @@ -1,520 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark; - -import java.io.File; -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.HashMap; -import java.util.List; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.Cell; -import org.apache.hadoop.hbase.CellUtil; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.Admin; -import org.apache.hadoop.hbase.client.Connection; -import org.apache.hadoop.hbase.client.ConnectionFactory; -import org.apache.hadoop.hbase.client.Delete; -import org.apache.hadoop.hbase.client.Get; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.Scan; -import org.apache.hadoop.hbase.client.Table; -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; -import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles; -import org.apache.hadoop.hbase.spark.example.hbasecontext.JavaHBaseBulkDeleteExample; -import org.apache.hadoop.hbase.testclassification.MediumTests; -import org.apache.hadoop.hbase.testclassification.MiscTests; -import org.apache.hadoop.hbase.util.Bytes; - -import org.apache.hadoop.hbase.util.Pair; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; - -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.experimental.categories.Category; - -import scala.Tuple2; -import org.apache.hadoop.hbase.shaded.com.google.common.io.Files; - -@Category({MiscTests.class, MediumTests.class}) -public class TestJavaHBaseContext implements Serializable { - private transient JavaSparkContext jsc; - HBaseTestingUtility htu; - protected static final Log LOG = LogFactory.getLog(TestJavaHBaseContext.class); - - - - byte[] tableName = Bytes.toBytes("t1"); - byte[] columnFamily = Bytes.toBytes("c"); - byte[] columnFamily1 = Bytes.toBytes("d"); - String columnFamilyStr = Bytes.toString(columnFamily); - String columnFamilyStr1 = Bytes.toString(columnFamily1); - - - @Before - public void setUp() { - jsc = new JavaSparkContext("local", "JavaHBaseContextSuite"); - - File tempDir = Files.createTempDir(); - tempDir.deleteOnExit(); - - htu = new HBaseTestingUtility(); - try { - LOG.info("cleaning up test dir"); - - htu.cleanupTestDir(); - - LOG.info("starting minicluster"); - - htu.startMiniZKCluster(); - htu.startMiniHBaseCluster(1, 1); - - LOG.info(" - minicluster started"); - - try { - htu.deleteTable(TableName.valueOf(tableName)); - } catch (Exception e) { - LOG.info(" - no table " + Bytes.toString(tableName) + " found"); - } - - LOG.info(" - creating table " + Bytes.toString(tableName)); - htu.createTable(TableName.valueOf(tableName), - new byte[][]{columnFamily, columnFamily1}); - LOG.info(" - created table"); - } catch (Exception e1) { - throw new RuntimeException(e1); - } - } - - @After - public void tearDown() { - try { - htu.deleteTable(TableName.valueOf(tableName)); - LOG.info("shuting down minicluster"); - htu.shutdownMiniHBaseCluster(); - htu.shutdownMiniZKCluster(); - LOG.info(" - minicluster shut down"); - htu.cleanupTestDir(); - } catch (Exception e) { - throw new RuntimeException(e); - } - jsc.stop(); - jsc = null; - } - - @Test - public void testBulkPut() throws IOException { - - List list = new ArrayList<>(5); - list.add("1," + columnFamilyStr + ",a,1"); - list.add("2," + columnFamilyStr + ",a,2"); - list.add("3," + columnFamilyStr + ",a,3"); - list.add("4," + columnFamilyStr + ",a,4"); - list.add("5," + columnFamilyStr + ",a,5"); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = htu.getConfiguration(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - Connection conn = ConnectionFactory.createConnection(conf); - Table table = conn.getTable(TableName.valueOf(tableName)); - - try { - List deletes = new ArrayList<>(5); - for (int i = 1; i < 6; i++) { - deletes.add(new Delete(Bytes.toBytes(Integer.toString(i)))); - } - table.delete(deletes); - } finally { - table.close(); - } - - hbaseContext.bulkPut(rdd, - TableName.valueOf(tableName), - new PutFunction()); - - table = conn.getTable(TableName.valueOf(tableName)); - - try { - Result result1 = table.get(new Get(Bytes.toBytes("1"))); - Assert.assertNotNull("Row 1 should had been deleted", result1.getRow()); - - Result result2 = table.get(new Get(Bytes.toBytes("2"))); - Assert.assertNotNull("Row 2 should had been deleted", result2.getRow()); - - Result result3 = table.get(new Get(Bytes.toBytes("3"))); - Assert.assertNotNull("Row 3 should had been deleted", result3.getRow()); - - Result result4 = table.get(new Get(Bytes.toBytes("4"))); - Assert.assertNotNull("Row 4 should had been deleted", result4.getRow()); - - Result result5 = table.get(new Get(Bytes.toBytes("5"))); - Assert.assertNotNull("Row 5 should had been deleted", result5.getRow()); - } finally { - table.close(); - conn.close(); - } - } - - public static class PutFunction implements Function { - - private static final long serialVersionUID = 1L; - - public Put call(String v) throws Exception { - String[] cells = v.split(","); - Put put = new Put(Bytes.toBytes(cells[0])); - - put.addColumn(Bytes.toBytes(cells[1]), Bytes.toBytes(cells[2]), - Bytes.toBytes(cells[3])); - return put; - } - } - - @Test - public void testBulkDelete() throws IOException { - List list = new ArrayList<>(3); - list.add(Bytes.toBytes("1")); - list.add(Bytes.toBytes("2")); - list.add(Bytes.toBytes("3")); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = htu.getConfiguration(); - - populateTableWithMockData(conf, TableName.valueOf(tableName)); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.bulkDelete(rdd, TableName.valueOf(tableName), - new JavaHBaseBulkDeleteExample.DeleteFunction(), 2); - - - - try ( - Connection conn = ConnectionFactory.createConnection(conf); - Table table = conn.getTable(TableName.valueOf(tableName)) - ){ - Result result1 = table.get(new Get(Bytes.toBytes("1"))); - Assert.assertNull("Row 1 should had been deleted", result1.getRow()); - - Result result2 = table.get(new Get(Bytes.toBytes("2"))); - Assert.assertNull("Row 2 should had been deleted", result2.getRow()); - - Result result3 = table.get(new Get(Bytes.toBytes("3"))); - Assert.assertNull("Row 3 should had been deleted", result3.getRow()); - - Result result4 = table.get(new Get(Bytes.toBytes("4"))); - Assert.assertNotNull("Row 4 should had been deleted", result4.getRow()); - - Result result5 = table.get(new Get(Bytes.toBytes("5"))); - Assert.assertNotNull("Row 5 should had been deleted", result5.getRow()); - } - } - - @Test - public void testDistributedScan() throws IOException { - Configuration conf = htu.getConfiguration(); - - populateTableWithMockData(conf, TableName.valueOf(tableName)); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - Scan scan = new Scan(); - scan.setCaching(100); - - JavaRDD javaRdd = - hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan) - .map(new ScanConvertFunction()); - - List results = javaRdd.collect(); - - Assert.assertEquals(results.size(), 5); - } - - private static class ScanConvertFunction implements - Function, String> { - @Override - public String call(Tuple2 v1) throws Exception { - return Bytes.toString(v1._1().copyBytes()); - } - } - - @Test - public void testBulkGet() throws IOException { - List list = new ArrayList<>(5); - list.add(Bytes.toBytes("1")); - list.add(Bytes.toBytes("2")); - list.add(Bytes.toBytes("3")); - list.add(Bytes.toBytes("4")); - list.add(Bytes.toBytes("5")); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = htu.getConfiguration(); - - populateTableWithMockData(conf, TableName.valueOf(tableName)); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - final JavaRDD stringJavaRDD = - hbaseContext.bulkGet(TableName.valueOf(tableName), 2, rdd, - new GetFunction(), - new ResultFunction()); - - Assert.assertEquals(stringJavaRDD.count(), 5); - } - - @Test - public void testBulkLoad() throws Exception { - - Path output = htu.getDataTestDir("testBulkLoad"); - // Add cell as String: "row,falmily,qualifier,value" - List list= new ArrayList(); - // row1 - list.add("1," + columnFamilyStr + ",b,1"); - // row3 - list.add("3," + columnFamilyStr + ",a,2"); - list.add("3," + columnFamilyStr + ",b,1"); - list.add("3," + columnFamilyStr1 + ",a,1"); - //row2 - list.add("2," + columnFamilyStr + ",a,3"); - list.add("2," + columnFamilyStr + ",b,3"); - - JavaRDD rdd = jsc.parallelize(list); - - Configuration conf = htu.getConfiguration(); - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - - - hbaseContext.bulkLoad(rdd, TableName.valueOf(tableName), new BulkLoadFunction(), output.toUri().getPath(), - new HashMap(), false, HConstants.DEFAULT_MAX_FILE_SIZE); - - try (Connection conn = ConnectionFactory.createConnection(conf); Admin admin = conn.getAdmin()) { - Table table = conn.getTable(TableName.valueOf(tableName)); - // Do bulk load - LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf); - load.doBulkLoad(output, admin, table, conn.getRegionLocator(TableName.valueOf(tableName))); - - - - // Check row1 - List cell1 = table.get(new Get(Bytes.toBytes("1"))).listCells(); - Assert.assertEquals(cell1.size(), 1); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell1.get(0))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell1.get(0))), "b"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell1.get(0))), "1"); - - // Check row3 - List cell3 = table.get(new Get(Bytes.toBytes("3"))).listCells(); - Assert.assertEquals(cell3.size(), 3); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell3.get(0))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell3.get(0))), "a"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell3.get(0))), "2"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell3.get(1))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell3.get(1))), "b"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell3.get(1))), "1"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell3.get(2))), columnFamilyStr1); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell3.get(2))), "a"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell3.get(2))), "1"); - - // Check row2 - List cell2 = table.get(new Get(Bytes.toBytes("2"))).listCells(); - Assert.assertEquals(cell2.size(), 2); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell2.get(0))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell2.get(0))), "a"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell2.get(0))), "3"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell2.get(1))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell2.get(1))), "b"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell2.get(1))), "3"); - } - } - - @Test - public void testBulkLoadThinRows() throws Exception { - Path output = htu.getDataTestDir("testBulkLoadThinRows"); - // because of the limitation of scala bulkLoadThinRows API - // we need to provide data as - List> list= new ArrayList>(); - // row1 - List list1 = new ArrayList(); - list1.add("1," + columnFamilyStr + ",b,1"); - list.add(list1); - // row3 - List list3 = new ArrayList(); - list3.add("3," + columnFamilyStr + ",a,2"); - list3.add("3," + columnFamilyStr + ",b,1"); - list3.add("3," + columnFamilyStr1 + ",a,1"); - list.add(list3); - //row2 - List list2 = new ArrayList(); - list2.add("2," + columnFamilyStr + ",a,3"); - list2.add("2," + columnFamilyStr + ",b,3"); - list.add(list2); - - JavaRDD> rdd = jsc.parallelize(list); - - Configuration conf = htu.getConfiguration(); - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.bulkLoadThinRows(rdd, TableName.valueOf(tableName), new BulkLoadThinRowsFunction(), output.toString(), - new HashMap(), false, HConstants.DEFAULT_MAX_FILE_SIZE); - - - try (Connection conn = ConnectionFactory.createConnection(conf); Admin admin = conn.getAdmin()) { - Table table = conn.getTable(TableName.valueOf(tableName)); - // Do bulk load - LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf); - load.doBulkLoad(output, admin, table, conn.getRegionLocator(TableName.valueOf(tableName))); - - // Check row1 - List cell1 = table.get(new Get(Bytes.toBytes("1"))).listCells(); - Assert.assertEquals(cell1.size(), 1); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell1.get(0))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell1.get(0))), "b"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell1.get(0))), "1"); - - // Check row3 - List cell3 = table.get(new Get(Bytes.toBytes("3"))).listCells(); - Assert.assertEquals(cell3.size(), 3); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell3.get(0))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell3.get(0))), "a"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell3.get(0))), "2"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell3.get(1))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell3.get(1))), "b"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell3.get(1))), "1"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell3.get(2))), columnFamilyStr1); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell3.get(2))), "a"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell3.get(2))), "1"); - - // Check row2 - List cell2 = table.get(new Get(Bytes.toBytes("2"))).listCells(); - Assert.assertEquals(cell2.size(), 2); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell2.get(0))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell2.get(0))), "a"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell2.get(0))), "3"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneFamily(cell2.get(1))), columnFamilyStr); - Assert.assertEquals(Bytes.toString(CellUtil.cloneQualifier(cell2.get(1))), "b"); - Assert.assertEquals(Bytes.toString(CellUtil.cloneValue(cell2.get(1))), "3"); - } - - } - public static class BulkLoadFunction implements Function> { - - @Override public Pair call(String v1) throws Exception { - if (v1 == null) - return null; - String[] strs = v1.split(","); - if(strs.length != 4) - return null; - KeyFamilyQualifier kfq = new KeyFamilyQualifier(Bytes.toBytes(strs[0]), Bytes.toBytes(strs[1]), - Bytes.toBytes(strs[2])); - return new Pair(kfq, Bytes.toBytes(strs[3])); - } - } - - public static class BulkLoadThinRowsFunction implements Function, Pair> { - - @Override public Pair call(List list) throws Exception { - if (list == null) - return null; - ByteArrayWrapper rowKey = null; - FamiliesQualifiersValues fqv = new FamiliesQualifiersValues(); - for (String cell : list) { - String[] strs = cell.split(","); - if (rowKey == null) { - rowKey = new ByteArrayWrapper(Bytes.toBytes(strs[0])); - } - fqv.add(Bytes.toBytes(strs[1]), Bytes.toBytes(strs[2]), Bytes.toBytes(strs[3])); - } - return new Pair(rowKey, fqv); - } - } - - public static class GetFunction implements Function { - - private static final long serialVersionUID = 1L; - - public Get call(byte[] v) throws Exception { - return new Get(v); - } - } - - public static class ResultFunction implements Function { - - private static final long serialVersionUID = 1L; - - public String call(Result result) throws Exception { - Iterator it = result.listCells().iterator(); - StringBuilder b = new StringBuilder(); - - b.append(Bytes.toString(result.getRow())).append(":"); - - while (it.hasNext()) { - Cell cell = it.next(); - String q = Bytes.toString(CellUtil.cloneQualifier(cell)); - if ("counter".equals(q)) { - b.append("(") - .append(q) - .append(",") - .append(Bytes.toLong(CellUtil.cloneValue(cell))) - .append(")"); - } else { - b.append("(") - .append(q) - .append(",") - .append(Bytes.toString(CellUtil.cloneValue(cell))) - .append(")"); - } - } - return b.toString(); - } - } - - private void populateTableWithMockData(Configuration conf, TableName tableName) - throws IOException { - try ( - Connection conn = ConnectionFactory.createConnection(conf); - Table table = conn.getTable(tableName)) { - - List puts = new ArrayList<>(5); - - for (int i = 1; i < 6; i++) { - Put put = new Put(Bytes.toBytes(Integer.toString(i))); - put.addColumn(columnFamily, columnFamily, columnFamily); - puts.add(put); - } - table.put(puts); - } - } - -} diff --git a/hbase-spark/src/test/resources/hbase-site.xml b/hbase-spark/src/test/resources/hbase-site.xml deleted file mode 100644 index b3fb0d90c50..00000000000 --- a/hbase-spark/src/test/resources/hbase-site.xml +++ /dev/null @@ -1,157 +0,0 @@ - - - - - - hbase.regionserver.msginterval - 1000 - Interval between messages from the RegionServer to HMaster - in milliseconds. Default is 15. Set this value low if you want unit - tests to be responsive. - - - - hbase.defaults.for.version.skip - true - - - hbase.server.thread.wakefrequency - 1000 - Time to sleep in between searches for work (in milliseconds). - Used as sleep interval by service threads such as hbase:meta scanner and log roller. - - - - hbase.master.event.waiting.time - 50 - Time to sleep between checks to see if a table event took place. - - - - hbase.regionserver.handler.count - 5 - - - hbase.regionserver.metahandler.count - 5 - - - hbase.ipc.server.read.threadpool.size - 3 - - - hbase.master.info.port - -1 - The port for the hbase master web UI - Set to -1 if you do not want the info server to run. - - - - hbase.master.port - 0 - Always have masters and regionservers come up on port '0' so we don't clash over - default ports. - - - - hbase.regionserver.port - 0 - Always have masters and regionservers come up on port '0' so we don't clash over - default ports. - - - - hbase.ipc.client.fallback-to-simple-auth-allowed - true - - - - hbase.regionserver.info.port - -1 - The port for the hbase regionserver web UI - Set to -1 if you do not want the info server to run. - - - - hbase.regionserver.info.port.auto - true - Info server auto port bind. Enables automatic port - search if hbase.regionserver.info.port is already in use. - Enabled for testing to run multiple tests on one machine. - - - - hbase.regionserver.safemode - false - - Turn on/off safe mode in region server. Always on for production, always off - for tests. - - - - hbase.hregion.max.filesize - 67108864 - - Maximum desired file size for an HRegion. If filesize exceeds - value + (value / 2), the HRegion is split in two. Default: 256M. - - Keep the maximum filesize small so we split more often in tests. - - - - hadoop.log.dir - ${user.dir}/../logs - - - hbase.zookeeper.property.clientPort - 21818 - Property from ZooKeeper's config zoo.cfg. - The port at which the clients will connect. - - - - hbase.defaults.for.version.skip - true - - Set to true to skip the 'hbase.defaults.for.version'. - Setting this to true can be useful in contexts other than - the other side of a maven generation; i.e. running in an - ide. You'll want to set this boolean to true to avoid - seeing the RuntimeException complaint: "hbase-default.xml file - seems to be for and old version of HBase (@@@VERSION@@@), this - version is X.X.X-SNAPSHOT" - - - - hbase.table.sanity.checks - false - Skip sanity checks in tests - - - - hbase.procedure.fail.on.corruption - true - - Enable replay sanity checks on procedure tests. - - - diff --git a/hbase-spark/src/test/resources/log4j.properties b/hbase-spark/src/test/resources/log4j.properties deleted file mode 100644 index cd3b8e9d8c7..00000000000 --- a/hbase-spark/src/test/resources/log4j.properties +++ /dev/null @@ -1,76 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Define some default values that can be overridden by system properties -hbase.root.logger=INFO,FA -hbase.log.dir=. -hbase.log.file=hbase.log - -# Define the root logger to the system property "hbase.root.logger". -log4j.rootLogger=${hbase.root.logger} - -# Logging Threshold -log4j.threshold=ALL - -# -# Daily Rolling File Appender -# -log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender -log4j.appender.DRFA.File=${hbase.log.dir}/${hbase.log.file} - -# Rollver at midnight -log4j.appender.DRFA.DatePattern=.yyyy-MM-dd - -# 30-day backup -#log4j.appender.DRFA.MaxBackupIndex=30 -log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout -# Debugging Pattern format -log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p [%t] %C{2}(%L): %m%n - - -# -# console -# Add "console" to rootlogger above if you want to use this -# -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{ISO8601} %-5p [%t] %C{2}(%L): %m%n - -#File Appender -log4j.appender.FA=org.apache.log4j.FileAppender -log4j.appender.FA.append=false -log4j.appender.FA.file=target/log-output.txt -log4j.appender.FA.layout=org.apache.log4j.PatternLayout -log4j.appender.FA.layout.ConversionPattern=%d{ISO8601} %-5p [%t] %C{2}(%L): %m%n -log4j.appender.FA.Threshold = INFO - -# Custom Logging levels - -#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG - -log4j.logger.org.apache.hadoop=WARN -log4j.logger.org.apache.zookeeper=ERROR -log4j.logger.org.apache.hadoop.hbase=DEBUG - -#These settings are workarounds against spurious logs from the minicluster. -#See HBASE-4709 -log4j.logger.org.apache.hadoop.metrics2.impl.MetricsConfig=WARN -log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSinkAdapter=WARN -log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=WARN -log4j.logger.org.apache.hadoop.metrics2.util.MBeans=WARN -# Enable this to get detailed connection error/retry logging. -# log4j.logger.org.apache.hadoop.hbase.client.ConnectionImplementation=TRACE diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/BulkLoadSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/BulkLoadSuite.scala deleted file mode 100644 index a42732788d5..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/BulkLoadSuite.scala +++ /dev/null @@ -1,956 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.hbase.client.{Get, ConnectionFactory} -import org.apache.hadoop.hbase.io.hfile.{CacheConfig, HFile} -import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles -import org.apache.hadoop.hbase.{HConstants, CellUtil, HBaseTestingUtility, TableName} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.spark.{SparkContext, Logging} -import org.junit.rules.TemporaryFolder -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -class BulkLoadSuite extends FunSuite with -BeforeAndAfterEach with BeforeAndAfterAll with Logging { - @transient var sc: SparkContext = null - var TEST_UTIL = new HBaseTestingUtility - - val tableName = "t1" - val columnFamily1 = "f1" - val columnFamily2 = "f2" - val testFolder = new TemporaryFolder() - - - override def beforeAll() { - TEST_UTIL.startMiniCluster() - logInfo(" - minicluster started") - - try { - TEST_UTIL.deleteTable(TableName.valueOf(tableName)) - } catch { - case e: Exception => - logInfo(" - no table " + tableName + " found") - } - - logInfo(" - created table") - - val envMap = Map[String,String](("Xmx", "512m")) - - sc = new SparkContext("local", "test", null, Nil, envMap) - } - - override def afterAll() { - logInfo("shuting down minicluster") - TEST_UTIL.shutdownMiniCluster() - logInfo(" - minicluster shut down") - TEST_UTIL.cleanupTestDir() - sc.stop() - } - - test("Wide Row Bulk Load: Test multi family and multi column tests " + - "with all default HFile Configs.") { - val config = TEST_UTIL.getConfiguration - - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), - Array(Bytes.toBytes(columnFamily1), Bytes.toBytes(columnFamily2))) - - //There are a number of tests in here. - // 1. Row keys are not in order - // 2. Qualifiers are not in order - // 3. Column Families are not in order - // 4. There are tests for records in one column family and some in two column families - // 5. There are records will a single qualifier and some with two - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo2.a"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily2), Bytes.toBytes("a"), Bytes.toBytes("foo2.b"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo2.c"))), - (Bytes.toBytes("5"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo3"))), - (Bytes.toBytes("4"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo.1"))), - (Bytes.toBytes("4"), - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo.2"))), - (Bytes.toBytes("2"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("bar.1"))), - (Bytes.toBytes("2"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("bar.2"))))) - - - - val hbaseContext = new HBaseContext(sc, config) - - testFolder.create() - val stagingFolder = testFolder.newFolder() - - hbaseContext.bulkLoad[(Array[Byte], (Array[Byte], Array[Byte], Array[Byte]))](rdd, - TableName.valueOf(tableName), - t => { - val rowKey = t._1 - val family:Array[Byte] = t._2._1 - val qualifier = t._2._2 - val value:Array[Byte] = t._2._3 - - val keyFamilyQualifier= new KeyFamilyQualifier(rowKey, family, qualifier) - - Seq((keyFamilyQualifier, value)).iterator - }, - stagingFolder.getPath) - - val fs = FileSystem.get(config) - assert(fs.listStatus(new Path(stagingFolder.getPath)).length == 2) - - val conn = ConnectionFactory.createConnection(config) - - val load = new LoadIncrementalHFiles(config) - val table = conn.getTable(TableName.valueOf(tableName)) - try { - load.doBulkLoad(new Path(stagingFolder.getPath), conn.getAdmin, table, - conn.getRegionLocator(TableName.valueOf(tableName))) - - val cells5 = table.get(new Get(Bytes.toBytes("5"))).listCells() - assert(cells5.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells5.get(0))).equals("foo3")) - assert(Bytes.toString(CellUtil.cloneFamily(cells5.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells5.get(0))).equals("a")) - - val cells4 = table.get(new Get(Bytes.toBytes("4"))).listCells() - assert(cells4.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(0))).equals("foo.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(1))).equals("foo.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(1))).equals("b")) - - val cells3 = table.get(new Get(Bytes.toBytes("3"))).listCells() - assert(cells3.size == 3) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(0))).equals("foo2.c")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(1))).equals("foo2.b")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(1))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(2))).equals("foo2.a")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(2))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(2))).equals("b")) - - - val cells2 = table.get(new Get(Bytes.toBytes("2"))).listCells() - assert(cells2.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(0))).equals("bar.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(1))).equals("bar.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(1))).equals("b")) - - val cells1 = table.get(new Get(Bytes.toBytes("1"))).listCells() - assert(cells1.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells1.get(0))).equals("foo1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells1.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells1.get(0))).equals("a")) - - } finally { - table.close() - val admin = ConnectionFactory.createConnection(config).getAdmin - try { - admin.disableTable(TableName.valueOf(tableName)) - admin.deleteTable(TableName.valueOf(tableName)) - } finally { - admin.close() - } - fs.delete(new Path(stagingFolder.getPath), true) - - testFolder.delete() - - } - } - - test("Wide Row Bulk Load: Test HBase client: Test Roll Over and " + - "using an implicit call to bulk load") { - val config = TEST_UTIL.getConfiguration - - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), - Array(Bytes.toBytes(columnFamily1), Bytes.toBytes(columnFamily2))) - - //There are a number of tests in here. - // 1. Row keys are not in order - // 2. Qualifiers are not in order - // 3. Column Families are not in order - // 4. There are tests for records in one column family and some in two column families - // 5. There are records will a single qualifier and some with two - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("foo2.b"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo2.a"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("c"), Bytes.toBytes("foo2.c"))), - (Bytes.toBytes("5"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo3"))), - (Bytes.toBytes("4"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo.1"))), - (Bytes.toBytes("4"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("foo.2"))), - (Bytes.toBytes("2"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("bar.1"))), - (Bytes.toBytes("2"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("bar.2"))))) - - val hbaseContext = new HBaseContext(sc, config) - - testFolder.create() - val stagingFolder = testFolder.newFolder() - - rdd.hbaseBulkLoad(hbaseContext, - TableName.valueOf(tableName), - t => { - val rowKey = t._1 - val family:Array[Byte] = t._2._1 - val qualifier = t._2._2 - val value = t._2._3 - - val keyFamilyQualifier= new KeyFamilyQualifier(rowKey, family, qualifier) - - Seq((keyFamilyQualifier, value)).iterator - }, - stagingFolder.getPath, - new java.util.HashMap[Array[Byte], FamilyHFileWriteOptions], - compactionExclude = false, - 20) - - val fs = FileSystem.get(config) - assert(fs.listStatus(new Path(stagingFolder.getPath)).length == 1) - - assert(fs.listStatus(new Path(stagingFolder.getPath+ "/f1")).length == 5) - - val conn = ConnectionFactory.createConnection(config) - - val load = new LoadIncrementalHFiles(config) - val table = conn.getTable(TableName.valueOf(tableName)) - try { - load.doBulkLoad(new Path(stagingFolder.getPath), - conn.getAdmin, table, conn.getRegionLocator(TableName.valueOf(tableName))) - - val cells5 = table.get(new Get(Bytes.toBytes("5"))).listCells() - assert(cells5.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells5.get(0))).equals("foo3")) - assert(Bytes.toString(CellUtil.cloneFamily(cells5.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells5.get(0))).equals("a")) - - val cells4 = table.get(new Get(Bytes.toBytes("4"))).listCells() - assert(cells4.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(0))).equals("foo.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(1))).equals("foo.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(1))).equals("b")) - - val cells3 = table.get(new Get(Bytes.toBytes("3"))).listCells() - assert(cells3.size == 3) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(0))).equals("foo2.a")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(1))).equals("foo2.b")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(1))).equals("b")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(2))).equals("foo2.c")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(2))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(2))).equals("c")) - - val cells2 = table.get(new Get(Bytes.toBytes("2"))).listCells() - assert(cells2.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(0))).equals("bar.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(1))).equals("bar.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(1))).equals("b")) - - val cells1 = table.get(new Get(Bytes.toBytes("1"))).listCells() - assert(cells1.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells1.get(0))).equals("foo1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells1.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells1.get(0))).equals("a")) - - } finally { - table.close() - val admin = ConnectionFactory.createConnection(config).getAdmin - try { - admin.disableTable(TableName.valueOf(tableName)) - admin.deleteTable(TableName.valueOf(tableName)) - } finally { - admin.close() - } - fs.delete(new Path(stagingFolder.getPath), true) - - testFolder.delete() - } - } - - test("Wide Row Bulk Load: Test multi family and multi column tests" + - " with one column family with custom configs plus multi region") { - val config = TEST_UTIL.getConfiguration - - val splitKeys:Array[Array[Byte]] = new Array[Array[Byte]](2) - splitKeys(0) = Bytes.toBytes("2") - splitKeys(1) = Bytes.toBytes("4") - - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), - Array(Bytes.toBytes(columnFamily1), Bytes.toBytes(columnFamily2)), - splitKeys) - - //There are a number of tests in here. - // 1. Row keys are not in order - // 2. Qualifiers are not in order - // 3. Column Families are not in order - // 4. There are tests for records in one column family and some in two column families - // 5. There are records will a single qualifier and some with two - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo2.a"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily2), Bytes.toBytes("a"), Bytes.toBytes("foo2.b"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo2.c"))), - (Bytes.toBytes("5"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo3"))), - (Bytes.toBytes("4"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo.1"))), - (Bytes.toBytes("4"), - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo.2"))), - (Bytes.toBytes("2"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("bar.1"))), - (Bytes.toBytes("2"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("bar.2"))))) - - val hbaseContext = new HBaseContext(sc, config) - - testFolder.create() - val stagingFolder = testFolder.newFolder() - - val familyHBaseWriterOptions = new java.util.HashMap[Array[Byte], FamilyHFileWriteOptions] - - val f1Options = new FamilyHFileWriteOptions("GZ", "ROW", 128, - "PREFIX") - - familyHBaseWriterOptions.put(Bytes.toBytes(columnFamily1), f1Options) - - hbaseContext.bulkLoad[(Array[Byte], (Array[Byte], Array[Byte], Array[Byte]))](rdd, - TableName.valueOf(tableName), - t => { - val rowKey = t._1 - val family:Array[Byte] = t._2._1 - val qualifier = t._2._2 - val value = t._2._3 - - val keyFamilyQualifier= new KeyFamilyQualifier(rowKey, family, qualifier) - - Seq((keyFamilyQualifier, value)).iterator - }, - stagingFolder.getPath, - familyHBaseWriterOptions, - compactionExclude = false, - HConstants.DEFAULT_MAX_FILE_SIZE) - - val fs = FileSystem.get(config) - assert(fs.listStatus(new Path(stagingFolder.getPath)).length == 2) - - val f1FileList = fs.listStatus(new Path(stagingFolder.getPath +"/f1")) - for ( i <- 0 until f1FileList.length) { - val reader = HFile.createReader(fs, f1FileList(i).getPath, - new CacheConfig(config), true, config) - assert(reader.getCompressionAlgorithm.getName.equals("gz")) - assert(reader.getDataBlockEncoding.name().equals("PREFIX")) - } - - assert( 3 == f1FileList.length) - - val f2FileList = fs.listStatus(new Path(stagingFolder.getPath +"/f2")) - for ( i <- 0 until f2FileList.length) { - val reader = HFile.createReader(fs, f2FileList(i).getPath, - new CacheConfig(config), true, config) - assert(reader.getCompressionAlgorithm.getName.equals("none")) - assert(reader.getDataBlockEncoding.name().equals("NONE")) - } - - assert( 2 == f2FileList.length) - - - val conn = ConnectionFactory.createConnection(config) - - val load = new LoadIncrementalHFiles(config) - val table = conn.getTable(TableName.valueOf(tableName)) - try { - load.doBulkLoad(new Path(stagingFolder.getPath), - conn.getAdmin, table, conn.getRegionLocator(TableName.valueOf(tableName))) - - val cells5 = table.get(new Get(Bytes.toBytes("5"))).listCells() - assert(cells5.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells5.get(0))).equals("foo3")) - assert(Bytes.toString(CellUtil.cloneFamily(cells5.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells5.get(0))).equals("a")) - - val cells4 = table.get(new Get(Bytes.toBytes("4"))).listCells() - assert(cells4.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(0))).equals("foo.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(1))).equals("foo.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(1))).equals("b")) - - val cells3 = table.get(new Get(Bytes.toBytes("3"))).listCells() - assert(cells3.size == 3) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(0))).equals("foo2.c")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(1))).equals("foo2.b")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(1))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(2))).equals("foo2.a")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(2))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(2))).equals("b")) - - - val cells2 = table.get(new Get(Bytes.toBytes("2"))).listCells() - assert(cells2.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(0))).equals("bar.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(1))).equals("bar.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(1))).equals("b")) - - val cells1 = table.get(new Get(Bytes.toBytes("1"))).listCells() - assert(cells1.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells1.get(0))).equals("foo1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells1.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells1.get(0))).equals("a")) - - } finally { - table.close() - val admin = ConnectionFactory.createConnection(config).getAdmin - try { - admin.disableTable(TableName.valueOf(tableName)) - admin.deleteTable(TableName.valueOf(tableName)) - } finally { - admin.close() - } - fs.delete(new Path(stagingFolder.getPath), true) - - testFolder.delete() - - } - } - - test("Test partitioner") { - - var splitKeys:Array[Array[Byte]] = new Array[Array[Byte]](3) - splitKeys(0) = Bytes.toBytes("") - splitKeys(1) = Bytes.toBytes("3") - splitKeys(2) = Bytes.toBytes("7") - - var partitioner = new BulkLoadPartitioner(splitKeys) - - assert(0 == partitioner.getPartition(Bytes.toBytes(""))) - assert(0 == partitioner.getPartition(Bytes.toBytes("1"))) - assert(0 == partitioner.getPartition(Bytes.toBytes("2"))) - assert(1 == partitioner.getPartition(Bytes.toBytes("3"))) - assert(1 == partitioner.getPartition(Bytes.toBytes("4"))) - assert(1 == partitioner.getPartition(Bytes.toBytes("6"))) - assert(2 == partitioner.getPartition(Bytes.toBytes("7"))) - assert(2 == partitioner.getPartition(Bytes.toBytes("8"))) - - - splitKeys = new Array[Array[Byte]](1) - splitKeys(0) = Bytes.toBytes("") - - partitioner = new BulkLoadPartitioner(splitKeys) - - assert(0 == partitioner.getPartition(Bytes.toBytes(""))) - assert(0 == partitioner.getPartition(Bytes.toBytes("1"))) - assert(0 == partitioner.getPartition(Bytes.toBytes("2"))) - assert(0 == partitioner.getPartition(Bytes.toBytes("3"))) - assert(0 == partitioner.getPartition(Bytes.toBytes("4"))) - assert(0 == partitioner.getPartition(Bytes.toBytes("6"))) - assert(0 == partitioner.getPartition(Bytes.toBytes("7"))) - - splitKeys = new Array[Array[Byte]](7) - splitKeys(0) = Bytes.toBytes("") - splitKeys(1) = Bytes.toBytes("02") - splitKeys(2) = Bytes.toBytes("04") - splitKeys(3) = Bytes.toBytes("06") - splitKeys(4) = Bytes.toBytes("08") - splitKeys(5) = Bytes.toBytes("10") - splitKeys(6) = Bytes.toBytes("12") - - partitioner = new BulkLoadPartitioner(splitKeys) - - assert(0 == partitioner.getPartition(Bytes.toBytes(""))) - assert(0 == partitioner.getPartition(Bytes.toBytes("01"))) - assert(1 == partitioner.getPartition(Bytes.toBytes("02"))) - assert(1 == partitioner.getPartition(Bytes.toBytes("03"))) - assert(2 == partitioner.getPartition(Bytes.toBytes("04"))) - assert(2 == partitioner.getPartition(Bytes.toBytes("05"))) - assert(3 == partitioner.getPartition(Bytes.toBytes("06"))) - assert(3 == partitioner.getPartition(Bytes.toBytes("07"))) - assert(4 == partitioner.getPartition(Bytes.toBytes("08"))) - assert(4 == partitioner.getPartition(Bytes.toBytes("09"))) - assert(5 == partitioner.getPartition(Bytes.toBytes("10"))) - assert(5 == partitioner.getPartition(Bytes.toBytes("11"))) - assert(6 == partitioner.getPartition(Bytes.toBytes("12"))) - assert(6 == partitioner.getPartition(Bytes.toBytes("13"))) - } - - test("Thin Row Bulk Load: Test multi family and multi column tests " + - "with all default HFile Configs") { - val config = TEST_UTIL.getConfiguration - - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), - Array(Bytes.toBytes(columnFamily1), Bytes.toBytes(columnFamily2))) - - //There are a number of tests in here. - // 1. Row keys are not in order - // 2. Qualifiers are not in order - // 3. Column Families are not in order - // 4. There are tests for records in one column family and some in two column families - // 5. There are records will a single qualifier and some with two - val rdd = sc.parallelize(Array( - ("1", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - ("3", - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo2.a"))), - ("3", - (Bytes.toBytes(columnFamily2), Bytes.toBytes("a"), Bytes.toBytes("foo2.b"))), - ("3", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo2.c"))), - ("5", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo3"))), - ("4", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo.1"))), - ("4", - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo.2"))), - ("2", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("bar.1"))), - ("2", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("bar.2"))))). - groupByKey() - - val hbaseContext = new HBaseContext(sc, config) - - testFolder.create() - val stagingFolder = testFolder.newFolder() - - hbaseContext.bulkLoadThinRows[(String, Iterable[(Array[Byte], Array[Byte], Array[Byte])])](rdd, - TableName.valueOf(tableName), - t => { - val rowKey = Bytes.toBytes(t._1) - - val familyQualifiersValues = new FamiliesQualifiersValues - t._2.foreach(f => { - val family:Array[Byte] = f._1 - val qualifier = f._2 - val value:Array[Byte] = f._3 - - familyQualifiersValues +=(family, qualifier, value) - }) - (new ByteArrayWrapper(rowKey), familyQualifiersValues) - }, - stagingFolder.getPath) - - val fs = FileSystem.get(config) - assert(fs.listStatus(new Path(stagingFolder.getPath)).length == 2) - - val conn = ConnectionFactory.createConnection(config) - - val load = new LoadIncrementalHFiles(config) - val table = conn.getTable(TableName.valueOf(tableName)) - try { - load.doBulkLoad(new Path(stagingFolder.getPath), conn.getAdmin, table, - conn.getRegionLocator(TableName.valueOf(tableName))) - - val cells5 = table.get(new Get(Bytes.toBytes("5"))).listCells() - assert(cells5.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells5.get(0))).equals("foo3")) - assert(Bytes.toString(CellUtil.cloneFamily(cells5.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells5.get(0))).equals("a")) - - val cells4 = table.get(new Get(Bytes.toBytes("4"))).listCells() - assert(cells4.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(0))).equals("foo.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(1))).equals("foo.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(1))).equals("b")) - - val cells3 = table.get(new Get(Bytes.toBytes("3"))).listCells() - assert(cells3.size == 3) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(0))).equals("foo2.c")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(1))).equals("foo2.b")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(1))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(2))).equals("foo2.a")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(2))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(2))).equals("b")) - - - val cells2 = table.get(new Get(Bytes.toBytes("2"))).listCells() - assert(cells2.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(0))).equals("bar.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(1))).equals("bar.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(1))).equals("b")) - - val cells1 = table.get(new Get(Bytes.toBytes("1"))).listCells() - assert(cells1.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells1.get(0))).equals("foo1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells1.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells1.get(0))).equals("a")) - - } finally { - table.close() - val admin = ConnectionFactory.createConnection(config).getAdmin - try { - admin.disableTable(TableName.valueOf(tableName)) - admin.deleteTable(TableName.valueOf(tableName)) - } finally { - admin.close() - } - fs.delete(new Path(stagingFolder.getPath), true) - - testFolder.delete() - - } - } - - test("Thin Row Bulk Load: Test HBase client: Test Roll Over and " + - "using an implicit call to bulk load") { - val config = TEST_UTIL.getConfiguration - - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), - Array(Bytes.toBytes(columnFamily1), Bytes.toBytes(columnFamily2))) - - //There are a number of tests in here. - // 1. Row keys are not in order - // 2. Qualifiers are not in order - // 3. Column Families are not in order - // 4. There are tests for records in one column family and some in two column families - // 5. There are records will a single qualifier and some with two - val rdd = sc.parallelize(Array( - ("1", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - ("3", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("foo2.b"))), - ("3", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo2.a"))), - ("3", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("c"), Bytes.toBytes("foo2.c"))), - ("5", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo3"))), - ("4", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo.1"))), - ("4", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("foo.2"))), - ("2", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("bar.1"))), - ("2", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("bar.2"))))). - groupByKey() - - val hbaseContext = new HBaseContext(sc, config) - - testFolder.create() - val stagingFolder = testFolder.newFolder() - - rdd.hbaseBulkLoadThinRows(hbaseContext, - TableName.valueOf(tableName), - t => { - val rowKey = t._1 - - val familyQualifiersValues = new FamiliesQualifiersValues - t._2.foreach(f => { - val family:Array[Byte] = f._1 - val qualifier = f._2 - val value:Array[Byte] = f._3 - - familyQualifiersValues +=(family, qualifier, value) - }) - (new ByteArrayWrapper(Bytes.toBytes(rowKey)), familyQualifiersValues) - }, - stagingFolder.getPath, - new java.util.HashMap[Array[Byte], FamilyHFileWriteOptions], - compactionExclude = false, - 20) - - val fs = FileSystem.get(config) - assert(fs.listStatus(new Path(stagingFolder.getPath)).length == 1) - - assert(fs.listStatus(new Path(stagingFolder.getPath+ "/f1")).length == 5) - - val conn = ConnectionFactory.createConnection(config) - - val load = new LoadIncrementalHFiles(config) - val table = conn.getTable(TableName.valueOf(tableName)) - try { - load.doBulkLoad(new Path(stagingFolder.getPath), - conn.getAdmin, table, conn.getRegionLocator(TableName.valueOf(tableName))) - - val cells5 = table.get(new Get(Bytes.toBytes("5"))).listCells() - assert(cells5.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells5.get(0))).equals("foo3")) - assert(Bytes.toString(CellUtil.cloneFamily(cells5.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells5.get(0))).equals("a")) - - val cells4 = table.get(new Get(Bytes.toBytes("4"))).listCells() - assert(cells4.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(0))).equals("foo.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(1))).equals("foo.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(1))).equals("b")) - - val cells3 = table.get(new Get(Bytes.toBytes("3"))).listCells() - assert(cells3.size == 3) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(0))).equals("foo2.a")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(1))).equals("foo2.b")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(1))).equals("b")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(2))).equals("foo2.c")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(2))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(2))).equals("c")) - - val cells2 = table.get(new Get(Bytes.toBytes("2"))).listCells() - assert(cells2.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(0))).equals("bar.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(1))).equals("bar.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(1))).equals("b")) - - val cells1 = table.get(new Get(Bytes.toBytes("1"))).listCells() - assert(cells1.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells1.get(0))).equals("foo1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells1.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells1.get(0))).equals("a")) - - } finally { - table.close() - val admin = ConnectionFactory.createConnection(config).getAdmin - try { - admin.disableTable(TableName.valueOf(tableName)) - admin.deleteTable(TableName.valueOf(tableName)) - } finally { - admin.close() - } - fs.delete(new Path(stagingFolder.getPath), true) - - testFolder.delete() - } - } - - test("Thin Row Bulk Load: Test multi family and multi column tests" + - " with one column family with custom configs plus multi region") { - val config = TEST_UTIL.getConfiguration - - val splitKeys:Array[Array[Byte]] = new Array[Array[Byte]](2) - splitKeys(0) = Bytes.toBytes("2") - splitKeys(1) = Bytes.toBytes("4") - - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), - Array(Bytes.toBytes(columnFamily1), Bytes.toBytes(columnFamily2)), - splitKeys) - - //There are a number of tests in here. - // 1. Row keys are not in order - // 2. Qualifiers are not in order - // 3. Column Families are not in order - // 4. There are tests for records in one column family and some in two column families - // 5. There are records will a single qualifier and some with two - val rdd = sc.parallelize(Array( - ("1", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - ("3", - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo2.a"))), - ("3", - (Bytes.toBytes(columnFamily2), Bytes.toBytes("a"), Bytes.toBytes("foo2.b"))), - ("3", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo2.c"))), - ("5", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo3"))), - ("4", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo.1"))), - ("4", - (Bytes.toBytes(columnFamily2), Bytes.toBytes("b"), Bytes.toBytes("foo.2"))), - ("2", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("bar.1"))), - ("2", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("bar.2"))))). - groupByKey() - - val hbaseContext = new HBaseContext(sc, config) - - testFolder.create() - val stagingFolder = testFolder.newFolder() - - val familyHBaseWriterOptions = new java.util.HashMap[Array[Byte], FamilyHFileWriteOptions] - - val f1Options = new FamilyHFileWriteOptions("GZ", "ROW", 128, - "PREFIX") - - familyHBaseWriterOptions.put(Bytes.toBytes(columnFamily1), f1Options) - - hbaseContext.bulkLoadThinRows[(String, Iterable[(Array[Byte], Array[Byte], Array[Byte])])](rdd, - TableName.valueOf(tableName), - t => { - val rowKey = t._1 - - val familyQualifiersValues = new FamiliesQualifiersValues - t._2.foreach(f => { - val family:Array[Byte] = f._1 - val qualifier = f._2 - val value:Array[Byte] = f._3 - - familyQualifiersValues +=(family, qualifier, value) - }) - (new ByteArrayWrapper(Bytes.toBytes(rowKey)), familyQualifiersValues) - }, - stagingFolder.getPath, - familyHBaseWriterOptions, - compactionExclude = false, - HConstants.DEFAULT_MAX_FILE_SIZE) - - val fs = FileSystem.get(config) - assert(fs.listStatus(new Path(stagingFolder.getPath)).length == 2) - - val f1FileList = fs.listStatus(new Path(stagingFolder.getPath +"/f1")) - for ( i <- 0 until f1FileList.length) { - val reader = HFile.createReader(fs, f1FileList(i).getPath, - new CacheConfig(config), true, config) - assert(reader.getCompressionAlgorithm.getName.equals("gz")) - assert(reader.getDataBlockEncoding.name().equals("PREFIX")) - } - - assert( 3 == f1FileList.length) - - val f2FileList = fs.listStatus(new Path(stagingFolder.getPath +"/f2")) - for ( i <- 0 until f2FileList.length) { - val reader = HFile.createReader(fs, f2FileList(i).getPath, - new CacheConfig(config), true, config) - assert(reader.getCompressionAlgorithm.getName.equals("none")) - assert(reader.getDataBlockEncoding.name().equals("NONE")) - } - - assert( 2 == f2FileList.length) - - - val conn = ConnectionFactory.createConnection(config) - - val load = new LoadIncrementalHFiles(config) - val table = conn.getTable(TableName.valueOf(tableName)) - try { - load.doBulkLoad(new Path(stagingFolder.getPath), - conn.getAdmin, table, conn.getRegionLocator(TableName.valueOf(tableName))) - - val cells5 = table.get(new Get(Bytes.toBytes("5"))).listCells() - assert(cells5.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells5.get(0))).equals("foo3")) - assert(Bytes.toString(CellUtil.cloneFamily(cells5.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells5.get(0))).equals("a")) - - val cells4 = table.get(new Get(Bytes.toBytes("4"))).listCells() - assert(cells4.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(0))).equals("foo.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells4.get(1))).equals("foo.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells4.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells4.get(1))).equals("b")) - - val cells3 = table.get(new Get(Bytes.toBytes("3"))).listCells() - assert(cells3.size == 3) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(0))).equals("foo2.c")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(1))).equals("foo2.b")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(1))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(1))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells3.get(2))).equals("foo2.a")) - assert(Bytes.toString(CellUtil.cloneFamily(cells3.get(2))).equals("f2")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells3.get(2))).equals("b")) - - - val cells2 = table.get(new Get(Bytes.toBytes("2"))).listCells() - assert(cells2.size == 2) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(0))).equals("bar.1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(0))).equals("a")) - assert(Bytes.toString(CellUtil.cloneValue(cells2.get(1))).equals("bar.2")) - assert(Bytes.toString(CellUtil.cloneFamily(cells2.get(1))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells2.get(1))).equals("b")) - - val cells1 = table.get(new Get(Bytes.toBytes("1"))).listCells() - assert(cells1.size == 1) - assert(Bytes.toString(CellUtil.cloneValue(cells1.get(0))).equals("foo1")) - assert(Bytes.toString(CellUtil.cloneFamily(cells1.get(0))).equals("f1")) - assert(Bytes.toString(CellUtil.cloneQualifier(cells1.get(0))).equals("a")) - - } finally { - table.close() - val admin = ConnectionFactory.createConnection(config).getAdmin - try { - admin.disableTable(TableName.valueOf(tableName)) - admin.deleteTable(TableName.valueOf(tableName)) - } finally { - admin.close() - } - fs.delete(new Path(stagingFolder.getPath), true) - - testFolder.delete() - - } - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/DefaultSourceSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/DefaultSourceSuite.scala deleted file mode 100644 index 3bce0415633..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/DefaultSourceSuite.scala +++ /dev/null @@ -1,1040 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.avro.Schema -import org.apache.avro.generic.GenericData -import org.apache.hadoop.hbase.client.{ConnectionFactory, Put} -import org.apache.hadoop.hbase.spark.datasources.HBaseSparkConf -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName} -import org.apache.spark.sql.datasources.hbase.HBaseTableCatalog -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, SQLContext} -import org.apache.spark.{Logging, SparkConf, SparkContext} -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -case class HBaseRecord( - col0: String, - col1: Boolean, - col2: Double, - col3: Float, - col4: Int, - col5: Long, - col6: Short, - col7: String, - col8: Byte) - -object HBaseRecord { - def apply(i: Int, t: String): HBaseRecord = { - val s = s"""row${"%03d".format(i)}""" - HBaseRecord(s, - i % 2 == 0, - i.toDouble, - i.toFloat, - i, - i.toLong, - i.toShort, - s"String$i: $t", - i.toByte) - } -} - - -case class AvroHBaseKeyRecord(col0: Array[Byte], - col1: Array[Byte]) - -object AvroHBaseKeyRecord { - val schemaString = - s"""{"namespace": "example.avro", - | "type": "record", "name": "User", - | "fields": [ {"name": "name", "type": "string"}, - | {"name": "favorite_number", "type": ["int", "null"]}, - | {"name": "favorite_color", "type": ["string", "null"]} ] }""".stripMargin - - val avroSchema: Schema = { - val p = new Schema.Parser - p.parse(schemaString) - } - - def apply(i: Int): AvroHBaseKeyRecord = { - val user = new GenericData.Record(avroSchema); - user.put("name", s"name${"%03d".format(i)}") - user.put("favorite_number", i) - user.put("favorite_color", s"color${"%03d".format(i)}") - val avroByte = AvroSerdes.serialize(user, avroSchema) - AvroHBaseKeyRecord(avroByte, avroByte) - } -} - -class DefaultSourceSuite extends FunSuite with -BeforeAndAfterEach with BeforeAndAfterAll with Logging { - @transient var sc: SparkContext = null - var TEST_UTIL: HBaseTestingUtility = new HBaseTestingUtility - - val t1TableName = "t1" - val t2TableName = "t2" - val columnFamily = "c" - - var sqlContext:SQLContext = null - var df:DataFrame = null - - override def beforeAll() { - - TEST_UTIL.startMiniCluster - - logInfo(" - minicluster started") - try - TEST_UTIL.deleteTable(TableName.valueOf(t1TableName)) - catch { - case e: Exception => logInfo(" - no table " + t1TableName + " found") - } - try - TEST_UTIL.deleteTable(TableName.valueOf(t2TableName)) - catch { - case e: Exception => logInfo(" - no table " + t2TableName + " found") - } - logInfo(" - creating table " + t1TableName) - TEST_UTIL.createTable(TableName.valueOf(t1TableName), Bytes.toBytes(columnFamily)) - logInfo(" - created table") - logInfo(" - creating table " + t2TableName) - TEST_UTIL.createTable(TableName.valueOf(t2TableName), Bytes.toBytes(columnFamily)) - logInfo(" - created table") - val sparkConf = new SparkConf - sparkConf.set(HBaseSparkConf.QUERY_CACHEBLOCKS, "true") - sparkConf.set(HBaseSparkConf.QUERY_BATCHSIZE, "100") - sparkConf.set(HBaseSparkConf.QUERY_CACHEDROWS, "100") - - sc = new SparkContext("local", "test", sparkConf) - - val connection = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration) - try { - val t1Table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("get1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(1)) - t1Table.put(put) - put = new Put(Bytes.toBytes("get2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("4")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(4)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("z"), Bytes.toBytes("FOO")) - t1Table.put(put) - put = new Put(Bytes.toBytes("get3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("8")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(8)) - t1Table.put(put) - put = new Put(Bytes.toBytes("get4")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo4")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("10")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(10)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("z"), Bytes.toBytes("BAR")) - t1Table.put(put) - put = new Put(Bytes.toBytes("get5")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo5")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("8")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(8)) - t1Table.put(put) - } finally { - t1Table.close() - } - - val t2Table = connection.getTable(TableName.valueOf("t2")) - - try { - var put = new Put(Bytes.toBytes(1)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(1)) - t2Table.put(put) - put = new Put(Bytes.toBytes(2)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("4")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(4)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("z"), Bytes.toBytes("FOO")) - t2Table.put(put) - put = new Put(Bytes.toBytes(3)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("8")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(8)) - t2Table.put(put) - put = new Put(Bytes.toBytes(4)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo4")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("10")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(10)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("z"), Bytes.toBytes("BAR")) - t2Table.put(put) - put = new Put(Bytes.toBytes(5)) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo5")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("8")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(8)) - t2Table.put(put) - } finally { - t2Table.close() - } - } finally { - connection.close() - } - - def hbaseTable1Catalog = s"""{ - |"table":{"namespace":"default", "name":"t1"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"b", "type":"string"} - |} - |}""".stripMargin - - new HBaseContext(sc, TEST_UTIL.getConfiguration) - sqlContext = new SQLContext(sc) - - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->hbaseTable1Catalog)) - - df.registerTempTable("hbaseTable1") - - def hbaseTable2Catalog = s"""{ - |"table":{"namespace":"default", "name":"t2"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"int"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"b", "type":"string"} - |} - |}""".stripMargin - - - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->hbaseTable2Catalog)) - - df.registerTempTable("hbaseTable2") - } - - override def afterAll() { - TEST_UTIL.deleteTable(TableName.valueOf(t1TableName)) - logInfo("shuting down minicluster") - TEST_UTIL.shutdownMiniCluster() - - sc.stop() - } - - override def beforeEach(): Unit = { - DefaultSourceStaticUtils.lastFiveExecutionRules.clear() - } - - - /** - * A example of query three fields and also only using rowkey points for the filter - */ - test("Test rowKey point only rowKey query") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "(KEY_FIELD = 'get1' or KEY_FIELD = 'get2' or KEY_FIELD = 'get3')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 3) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( ( KEY_FIELD == 0 OR KEY_FIELD == 1 ) OR KEY_FIELD == 2 )")) - - assert(executionRules.rowKeyFilter.points.size == 3) - assert(executionRules.rowKeyFilter.ranges.size == 0) - } - - /** - * A example of query three fields and also only using cell points for the filter - */ - test("Test cell point only rowKey query") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "(B_FIELD = '4' or B_FIELD = '10' or A_FIELD = 'foo1')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 3) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( ( B_FIELD == 0 OR B_FIELD == 1 ) OR A_FIELD == 2 )")) - } - - /** - * A example of a OR merge between to ranges the result is one range - * Also an example of less then and greater then - */ - test("Test two range rowKey query") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "( KEY_FIELD < 'get2' or KEY_FIELD > 'get3')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 3) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( KEY_FIELD < 0 OR KEY_FIELD > 1 )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 2) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes(""))) - assert(Bytes.equals(scanRange1.upperBound,Bytes.toBytes("get2"))) - assert(scanRange1.isLowerBoundEqualTo) - assert(!scanRange1.isUpperBoundEqualTo) - - val scanRange2 = executionRules.rowKeyFilter.ranges.get(1).get - assert(Bytes.equals(scanRange2.lowerBound,Bytes.toBytes("get3"))) - assert(scanRange2.upperBound == null) - assert(!scanRange2.isLowerBoundEqualTo) - assert(scanRange2.isUpperBoundEqualTo) - } - - /** - * A example of a OR merge between to ranges the result is one range - * Also an example of less then and greater then - * - * This example makes sure the code works for a int rowKey - */ - test("Test two range rowKey query where the rowKey is Int and there is a range over lap") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable2 " + - "WHERE " + - "( KEY_FIELD < 4 or KEY_FIELD > 2)").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( KEY_FIELD < 0 OR KEY_FIELD > 1 )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 2) - assert(results.length == 5) - } - - /** - * A example of a OR merge between to ranges the result is two ranges - * Also an example of less then and greater then - * - * This example makes sure the code works for a int rowKey - */ - test("Test two range rowKey query where the rowKey is Int and the ranges don't over lap") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable2 " + - "WHERE " + - "( KEY_FIELD < 2 or KEY_FIELD > 4)").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( KEY_FIELD < 0 OR KEY_FIELD > 1 )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - - assert(executionRules.rowKeyFilter.ranges.size == 3) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.upperBound, Bytes.toBytes(2))) - assert(scanRange1.isLowerBoundEqualTo) - assert(!scanRange1.isUpperBoundEqualTo) - - val scanRange2 = executionRules.rowKeyFilter.ranges.get(1).get - assert(scanRange2.isUpperBoundEqualTo) - - assert(results.length == 2) - } - - /** - * A example of a AND merge between to ranges the result is one range - * Also an example of less then and equal to and greater then and equal to - */ - test("Test one combined range rowKey query") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "(KEY_FIELD <= 'get3' and KEY_FIELD >= 'get2')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 2) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( KEY_FIELD <= 0 AND KEY_FIELD >= 1 )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 1) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes("get2"))) - assert(Bytes.equals(scanRange1.upperBound, Bytes.toBytes("get3"))) - assert(scanRange1.isLowerBoundEqualTo) - assert(scanRange1.isUpperBoundEqualTo) - - } - - /** - * Do a select with no filters - */ - test("Test select only query") { - - val results = df.select("KEY_FIELD").take(10) - assert(results.length == 5) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(executionRules.dynamicLogicExpression == null) - - } - - /** - * A complex query with one point and one range for both the - * rowKey and the a column - */ - test("Test SQL point and range combo") { - val results = sqlContext.sql("SELECT KEY_FIELD FROM hbaseTable1 " + - "WHERE " + - "(KEY_FIELD = 'get1' and B_FIELD < '3') or " + - "(KEY_FIELD >= 'get3' and B_FIELD = '8')").take(5) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( ( KEY_FIELD == 0 AND B_FIELD < 1 ) OR " + - "( KEY_FIELD >= 2 AND B_FIELD == 3 ) )")) - - assert(executionRules.rowKeyFilter.points.size == 1) - assert(executionRules.rowKeyFilter.ranges.size == 1) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes("get3"))) - assert(scanRange1.upperBound == null) - assert(scanRange1.isLowerBoundEqualTo) - assert(scanRange1.isUpperBoundEqualTo) - - - assert(results.length == 3) - } - - /** - * A complex query with two complex ranges that doesn't merge into one - */ - test("Test two complete range non merge rowKey query") { - - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable2 " + - "WHERE " + - "( KEY_FIELD >= 1 and KEY_FIELD <= 2) or" + - "( KEY_FIELD > 3 and KEY_FIELD <= 5)").take(10) - - - assert(results.length == 4) - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( ( KEY_FIELD >= 0 AND KEY_FIELD <= 1 ) OR " + - "( KEY_FIELD > 2 AND KEY_FIELD <= 3 ) )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 2) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes(1))) - assert(Bytes.equals(scanRange1.upperBound, Bytes.toBytes(2))) - assert(scanRange1.isLowerBoundEqualTo) - assert(scanRange1.isUpperBoundEqualTo) - - val scanRange2 = executionRules.rowKeyFilter.ranges.get(1).get - assert(Bytes.equals(scanRange2.lowerBound,Bytes.toBytes(3))) - assert(Bytes.equals(scanRange2.upperBound, Bytes.toBytes(5))) - assert(!scanRange2.isLowerBoundEqualTo) - assert(scanRange2.isUpperBoundEqualTo) - - } - - /** - * A complex query with two complex ranges that does merge into one - */ - test("Test two complete range merge rowKey query") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "( KEY_FIELD >= 'get1' and KEY_FIELD <= 'get2') or" + - "( KEY_FIELD > 'get3' and KEY_FIELD <= 'get5')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 4) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( ( KEY_FIELD >= 0 AND KEY_FIELD <= 1 ) OR " + - "( KEY_FIELD > 2 AND KEY_FIELD <= 3 ) )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 2) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes("get1"))) - assert(Bytes.equals(scanRange1.upperBound, Bytes.toBytes("get2"))) - assert(scanRange1.isLowerBoundEqualTo) - assert(scanRange1.isUpperBoundEqualTo) - - val scanRange2 = executionRules.rowKeyFilter.ranges.get(1).get - assert(Bytes.equals(scanRange2.lowerBound, Bytes.toBytes("get3"))) - assert(Bytes.equals(scanRange2.upperBound, Bytes.toBytes("get5"))) - assert(!scanRange2.isLowerBoundEqualTo) - assert(scanRange2.isUpperBoundEqualTo) - } - - test("Test OR logic with a one RowKey and One column") { - - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "( KEY_FIELD >= 'get1' or A_FIELD <= 'foo2') or" + - "( KEY_FIELD > 'get3' or B_FIELD <= '4')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 5) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( ( KEY_FIELD >= 0 OR A_FIELD <= 1 ) OR " + - "( KEY_FIELD > 2 OR B_FIELD <= 3 ) )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 1) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - //This is the main test for 14406 - //Because the key is joined through a or with a qualifier - //There is no filter on the rowKey - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes(""))) - assert(scanRange1.upperBound == null) - assert(scanRange1.isLowerBoundEqualTo) - assert(scanRange1.isUpperBoundEqualTo) - } - - test("Test OR logic with a two columns") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "( B_FIELD > '4' or A_FIELD <= 'foo2') or" + - "( A_FIELD > 'foo2' or B_FIELD < '4')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 5) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( ( B_FIELD > 0 OR A_FIELD <= 1 ) OR " + - "( A_FIELD > 2 OR B_FIELD < 3 ) )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 1) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes(""))) - assert(scanRange1.upperBound == null) - assert(scanRange1.isLowerBoundEqualTo) - assert(scanRange1.isUpperBoundEqualTo) - - } - - test("Test single RowKey Or Column logic") { - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseTable1 " + - "WHERE " + - "( KEY_FIELD >= 'get4' or A_FIELD <= 'foo2' )").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 4) - - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( KEY_FIELD >= 0 OR A_FIELD <= 1 )")) - - assert(executionRules.rowKeyFilter.points.size == 0) - assert(executionRules.rowKeyFilter.ranges.size == 1) - - val scanRange1 = executionRules.rowKeyFilter.ranges.get(0).get - assert(Bytes.equals(scanRange1.lowerBound,Bytes.toBytes(""))) - assert(scanRange1.upperBound == null) - assert(scanRange1.isLowerBoundEqualTo) - assert(scanRange1.isUpperBoundEqualTo) - } - - test("Test table that doesn't exist") { - val catalog = s"""{ - |"table":{"namespace":"default", "name":"t1NotThere"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"c", "type":"string"} - |} - |}""".stripMargin - - intercept[Exception] { - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->catalog)) - - df.registerTempTable("hbaseNonExistingTmp") - - sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseNonExistingTmp " + - "WHERE " + - "( KEY_FIELD >= 'get1' and KEY_FIELD <= 'get3') or" + - "( KEY_FIELD > 'get3' and KEY_FIELD <= 'get5')").count() - } - DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - } - - - test("Test table with column that doesn't exist") { - val catalog = s"""{ - |"table":{"namespace":"default", "name":"t1"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"b", "type":"string"}, - |"C_FIELD":{"cf":"c", "col":"c", "type":"string"} - |} - |}""".stripMargin - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->catalog)) - - df.registerTempTable("hbaseFactColumnTmp") - - val result = sqlContext.sql("SELECT KEY_FIELD, " + - "B_FIELD, A_FIELD FROM hbaseFactColumnTmp") - - assert(result.count() == 5) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - assert(executionRules.dynamicLogicExpression == null) - - } - - test("Test table with INT column") { - val catalog = s"""{ - |"table":{"namespace":"default", "name":"t1"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"b", "type":"string"}, - |"I_FIELD":{"cf":"c", "col":"i", "type":"int"} - |} - |}""".stripMargin - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->catalog)) - - df.registerTempTable("hbaseIntTmp") - - val result = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, I_FIELD FROM hbaseIntTmp"+ - " where I_FIELD > 4 and I_FIELD < 10") - - val localResult = result.take(5) - - assert(localResult.length == 2) - assert(localResult(0).getInt(2) == 8) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - assert(executionRules.dynamicLogicExpression.toExpressionString. - equals("( I_FIELD > 0 AND I_FIELD < 1 )")) - - } - - test("Test table with INT column defined at wrong type") { - val catalog = s"""{ - |"table":{"namespace":"default", "name":"t1"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"b", "type":"string"}, - |"I_FIELD":{"cf":"c", "col":"i", "type":"string"} - |} - |}""".stripMargin - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->catalog)) - - df.registerTempTable("hbaseIntWrongTypeTmp") - - val result = sqlContext.sql("SELECT KEY_FIELD, " + - "B_FIELD, I_FIELD FROM hbaseIntWrongTypeTmp") - - val localResult = result.take(10) - assert(localResult.length == 5) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - assert(executionRules.dynamicLogicExpression == null) - - assert(localResult(0).getString(2).length == 4) - assert(localResult(0).getString(2).charAt(0).toByte == 0) - assert(localResult(0).getString(2).charAt(1).toByte == 0) - assert(localResult(0).getString(2).charAt(2).toByte == 0) - assert(localResult(0).getString(2).charAt(3).toByte == 1) - } - - test("Test bad column type") { - val catalog = s"""{ - |"table":{"namespace":"default", "name":"t1"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"FOOBAR"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"I_FIELD":{"cf":"c", "col":"i", "type":"string"} - |} - |}""".stripMargin - intercept[Exception] { - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->catalog)) - - df.registerTempTable("hbaseIntWrongTypeTmp") - - val result = sqlContext.sql("SELECT KEY_FIELD, " + - "B_FIELD, I_FIELD FROM hbaseIntWrongTypeTmp") - - val localResult = result.take(10) - assert(localResult.length == 5) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - assert(executionRules.dynamicLogicExpression == null) - - } - } - - test("Test HBaseSparkConf matching") { - val df = sqlContext.load("org.apache.hadoop.hbase.spark.HBaseTestSource", - Map("cacheSize" -> "100", - "batchNum" -> "100", - "blockCacheingEnable" -> "true", "rowNum" -> "10")) - assert(df.count() == 10) - - val df1 = sqlContext.load("org.apache.hadoop.hbase.spark.HBaseTestSource", - Map("cacheSize" -> "1000", - "batchNum" -> "100", "blockCacheingEnable" -> "true", "rowNum" -> "10")) - intercept[Exception] { - assert(df1.count() == 10) - } - - val df2 = sqlContext.load("org.apache.hadoop.hbase.spark.HBaseTestSource", - Map("cacheSize" -> "100", - "batchNum" -> "1000", "blockCacheingEnable" -> "true", "rowNum" -> "10")) - intercept[Exception] { - assert(df2.count() == 10) - } - - val df3 = sqlContext.load("org.apache.hadoop.hbase.spark.HBaseTestSource", - Map("cacheSize" -> "100", - "batchNum" -> "100", "blockCacheingEnable" -> "false", "rowNum" -> "10")) - intercept[Exception] { - assert(df3.count() == 10) - } - } - - test("Test table with sparse column") { - val catalog = s"""{ - |"table":{"namespace":"default", "name":"t1"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"b", "type":"string"}, - |"Z_FIELD":{"cf":"c", "col":"z", "type":"string"} - |} - |}""".stripMargin - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->catalog)) - - df.registerTempTable("hbaseZTmp") - - val result = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, Z_FIELD FROM hbaseZTmp") - - val localResult = result.take(10) - assert(localResult.length == 5) - - assert(localResult(0).getString(2) == null) - assert(localResult(1).getString(2) == "FOO") - assert(localResult(2).getString(2) == null) - assert(localResult(3).getString(2) == "BAR") - assert(localResult(4).getString(2) == null) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - assert(executionRules.dynamicLogicExpression == null) - } - - test("Test with column logic disabled") { - val catalog = s"""{ - |"table":{"namespace":"default", "name":"t1"}, - |"rowkey":"key", - |"columns":{ - |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"}, - |"A_FIELD":{"cf":"c", "col":"a", "type":"string"}, - |"B_FIELD":{"cf":"c", "col":"b", "type":"string"}, - |"Z_FIELD":{"cf":"c", "col":"z", "type":"string"} - |} - |}""".stripMargin - df = sqlContext.load("org.apache.hadoop.hbase.spark", - Map(HBaseTableCatalog.tableCatalog->catalog, - HBaseSparkConf.PUSHDOWN_COLUMN_FILTER -> "false")) - - df.registerTempTable("hbaseNoPushDownTmp") - - val results = sqlContext.sql("SELECT KEY_FIELD, B_FIELD, A_FIELD FROM hbaseNoPushDownTmp " + - "WHERE " + - "(KEY_FIELD <= 'get3' and KEY_FIELD >= 'get2')").take(10) - - val executionRules = DefaultSourceStaticUtils.lastFiveExecutionRules.poll() - - assert(results.length == 2) - - assert(executionRules.dynamicLogicExpression == null) - } - - def writeCatalog = s"""{ - |"table":{"namespace":"default", "name":"table1"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, - |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, - |"col2":{"cf":"cf1", "col":"col2", "type":"double"}, - |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, - |"col4":{"cf":"cf3", "col":"col4", "type":"int"}, - |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, - |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, - |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, - |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} - |} - |}""".stripMargin - - def withCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map(HBaseTableCatalog.tableCatalog->cat)) - .format("org.apache.hadoop.hbase.spark") - .load() - } - - test("populate table") { - val sql = sqlContext - import sql.implicits._ - val data = (0 to 255).map { i => - HBaseRecord(i, "extra") - } - sc.parallelize(data).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - } - - test("empty column") { - val df = withCatalog(writeCatalog) - df.registerTempTable("table0") - val c = sqlContext.sql("select count(1) from table0").rdd.collect()(0)(0).asInstanceOf[Long] - assert(c == 256) - } - - test("full query") { - val df = withCatalog(writeCatalog) - df.show() - assert(df.count() == 256) - } - - test("filtered query0") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(writeCatalog) - val s = df.filter($"col0" <= "row005") - .select("col0", "col1") - s.show() - assert(s.count() == 6) - } - - test("Timestamp semantics") { - val sql = sqlContext - import sql.implicits._ - - // There's already some data in here from recently. Let's throw something in - // from 1993 which we can include/exclude and add some data with the implicit (now) timestamp. - // Then we should be able to cross-section it and only get points in between, get the most recent view - // and get an old view. - val oldMs = 754869600000L - val startMs = System.currentTimeMillis() - val oldData = (0 to 100).map { i => - HBaseRecord(i, "old") - } - val newData = (200 to 255).map { i => - HBaseRecord(i, "new") - } - - sc.parallelize(oldData).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseTableCatalog.tableName -> "5", - HBaseSparkConf.TIMESTAMP -> oldMs.toString)) - .format("org.apache.hadoop.hbase.spark") - .save() - sc.parallelize(newData).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseTableCatalog.tableName -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - - // Test specific timestamp -- Full scan, Timestamp - val individualTimestamp = sqlContext.read - .options(Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseSparkConf.TIMESTAMP -> oldMs.toString)) - .format("org.apache.hadoop.hbase.spark") - .load() - assert(individualTimestamp.count() == 101) - - // Test getting everything -- Full Scan, No range - val everything = sqlContext.read - .options(Map(HBaseTableCatalog.tableCatalog -> writeCatalog)) - .format("org.apache.hadoop.hbase.spark") - .load() - assert(everything.count() == 256) - // Test getting everything -- Pruned Scan, TimeRange - val element50 = everything.where(col("col0") === lit("row050")).select("col7").collect()(0)(0) - assert(element50 == "String50: extra") - val element200 = everything.where(col("col0") === lit("row200")).select("col7").collect()(0)(0) - assert(element200 == "String200: new") - - // Test Getting old stuff -- Full Scan, TimeRange - val oldRange = sqlContext.read - .options(Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseSparkConf.TIMERANGE_START -> "0", - HBaseSparkConf.TIMERANGE_END -> (oldMs + 100).toString)) - .format("org.apache.hadoop.hbase.spark") - .load() - assert(oldRange.count() == 101) - // Test Getting old stuff -- Pruned Scan, TimeRange - val oldElement50 = oldRange.where(col("col0") === lit("row050")).select("col7").collect()(0)(0) - assert(oldElement50 == "String50: old") - - // Test Getting middle stuff -- Full Scan, TimeRange - val middleRange = sqlContext.read - .options(Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseSparkConf.TIMERANGE_START -> "0", - HBaseSparkConf.TIMERANGE_END -> (startMs + 100).toString)) - .format("org.apache.hadoop.hbase.spark") - .load() - assert(middleRange.count() == 256) - // Test Getting middle stuff -- Pruned Scan, TimeRange - val middleElement200 = middleRange.where(col("col0") === lit("row200")).select("col7").collect()(0)(0) - assert(middleElement200 == "String200: extra") - } - - - // catalog for insertion - def avroWriteCatalog = s"""{ - |"table":{"namespace":"default", "name":"avrotable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"binary"}, - |"col1":{"cf":"cf1", "col":"col1", "type":"binary"} - |} - |}""".stripMargin - - // catalog for read - def avroCatalog = s"""{ - |"table":{"namespace":"default", "name":"avrotable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "avro":"avroSchema"}, - |"col1":{"cf":"cf1", "col":"col1", "avro":"avroSchema"} - |} - |}""".stripMargin - - // for insert to another table - def avroCatalogInsert = s"""{ - |"table":{"namespace":"default", "name":"avrotableInsert"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "avro":"avroSchema"}, - |"col1":{"cf":"cf1", "col":"col1", "avro":"avroSchema"} - |} - |}""".stripMargin - - def withAvroCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map("avroSchema"->AvroHBaseKeyRecord.schemaString, - HBaseTableCatalog.tableCatalog->avroCatalog)) - .format("org.apache.hadoop.hbase.spark") - .load() - } - - - test("populate avro table") { - val sql = sqlContext - import sql.implicits._ - - val data = (0 to 255).map { i => - AvroHBaseKeyRecord(i) - } - sc.parallelize(data).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> avroWriteCatalog, - HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - } - - test("avro empty column") { - val df = withAvroCatalog(avroCatalog) - df.registerTempTable("avrotable") - val c = sqlContext.sql("select count(1) from avrotable") - .rdd.collect()(0)(0).asInstanceOf[Long] - assert(c == 256) - } - - test("avro full query") { - val df = withAvroCatalog(avroCatalog) - df.show() - df.printSchema() - assert(df.count() == 256) - } - - test("avro serialization and deserialization query") { - val df = withAvroCatalog(avroCatalog) - df.write.options( - Map("avroSchema"->AvroHBaseKeyRecord.schemaString, - HBaseTableCatalog.tableCatalog->avroCatalogInsert, - HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - val newDF = withAvroCatalog(avroCatalogInsert) - newDF.show() - newDF.printSchema() - assert(newDF.count() == 256) - } - - test("avro filtered query") { - val sql = sqlContext - import sql.implicits._ - val df = withAvroCatalog(avroCatalog) - val r = df.filter($"col1.name" === "name005" || $"col1.name" <= "name005") - .select("col0", "col1.favorite_color", "col1.favorite_number") - r.show() - assert(r.count() == 6) - } - - test("avro Or filter") { - val sql = sqlContext - import sql.implicits._ - val df = withAvroCatalog(avroCatalog) - val s = df.filter($"col1.name" <= "name005" || $"col1.name".contains("name007")) - .select("col0", "col1.favorite_color", "col1.favorite_number") - s.show() - assert(s.count() == 7) - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/DynamicLogicExpressionSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/DynamicLogicExpressionSuite.scala deleted file mode 100644 index bc833e8e603..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/DynamicLogicExpressionSuite.scala +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.util - -import org.apache.hadoop.hbase.spark.datasources.{HBaseSparkConf, JavaBytesEncoder} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.Logging -import org.apache.spark.sql.types._ -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -class DynamicLogicExpressionSuite extends FunSuite with -BeforeAndAfterEach with BeforeAndAfterAll with Logging { - - val encoder = JavaBytesEncoder.create(HBaseSparkConf.DEFAULT_QUERY_ENCODER) - - test("Basic And Test") { - val leftLogic = new LessThanLogicExpression("Col1", 0) - leftLogic.setEncoder(encoder) - val rightLogic = new GreaterThanLogicExpression("Col1", 1) - rightLogic.setEncoder(encoder) - val andLogic = new AndLogicExpression(leftLogic, rightLogic) - - val columnToCurrentRowValueMap = new util.HashMap[String, ByteArrayComparable]() - - columnToCurrentRowValueMap.put("Col1", new ByteArrayComparable(Bytes.toBytes(10))) - val valueFromQueryValueArray = new Array[Array[Byte]](2) - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 10) - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - val expressionString = andLogic.toExpressionString - - assert(expressionString.equals("( Col1 < 0 AND Col1 > 1 )")) - - val builtExpression = DynamicLogicExpressionBuilder.build(expressionString, encoder) - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 10) - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - } - - test("Basic OR Test") { - val leftLogic = new LessThanLogicExpression("Col1", 0) - leftLogic.setEncoder(encoder) - val rightLogic = new GreaterThanLogicExpression("Col1", 1) - rightLogic.setEncoder(encoder) - val OrLogic = new OrLogicExpression(leftLogic, rightLogic) - - val columnToCurrentRowValueMap = new util.HashMap[String, ByteArrayComparable]() - - columnToCurrentRowValueMap.put("Col1", new ByteArrayComparable(Bytes.toBytes(10))) - val valueFromQueryValueArray = new Array[Array[Byte]](2) - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(OrLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(OrLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 10) - assert(OrLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 10) - assert(!OrLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - val expressionString = OrLogic.toExpressionString - - assert(expressionString.equals("( Col1 < 0 OR Col1 > 1 )")) - - val builtExpression = DynamicLogicExpressionBuilder.build(expressionString, encoder) - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 5) - assert(builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 15) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 10) - assert(builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - valueFromQueryValueArray(1) = encoder.encode(IntegerType, 10) - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - } - - test("Basic Command Test") { - val greaterLogic = new GreaterThanLogicExpression("Col1", 0) - greaterLogic.setEncoder(encoder) - val greaterAndEqualLogic = new GreaterThanOrEqualLogicExpression("Col1", 0) - greaterAndEqualLogic.setEncoder(encoder) - val lessLogic = new LessThanLogicExpression("Col1", 0) - lessLogic.setEncoder(encoder) - val lessAndEqualLogic = new LessThanOrEqualLogicExpression("Col1", 0) - lessAndEqualLogic.setEncoder(encoder) - val equalLogic = new EqualLogicExpression("Col1", 0, false) - val notEqualLogic = new EqualLogicExpression("Col1", 0, true) - val passThrough = new PassThroughLogicExpression - - val columnToCurrentRowValueMap = new util.HashMap[String, ByteArrayComparable]() - columnToCurrentRowValueMap.put("Col1", new ByteArrayComparable(Bytes.toBytes(10))) - val valueFromQueryValueArray = new Array[Array[Byte]](1) - - //great than - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - assert(!greaterLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 20) - assert(!greaterLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - //great than and equal - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 5) - assert(greaterAndEqualLogic.execute(columnToCurrentRowValueMap, - valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - assert(greaterAndEqualLogic.execute(columnToCurrentRowValueMap, - valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 20) - assert(!greaterAndEqualLogic.execute(columnToCurrentRowValueMap, - valueFromQueryValueArray)) - - //less than - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - assert(!lessLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 5) - assert(!lessLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - //less than and equal - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 20) - assert(lessAndEqualLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 20) - assert(lessAndEqualLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(IntegerType, 10) - assert(lessAndEqualLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - //equal too - valueFromQueryValueArray(0) = Bytes.toBytes(10) - assert(equalLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = Bytes.toBytes(5) - assert(!equalLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - //not equal too - valueFromQueryValueArray(0) = Bytes.toBytes(10) - assert(!notEqualLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = Bytes.toBytes(5) - assert(notEqualLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - //pass through - valueFromQueryValueArray(0) = Bytes.toBytes(10) - assert(passThrough.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = Bytes.toBytes(5) - assert(passThrough.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - } - - - test("Double Type") { - val leftLogic = new LessThanLogicExpression("Col1", 0) - leftLogic.setEncoder(encoder) - val rightLogic = new GreaterThanLogicExpression("Col1", 1) - rightLogic.setEncoder(encoder) - val andLogic = new AndLogicExpression(leftLogic, rightLogic) - - val columnToCurrentRowValueMap = new util.HashMap[String, ByteArrayComparable]() - - columnToCurrentRowValueMap.put("Col1", new ByteArrayComparable(Bytes.toBytes(-4.0d))) - val valueFromQueryValueArray = new Array[Array[Byte]](2) - valueFromQueryValueArray(0) = encoder.encode(DoubleType, 15.0d) - valueFromQueryValueArray(1) = encoder.encode(DoubleType, -5.0d) - assert(andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(DoubleType, 10.0d) - valueFromQueryValueArray(1) = encoder.encode(DoubleType, -1.0d) - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(DoubleType, -10.0d) - valueFromQueryValueArray(1) = encoder.encode(DoubleType, -20.0d) - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - val expressionString = andLogic.toExpressionString - // Note that here 0 and 1 is index, instead of value. - assert(expressionString.equals("( Col1 < 0 AND Col1 > 1 )")) - - val builtExpression = DynamicLogicExpressionBuilder.build(expressionString, encoder) - valueFromQueryValueArray(0) = encoder.encode(DoubleType, 15.0d) - valueFromQueryValueArray(1) = encoder.encode(DoubleType, -5.0d) - assert(builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(DoubleType, 10.0d) - valueFromQueryValueArray(1) = encoder.encode(DoubleType, -1.0d) - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(DoubleType, -10.0d) - valueFromQueryValueArray(1) = encoder.encode(DoubleType, -20.0d) - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - } - - test("Float Type") { - val leftLogic = new LessThanLogicExpression("Col1", 0) - leftLogic.setEncoder(encoder) - val rightLogic = new GreaterThanLogicExpression("Col1", 1) - rightLogic.setEncoder(encoder) - val andLogic = new AndLogicExpression(leftLogic, rightLogic) - - val columnToCurrentRowValueMap = new util.HashMap[String, ByteArrayComparable]() - - columnToCurrentRowValueMap.put("Col1", new ByteArrayComparable(Bytes.toBytes(-4.0f))) - val valueFromQueryValueArray = new Array[Array[Byte]](2) - valueFromQueryValueArray(0) = encoder.encode(FloatType, 15.0f) - valueFromQueryValueArray(1) = encoder.encode(FloatType, -5.0f) - assert(andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(FloatType, 10.0f) - valueFromQueryValueArray(1) = encoder.encode(FloatType, -1.0f) - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(FloatType, -10.0f) - valueFromQueryValueArray(1) = encoder.encode(FloatType, -20.0f) - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - val expressionString = andLogic.toExpressionString - // Note that here 0 and 1 is index, instead of value. - assert(expressionString.equals("( Col1 < 0 AND Col1 > 1 )")) - - val builtExpression = DynamicLogicExpressionBuilder.build(expressionString, encoder) - valueFromQueryValueArray(0) = encoder.encode(FloatType, 15.0f) - valueFromQueryValueArray(1) = encoder.encode(FloatType, -5.0f) - assert(builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(FloatType, 10.0f) - valueFromQueryValueArray(1) = encoder.encode(FloatType, -1.0f) - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(FloatType, -10.0f) - valueFromQueryValueArray(1) = encoder.encode(FloatType, -20.0f) - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - } - - test("String Type") { - val leftLogic = new LessThanLogicExpression("Col1", 0) - leftLogic.setEncoder(encoder) - val rightLogic = new GreaterThanLogicExpression("Col1", 1) - rightLogic.setEncoder(encoder) - val andLogic = new AndLogicExpression(leftLogic, rightLogic) - - val columnToCurrentRowValueMap = new util.HashMap[String, ByteArrayComparable]() - - columnToCurrentRowValueMap.put("Col1", new ByteArrayComparable(Bytes.toBytes("row005"))) - val valueFromQueryValueArray = new Array[Array[Byte]](2) - valueFromQueryValueArray(0) = encoder.encode(StringType, "row015") - valueFromQueryValueArray(1) = encoder.encode(StringType, "row000") - assert(andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(StringType, "row004") - valueFromQueryValueArray(1) = encoder.encode(StringType, "row000") - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(StringType, "row020") - valueFromQueryValueArray(1) = encoder.encode(StringType, "row010") - assert(!andLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - val expressionString = andLogic.toExpressionString - // Note that here 0 and 1 is index, instead of value. - assert(expressionString.equals("( Col1 < 0 AND Col1 > 1 )")) - - val builtExpression = DynamicLogicExpressionBuilder.build(expressionString, encoder) - valueFromQueryValueArray(0) = encoder.encode(StringType, "row015") - valueFromQueryValueArray(1) = encoder.encode(StringType, "row000") - assert(builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(StringType, "row004") - valueFromQueryValueArray(1) = encoder.encode(StringType, "row000") - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - - valueFromQueryValueArray(0) = encoder.encode(StringType, "row020") - valueFromQueryValueArray(1) = encoder.encode(StringType, "row010") - assert(!builtExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - } - - test("Boolean Type") { - val leftLogic = new LessThanLogicExpression("Col1", 0) - leftLogic.setEncoder(encoder) - val rightLogic = new GreaterThanLogicExpression("Col1", 1) - rightLogic.setEncoder(encoder) - - val columnToCurrentRowValueMap = new util.HashMap[String, ByteArrayComparable]() - - columnToCurrentRowValueMap.put("Col1", new ByteArrayComparable(Bytes.toBytes(false))) - val valueFromQueryValueArray = new Array[Array[Byte]](2) - valueFromQueryValueArray(0) = encoder.encode(BooleanType, true) - valueFromQueryValueArray(1) = encoder.encode(BooleanType, false) - assert(leftLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - assert(!rightLogic.execute(columnToCurrentRowValueMap, valueFromQueryValueArray)) - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseCatalogSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseCatalogSuite.scala deleted file mode 100644 index 49e2f6c340e..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseCatalogSuite.scala +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.spark.datasources.{DoubleSerDes, SerDes} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.Logging -import org.apache.spark.sql.datasources.hbase.{DataTypeParserWrapper, HBaseTableCatalog} -import org.apache.spark.sql.types._ -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -class HBaseCatalogSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll with Logging { - - val map = s"""MAP>""" - val array = s"""array>""" - val arrayMap = s"""MAp>""" - val catalog = s"""{ - |"table":{"namespace":"default", "name":"htable"}, - |"rowkey":"key1:key2", - |"columns":{ - |"col1":{"cf":"rowkey", "col":"key1", "type":"string"}, - |"col2":{"cf":"rowkey", "col":"key2", "type":"double"}, - |"col3":{"cf":"cf1", "col":"col2", "type":"binary"}, - |"col4":{"cf":"cf1", "col":"col3", "type":"timestamp"}, - |"col5":{"cf":"cf1", "col":"col4", "type":"double", "serdes":"${classOf[DoubleSerDes].getName}"}, - |"col6":{"cf":"cf1", "col":"col5", "type":"$map"}, - |"col7":{"cf":"cf1", "col":"col6", "type":"$array"}, - |"col8":{"cf":"cf1", "col":"col7", "type":"$arrayMap"} - |} - |}""".stripMargin - val parameters = Map(HBaseTableCatalog.tableCatalog->catalog) - val t = HBaseTableCatalog(parameters) - - def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = { - test(s"parse ${dataTypeString.replace("\n", "")}") { - assert(DataTypeParserWrapper.parse(dataTypeString) === expectedDataType) - } - } - test("basic") { - assert(t.getField("col1").isRowKey == true) - assert(t.getPrimaryKey == "key1") - assert(t.getField("col3").dt == BinaryType) - assert(t.getField("col4").dt == TimestampType) - assert(t.getField("col5").dt == DoubleType) - assert(t.getField("col5").serdes != None) - assert(t.getField("col4").serdes == None) - assert(t.getField("col1").isRowKey) - assert(t.getField("col2").isRowKey) - assert(!t.getField("col3").isRowKey) - assert(t.getField("col2").length == Bytes.SIZEOF_DOUBLE) - assert(t.getField("col1").length == -1) - assert(t.getField("col8").length == -1) - } - - checkDataType( - map, - t.getField("col6").dt - ) - - checkDataType( - array, - t.getField("col7").dt - ) - - checkDataType( - arrayMap, - t.getField("col8").dt - ) - - test("convert") { - val m = Map("hbase.columns.mapping" -> - "KEY_FIELD STRING :key, A_FIELD STRING c:a, B_FIELD DOUBLE c:b, C_FIELD BINARY c:c,", - "hbase.table" -> "t1") - val map = HBaseTableCatalog.convert(m) - val json = map.get(HBaseTableCatalog.tableCatalog).get - val parameters = Map(HBaseTableCatalog.tableCatalog->json) - val t = HBaseTableCatalog(parameters) - assert(t.getField("KEY_FIELD").isRowKey) - assert(DataTypeParserWrapper.parse("STRING") === t.getField("A_FIELD").dt) - assert(!t.getField("A_FIELD").isRowKey) - assert(DataTypeParserWrapper.parse("DOUBLE") === t.getField("B_FIELD").dt) - assert(DataTypeParserWrapper.parse("BINARY") === t.getField("C_FIELD").dt) - } - - test("compatiblity") { - val m = Map("hbase.columns.mapping" -> - "KEY_FIELD STRING :key, A_FIELD STRING c:a, B_FIELD DOUBLE c:b, C_FIELD BINARY c:c,", - "hbase.table" -> "t1") - val t = HBaseTableCatalog(m) - assert(t.getField("KEY_FIELD").isRowKey) - assert(DataTypeParserWrapper.parse("STRING") === t.getField("A_FIELD").dt) - assert(!t.getField("A_FIELD").isRowKey) - assert(DataTypeParserWrapper.parse("DOUBLE") === t.getField("B_FIELD").dt) - assert(DataTypeParserWrapper.parse("BINARY") === t.getField("C_FIELD").dt) - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseConnectionCacheSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseConnectionCacheSuite.scala deleted file mode 100644 index b3fdd4edfbf..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseConnectionCacheSuite.scala +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import java.util.concurrent.ExecutorService -import scala.util.Random - -import org.apache.hadoop.hbase.client.{BufferedMutator, Table, RegionLocator, - Connection, BufferedMutatorParams, Admin, TableBuilder} -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.hbase.TableName -import org.apache.spark.Logging -import org.scalatest.FunSuite - -case class HBaseConnectionKeyMocker (confId: Int) extends HBaseConnectionKey (null) { - override def hashCode: Int = { - confId - } - - override def equals(obj: Any): Boolean = { - if(!obj.isInstanceOf[HBaseConnectionKeyMocker]) - false - else - confId == obj.asInstanceOf[HBaseConnectionKeyMocker].confId - } -} - -class ConnectionMocker extends Connection { - var isClosed: Boolean = false - - def getRegionLocator (tableName: TableName): RegionLocator = null - def getConfiguration: Configuration = null - def getTable (tableName: TableName): Table = null - def getTable(tableName: TableName, pool: ExecutorService): Table = null - def getBufferedMutator (params: BufferedMutatorParams): BufferedMutator = null - def getBufferedMutator (tableName: TableName): BufferedMutator = null - def getAdmin: Admin = null - def getTableBuilder(tableName: TableName, pool: ExecutorService): TableBuilder = null - - def close(): Unit = { - if (isClosed) - throw new IllegalStateException() - isClosed = true - } - - def isAborted: Boolean = true - def abort(why: String, e: Throwable) = {} -} - -class HBaseConnectionCacheSuite extends FunSuite with Logging { - /* - * These tests must be performed sequentially as they operate with an - * unique running thread and resource. - * - * It looks there's no way to tell FunSuite to do so, so making those - * test cases normal functions which are called sequentially in a single - * test case. - */ - test("all test cases") { - testBasic() - testWithPressureWithoutClose() - testWithPressureWithClose() - } - - def cleanEnv() { - HBaseConnectionCache.connectionMap.synchronized { - HBaseConnectionCache.connectionMap.clear() - HBaseConnectionCache.cacheStat.numActiveConnections = 0 - HBaseConnectionCache.cacheStat.numActualConnectionsCreated = 0 - HBaseConnectionCache.cacheStat.numTotalRequests = 0 - } - } - - def testBasic() { - cleanEnv() - HBaseConnectionCache.setTimeout(1 * 1000) - - val connKeyMocker1 = new HBaseConnectionKeyMocker(1) - val connKeyMocker1a = new HBaseConnectionKeyMocker(1) - val connKeyMocker2 = new HBaseConnectionKeyMocker(2) - - val c1 = HBaseConnectionCache - .getConnection(connKeyMocker1, new ConnectionMocker) - - assert(HBaseConnectionCache.connectionMap.size === 1) - assert(HBaseConnectionCache.getStat.numTotalRequests === 1) - assert(HBaseConnectionCache.getStat.numActualConnectionsCreated === 1) - assert(HBaseConnectionCache.getStat.numActiveConnections === 1) - - val c1a = HBaseConnectionCache - .getConnection(connKeyMocker1a, new ConnectionMocker) - - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 1) - assert(HBaseConnectionCache.getStat.numTotalRequests === 2) - assert(HBaseConnectionCache.getStat.numActualConnectionsCreated === 1) - assert(HBaseConnectionCache.getStat.numActiveConnections === 1) - } - - val c2 = HBaseConnectionCache - .getConnection(connKeyMocker2, new ConnectionMocker) - - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 2) - assert(HBaseConnectionCache.getStat.numTotalRequests === 3) - assert(HBaseConnectionCache.getStat.numActualConnectionsCreated === 2) - assert(HBaseConnectionCache.getStat.numActiveConnections === 2) - } - - c1.close() - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 2) - assert(HBaseConnectionCache.getStat.numActiveConnections === 2) - } - - c1a.close() - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 2) - assert(HBaseConnectionCache.getStat.numActiveConnections === 2) - } - - Thread.sleep(3 * 1000) // Leave housekeeping thread enough time - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 1) - assert(HBaseConnectionCache.connectionMap.iterator.next()._1 - .asInstanceOf[HBaseConnectionKeyMocker].confId === 2) - assert(HBaseConnectionCache.getStat.numActiveConnections === 1) - } - - c2.close() - } - - def testWithPressureWithoutClose() { - cleanEnv() - - class TestThread extends Runnable { - override def run() { - for (i <- 0 to 999) { - val c = HBaseConnectionCache.getConnection( - new HBaseConnectionKeyMocker(Random.nextInt(10)), new ConnectionMocker) - } - } - } - - HBaseConnectionCache.setTimeout(500) - val threads: Array[Thread] = new Array[Thread](100) - for (i <- 0 to 99) { - threads.update(i, new Thread(new TestThread())) - threads(i).run() - } - try { - threads.foreach { x => x.join() } - } catch { - case e: InterruptedException => println(e.getMessage) - } - - Thread.sleep(1000) - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 10) - assert(HBaseConnectionCache.getStat.numTotalRequests === 100 * 1000) - assert(HBaseConnectionCache.getStat.numActualConnectionsCreated === 10) - assert(HBaseConnectionCache.getStat.numActiveConnections === 10) - - var totalRc : Int = 0 - HBaseConnectionCache.connectionMap.foreach { - x => totalRc += x._2.refCount - } - assert(totalRc === 100 * 1000) - HBaseConnectionCache.connectionMap.foreach { - x => { - x._2.refCount = 0 - x._2.timestamp = System.currentTimeMillis() - 1000 - } - } - } - Thread.sleep(1000) - assert(HBaseConnectionCache.connectionMap.size === 0) - assert(HBaseConnectionCache.getStat.numActualConnectionsCreated === 10) - assert(HBaseConnectionCache.getStat.numActiveConnections === 0) - } - - def testWithPressureWithClose() { - cleanEnv() - - class TestThread extends Runnable { - override def run() { - for (i <- 0 to 999) { - val c = HBaseConnectionCache.getConnection( - new HBaseConnectionKeyMocker(Random.nextInt(10)), new ConnectionMocker) - Thread.`yield`() - c.close() - } - } - } - - HBaseConnectionCache.setTimeout(3 * 1000) - val threads: Array[Thread] = new Array[Thread](100) - for (i <- threads.indices) { - threads.update(i, new Thread(new TestThread())) - threads(i).run() - } - try { - threads.foreach { x => x.join() } - } catch { - case e: InterruptedException => println(e.getMessage) - } - - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 10) - assert(HBaseConnectionCache.getStat.numTotalRequests === 100 * 1000) - assert(HBaseConnectionCache.getStat.numActualConnectionsCreated === 10) - assert(HBaseConnectionCache.getStat.numActiveConnections === 10) - } - - Thread.sleep(6 * 1000) - HBaseConnectionCache.connectionMap.synchronized { - assert(HBaseConnectionCache.connectionMap.size === 0) - assert(HBaseConnectionCache.getStat.numActualConnectionsCreated === 10) - assert(HBaseConnectionCache.getStat.numActiveConnections === 0) - } - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseContextSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseContextSuite.scala deleted file mode 100644 index 1e1e52dd4a2..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseContextSuite.scala +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.{ CellUtil, TableName, HBaseTestingUtility} -import org.apache.spark.{SparkException, Logging, SparkContext} -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -class HBaseContextSuite extends FunSuite with -BeforeAndAfterEach with BeforeAndAfterAll with Logging { - - @transient var sc: SparkContext = null - var TEST_UTIL = new HBaseTestingUtility - - val tableName = "t1" - val columnFamily = "c" - - override def beforeAll() { - TEST_UTIL.startMiniCluster() - logInfo(" - minicluster started") - - try { - TEST_UTIL.deleteTable(TableName.valueOf(tableName)) - } catch { - case e: Exception => - logInfo(" - no table " + tableName + " found") - } - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), Bytes.toBytes(columnFamily)) - logInfo(" - created table") - - val envMap = Map[String,String](("Xmx", "512m")) - - sc = new SparkContext("local", "test", null, Nil, envMap) - } - - override def afterAll() { - logInfo("shuting down minicluster") - TEST_UTIL.shutdownMiniCluster() - logInfo(" - minicluster shut down") - TEST_UTIL.cleanupTestDir() - sc.stop() - } - - test("bulkput to test HBase client") { - val config = TEST_UTIL.getConfiguration - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")))), - (Bytes.toBytes("2"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("foo2")))), - (Bytes.toBytes("3"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("c"), Bytes.toBytes("foo3")))), - (Bytes.toBytes("4"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("d"), Bytes.toBytes("foo")))), - (Bytes.toBytes("5"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("e"), Bytes.toBytes("bar")))))) - - val hbaseContext = new HBaseContext(sc, config) - hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, - TableName.valueOf(tableName), - (putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) - put - }) - - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - val foo1 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("1"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")))) - assert(foo1 == "foo1") - - val foo2 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("2"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("b")))) - assert(foo2 == "foo2") - - val foo3 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("3"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("c")))) - assert(foo3 == "foo3") - - val foo4 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("4"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("d")))) - assert(foo4 == "foo") - - val foo5 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("5"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("e")))) - assert(foo5 == "bar") - - } finally { - table.close() - connection.close() - } - } - - test("bulkDelete to test HBase client") { - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("delete1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("delete2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("delete3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - - val rdd = sc.parallelize(Array( - Bytes.toBytes("delete1"), - Bytes.toBytes("delete3"))) - - val hbaseContext = new HBaseContext(sc, config) - hbaseContext.bulkDelete[Array[Byte]](rdd, - TableName.valueOf(tableName), - putRecord => new Delete(putRecord), - 4) - - assert(table.get(new Get(Bytes.toBytes("delete1"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")) == null) - assert(table.get(new Get(Bytes.toBytes("delete3"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")) == null) - assert(Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("delete2"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")))).equals("foo2")) - } finally { - table.close() - connection.close() - } - } - - test("bulkGet to test HBase client") { - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("get1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("get2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("get3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - } finally { - table.close() - connection.close() - } - val rdd = sc.parallelize(Array( - Bytes.toBytes("get1"), - Bytes.toBytes("get2"), - Bytes.toBytes("get3"), - Bytes.toBytes("get4"))) - val hbaseContext = new HBaseContext(sc, config) - - val getRdd = hbaseContext.bulkGet[Array[Byte], String]( - TableName.valueOf(tableName), - 2, - rdd, - record => { - new Get(record) - }, - (result: Result) => { - if (result.listCells() != null) { - val it = result.listCells().iterator() - val B = new StringBuilder - - B.append(Bytes.toString(result.getRow) + ":") - - while (it.hasNext) { - val cell = it.next() - val q = Bytes.toString(CellUtil.cloneQualifier(cell)) - if (q.equals("counter")) { - B.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") - } else { - B.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") - } - } - "" + B.toString - } else { - "" - } - }) - val getArray = getRdd.collect() - - assert(getArray.length == 4) - assert(getArray.contains("get1:(a,foo1)")) - assert(getArray.contains("get2:(a,foo2)")) - assert(getArray.contains("get3:(a,foo3)")) - - } - - test("BulkGet failure test: bad table") { - val config = TEST_UTIL.getConfiguration - - val rdd = sc.parallelize(Array( - Bytes.toBytes("get1"), - Bytes.toBytes("get2"), - Bytes.toBytes("get3"), - Bytes.toBytes("get4"))) - val hbaseContext = new HBaseContext(sc, config) - - intercept[SparkException] { - try { - val getRdd = hbaseContext.bulkGet[Array[Byte], String]( - TableName.valueOf("badTableName"), - 2, - rdd, - record => { - new Get(record) - }, - (result: Result) => "1") - - getRdd.collect() - - fail("We should have failed and not reached this line") - } catch { - case ex: SparkException => { - assert( - ex.getMessage.contains( - "org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException")) - throw ex - } - } - } - } - - test("BulkGet failure test: bad column") { - - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("get1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("get2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("get3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - } finally { - table.close() - connection.close() - } - - val rdd = sc.parallelize(Array( - Bytes.toBytes("get1"), - Bytes.toBytes("get2"), - Bytes.toBytes("get3"), - Bytes.toBytes("get4"))) - val hbaseContext = new HBaseContext(sc, config) - - val getRdd = hbaseContext.bulkGet[Array[Byte], String]( - TableName.valueOf(tableName), - 2, - rdd, - record => { - new Get(record) - }, - (result: Result) => { - if (result.listCells() != null) { - val cellValue = result.getColumnLatestCell( - Bytes.toBytes("c"), Bytes.toBytes("bad_column")) - if (cellValue == null) "null" else "bad" - } else "noValue" - }) - var nullCounter = 0 - var noValueCounter = 0 - getRdd.collect().foreach(r => { - if ("null".equals(r)) nullCounter += 1 - else if ("noValue".equals(r)) noValueCounter += 1 - }) - assert(nullCounter == 3) - assert(noValueCounter == 1) - } - - test("distributedScan to test HBase client") { - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("scan1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("scan2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("scan2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("foo-2")) - table.put(put) - put = new Put(Bytes.toBytes("scan3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - put = new Put(Bytes.toBytes("scan4")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - put = new Put(Bytes.toBytes("scan5")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - } finally { - table.close() - connection.close() - } - - val hbaseContext = new HBaseContext(sc, config) - - val scan = new Scan() - val filter = new FirstKeyOnlyFilter() - scan.setCaching(100) - scan.setStartRow(Bytes.toBytes("scan2")) - scan.setStopRow(Bytes.toBytes("scan4_")) - scan.setFilter(filter) - - val scanRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan) - - try { - val scanList = scanRdd.map(r => r._1.copyBytes()).collect() - assert(scanList.length == 3) - var cnt = 0 - scanRdd.map(r => r._2.listCells().size()).collect().foreach(l => { - cnt += l - }) - // the number of cells returned would be 4 without the Filter - assert(cnt == 3); - } catch { - case ex: Exception => ex.printStackTrace() - } - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseDStreamFunctionsSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseDStreamFunctionsSuite.scala deleted file mode 100644 index e6767aedeeb..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseDStreamFunctionsSuite.scala +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseTestingUtility} -import org.apache.spark.rdd.RDD -import org.apache.spark.streaming.{Milliseconds, StreamingContext} -import org.apache.spark.{SparkContext, Logging} -import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -import scala.collection.mutable - -class HBaseDStreamFunctionsSuite extends FunSuite with -BeforeAndAfterEach with BeforeAndAfterAll with Logging { - @transient var sc: SparkContext = null - - var TEST_UTIL: HBaseTestingUtility = new HBaseTestingUtility - - val tableName = "t1" - val columnFamily = "c" - - override def beforeAll() { - - TEST_UTIL.startMiniCluster() - - logInfo(" - minicluster started") - try - TEST_UTIL.deleteTable(TableName.valueOf(tableName)) - catch { - case e: Exception => logInfo(" - no table " + tableName + " found") - - } - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), Bytes.toBytes(columnFamily)) - logInfo(" - created table") - - sc = new SparkContext("local", "test") - } - - override def afterAll() { - TEST_UTIL.deleteTable(TableName.valueOf(tableName)) - TEST_UTIL.shutdownMiniCluster() - sc.stop() - } - - test("bulkput to test HBase client") { - val config = TEST_UTIL.getConfiguration - val rdd1 = sc.parallelize(Array( - (Bytes.toBytes("1"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")))), - (Bytes.toBytes("2"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("foo2")))), - (Bytes.toBytes("3"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("c"), Bytes.toBytes("foo3")))))) - - val rdd2 = sc.parallelize(Array( - (Bytes.toBytes("4"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("d"), Bytes.toBytes("foo")))), - (Bytes.toBytes("5"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("e"), Bytes.toBytes("bar")))))) - - var isFinished = false - - val hbaseContext = new HBaseContext(sc, config) - val ssc = new StreamingContext(sc, Milliseconds(200)) - - val queue = mutable.Queue[RDD[(Array[Byte], Array[(Array[Byte], - Array[Byte], Array[Byte])])]]() - queue += rdd1 - queue += rdd2 - val dStream = ssc.queueStream(queue) - - dStream.hbaseBulkPut( - hbaseContext, - TableName.valueOf(tableName), - (putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) - put - }) - - dStream.foreachRDD(rdd => { - if (rdd.count() == 0) { - isFinished = true - } - }) - - ssc.start() - - while (!isFinished) { - Thread.sleep(100) - } - - ssc.stop(true, true) - - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - val foo1 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("1"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")))) - assert(foo1 == "foo1") - - val foo2 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("2"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("b")))) - assert(foo2 == "foo2") - - val foo3 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("3"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("c")))) - assert(foo3 == "foo3") - - val foo4 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("4"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("d")))) - assert(foo4 == "foo") - - val foo5 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("5"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("e")))) - assert(foo5 == "bar") - } finally { - table.close() - connection.close() - } - } - -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseRDDFunctionsSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseRDDFunctionsSuite.scala deleted file mode 100644 index 89148c39a5d..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseRDDFunctionsSuite.scala +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.client._ -import org.apache.hadoop.hbase.util.Bytes -import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseTestingUtility} -import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ -import org.apache.spark.{Logging, SparkContext} -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -import scala.collection.mutable - -class HBaseRDDFunctionsSuite extends FunSuite with -BeforeAndAfterEach with BeforeAndAfterAll with Logging { - @transient var sc: SparkContext = null - var TEST_UTIL: HBaseTestingUtility = new HBaseTestingUtility - - val tableName = "t1" - val columnFamily = "c" - - override def beforeAll() { - - TEST_UTIL.startMiniCluster - - logInfo(" - minicluster started") - try - TEST_UTIL.deleteTable(TableName.valueOf(tableName)) - catch { - case e: Exception => logInfo(" - no table " + tableName + " found") - - } - logInfo(" - creating table " + tableName) - TEST_UTIL.createTable(TableName.valueOf(tableName), Bytes.toBytes(columnFamily)) - logInfo(" - created table") - - sc = new SparkContext("local", "test") - } - - override def afterAll() { - TEST_UTIL.deleteTable(TableName.valueOf(tableName)) - logInfo("shuting down minicluster") - TEST_UTIL.shutdownMiniCluster() - - sc.stop() - } - - test("bulkput to test HBase client") { - val config = TEST_UTIL.getConfiguration - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")))), - (Bytes.toBytes("2"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("foo2")))), - (Bytes.toBytes("3"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("c"), Bytes.toBytes("foo3")))), - (Bytes.toBytes("4"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("d"), Bytes.toBytes("foo")))), - (Bytes.toBytes("5"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("e"), Bytes.toBytes("bar")))))) - - val hbaseContext = new HBaseContext(sc, config) - - rdd.hbaseBulkPut( - hbaseContext, - TableName.valueOf(tableName), - (putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) - put - }) - - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - val foo1 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("1"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")))) - assert(foo1 == "foo1") - - val foo2 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("2"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("b")))) - assert(foo2 == "foo2") - - val foo3 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("3"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("c")))) - assert(foo3 == "foo3") - - val foo4 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("4"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("d")))) - assert(foo4 == "foo") - - val foo5 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("5"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("e")))) - assert(foo5 == "bar") - } finally { - table.close() - connection.close() - } - } - - test("bulkDelete to test HBase client") { - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("delete1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("delete2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("delete3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - - val rdd = sc.parallelize(Array( - Bytes.toBytes("delete1"), - Bytes.toBytes("delete3"))) - - val hbaseContext = new HBaseContext(sc, config) - - rdd.hbaseBulkDelete(hbaseContext, - TableName.valueOf(tableName), - putRecord => new Delete(putRecord), - 4) - - assert(table.get(new Get(Bytes.toBytes("delete1"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")) == null) - assert(table.get(new Get(Bytes.toBytes("delete3"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")) == null) - assert(Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("delete2"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")))).equals("foo2")) - } finally { - table.close() - connection.close() - } - - } - - test("bulkGet to test HBase client") { - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("get1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("get2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("get3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - } finally { - table.close() - connection.close() - } - - val rdd = sc.parallelize(Array( - Bytes.toBytes("get1"), - Bytes.toBytes("get2"), - Bytes.toBytes("get3"), - Bytes.toBytes("get4"))) - val hbaseContext = new HBaseContext(sc, config) - - //Get with custom convert logic - val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2, - record => { - new Get(record) - }, - (result: Result) => { - if (result.listCells() != null) { - val it = result.listCells().iterator() - val B = new StringBuilder - - B.append(Bytes.toString(result.getRow) + ":") - - while (it.hasNext) { - val cell = it.next - val q = Bytes.toString(CellUtil.cloneQualifier(cell)) - if (q.equals("counter")) { - B.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") - } else { - B.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") - } - } - "" + B.toString - } else { - "" - } - }) - - val getArray = getRdd.collect() - - assert(getArray.length == 4) - assert(getArray.contains("get1:(a,foo1)")) - assert(getArray.contains("get2:(a,foo2)")) - assert(getArray.contains("get3:(a,foo3)")) - } - - test("bulkGet default converter to test HBase client") { - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("get1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("get2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("get3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - } finally { - table.close() - connection.close() - } - - val rdd = sc.parallelize(Array( - Bytes.toBytes("get1"), - Bytes.toBytes("get2"), - Bytes.toBytes("get3"), - Bytes.toBytes("get4"))) - val hbaseContext = new HBaseContext(sc, config) - - val getRdd = rdd.hbaseBulkGet(hbaseContext, TableName.valueOf("t1"), 2, - record => { - new Get(record) - }).map((row) => { - if (row != null && row._2.listCells() != null) { - val it = row._2.listCells().iterator() - val B = new StringBuilder - - B.append(Bytes.toString(row._2.getRow) + ":") - - while (it.hasNext) { - val cell = it.next - val q = Bytes.toString(CellUtil.cloneQualifier(cell)) - if (q.equals("counter")) { - B.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") - } else { - B.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") - } - } - "" + B.toString - } else { - "" - }}) - - val getArray = getRdd.collect() - - assert(getArray.length == 4) - assert(getArray.contains("get1:(a,foo1)")) - assert(getArray.contains("get2:(a,foo2)")) - assert(getArray.contains("get3:(a,foo3)")) - } - - test("foreachPartition with puts to test HBase client") { - val config = TEST_UTIL.getConfiguration - val rdd = sc.parallelize(Array( - (Bytes.toBytes("1foreach"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")))), - (Bytes.toBytes("2foreach"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("foo2")))), - (Bytes.toBytes("3foreach"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("c"), Bytes.toBytes("foo3")))), - (Bytes.toBytes("4foreach"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("d"), Bytes.toBytes("foo")))), - (Bytes.toBytes("5foreach"), - Array((Bytes.toBytes(columnFamily), Bytes.toBytes("e"), Bytes.toBytes("bar")))))) - - val hbaseContext = new HBaseContext(sc, config) - - rdd.hbaseForeachPartition(hbaseContext, (it, conn) => { - val bufferedMutator = conn.getBufferedMutator(TableName.valueOf("t1")) - it.foreach((putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) - bufferedMutator.mutate(put) - }) - bufferedMutator.flush() - bufferedMutator.close() - }) - - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - val foo1 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("1foreach"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("a")))) - assert(foo1 == "foo1") - - val foo2 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("2foreach"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("b")))) - assert(foo2 == "foo2") - - val foo3 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("3foreach"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("c")))) - assert(foo3 == "foo3") - - val foo4 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("4foreach"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("d")))) - assert(foo4 == "foo") - - val foo5 = Bytes.toString(CellUtil.cloneValue(table.get(new Get(Bytes.toBytes("5"))). - getColumnLatestCell(Bytes.toBytes(columnFamily), Bytes.toBytes("e")))) - assert(foo5 == "bar") - } finally { - table.close() - connection.close() - } - } - - test("mapPartitions with Get from test HBase client") { - val config = TEST_UTIL.getConfiguration - val connection = ConnectionFactory.createConnection(config) - val table = connection.getTable(TableName.valueOf("t1")) - - try { - var put = new Put(Bytes.toBytes("get1")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1")) - table.put(put) - put = new Put(Bytes.toBytes("get2")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2")) - table.put(put) - put = new Put(Bytes.toBytes("get3")) - put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3")) - table.put(put) - } finally { - table.close() - connection.close() - } - - val rdd = sc.parallelize(Array( - Bytes.toBytes("get1"), - Bytes.toBytes("get2"), - Bytes.toBytes("get3"), - Bytes.toBytes("get4"))) - val hbaseContext = new HBaseContext(sc, config) - - //Get with custom convert logic - val getRdd = rdd.hbaseMapPartitions(hbaseContext, (it, conn) => { - val table = conn.getTable(TableName.valueOf("t1")) - var res = mutable.MutableList[String]() - - it.foreach(r => { - val get = new Get(r) - val result = table.get(get) - if (result.listCells != null) { - val it = result.listCells().iterator() - val B = new StringBuilder - - B.append(Bytes.toString(result.getRow) + ":") - - while (it.hasNext) { - val cell = it.next() - val q = Bytes.toString(CellUtil.cloneQualifier(cell)) - if (q.equals("counter")) { - B.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") - } else { - B.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") - } - } - res += "" + B.toString - } else { - res += "" - } - }) - res.iterator - }) - - val getArray = getRdd.collect() - - assert(getArray.length == 4) - assert(getArray.contains("get1:(a,foo1)")) - assert(getArray.contains("get2:(a,foo2)")) - assert(getArray.contains("get3:(a,foo3)")) - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseTestSource.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseTestSource.scala deleted file mode 100644 index ccb4625619d..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/HBaseTestSource.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.spark.datasources.HBaseSparkConf -import org.apache.spark.SparkEnv -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Row, SQLContext} -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types._ - -class HBaseTestSource extends RelationProvider { - override def createRelation( - sqlContext: SQLContext, - parameters: Map[String, String]): BaseRelation = { - DummyScan( - parameters("cacheSize").toInt, - parameters("batchNum").toInt, - parameters("blockCacheingEnable").toBoolean, - parameters("rowNum").toInt)(sqlContext) - } -} - -case class DummyScan( - cacheSize: Int, - batchNum: Int, - blockCachingEnable: Boolean, - rowNum: Int)(@transient val sqlContext: SQLContext) - extends BaseRelation with TableScan { - private def sparkConf = SparkEnv.get.conf - override def schema: StructType = - StructType(StructField("i", IntegerType, nullable = false) :: Nil) - - override def buildScan(): RDD[Row] = sqlContext.sparkContext.parallelize(0 until rowNum) - .map(Row(_)) - .map{ x => - if (sparkConf.getInt(HBaseSparkConf.QUERY_BATCHSIZE, - -1) != batchNum || - sparkConf.getInt(HBaseSparkConf.QUERY_CACHEDROWS, - -1) != cacheSize || - sparkConf.getBoolean(HBaseSparkConf.QUERY_CACHEBLOCKS, - false) != blockCachingEnable) { - throw new Exception("HBase Spark configuration cannot be set properly") - } - x - } -} diff --git a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/PartitionFilterSuite.scala b/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/PartitionFilterSuite.scala deleted file mode 100644 index f47a319b13d..00000000000 --- a/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/PartitionFilterSuite.scala +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hbase.spark - -import org.apache.hadoop.hbase.spark.datasources.HBaseSparkConf -import org.apache.hadoop.hbase.{TableName, HBaseTestingUtility} -import org.apache.spark.sql.datasources.hbase.HBaseTableCatalog -import org.apache.spark.sql.{DataFrame, SQLContext} -import org.apache.spark.{SparkConf, SparkContext, Logging} -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} - -case class FilterRangeRecord( - intCol0: Int, - boolCol1: Boolean, - doubleCol2: Double, - floatCol3: Float, - intCol4: Int, - longCol5: Long, - shortCol6: Short, - stringCol7: String, - byteCol8: Byte) - -object FilterRangeRecord { - def apply(i: Int): FilterRangeRecord = { - FilterRangeRecord(if (i % 2 == 0) i else -i, - i % 2 == 0, - if (i % 2 == 0) i.toDouble else -i.toDouble, - i.toFloat, - if (i % 2 == 0) i else -i, - i.toLong, - i.toShort, - s"String$i extra", - i.toByte) - } -} - -class PartitionFilterSuite extends FunSuite with - BeforeAndAfterEach with BeforeAndAfterAll with Logging { - @transient var sc: SparkContext = null - var TEST_UTIL: HBaseTestingUtility = new HBaseTestingUtility - - var sqlContext: SQLContext = null - var df: DataFrame = null - - def withCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map(HBaseTableCatalog.tableCatalog -> cat)) - .format("org.apache.hadoop.hbase.spark") - .load() - } - - override def beforeAll() { - - TEST_UTIL.startMiniCluster - val sparkConf = new SparkConf - sparkConf.set(HBaseSparkConf.QUERY_CACHEBLOCKS, "true") - sparkConf.set(HBaseSparkConf.QUERY_BATCHSIZE, "100") - sparkConf.set(HBaseSparkConf.QUERY_CACHEDROWS, "100") - - sc = new SparkContext("local", "test", sparkConf) - new HBaseContext(sc, TEST_UTIL.getConfiguration) - sqlContext = new SQLContext(sc) - } - - override def afterAll() { - logInfo("shutting down minicluster") - TEST_UTIL.shutdownMiniCluster() - - sc.stop() - } - - override def beforeEach(): Unit = { - DefaultSourceStaticUtils.lastFiveExecutionRules.clear() - } - - // The original raw data used for construct result set without going through - // data frame logic. It is used to verify the result set retrieved from data frame logic. - val rawResult = (0 until 32).map { i => - FilterRangeRecord(i) - } - - def collectToSet[T](df: DataFrame): Set[T] = { - df.collect().map(_.getAs[T](0)).toSet - } - val catalog = s"""{ - |"table":{"namespace":"default", "name":"rangeTable"}, - |"rowkey":"key", - |"columns":{ - |"intCol0":{"cf":"rowkey", "col":"key", "type":"int"}, - |"boolCol1":{"cf":"cf1", "col":"boolCol1", "type":"boolean"}, - |"doubleCol2":{"cf":"cf2", "col":"doubleCol2", "type":"double"}, - |"floatCol3":{"cf":"cf3", "col":"floatCol3", "type":"float"}, - |"intCol4":{"cf":"cf4", "col":"intCol4", "type":"int"}, - |"longCol5":{"cf":"cf5", "col":"longCol5", "type":"bigint"}, - |"shortCol6":{"cf":"cf6", "col":"shortCol6", "type":"smallint"}, - |"stringCol7":{"cf":"cf7", "col":"stringCol7", "type":"string"}, - |"byteCol8":{"cf":"cf8", "col":"byteCol8", "type":"tinyint"} - |} - |}""".stripMargin - - test("populate rangeTable") { - val sql = sqlContext - import sql.implicits._ - - sc.parallelize(rawResult).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark") - .save() - } - test("rangeTable full query") { - val df = withCatalog(catalog) - df.show - assert(df.count() === 32) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| -31 | - *| -29 | - *| -27 | - *| -25 | - *| -23 | - *| -21 | - *| -19 | - *| -17 | - *| -15 | - *| -13 | - *| -11 | - *| -9 | - *| -7 | - *| -5 | - *| -3 | - *| -1 | - *+---- + - */ - test("rangeTable rowkey less than 0") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" < 0).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol0 < 0).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol4| - *+-------+ - *| -31 | - *| -29 | - *| -27 | - *| -25 | - *| -23 | - *| -21 | - *| -19 | - *| -17 | - *| -15 | - *| -13 | - *| -11 | - *| -9 | - *| -7 | - *| -5 | - *| -3 | - *| -1 | - *+-------+ - */ - test("rangeTable int col less than 0") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol4" < 0).select($"intCol4") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol4 < 0).map(_.intCol4).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-----------+ - *| doubleCol2| - *+-----------+ - *| 0.0 | - *| 2.0 | - *|-31.0 | - *|-29.0 | - *|-27.0 | - *|-25.0 | - *|-23.0 | - *|-21.0 | - *|-19.0 | - *|-17.0 | - *|-15.0 | - *|-13.0 | - *|-11.0 | - *| -9.0 | - *| -7.0 | - *| -5.0 | - *| -3.0 | - *| -1.0 | - *+-----------+ - */ - test("rangeTable double col less than 0") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"doubleCol2" < 3.0).select($"doubleCol2") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.doubleCol2 < 3.0).map(_.doubleCol2).toSet - // filter results going through dataframe - val result = collectToSet[Double](s) - assert(expected === result) - } - - /** - * expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| -31 | - *| -29 | - *| -27 | - *| -25 | - *| -23 | - *| -21 | - *| -19 | - *| -17 | - *| -15 | - *| -13 | - *| -11 | - *+-------+ - * - */ - test("rangeTable lessequal than -10") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" <= -10).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol0 <= -10).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+----+ - *| -31 | - *| -29 | - *| -27 | - *| -25 | - *| -23 | - *| -21 | - *| -19 | - *| -17 | - *| -15 | - *| -13 | - *| -11 | - *| -9 | - *+-------+ - */ - test("rangeTable lessequal than -9") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" <= -9).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol0 <= -9).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| 0 | - *| 2 | - *| 4 | - *| 6 | - *| 8 | - *| 10 | - *| 12 | - *| 14 | - *| 16 | - *| 18 | - *| 20 | - *| 22 | - *| 24 | - *| 26 | - *| 28 | - *| 30 | - *| -9 | - *| -7 | - *| -5 | - *| -3 | - *+-------+ - */ - test("rangeTable greaterequal than -9") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" >= -9).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol0 >= -9).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| 0 | - *| 2 | - *| 4 | - *| 6 | - *| 8 | - *| 10 | - *| 12 | - *| 14 | - *| 16 | - *| 18 | - *| 20 | - *| 22 | - *| 24 | - *| 26 | - *| 28 | - *| 30 | - *+-------+ - */ - test("rangeTable greaterequal than 0") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" >= 0).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol0 >= 0).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| 12 | - *| 14 | - *| 16 | - *| 18 | - *| 20 | - *| 22 | - *| 24 | - *| 26 | - *| 28 | - *| 30 | - *+-------+ - */ - test("rangeTable greater than 10") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" > 10).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol0 > 10).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| 0 | - *| 2 | - *| 4 | - *| 6 | - *| 8 | - *| 10 | - *| -9 | - *| -7 | - *| -5 | - *| -3 | - *| -1 | - *+-------+ - */ - test("rangeTable and") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" > -10 && $"intCol0" <= 10).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(x => x.intCol0 > -10 && x.intCol0 <= 10 ).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| 12 | - *| 14 | - *| 16 | - *| 18 | - *| 20 | - *| 22 | - *| 24 | - *| 26 | - *| 28 | - *| 30 | - *| -31 | - *| -29 | - *| -27 | - *| -25 | - *| -23 | - *| -21 | - *| -19 | - *| -17 | - *| -15 | - *| -13 | - *+-------+ - */ - - test("or") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" <= -10 || $"intCol0" > 10).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(x => x.intCol0 <= -10 || x.intCol0 > 10).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } - - /** - *expected result: only showing top 20 rows - *+-------+ - *|intCol0| - *+-------+ - *| 0 | - *| 2 | - *| 4 | - *| 6 | - *| 8 | - *| 10 | - *| 12 | - *| 14 | - *| 16 | - *| 18 | - *| 20 | - *| 22 | - *| 24 | - *| 26 | - *| 28 | - *| 30 | - *| -31 | - *| -29 | - *| -27 | - *| -25 | - *+-------+ - */ - test("rangeTable all") { - val sql = sqlContext - import sql.implicits._ - val df = withCatalog(catalog) - val s = df.filter($"intCol0" >= -100).select($"intCol0") - s.show - // filter results without going through dataframe - val expected = rawResult.filter(_.intCol0 >= -100).map(_.intCol0).toSet - // filter results going through dataframe - val result = collectToSet[Int](s) - assert(expected === result) - } -} diff --git a/pom.xml b/pom.xml index e4a040f494e..a64d56c0811 100644 --- a/pom.xml +++ b/pom.xml @@ -85,11 +85,9 @@ hbase-checkstyle hbase-external-blockcache hbase-shaded - hbase-spark hbase-archetypes hbase-metrics-api hbase-metrics - hbase-spark-it hbase-backup @@ -3305,7 +3271,7 @@ **/protobuf/* **/*.scala - org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.spark:org.apache.hadoop.hbase.generated* + org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.generated* private true true @@ -3348,7 +3314,7 @@ **/protobuf/* **/*.scala - org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.spark:org.apache.hadoop.hbase.generated* + org.apache.hadoop.hbase.tmpl.common:com.google.protobuf:org.apache.hadoop.hbase.generated* private true true diff --git a/src/main/asciidoc/_chapters/spark.adoc b/src/main/asciidoc/_chapters/spark.adoc deleted file mode 100644 index 774d137fc34..00000000000 --- a/src/main/asciidoc/_chapters/spark.adoc +++ /dev/null @@ -1,690 +0,0 @@ -//// -/** - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - . . http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -//// - -[[spark]] -= HBase and Spark -:doctype: book -:numbered: -:toc: left -:icons: font -:experimental: - -link:http://spark.apache.org/[Apache Spark] is a software framework that is used -to process data in memory in a distributed manner, and is replacing MapReduce in -many use cases. - -Spark itself is out of scope of this document, please refer to the Spark site for -more information on the Spark project and subprojects. This document will focus -on 4 main interaction points between Spark and HBase. Those interaction points are: - -Basic Spark:: - The ability to have an HBase Connection at any point in your Spark DAG. -Spark Streaming:: - The ability to have an HBase Connection at any point in your Spark Streaming - application. -Spark Bulk Load:: - The ability to write directly to HBase HFiles for bulk insertion into HBase -SparkSQL/DataFrames:: - The ability to write SparkSQL that draws on tables that are represented in HBase. - -The following sections will walk through examples of all these interaction points. - -== Basic Spark - -This section discusses Spark HBase integration at the lowest and simplest levels. -All the other interaction points are built upon the concepts that will be described -here. - -At the root of all Spark and HBase integration is the HBaseContext. The HBaseContext -takes in HBase configurations and pushes them to the Spark executors. This allows -us to have an HBase Connection per Spark Executor in a static location. - -For reference, Spark Executors can be on the same nodes as the Region Servers or -on different nodes there is no dependence of co-location. Think of every Spark -Executor as a multi-threaded client application. This allows any Spark Tasks -running on the executors to access the shared Connection object. - -.HBaseContext Usage Example -==== - -This example shows how HBaseContext can be used to do a `foreachPartition` on a RDD -in Scala: - -[source, scala] ----- -val sc = new SparkContext("local", "test") -val config = new HBaseConfiguration() - -... - -val hbaseContext = new HBaseContext(sc, config) - -rdd.hbaseForeachPartition(hbaseContext, (it, conn) => { - val bufferedMutator = conn.getBufferedMutator(TableName.valueOf("t1")) - it.foreach((putRecord) => { -. val put = new Put(putRecord._1) -. putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) -. bufferedMutator.mutate(put) - }) - bufferedMutator.flush() - bufferedMutator.close() -}) ----- - -Here is the same example implemented in Java: - -[source, java] ----- -JavaSparkContext jsc = new JavaSparkContext(sparkConf); - -try { - List list = new ArrayList<>(); - list.add(Bytes.toBytes("1")); - ... - list.add(Bytes.toBytes("5")); - - JavaRDD rdd = jsc.parallelize(list); - Configuration conf = HBaseConfiguration.create(); - - JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); - - hbaseContext.foreachPartition(rdd, - new VoidFunction, Connection>>() { - public void call(Tuple2, Connection> t) - throws Exception { - Table table = t._2().getTable(TableName.valueOf(tableName)); - BufferedMutator mutator = t._2().getBufferedMutator(TableName.valueOf(tableName)); - while (t._1().hasNext()) { - byte[] b = t._1().next(); - Result r = table.get(new Get(b)); - if (r.getExists()) { - mutator.mutate(new Put(b)); - } - } - - mutator.flush(); - mutator.close(); - table.close(); - } - }); -} finally { - jsc.stop(); -} ----- -==== - -All functionality between Spark and HBase will be supported both in Scala and in -Java, with the exception of SparkSQL which will support any language that is -supported by Spark. For the remaining of this documentation we will focus on -Scala examples for now. - -The examples above illustrate how to do a foreachPartition with a connection. A -number of other Spark base functions are supported out of the box: - -// tag::spark_base_functions[] -`bulkPut`:: For massively parallel sending of puts to HBase -`bulkDelete`:: For massively parallel sending of deletes to HBase -`bulkGet`:: For massively parallel sending of gets to HBase to create a new RDD -`mapPartition`:: To do a Spark Map function with a Connection object to allow full -access to HBase -`hBaseRDD`:: To simplify a distributed scan to create a RDD -// end::spark_base_functions[] - -For examples of all these functionalities, see the HBase-Spark Module. - -== Spark Streaming -http://spark.apache.org/streaming/[Spark Streaming] is a micro batching stream -processing framework built on top of Spark. HBase and Spark Streaming make great -companions in that HBase can help serve the following benefits alongside Spark -Streaming. - -* A place to grab reference data or profile data on the fly -* A place to store counts or aggregates in a way that supports Spark Streaming -promise of _only once processing_. - -The HBase-Spark module’s integration points with Spark Streaming are similar to -its normal Spark integration points, in that the following commands are possible -straight off a Spark Streaming DStream. - -include::spark.adoc[tags=spark_base_functions] - -.`bulkPut` Example with DStreams -==== - -Below is an example of bulkPut with DStreams. It is very close in feel to the RDD -bulk put. - -[source, scala] ----- -val sc = new SparkContext("local", "test") -val config = new HBaseConfiguration() - -val hbaseContext = new HBaseContext(sc, config) -val ssc = new StreamingContext(sc, Milliseconds(200)) - -val rdd1 = ... -val rdd2 = ... - -val queue = mutable.Queue[RDD[(Array[Byte], Array[(Array[Byte], - Array[Byte], Array[Byte])])]]() - -queue += rdd1 -queue += rdd2 - -val dStream = ssc.queueStream(queue) - -dStream.hbaseBulkPut( - hbaseContext, - TableName.valueOf(tableName), - (putRecord) => { - val put = new Put(putRecord._1) - putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) - put - }) ----- - -There are three inputs to the `hbaseBulkPut` function. -. The hbaseContext that carries the configuration boardcast information link us -to the HBase Connections in the executors -. The table name of the table we are putting data into -. A function that will convert a record in the DStream into an HBase Put object. -==== - -== Bulk Load - -There are two options for bulk loading data into HBase with Spark. There is the -basic bulk load functionality that will work for cases where your rows have -millions of columns and cases where your columns are not consolidated and -partitions before the on the map side of the Spark bulk load process. - -There is also a thin record bulk load option with Spark, this second option is -designed for tables that have less then 10k columns per row. The advantage -of this second option is higher throughput and less over all load on the Spark -shuffle operation. - -Both implementations work more or less like the MapReduce bulk load process in -that a partitioner partitions the rowkeys based on region splits and -the row keys are sent to the reducers in order, so that HFiles can be written -out directly from the reduce phase. - -In Spark terms, the bulk load will be implemented around a the Spark -`repartitionAndSortWithinPartitions` followed by a Spark `foreachPartition`. - -First lets look at an example of using the basic bulk load functionality - -.Bulk Loading Example -==== - -The following example shows bulk loading with Spark. - -[source, scala] ----- -val sc = new SparkContext("local", "test") -val config = new HBaseConfiguration() - -val hbaseContext = new HBaseContext(sc, config) - -val stagingFolder = ... -val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("foo2.b"))), ... - -rdd.hbaseBulkLoad(TableName.valueOf(tableName), - t => { - val rowKey = t._1 - val family:Array[Byte] = t._2(0)._1 - val qualifier = t._2(0)._2 - val value = t._2(0)._3 - - val keyFamilyQualifier= new KeyFamilyQualifier(rowKey, family, qualifier) - - Seq((keyFamilyQualifier, value)).iterator - }, - stagingFolder.getPath) - -val load = new LoadIncrementalHFiles(config) -load.doBulkLoad(new Path(stagingFolder.getPath), - conn.getAdmin, table, conn.getRegionLocator(TableName.valueOf(tableName))) ----- -==== - -The `hbaseBulkLoad` function takes three required parameters: - -. The table name of the table we intend to bulk load too - -. A function that will convert a record in the RDD to a tuple key value par. With -the tuple key being a KeyFamilyQualifer object and the value being the cell value. -The KeyFamilyQualifer object will hold the RowKey, Column Family, and Column Qualifier. -The shuffle will partition on the RowKey but will sort by all three values. - -. The temporary path for the HFile to be written out too - -Following the Spark bulk load command, use the HBase's LoadIncrementalHFiles object -to load the newly created HFiles into HBase. - -.Additional Parameters for Bulk Loading with Spark - -You can set the following attributes with additional parameter options on hbaseBulkLoad. - -* Max file size of the HFiles -* A flag to exclude HFiles from compactions -* Column Family settings for compression, bloomType, blockSize, and dataBlockEncoding - -.Using Additional Parameters -==== - -[source, scala] ----- -val sc = new SparkContext("local", "test") -val config = new HBaseConfiguration() - -val hbaseContext = new HBaseContext(sc, config) - -val stagingFolder = ... -val rdd = sc.parallelize(Array( - (Bytes.toBytes("1"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - (Bytes.toBytes("3"), - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("foo2.b"))), ... - -val familyHBaseWriterOptions = new java.util.HashMap[Array[Byte], FamilyHFileWriteOptions] -val f1Options = new FamilyHFileWriteOptions("GZ", "ROW", 128, "PREFIX") - -familyHBaseWriterOptions.put(Bytes.toBytes("columnFamily1"), f1Options) - -rdd.hbaseBulkLoad(TableName.valueOf(tableName), - t => { - val rowKey = t._1 - val family:Array[Byte] = t._2(0)._1 - val qualifier = t._2(0)._2 - val value = t._2(0)._3 - - val keyFamilyQualifier= new KeyFamilyQualifier(rowKey, family, qualifier) - - Seq((keyFamilyQualifier, value)).iterator - }, - stagingFolder.getPath, - familyHBaseWriterOptions, - compactionExclude = false, - HConstants.DEFAULT_MAX_FILE_SIZE) - -val load = new LoadIncrementalHFiles(config) -load.doBulkLoad(new Path(stagingFolder.getPath), - conn.getAdmin, table, conn.getRegionLocator(TableName.valueOf(tableName))) ----- -==== - -Now lets look at how you would call the thin record bulk load implementation - -.Using thin record bulk load -==== - -[source, scala] ----- -val sc = new SparkContext("local", "test") -val config = new HBaseConfiguration() - -val hbaseContext = new HBaseContext(sc, config) - -val stagingFolder = ... -val rdd = sc.parallelize(Array( - ("1", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("a"), Bytes.toBytes("foo1"))), - ("3", - (Bytes.toBytes(columnFamily1), Bytes.toBytes("b"), Bytes.toBytes("foo2.b"))), ... - -rdd.hbaseBulkLoadThinRows(hbaseContext, - TableName.valueOf(tableName), - t => { - val rowKey = t._1 - - val familyQualifiersValues = new FamiliesQualifiersValues - t._2.foreach(f => { - val family:Array[Byte] = f._1 - val qualifier = f._2 - val value:Array[Byte] = f._3 - - familyQualifiersValues +=(family, qualifier, value) - }) - (new ByteArrayWrapper(Bytes.toBytes(rowKey)), familyQualifiersValues) - }, - stagingFolder.getPath, - new java.util.HashMap[Array[Byte], FamilyHFileWriteOptions], - compactionExclude = false, - 20) - -val load = new LoadIncrementalHFiles(config) -load.doBulkLoad(new Path(stagingFolder.getPath), - conn.getAdmin, table, conn.getRegionLocator(TableName.valueOf(tableName))) ----- -==== - -Note that the big difference in using bulk load for thin rows is the function -returns a tuple with the first value being the row key and the second value -being an object of FamiliesQualifiersValues, which will contain all the -values for this row for all column families. - -== SparkSQL/DataFrames - -HBase-Spark Connector (in HBase-Spark Module) leverages -link:https://databricks.com/blog/2015/01/09/spark-sql-data-sources-api-unified-data-access-for-the-spark-platform.html[DataSource API] -(link:https://issues.apache.org/jira/browse/SPARK-3247[SPARK-3247]) -introduced in Spark-1.2.0, bridges the gap between simple HBase KV store and complex -relational SQL queries and enables users to perform complex data analytical work -on top of HBase using Spark. HBase Dataframe is a standard Spark Dataframe, and is able to -interact with any other data sources such as Hive, Orc, Parquet, JSON, etc. -HBase-Spark Connector applies critical techniques such as partition pruning, column pruning, -predicate pushdown and data locality. - -To use HBase-Spark connector, users need to define the Catalog for the schema mapping -between HBase and Spark tables, prepare the data and populate the HBase table, -then load HBase DataFrame. After that, users can do integrated query and access records -in HBase table with SQL query. Following illustrates the basic procedure. - -=== Define catalog - -[source, scala] ----- -def catalog = s"""{ -       |"table":{"namespace":"default", "name":"table1"}, -       |"rowkey":"key", -       |"columns":{ -         |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, -         |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, -         |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, -         |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, -         |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, -         |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, -         |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, -         |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, -         |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} -       |} -     |}""".stripMargin ----- - -Catalog defines a mapping between HBase and Spark tables. There are two critical parts of this catalog. -One is the rowkey definition and the other is the mapping between table column in Spark and -the column family and column qualifier in HBase. The above defines a schema for a HBase table -with name as table1, row key as key and a number of columns (col1 `-` col8). Note that the rowkey -also has to be defined in details as a column (col0), which has a specific cf (rowkey). - -=== Save the DataFrame - -[source, scala] ----- -case class HBaseRecord( - col0: String, - col1: Boolean, - col2: Double, - col3: Float, - col4: Int,        - col5: Long, - col6: Short, - col7: String, - col8: Byte) - -object HBaseRecord -{                                                                                                              - def apply(i: Int, t: String): HBaseRecord = { - val s = s"""row${"%03d".format(i)}"""        - HBaseRecord(s, - i % 2 == 0, - i.toDouble, - i.toFloat,   - i, - i.toLong, - i.toShort,   - s"String$i: $t",       - i.toByte) - } -} - -val data = (0 to 255).map { i =>  HBaseRecord(i, "extra")} - -sc.parallelize(data).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5")) - .format("org.apache.hadoop.hbase.spark ") - .save() - ----- -`data` prepared by the user is a local Scala collection which has 256 HBaseRecord objects. -`sc.parallelize(data)` function distributes `data` to form an RDD. `toDF` returns a DataFrame. -`write` function returns a DataFrameWriter used to write the DataFrame to external storage -systems (e.g. HBase here). Given a DataFrame with specified schema `catalog`, `save` function -will create an HBase table with 5 regions and save the DataFrame inside. - -=== Load the DataFrame - -[source, scala] ----- -def withCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map(HBaseTableCatalog.tableCatalog->cat)) - .format("org.apache.hadoop.hbase.spark") - .load() -} -val df = withCatalog(catalog) ----- -In ‘withCatalog’ function, sqlContext is a variable of SQLContext, which is the entry point -for working with structured data (rows and columns) in Spark. -`read` returns a DataFrameReader that can be used to read data in as a DataFrame. -`option` function adds input options for the underlying data source to the DataFrameReader, -and `format` function specifies the input data source format for the DataFrameReader. -The `load()` function loads input in as a DataFrame. The date frame `df` returned -by `withCatalog` function could be used to access HBase table, such as 4.4 and 4.5. - -=== Language Integrated Query - -[source, scala] ----- -val s = df.filter(($"col0" <= "row050" && $"col0" > "row040") || - $"col0" === "row005" || - $"col0" <= "row005") - .select("col0", "col1", "col4") -s.show ----- -DataFrame can do various operations, such as join, sort, select, filter, orderBy and so on. -`df.filter` above filters rows using the given SQL expression. `select` selects a set of columns: -`col0`, `col1` and `col4`. - -=== SQL Query - -[source, scala] ----- -df.registerTempTable("table1") -sqlContext.sql("select count(col1) from table1").show ----- - -`registerTempTable` registers `df` DataFrame as a temporary table using the table name `table1`. -The lifetime of this temporary table is tied to the SQLContext that was used to create `df`. -`sqlContext.sql` function allows the user to execute SQL queries. - -=== Others - -.Query with different timestamps -==== -In HBaseSparkConf, four parameters related to timestamp can be set. They are TIMESTAMP, -MIN_TIMESTAMP, MAX_TIMESTAMP and MAX_VERSIONS respectively. Users can query records with -different timestamps or time ranges with MIN_TIMESTAMP and MAX_TIMESTAMP. In the meantime, -use concrete value instead of tsSpecified and oldMs in the examples below. - -The example below shows how to load df DataFrame with different timestamps. -tsSpecified is specified by the user. -HBaseTableCatalog defines the HBase and Relation relation schema. -writeCatalog defines catalog for the schema mapping. - -[source, scala] ----- -val df = sqlContext.read - .options(Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseSparkConf.TIMESTAMP -> tsSpecified.toString)) - .format("org.apache.hadoop.hbase.spark") - .load() ----- - -The example below shows how to load df DataFrame with different time ranges. -oldMs is specified by the user. - -[source, scala] ----- -val df = sqlContext.read - .options(Map(HBaseTableCatalog.tableCatalog -> writeCatalog, HBaseSparkConf.MIN_TIMESTAMP -> "0", - HBaseSparkConf.MAX_TIMESTAMP -> oldMs.toString)) - .format("org.apache.hadoop.hbase.spark") - .load() ----- -After loading df DataFrame, users can query data. - -[source, scala] ----- -df.registerTempTable("table") -sqlContext.sql("select count(col1) from table").show ----- -==== - -.Native Avro support -==== -HBase-Spark Connector support different data formats like Avro, Jason, etc. The use case below -shows how spark supports Avro. User can persist the Avro record into HBase directly. Internally, -the Avro schema is converted to a native Spark Catalyst data type automatically. -Note that both key-value parts in an HBase table can be defined in Avro format. - -1) Define catalog for the schema mapping: - -[source, scala] ----- -def catalog = s"""{ - |"table":{"namespace":"default", "name":"Avrotable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, - |"col1":{"cf":"cf1", "col":"col1", "type":"binary"} - |} - |}""".stripMargin ----- - -`catalog` is a schema for a HBase table named `Avrotable`. row key as key and -one column col1. The rowkey also has to be defined in details as a column (col0), -which has a specific cf (rowkey). - -2) Prepare the Data: - -[source, scala] ----- - object AvroHBaseRecord { - val schemaString = - s"""{"namespace": "example.avro", - | "type": "record", "name": "User", - | "fields": [ - | {"name": "name", "type": "string"}, - | {"name": "favorite_number", "type": ["int", "null"]}, - | {"name": "favorite_color", "type": ["string", "null"]}, - | {"name": "favorite_array", "type": {"type": "array", "items": "string"}}, - | {"name": "favorite_map", "type": {"type": "map", "values": "int"}} - | ] }""".stripMargin - - val avroSchema: Schema = { - val p = new Schema.Parser - p.parse(schemaString) - } - - def apply(i: Int): AvroHBaseRecord = { - val user = new GenericData.Record(avroSchema); - user.put("name", s"name${"%03d".format(i)}") - user.put("favorite_number", i) - user.put("favorite_color", s"color${"%03d".format(i)}") - val favoriteArray = new GenericData.Array[String](2, avroSchema.getField("favorite_array").schema()) - favoriteArray.add(s"number${i}") - favoriteArray.add(s"number${i+1}") - user.put("favorite_array", favoriteArray) - import collection.JavaConverters._ - val favoriteMap = Map[String, Int](("key1" -> i), ("key2" -> (i+1))).asJava - user.put("favorite_map", favoriteMap) - val avroByte = AvroSedes.serialize(user, avroSchema) - AvroHBaseRecord(s"name${"%03d".format(i)}", avroByte) - } - } - - val data = (0 to 255).map { i => - AvroHBaseRecord(i) - } ----- - -`schemaString` is defined first, then it is parsed to get `avroSchema`. `avroSchema` is used to -generate `AvroHBaseRecord`. `data` prepared by users is a local Scala collection -which has 256 `AvroHBaseRecord` objects. - -3) Save DataFrame: - -[source, scala] ----- - sc.parallelize(data).toDF.write.options( - Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5")) - .format("org.apache.spark.sql.execution.datasources.hbase") - .save() ----- - -Given a data frame with specified schema `catalog`, above will create an HBase table with 5 -regions and save the data frame inside. - -4) Load the DataFrame - -[source, scala] ----- -def avroCatalog = s"""{ - |"table":{"namespace":"default", "name":"avrotable"}, - |"rowkey":"key", - |"columns":{ - |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, - |"col1":{"cf":"cf1", "col":"col1", "avro":"avroSchema"} - |} - |}""".stripMargin - - def withCatalog(cat: String): DataFrame = { - sqlContext - .read - .options(Map("avroSchema" -> AvroHBaseRecord.schemaString, HBaseTableCatalog.tableCatalog -> avroCatalog)) - .format("org.apache.spark.sql.execution.datasources.hbase") - .load() - } - val df = withCatalog(catalog) ----- - -In `withCatalog` function, `read` returns a DataFrameReader that can be used to read data in as a DataFrame. -The `option` function adds input options for the underlying data source to the DataFrameReader. -There are two options: one is to set `avroSchema` as `AvroHBaseRecord.schemaString`, and one is to -set `HBaseTableCatalog.tableCatalog` as `avroCatalog`. The `load()` function loads input in as a DataFrame. -The date frame `df` returned by `withCatalog` function could be used to access the HBase table. - -5) SQL Query - -[source, scala] ----- - df.registerTempTable("avrotable") - val c = sqlContext.sql("select count(1) from avrotable"). ----- - -After loading df DataFrame, users can query data. registerTempTable registers df DataFrame -as a temporary table using the table name avrotable. `sqlContext.sql` function allows the -user to execute SQL queries. -==== \ No newline at end of file diff --git a/src/main/asciidoc/book.adoc b/src/main/asciidoc/book.adoc index 2b9bf265109..f6c6cdb0dd7 100644 --- a/src/main/asciidoc/book.adoc +++ b/src/main/asciidoc/book.adoc @@ -65,7 +65,6 @@ include::_chapters/hbase_mob.adoc[] include::_chapters/hbase_apis.adoc[] include::_chapters/external_apis.adoc[] include::_chapters/thrift_filter_language.adoc[] -include::_chapters/spark.adoc[] include::_chapters/cp.adoc[] include::_chapters/performance.adoc[] include::_chapters/troubleshooting.adoc[]