HBASE-15184 SparkSQL Scan operation doesn't work on kerberos cluster (Ted Malaska)
This commit is contained in:
parent
f47dba74d4
commit
00248656ee
|
@ -164,7 +164,7 @@ case class HBaseRelation (val tableName:String,
|
||||||
HBaseSparkConf.BULKGET_SIZE, HBaseSparkConf.defaultBulkGetSize))
|
HBaseSparkConf.BULKGET_SIZE, HBaseSparkConf.defaultBulkGetSize))
|
||||||
|
|
||||||
//create or get latest HBaseContext
|
//create or get latest HBaseContext
|
||||||
@transient val hbaseContext:HBaseContext = if (useHBaseContext) {
|
val hbaseContext:HBaseContext = if (useHBaseContext) {
|
||||||
LatestHBaseContextCache.latest
|
LatestHBaseContextCache.latest
|
||||||
} else {
|
} else {
|
||||||
val config = HBaseConfiguration.create()
|
val config = HBaseConfiguration.create()
|
||||||
|
@ -270,7 +270,7 @@ case class HBaseRelation (val tableName:String,
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
val hRdd = new HBaseTableScanRDD(this, pushDownFilterJava, requiredQualifierDefinitionList.seq)
|
val hRdd = new HBaseTableScanRDD(this, hbaseContext, pushDownFilterJava, requiredQualifierDefinitionList.seq)
|
||||||
pushDownRowKeyFilter.points.foreach(hRdd.addPoint(_))
|
pushDownRowKeyFilter.points.foreach(hRdd.addPoint(_))
|
||||||
pushDownRowKeyFilter.ranges.foreach(hRdd.addRange(_))
|
pushDownRowKeyFilter.ranges.foreach(hRdd.addRange(_))
|
||||||
var resultRDD: RDD[Row] = {
|
var resultRDD: RDD[Row] = {
|
||||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding
|
||||||
import org.apache.hadoop.hbase.io.hfile.{CacheConfig, HFileContextBuilder, HFileWriterImpl}
|
import org.apache.hadoop.hbase.io.hfile.{CacheConfig, HFileContextBuilder, HFileWriterImpl}
|
||||||
import org.apache.hadoop.hbase.regionserver.{HStore, StoreFile, BloomType}
|
import org.apache.hadoop.hbase.regionserver.{HStore, StoreFile, BloomType}
|
||||||
import org.apache.hadoop.hbase.util.Bytes
|
import org.apache.hadoop.hbase.util.Bytes
|
||||||
|
import org.apache.hadoop.mapred.JobConf
|
||||||
import org.apache.spark.broadcast.Broadcast
|
import org.apache.spark.broadcast.Broadcast
|
||||||
import org.apache.spark.deploy.SparkHadoopUtil
|
import org.apache.spark.deploy.SparkHadoopUtil
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
@ -228,7 +229,7 @@ class HBaseContext(@transient sc: SparkContext,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
def applyCreds[T] (configBroadcast: Broadcast[SerializableWritable[Configuration]]){
|
def applyCreds[T] (){
|
||||||
credentials = SparkHadoopUtil.get.getCurrentUserCredentials()
|
credentials = SparkHadoopUtil.get.getCurrentUserCredentials()
|
||||||
|
|
||||||
logDebug("appliedCredentials:" + appliedCredentials + ",credentials:" + credentials)
|
logDebug("appliedCredentials:" + appliedCredentials + ",credentials:" + credentials)
|
||||||
|
@ -440,10 +441,14 @@ class HBaseContext(@transient sc: SparkContext,
|
||||||
TableMapReduceUtil.initTableMapperJob(tableName, scan,
|
TableMapReduceUtil.initTableMapperJob(tableName, scan,
|
||||||
classOf[IdentityTableMapper], null, null, job)
|
classOf[IdentityTableMapper], null, null, job)
|
||||||
|
|
||||||
sc.newAPIHadoopRDD(job.getConfiguration,
|
val jconf = new JobConf(job.getConfiguration)
|
||||||
|
SparkHadoopUtil.get.addCredentials(jconf)
|
||||||
|
new NewHBaseRDD(sc,
|
||||||
classOf[TableInputFormat],
|
classOf[TableInputFormat],
|
||||||
classOf[ImmutableBytesWritable],
|
classOf[ImmutableBytesWritable],
|
||||||
classOf[Result]).map(f)
|
classOf[Result],
|
||||||
|
job.getConfiguration,
|
||||||
|
this).map(f)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -474,7 +479,7 @@ class HBaseContext(@transient sc: SparkContext,
|
||||||
|
|
||||||
val config = getConf(configBroadcast)
|
val config = getConf(configBroadcast)
|
||||||
|
|
||||||
applyCreds(configBroadcast)
|
applyCreds
|
||||||
// specify that this is a proxy user
|
// specify that this is a proxy user
|
||||||
val connection = ConnectionFactory.createConnection(config)
|
val connection = ConnectionFactory.createConnection(config)
|
||||||
f(it, connection)
|
f(it, connection)
|
||||||
|
@ -514,7 +519,7 @@ class HBaseContext(@transient sc: SparkContext,
|
||||||
Iterator[U]): Iterator[U] = {
|
Iterator[U]): Iterator[U] = {
|
||||||
|
|
||||||
val config = getConf(configBroadcast)
|
val config = getConf(configBroadcast)
|
||||||
applyCreds(configBroadcast)
|
applyCreds
|
||||||
|
|
||||||
val connection = ConnectionFactory.createConnection(config)
|
val connection = ConnectionFactory.createConnection(config)
|
||||||
val res = mp(it, connection)
|
val res = mp(it, connection)
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.hbase.spark
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
import org.apache.hadoop.mapreduce.InputFormat
|
||||||
|
import org.apache.spark.rdd.NewHadoopRDD
|
||||||
|
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
|
||||||
|
|
||||||
|
class NewHBaseRDD[K,V](@transient sc : SparkContext,
|
||||||
|
@transient inputFormatClass: Class[_ <: InputFormat[K, V]],
|
||||||
|
@transient keyClass: Class[K],
|
||||||
|
@transient valueClass: Class[V],
|
||||||
|
@transient conf: Configuration,
|
||||||
|
val hBaseContext: HBaseContext) extends NewHadoopRDD(sc,inputFormatClass, keyClass, valueClass, conf) {
|
||||||
|
|
||||||
|
override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
|
||||||
|
hBaseContext.applyCreds()
|
||||||
|
super.compute(theSplit, context)
|
||||||
|
}
|
||||||
|
}
|
|
@ -20,7 +20,7 @@ package org.apache.hadoop.hbase.spark.datasources
|
||||||
import java.util.ArrayList
|
import java.util.ArrayList
|
||||||
|
|
||||||
import org.apache.hadoop.hbase.client._
|
import org.apache.hadoop.hbase.client._
|
||||||
import org.apache.hadoop.hbase.spark.{ScanRange, SchemaQualifierDefinition, HBaseRelation, SparkSQLPushDownFilter}
|
import org.apache.hadoop.hbase.spark._
|
||||||
import org.apache.hadoop.hbase.spark.hbase._
|
import org.apache.hadoop.hbase.spark.hbase._
|
||||||
import org.apache.hadoop.hbase.spark.datasources.HBaseResources._
|
import org.apache.hadoop.hbase.spark.datasources.HBaseResources._
|
||||||
import org.apache.spark.{SparkEnv, TaskContext, Logging, Partition}
|
import org.apache.spark.{SparkEnv, TaskContext, Logging, Partition}
|
||||||
|
@ -28,10 +28,10 @@ import org.apache.spark.rdd.RDD
|
||||||
|
|
||||||
import scala.collection.mutable
|
import scala.collection.mutable
|
||||||
|
|
||||||
|
|
||||||
class HBaseTableScanRDD(relation: HBaseRelation,
|
class HBaseTableScanRDD(relation: HBaseRelation,
|
||||||
@transient val filter: Option[SparkSQLPushDownFilter] = None,
|
val hbaseContext: HBaseContext,
|
||||||
val columns: Seq[SchemaQualifierDefinition] = Seq.empty
|
@transient val filter: Option[SparkSQLPushDownFilter] = None,
|
||||||
|
val columns: Seq[SchemaQualifierDefinition] = Seq.empty
|
||||||
)extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging {
|
)extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging {
|
||||||
private def sparkConf = SparkEnv.get.conf
|
private def sparkConf = SparkEnv.get.conf
|
||||||
@transient var ranges = Seq.empty[Range]
|
@transient var ranges = Seq.empty[Range]
|
||||||
|
@ -98,7 +98,8 @@ class HBaseTableScanRDD(relation: HBaseRelation,
|
||||||
tbr: TableResource,
|
tbr: TableResource,
|
||||||
g: Seq[Array[Byte]],
|
g: Seq[Array[Byte]],
|
||||||
filter: Option[SparkSQLPushDownFilter],
|
filter: Option[SparkSQLPushDownFilter],
|
||||||
columns: Seq[SchemaQualifierDefinition]): Iterator[Result] = {
|
columns: Seq[SchemaQualifierDefinition],
|
||||||
|
hbaseContext: HBaseContext): Iterator[Result] = {
|
||||||
g.grouped(relation.bulkGetSize).flatMap{ x =>
|
g.grouped(relation.bulkGetSize).flatMap{ x =>
|
||||||
val gets = new ArrayList[Get]()
|
val gets = new ArrayList[Get]()
|
||||||
x.foreach{ y =>
|
x.foreach{ y =>
|
||||||
|
@ -111,6 +112,7 @@ class HBaseTableScanRDD(relation: HBaseRelation,
|
||||||
filter.foreach(g.setFilter(_))
|
filter.foreach(g.setFilter(_))
|
||||||
gets.add(g)
|
gets.add(g)
|
||||||
}
|
}
|
||||||
|
hbaseContext.applyCreds()
|
||||||
val tmp = tbr.get(gets)
|
val tmp = tbr.get(gets)
|
||||||
rddResources.addResource(tmp)
|
rddResources.addResource(tmp)
|
||||||
toResultIterator(tmp)
|
toResultIterator(tmp)
|
||||||
|
@ -208,11 +210,12 @@ class HBaseTableScanRDD(relation: HBaseRelation,
|
||||||
if (points.isEmpty) {
|
if (points.isEmpty) {
|
||||||
Iterator.empty: Iterator[Result]
|
Iterator.empty: Iterator[Result]
|
||||||
} else {
|
} else {
|
||||||
buildGets(tableResource, points, filter, columns)
|
buildGets(tableResource, points, filter, columns, hbaseContext)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
val rIts = scans.par
|
val rIts = scans.par
|
||||||
.map { scan =>
|
.map { scan =>
|
||||||
|
hbaseContext.applyCreds()
|
||||||
val scanner = tableResource.getScanner(scan)
|
val scanner = tableResource.getScanner(scan)
|
||||||
rddResources.addResource(scanner)
|
rddResources.addResource(scanner)
|
||||||
scanner
|
scanner
|
||||||
|
|
Loading…
Reference in New Issue