1
0
mirror of https://github.com/apache/nifi.git synced 2025-03-02 07:29:13 +00:00

NIFI-4833 Add ScanHBase Processor

- New processor for scanning HBase records based on verious params like range of rowkeys, range of timestamps. Supports result limit and reverse scan.
- Adds Atlas Support for ScanHBase processor
- Fixed not recent version of FF
- Formatting and Style changes
- Single line to multiline if-then statements
- Removed HTML formatting that is not used for doc generation
- Fixed issue with limitRows
- Fixed issue with filter expression
- Refactored "processRows"
- Fixed possible NPE for bulkSize var
- Changed provenance to "receive" to indicate new data from external source.
- Updated min/max timestamp custom validation
- JSON array support
- Removed in-memory caching for records. Now records are being written directly to FF
- Removed unfinished flowfile from session, transfered original to "failure". Test cases update as well

This closes .

Signed-off-by: Bryan Bende <bbende@apache.org>
This commit is contained in:
Ed 2018-02-17 16:26:04 -05:00 committed by Bryan Bende
parent 6471f66acd
commit c2616e6fe7
No known key found for this signature in database
GPG Key ID: A0DDA9ED50711C39
8 changed files with 1127 additions and 0 deletions
nifi-nar-bundles
nifi-atlas-bundle/nifi-atlas-reporting-task/src/main/resources/docs/org.apache.nifi.atlas.reporting.ReportLineageToAtlas
nifi-hbase-bundle/nifi-hbase-processors/src
main
java/org/apache/nifi/hbase
resources/META-INF/services
test/java/org/apache/nifi/hbase
nifi-standard-services
nifi-hbase-client-service-api/src/main/java/org/apache/nifi/hbase
nifi-hbase_1_1_2-client-service-bundle/nifi-hbase_1_1_2-client-service/src
main/java/org/apache/nifi/hbase
test/java/org/apache/nifi/hbase

@ -431,6 +431,7 @@ Processor 3</pre>
PutHBaseCell<br/>
PutHBaseJSON<br/>
PutHBaseRecord<br/>
ScanHBase<br/>
</td>
<td>
FETCH<br/>
@ -438,6 +439,7 @@ Processor 3</pre>
SEND<br/>
SEND<br/>
SEND<br/>
RECEIVE<br/>
</td>
<td>hbase://hmaster.example.com:16000/tableA/rowX</td>
<td>hbase_table</td>

@ -0,0 +1,579 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.hbase;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.hbase.io.JsonFullRowSerializer;
import org.apache.nifi.hbase.io.JsonQualifierAndValueRowSerializer;
import org.apache.nifi.hbase.io.RowSerializer;
import org.apache.nifi.hbase.scan.Column;
import org.apache.nifi.hbase.scan.ResultCell;
import org.apache.nifi.hbase.scan.ResultHandler;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@Tags({"hbase", "scan", "fetch", "get"})
@CapabilityDescription("Scans and fetches rows from an HBase table. This processor may be used to fetch rows from hbase table by specifying a range of rowkey values (start and/or end ),"
+ "by time range, by filter expression, or any combination of them. "
+ "Order of records can be controlled by a property Reversed"
+ "Number of rows retrieved by the processor can be limited.")
@WritesAttributes({
@WritesAttribute(attribute = "hbase.table", description = "The name of the HBase table that the row was fetched from"),
@WritesAttribute(attribute = "mime.type", description = "Set to application/json when using a Destination of flowfile-content, not set or modified otherwise"),
@WritesAttribute(attribute = "hbase.rows.count", description = "Number of rows in the content of given flow file"),
@WritesAttribute(attribute = "scanhbase.results.found", description = "Indicates whether at least one row has been found in given hbase table with provided conditions. "
+ "Could be null (not present) if transfered to FAILURE")
})
public class ScanHBase extends AbstractProcessor {
//enhanced regex for columns to allow "-" in column qualifier names
static final Pattern COLUMNS_PATTERN = Pattern.compile("\\w+(:(\\w|-)+)?(?:,\\w+(:(\\w|-)+)?)*");
static final String nl = System.lineSeparator();
static final PropertyDescriptor HBASE_CLIENT_SERVICE = new PropertyDescriptor.Builder()
.displayName("HBase Client Service")
.name("scanhbase-client-service")
.description("Specifies the Controller Service to use for accessing HBase.")
.required(true)
.identifiesControllerService(HBaseClientService.class)
.build();
static final PropertyDescriptor TABLE_NAME = new PropertyDescriptor.Builder()
.displayName("Table Name")
.name("scanhbase-table-name")
.description("The name of the HBase Table to fetch from.")
.required(true)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
static final PropertyDescriptor START_ROW = new PropertyDescriptor.Builder()
.displayName("Start rowkey")
.name("scanhbase-start-rowkey")
.description("The rowkey to start scan from.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
static final PropertyDescriptor END_ROW = new PropertyDescriptor.Builder()
.displayName("End rowkey")
.name("scanhbase-end-rowkey")
.description("The row key to end scan by.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
static final PropertyDescriptor TIME_RANGE_MIN = new PropertyDescriptor.Builder()
.displayName("Time range min")
.name("scanhbase-time-range-min")
.description("Time range min value. Both min and max values for time range should be either blank or provided.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.LONG_VALIDATOR)
.build();
static final PropertyDescriptor TIME_RANGE_MAX = new PropertyDescriptor.Builder()
.displayName("Time range max")
.name("scanhbase-time-range-max")
.description("Time range max value. Both min and max values for time range should be either blank or provided.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.LONG_VALIDATOR)
.build();
static final PropertyDescriptor LIMIT_ROWS = new PropertyDescriptor.Builder()
.displayName("Limit rows")
.name("scanhbase-limit")
.description("Limit number of rows retrieved by scan.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.build();
static final PropertyDescriptor BULK_SIZE = new PropertyDescriptor.Builder()
.displayName("Max rows per flow file")
.name("scanhbase-bulk-size")
.description("Limits number of rows in single flow file content. Set to 0 to avoid multiple flow files.")
.required(false)
.expressionLanguageSupported(true)
.defaultValue("0")
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.build();
static final PropertyDescriptor REVERSED_SCAN = new PropertyDescriptor.Builder()
.displayName("Reversed order")
.name("scanhbase-reversed-order")
.description("Set whether this scan is a reversed one. This is false by default which means forward(normal) scan.")
.expressionLanguageSupported(false)
.allowableValues("true", "false")
.required(false)
.defaultValue("false")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
static final PropertyDescriptor FILTER_EXPRESSION = new PropertyDescriptor.Builder()
.displayName("Filter expression")
.name("scanhbase-filter-expression")
.description("An HBase filter expression that will be applied to the scan. This property can not be used when also using the Columns property. "
+ "Example: \"ValueFilter( =, 'binaryprefix:commit' )\"")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
static final PropertyDescriptor COLUMNS = new PropertyDescriptor.Builder()
.displayName("Columns")
.name("scanhbase-columns")
.description("An optional comma-separated list of \"<colFamily>:<colQualifier>\" pairs to fetch. To return all columns " +
"for a given family, leave off the qualifier such as \"<colFamily1>,<colFamily2>\".")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.createRegexMatchingValidator(COLUMNS_PATTERN))
.build();
static final AllowableValue JSON_FORMAT_FULL_ROW = new AllowableValue("full-row", "full-row",
"Creates a JSON document with the format: {\"row\":<row-id>, \"cells\":[{\"fam\":<col-fam>, \"qual\":<col-val>, \"val\":<value>, \"ts\":<timestamp>}]}.");
static final AllowableValue JSON_FORMAT_QUALIFIER_AND_VALUE = new AllowableValue("col-qual-and-val", "col-qual-and-val",
"Creates a JSON document with the format: {\"<col-qual>\":\"<value>\", \"<col-qual>\":\"<value>\".");
static final PropertyDescriptor JSON_FORMAT = new PropertyDescriptor.Builder()
.displayName("JSON Format")
.name("scanhbase-json-format")
.description("Specifies how to represent the HBase row as a JSON document.")
.required(true)
.allowableValues(JSON_FORMAT_FULL_ROW, JSON_FORMAT_QUALIFIER_AND_VALUE)
.defaultValue(JSON_FORMAT_FULL_ROW.getValue())
.build();
static final PropertyDescriptor DECODE_CHARSET = new PropertyDescriptor.Builder()
.displayName("Decode Character Set")
.name("scanhbase-decode-charset")
.description("The character set used to decode data from HBase.")
.required(true)
.defaultValue("UTF-8")
.addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
.build();
static final PropertyDescriptor ENCODE_CHARSET = new PropertyDescriptor.Builder()
.displayName("Encode Character Set")
.name("scanhbase-encode-charset")
.description("The character set used to encode the JSON representation of the row.")
.required(true)
.defaultValue("UTF-8")
.addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
.build();
public static final Relationship REL_ORIGINAL = new Relationship.Builder()
.name("original")
.description("The original input file will be routed to this destination, even if no rows are retrieved based on provided conditions.")
.build();
static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("All successful fetches are routed to this relationship.")
.build();
static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("All failed fetches are routed to this relationship.")
.build();
static final String HBASE_TABLE_ATTR = "hbase.table";
static final String HBASE_ROWS_COUNT_ATTR = "hbase.rows.count";
static final List<PropertyDescriptor> properties;
static {
List<PropertyDescriptor> props = new ArrayList<>();
props.add(HBASE_CLIENT_SERVICE);
props.add(TABLE_NAME);
props.add(START_ROW);
props.add(END_ROW);
props.add(TIME_RANGE_MIN);
props.add(TIME_RANGE_MAX);
props.add(LIMIT_ROWS);
props.add(REVERSED_SCAN);
props.add(BULK_SIZE);
props.add(FILTER_EXPRESSION);
props.add(COLUMNS);
props.add(JSON_FORMAT);
props.add(ENCODE_CHARSET);
props.add(DECODE_CHARSET);
properties = Collections.unmodifiableList(props);
}
static final Set<Relationship> relationships;
static {
Set<Relationship> rels = new HashSet<>();
rels.add(REL_SUCCESS);
rels.add(REL_ORIGINAL);
rels.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(rels);
}
private volatile Charset decodeCharset;
private volatile Charset encodeCharset;
private RowSerializer serializer = null;
@OnScheduled
public void onScheduled(ProcessContext context) {
this.decodeCharset = Charset.forName(context.getProperty(DECODE_CHARSET).getValue());
this.encodeCharset = Charset.forName(context.getProperty(ENCODE_CHARSET).getValue());
final String jsonFormat = context.getProperty(JSON_FORMAT).getValue();
if (jsonFormat.equals(JSON_FORMAT_FULL_ROW.getValue())) {
this.serializer = new JsonFullRowSerializer(decodeCharset, encodeCharset);
} else {
this.serializer = new JsonQualifierAndValueRowSerializer(decodeCharset, encodeCharset);
}
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@Override
protected Collection<ValidationResult> customValidate(ValidationContext validationContext) {
final List<ValidationResult> problems = new ArrayList<>();
final String columns = validationContext.getProperty(COLUMNS).getValue();
final String filter = validationContext.getProperty(FILTER_EXPRESSION).getValue();
if (!StringUtils.isBlank(columns) && !StringUtils.isBlank(filter)) {
problems.add(new ValidationResult.Builder()
.subject(FILTER_EXPRESSION.getDisplayName())
.input(filter).valid(false)
.explanation("A filter expression can not be used in conjunction with the Columns property")
.build());
}
String minTS = validationContext.getProperty(TIME_RANGE_MIN).getValue();
String maxTS = validationContext.getProperty(TIME_RANGE_MAX).getValue();
if ( (!StringUtils.isBlank(minTS) && StringUtils.isBlank(maxTS)) || (StringUtils.isBlank(minTS) && !StringUtils.isBlank(maxTS))){
problems.add(new ValidationResult.Builder()
.subject(TIME_RANGE_MAX.getDisplayName())
.input(maxTS).valid(false)
.explanation(String.format("%s and %s both should be either empty or provided", TIME_RANGE_MIN, TIME_RANGE_MAX))
.build());
}
return problems;
}
@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
try{
final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
if (StringUtils.isBlank(tableName)) {
getLogger().error("Table Name is blank or null for {}, transferring to failure", new Object[] {flowFile});
session.transfer(session.penalize(flowFile), REL_FAILURE);
return;
}
final String startRow = context.getProperty(START_ROW).evaluateAttributeExpressions(flowFile).getValue();
final String endRow = context.getProperty(END_ROW).evaluateAttributeExpressions(flowFile).getValue();
final String filterExpression = context.getProperty(FILTER_EXPRESSION).evaluateAttributeExpressions(flowFile).getValue();
//evaluate and validate time range min and max values. They both should be either empty or provided.
Long timerangeMin = null;
Long timerangeMax = null;
try{
timerangeMin = context.getProperty(TIME_RANGE_MIN).evaluateAttributeExpressions(flowFile).asLong();
}catch(Exception e){
getLogger().error("Time range min value is not a number ({}) for {}, transferring to failure",
new Object[] {context.getProperty(TIME_RANGE_MIN).evaluateAttributeExpressions(flowFile).getValue(), flowFile});
session.transfer(session.penalize(flowFile), REL_FAILURE);
return;
}
try{
timerangeMax = context.getProperty(TIME_RANGE_MAX).evaluateAttributeExpressions(flowFile).asLong();
}catch(Exception e){
getLogger().error("Time range max value is not a number ({}) for {}, transferring to failure",
new Object[] {context.getProperty(TIME_RANGE_MAX).evaluateAttributeExpressions(flowFile).getValue(), flowFile});
session.transfer(session.penalize(flowFile), REL_FAILURE);
return;
}
if (timerangeMin == null && timerangeMax != null) {
getLogger().error("Time range min value cannot be blank when max value provided for {}, transferring to failure", new Object[] {flowFile});
session.transfer(session.penalize(flowFile), REL_FAILURE);
return;
}else if (timerangeMin != null && timerangeMax == null) {
getLogger().error("Time range max value cannot be blank when min value provided for {}, transferring to failure", new Object[] {flowFile});
session.transfer(session.penalize(flowFile), REL_FAILURE);
return;
}
final Integer limitRows = context.getProperty(LIMIT_ROWS).evaluateAttributeExpressions(flowFile).asInteger();
final Boolean isReversed = context.getProperty(REVERSED_SCAN).asBoolean();
final Integer bulkSize = context.getProperty(BULK_SIZE).evaluateAttributeExpressions(flowFile).asInteger();
final List<Column> columns = getColumns(context.getProperty(COLUMNS).evaluateAttributeExpressions(flowFile).getValue());
final HBaseClientService hBaseClientService = context.getProperty(HBASE_CLIENT_SERVICE).asControllerService(HBaseClientService.class);
final AtomicReference<Long> rowsPulledHolder = new AtomicReference<>(0L);
final AtomicReference<Long> ffCountHolder = new AtomicReference<>(0L);
ScanHBaseResultHandler handler = new ScanHBaseResultHandler(context, session, flowFile, rowsPulledHolder, ffCountHolder, hBaseClientService, tableName, bulkSize);
try {
hBaseClientService.scan(tableName,
startRow, endRow,
filterExpression,
timerangeMin, timerangeMax,
limitRows,
isReversed,
columns,
handler);
} catch (Exception e) {
if (handler.getFlowFile() != null){
session.remove(handler.getFlowFile());
}
getLogger().error("Unable to fetch rows from HBase table {} due to {}", new Object[] {tableName, e});
flowFile = session.putAttribute(flowFile, "scanhbase.results.found", Boolean.toString(handler.isHandledAny()));
session.transfer(flowFile, REL_FAILURE);
return;
}
flowFile = session.putAttribute(flowFile, "scanhbase.results.found", Boolean.toString(handler.isHandledAny()));
FlowFile openedFF = handler.getFlowFile();
if (openedFF != null) {
finalizeFlowFile(session, hBaseClientService, openedFF, tableName, handler.getRecordsCount(), null);
}
session.transfer(flowFile, REL_ORIGINAL);
session.commit();
}catch (final Exception e) {
getLogger().error("Failed to receive data from HBase due to {}", e);
session.rollback();
// if we failed, we want to yield so that we don't hammer hbase.
context.yield();
}
}
/*
* Initiates FF content, adds relevant attributes, and starts content with JSON array "["
*/
private FlowFile initNewFlowFile(final ProcessSession session, final FlowFile origFF, final String tableName) throws IOException{
FlowFile flowFile = session.create(origFF);
flowFile = session.putAttribute(flowFile, HBASE_TABLE_ATTR, tableName);
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/json");
final AtomicReference<IOException> ioe = new AtomicReference<>(null);
flowFile = session.write(flowFile, (out) -> {
try{
out.write("[".getBytes());
}catch(IOException e){
ioe.set(e);
}
});
if (ioe.get() != null){
throw ioe.get();
}
return flowFile;
}
private void finalizeFlowFile(final ProcessSession session, final HBaseClientService hBaseClientService,
FlowFile flowFile, final String tableName, Long rowsPulled, Exception e) {
Relationship rel = REL_SUCCESS;
flowFile = session.putAttribute(flowFile, HBASE_ROWS_COUNT_ATTR, rowsPulled.toString());
final AtomicReference<IOException> ioe = new AtomicReference<>(null);
flowFile = session.append(flowFile, (out) -> {
try{
out.write("]".getBytes());
}catch(IOException ei){
ioe.set(ei);
}
});
if (e != null || ioe.get() != null) {
flowFile = session.putAttribute(flowFile, "scanhbase.error", (e==null?e:ioe.get()).toString());
rel = REL_FAILURE;
} else {
session.getProvenanceReporter().receive(flowFile, hBaseClientService.toTransitUri(tableName, "{ids}"));
}
session.transfer(flowFile, rel);
}
/**
* @param columnsValue a String in the form colFam:colQual,colFam:colQual
* @return a list of Columns based on parsing the given String
*/
private List<Column> getColumns(final String columnsValue) {
final String[] columns = (columnsValue == null || columnsValue.isEmpty() ? new String[0] : columnsValue.split(","));
List<Column> columnsList = new ArrayList<>(columns.length);
for (final String column : columns) {
if (column.contains(":")) {
final String[] parts = column.split(":");
final byte[] cf = parts[0].getBytes(StandardCharsets.UTF_8);
final byte[] cq = parts[1].getBytes(StandardCharsets.UTF_8);
columnsList.add(new Column(cf, cq));
} else {
final byte[] cf = column.getBytes(StandardCharsets.UTF_8);
columnsList.add(new Column(cf, null));
}
}
return columnsList;
}
/**
* @return number of rows to be committed to session.
*/
protected int getBatchSize(){
return 500;
}
/**
* Result Handler for Scan operation
*/
private class ScanHBaseResultHandler implements ResultHandler {
final private ProcessSession session;
final private FlowFile origFF;
final private AtomicReference<Long> rowsPulledHolder;
final private AtomicReference<Long> ffCountHolder;
final private HBaseClientService hBaseClientService;
final private String tableName;
final private Integer bulkSize;
private FlowFile flowFile = null;
private byte[] JSON_ARRAY_DELIM = ",\n".getBytes();
private boolean handledAny = false;
ScanHBaseResultHandler(final ProcessContext context, final ProcessSession session,
final FlowFile origFF, final AtomicReference<Long> rowsPulledHolder, final AtomicReference<Long> ffCountHolder,
final HBaseClientService hBaseClientService, final String tableName, final Integer bulkSize){
this.session = session;
this.rowsPulledHolder = rowsPulledHolder;
this.ffCountHolder = ffCountHolder;
this.hBaseClientService = hBaseClientService;
this.tableName = tableName;
this.bulkSize = bulkSize == null ? 0 : bulkSize;
this.origFF = origFF;
}
@Override
public void handle(final byte[] rowKey, final ResultCell[] resultCells) {
long rowsPulled = rowsPulledHolder.get();
long ffUncommittedCount = ffCountHolder.get();
try{
if (flowFile == null){
flowFile = initNewFlowFile(session, origFF, tableName);
ffUncommittedCount++;
}
flowFile = session.append(flowFile, (out) -> {
if (rowsPulledHolder.get() > 0){
out.write(JSON_ARRAY_DELIM);
}
serializer.serialize(rowKey, resultCells, out);
});
handledAny = true;
}catch(Exception e){
throw new RuntimeException(e);
}
rowsPulled++;
// bulkSize controls number of records per flow file.
if (bulkSize>0 && rowsPulled >= bulkSize) {
finalizeFlowFile(session, hBaseClientService, flowFile, tableName, rowsPulled, null);
flowFile = null;
rowsPulledHolder.set(0L);
// we could potentially have a huge number of rows. If we get to batchSize, go ahead and commit the
// session so that we can avoid buffering tons of FlowFiles without ever sending any out.
if (getBatchSize()>0 && ffUncommittedCount*bulkSize > getBatchSize()) {
session.commit();
ffCountHolder.set(0L);
}else{
ffCountHolder.set(ffUncommittedCount++);
}
}else{
rowsPulledHolder.set(rowsPulled);
}
}
public boolean isHandledAny(){
return handledAny;
}
public FlowFile getFlowFile(){
return flowFile;
}
public long getRecordsCount(){
return rowsPulledHolder.get();
}
}
}

@ -19,3 +19,4 @@ org.apache.nifi.hbase.PutHBaseCell
org.apache.nifi.hbase.PutHBaseJSON
org.apache.nifi.hbase.PutHBaseRecord
org.apache.nifi.hbase.FetchHBaseRow
org.apache.nifi.hbase.ScanHBase

@ -41,6 +41,8 @@ public class MockHBaseClientService extends AbstractControllerService implements
private boolean throwExceptionDuringBatchDelete = false;
private int numScans = 0;
private int numPuts = 0;
private int linesBeforeException = -1;
@Override
public void put(String tableName, Collection<PutFlowFile> puts) throws IOException {
if (throwException) {
@ -153,6 +155,28 @@ public class MockHBaseClientService extends AbstractControllerService implements
numScans++;
}
@Override
public void scan(String tableName, String startRow, String endRow, String filterExpression, Long timerangeMin,
Long timerangeMax, Integer limitRows, Boolean isReversed, Collection<Column> columns, ResultHandler handler)
throws IOException {
if (throwException) {
throw new IOException("exception");
}
int i = 0;
// pass all the staged data to the handler
for (final Map.Entry<String,ResultCell[]> entry : results.entrySet()) {
if (linesBeforeException>=0 && i++>=linesBeforeException) {
throw new IOException("iterating exception");
}
handler.handle(entry.getKey().getBytes(StandardCharsets.UTF_8), entry.getValue());
}
// delegate to the handler
numScans++;
}
public void addResult(final String rowKey, final Map<String, String> cells, final long timestamp) {
final byte[] rowArray = rowKey.getBytes(StandardCharsets.UTF_8);
@ -258,4 +282,12 @@ public class MockHBaseClientService extends AbstractControllerService implements
public void setThrowExceptionDuringBatchDelete(boolean throwExceptionDuringBatchDelete) {
this.throwExceptionDuringBatchDelete = throwExceptionDuringBatchDelete;
}
public int getLinesBeforeException() {
return linesBeforeException;
}
public void setLinesBeforeException(int linesBeforeException) {
this.linesBeforeException = linesBeforeException;
}
}

@ -0,0 +1,403 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.hbase;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
public class TestScanHBase {
private ScanHBase proc;
private MockHBaseClientService hBaseClientService;
private TestRunner runner;
@Before
public void setup() throws InitializationException {
proc = new ScanHBase();
runner = TestRunners.newTestRunner(proc);
hBaseClientService = new MockHBaseClientService();
runner.addControllerService("hbaseClient", hBaseClientService);
runner.enableControllerService(hBaseClientService);
runner.setProperty(ScanHBase.HBASE_CLIENT_SERVICE, "hbaseClient");
}
@Test
public void testColumnsValidation() {
runner.setProperty(ScanHBase.TABLE_NAME, "table1");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row1");
runner.assertValid();
runner.setProperty(ScanHBase.COLUMNS, "cf1:cq1");
runner.assertValid();
runner.setProperty(ScanHBase.COLUMNS, "cf1");
runner.assertValid();
runner.setProperty(ScanHBase.COLUMNS, "cf1:cq1,cf2:cq2,cf3:cq3");
runner.assertValid();
runner.setProperty(ScanHBase.COLUMNS, "cf1,cf2:cq1,cf3");
runner.assertValid();
runner.setProperty(ScanHBase.COLUMNS, "cf1 cf2,cf3");
runner.assertNotValid();
runner.setProperty(ScanHBase.COLUMNS, "cf1:,cf2,cf3");
runner.assertNotValid();
runner.setProperty(ScanHBase.COLUMNS, "cf1:cq1,");
runner.assertNotValid();
}
@Test
public void testNoIncomingFlowFile() {
runner.setProperty(ScanHBase.TABLE_NAME, "table1");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row1");
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 0);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 0);
Assert.assertEquals(0, hBaseClientService.getNumScans());
}
@Test
public void testInvalidTableName() {
runner.setProperty(ScanHBase.TABLE_NAME, "${hbase.table}");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row1");
runner.enqueue("trigger flow file");
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 1);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 0);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 0);
Assert.assertEquals(0, hBaseClientService.getNumScans());
}
@Test
public void testResultsNotFound() {
runner.setProperty(ScanHBase.TABLE_NAME, "table1");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row1");
runner.enqueue("trigger flow file");
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 0);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(ScanHBase.REL_ORIGINAL).get(0);
flowFile.assertAttributeEquals("scanhbase.results.found", Boolean.FALSE.toString());
Assert.assertEquals(1, hBaseClientService.getNumScans());
}
@Test
public void testScanToContentWithStringValues() {
final Map<String, String> cells = new HashMap<>();
cells.put("cq1", "val1");
cells.put("cq2", "val2");
final long ts1 = 123456789;
hBaseClientService.addResult("row1", cells, ts1);
hBaseClientService.addResult("row2", cells, ts1);
runner.setProperty(ScanHBase.TABLE_NAME, "table1");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row2");
runner.setProperty(ScanHBase.TIME_RANGE_MIN, "0");
runner.setProperty(ScanHBase.TIME_RANGE_MAX, "1111111110");
runner.setProperty(ScanHBase.LIMIT_ROWS, "10");
runner.setProperty(ScanHBase.REVERSED_SCAN, "false");
runner.setProperty(ScanHBase.BULK_SIZE, "10");
runner.enqueue("trigger flow file");
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 1);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(ScanHBase.REL_SUCCESS).get(0);
flowFile.assertContentEquals("[{\"row\":\"row1\", \"cells\": [" +
"{\"fam\":\"nifi\",\"qual\":\"cq1\",\"val\":\"val1\",\"ts\":" + ts1 + "}, " +
"{\"fam\":\"nifi\",\"qual\":\"cq2\",\"val\":\"val2\",\"ts\":" + ts1 + "}]},\n"
+ "{\"row\":\"row2\", \"cells\": [" +
"{\"fam\":\"nifi\",\"qual\":\"cq1\",\"val\":\"val1\",\"ts\":" + ts1 + "}, " +
"{\"fam\":\"nifi\",\"qual\":\"cq2\",\"val\":\"val2\",\"ts\":" + ts1 + "}]}]");
flowFile.assertAttributeEquals(ScanHBase.HBASE_ROWS_COUNT_ATTR, "2");
flowFile = runner.getFlowFilesForRelationship(ScanHBase.REL_ORIGINAL).get(0);
flowFile.assertAttributeEquals("scanhbase.results.found", Boolean.TRUE.toString());
Assert.assertEquals(1, hBaseClientService.getNumScans());
}
@Test
public void testScanBulkSize(){
final Map<String, String> cells = new HashMap<>();
cells.put("cq1", "val1");
cells.put("cq2", "val2");
for (int i = 0; i < 15; i++){
hBaseClientService.addResult("row"+i, cells, System.currentTimeMillis());
}
runner.setProperty(ScanHBase.TABLE_NAME, "${hbase.table}");
runner.setProperty(ScanHBase.START_ROW, "${hbase.row}1");
runner.setProperty(ScanHBase.END_ROW, "${hbase.row}2");
runner.setProperty(ScanHBase.COLUMNS, "${hbase.cols}");
runner.setProperty(ScanHBase.TIME_RANGE_MIN, "${tr_min}");
runner.setProperty(ScanHBase.TIME_RANGE_MAX, "${tr_max}");
runner.setProperty(ScanHBase.LIMIT_ROWS, "${limit}");
runner.setProperty(ScanHBase.BULK_SIZE, "${bulk.size}");
final Map<String,String> attributes = new HashMap<>();
attributes.put("hbase.table", "table1");
attributes.put("hbase.row", "row");
attributes.put("hbase.cols", "nifi:cq2");
attributes.put("tr_min", "10000000");
attributes.put("tr_max", "10000001");
attributes.put("limit", "1000");
attributes.put("bulk.size", "10");
runner.enqueue("trigger flow file", attributes);
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 2);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(ScanHBase.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(ScanHBase.HBASE_ROWS_COUNT_ATTR, "10");
flowFile = runner.getFlowFilesForRelationship(ScanHBase.REL_SUCCESS).get(1);
flowFile.assertAttributeEquals(ScanHBase.HBASE_ROWS_COUNT_ATTR, "5");
}
@Test
public void testScanBatchSizeTimesOfBulkSize(){
final Map<String, String> cells = new HashMap<>();
cells.put("cq1", "val1");
cells.put("cq2", "val2");
for (int i = 0; i < 1000; i++){
hBaseClientService.addResult("row"+i, cells, System.currentTimeMillis());
}
runner.setProperty(ScanHBase.TABLE_NAME, "${hbase.table}");
runner.setProperty(ScanHBase.START_ROW, "${hbase.row}1");
runner.setProperty(ScanHBase.END_ROW, "${hbase.row}2");
runner.setProperty(ScanHBase.COLUMNS, "${hbase.cols}");
runner.setProperty(ScanHBase.TIME_RANGE_MIN, "${tr_min}");
runner.setProperty(ScanHBase.TIME_RANGE_MAX, "${tr_max}");
runner.setProperty(ScanHBase.LIMIT_ROWS, "${limit}");
runner.setProperty(ScanHBase.BULK_SIZE, "${bulk.size}");
final Map<String,String> attributes = new HashMap<>();
attributes.put("hbase.table", "table1");
attributes.put("hbase.row", "row");
attributes.put("hbase.cols", "nifi:cq2");
attributes.put("tr_min", "10000000");
attributes.put("tr_max", "10000001");
attributes.put("limit", "1000");
attributes.put("bulk.size", "100");
runner.enqueue("trigger flow file", attributes);
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 10);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 1);
runner.getFlowFilesForRelationship(ScanHBase.REL_SUCCESS).forEach(ff ->{
ff.assertAttributeEquals(ScanHBase.HBASE_ROWS_COUNT_ATTR, "100");
Assert.assertNotEquals(0, ff.getId()); // since total amount of rows is a multiplication of bulkSize, original FF (with id=0) shouldn't be present on output.
});
}
@Test
public void testScanBatchSizeTimesCutBulkSize(){
final Map<String, String> cells = new HashMap<>();
cells.put("cq1", "val1");
cells.put("cq2", "val2");
for (int i = 0; i < 1102; i++){
hBaseClientService.addResult("row"+i, cells, System.currentTimeMillis());
}
runner.setProperty(ScanHBase.TABLE_NAME, "${hbase.table}");
runner.setProperty(ScanHBase.START_ROW, "${hbase.row}1");
runner.setProperty(ScanHBase.END_ROW, "${hbase.row}2");
runner.setProperty(ScanHBase.COLUMNS, "${hbase.cols}");
runner.setProperty(ScanHBase.TIME_RANGE_MIN, "${tr_min}");
runner.setProperty(ScanHBase.TIME_RANGE_MAX, "${tr_max}");
runner.setProperty(ScanHBase.LIMIT_ROWS, "${limit}");
runner.setProperty(ScanHBase.BULK_SIZE, "${bulk.size}");
final Map<String,String> attributes = new HashMap<>();
attributes.put("hbase.table", "table1");
attributes.put("hbase.row", "row");
attributes.put("hbase.cols", "nifi:cq2");
attributes.put("tr_min", "10000000");
attributes.put("tr_max", "10000001");
attributes.put("limit", "1000");
attributes.put("bulk.size", "110");
runner.enqueue("trigger flow file", attributes);
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 11);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 1);
List<MockFlowFile> ffs = runner.getFlowFilesForRelationship(ScanHBase.REL_SUCCESS);
int i = 0;
for (MockFlowFile ff : ffs)
ff.assertAttributeEquals(ScanHBase.HBASE_ROWS_COUNT_ATTR, new String(i++ < 10 ? "110" : "2")); //last ff should have only 2
}
@Test
public void testScanToContentWithQualifierAndValueJSON() {
final Map<String, String> cells = new HashMap<>();
cells.put("cq1", "val1");
cells.put("cq2", "val2");
hBaseClientService.addResult("row1", cells, System.currentTimeMillis());
runner.setProperty(ScanHBase.TABLE_NAME, "table1");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row1");
runner.setProperty(ScanHBase.JSON_FORMAT, ScanHBase.JSON_FORMAT_QUALIFIER_AND_VALUE);
runner.enqueue("trigger flow file");
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 1);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ScanHBase.REL_SUCCESS).get(0);
flowFile.assertContentEquals("[{\"cq1\":\"val1\", \"cq2\":\"val2\"}]");
Assert.assertEquals(1, hBaseClientService.getNumScans());
}
@Test
public void testScanWithExpressionLanguage() {
final Map<String, String> cells = new HashMap<>();
// cells.put("cq1", "val1");
cells.put("cq2", "val2");
final long ts1 = 123456789;
hBaseClientService.addResult("row1", cells, ts1);
runner.setProperty(ScanHBase.TABLE_NAME, "${hbase.table}");
runner.setProperty(ScanHBase.START_ROW, "${hbase.row}1");
runner.setProperty(ScanHBase.END_ROW, "${hbase.row}2");
runner.setProperty(ScanHBase.COLUMNS, "${hbase.cols}");
runner.setProperty(ScanHBase.TIME_RANGE_MIN, "${tr_min}");
runner.setProperty(ScanHBase.TIME_RANGE_MAX, "${tr_max}");
runner.setProperty(ScanHBase.LIMIT_ROWS, "${limit}");
runner.setProperty(ScanHBase.BULK_SIZE, "${bulk.size}");
final Map<String,String> attributes = new HashMap<>();
attributes.put("hbase.table", "table1");
attributes.put("hbase.row", "row");
attributes.put("hbase.cols", "nifi:cq2");
attributes.put("tr_min", "10000000");
attributes.put("tr_max", "10000001");
attributes.put("limit", "1000");
attributes.put("bulk.size", "10");
runner.enqueue("trigger flow file", attributes);
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 0);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 1);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ScanHBase.REL_SUCCESS).get(0);
flowFile.assertContentEquals("[{\"row\":\"row1\", \"cells\": [{\"fam\":\"nifi\",\"qual\":\"cq2\",\"val\":\"val2\",\"ts\":" + ts1 + "}]}]");
Assert.assertEquals(1, hBaseClientService.getNumScans());
}
@Test
public void testScanWhenScanThrowsException() {
hBaseClientService.setThrowException(true);
runner.setProperty(ScanHBase.TABLE_NAME, "table1");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row1");
runner.enqueue("trigger flow file");
runner.run();
runner.assertTransferCount(ScanHBase.REL_FAILURE, 1);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 0);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 0);
Assert.assertEquals(0, hBaseClientService.getNumScans());
}
@Test
public void testScanWhenScanThrowsExceptionAfterLineN() {
hBaseClientService.setLinesBeforeException(1);
final Map<String, String> cells = new HashMap<>();
cells.put("cq1", "val1");
cells.put("cq2", "val2");
final long ts1 = 123456789;
hBaseClientService.addResult("row1", cells, ts1);
hBaseClientService.addResult("row2", cells, ts1);
runner.setProperty(ScanHBase.TABLE_NAME, "table1");
runner.setProperty(ScanHBase.START_ROW, "row1");
runner.setProperty(ScanHBase.END_ROW, "row2");
runner.enqueue("trigger flow file");
runner.run();
hBaseClientService.setLinesBeforeException(-1);
runner.assertTransferCount(ScanHBase.REL_FAILURE, 1);
runner.assertTransferCount(ScanHBase.REL_SUCCESS, 0);
runner.assertTransferCount(ScanHBase.REL_ORIGINAL, 0);
Assert.assertEquals(0, hBaseClientService.getNumScans());
}
}

@ -150,6 +150,23 @@ public interface HBaseClientService extends ControllerService {
*/
void scan(String tableName, byte[] startRow, byte[] endRow, Collection<Column> columns, ResultHandler handler) throws IOException;
/**
* Scans the given table for the given range of row keys or time rage and passes the result to a handler.<br/>
*
* @param tableName the name of an HBase table to scan
* @param startRow the row identifier to start scanning at
* @param endRow the row identifier to end scanning at
* @param filterExpression optional filter expression, if not specified no filtering is performed
* @param timerangeMin the minimum timestamp of cells to return, passed to the HBase scanner timeRange
* @param timerangeMax the maximum timestamp of cells to return, passed to the HBase scanner timeRange
* @param limitRows the maximum number of rows to be returned by scanner
* @param isReversed whether this scan is a reversed one.
* @param columns optional columns to return, if not specified all columns are returned
* @param handler a handler to process rows of the result
*/
void scan(String tableName, String startRow, String endRow, String filterExpression, Long timerangeMin, Long timerangeMax, Integer limitRows,
Boolean isReversed, Collection<Column> columns, ResultHandler handler) throws IOException;
/**
* Converts the given boolean to it's byte representation.
*

@ -430,6 +430,91 @@ public class HBase_1_1_2_ClientService extends AbstractControllerService impleme
}
}
@Override
public void scan(final String tableName, final String startRow, final String endRow, String filterExpression,
final Long timerangeMin, final Long timerangeMax, final Integer limitRows, final Boolean isReversed,
final Collection<Column> columns, final ResultHandler handler) throws IOException {
try (final Table table = connection.getTable(TableName.valueOf(tableName));
final ResultScanner scanner = getResults(table, startRow, endRow, filterExpression, timerangeMin,
timerangeMax, limitRows, isReversed, columns)) {
int cnt = 0;
final int lim = limitRows != null ? limitRows : 0;
for (final Result result : scanner) {
if (lim > 0 && ++cnt > lim){
break;
}
final byte[] rowKey = result.getRow();
final Cell[] cells = result.rawCells();
if (cells == null) {
continue;
}
// convert HBase cells to NiFi cells
final ResultCell[] resultCells = new ResultCell[cells.length];
for (int i = 0; i < cells.length; i++) {
final Cell cell = cells[i];
final ResultCell resultCell = getResultCell(cell);
resultCells[i] = resultCell;
}
// delegate to the handler
handler.handle(rowKey, resultCells);
}
}
}
//
protected ResultScanner getResults(final Table table, final String startRow, final String endRow, final String filterExpression, final Long timerangeMin, final Long timerangeMax,
final Integer limitRows, final Boolean isReversed, final Collection<Column> columns) throws IOException {
final Scan scan = new Scan();
if (!StringUtils.isBlank(startRow)){
scan.setStartRow(startRow.getBytes(StandardCharsets.UTF_8));
}
if (!StringUtils.isBlank(endRow)){
scan.setStopRow( endRow.getBytes(StandardCharsets.UTF_8));
}
Filter filter = null;
if (columns != null) {
for (Column col : columns) {
if (col.getQualifier() == null) {
scan.addFamily(col.getFamily());
} else {
scan.addColumn(col.getFamily(), col.getQualifier());
}
}
}
if (!StringUtils.isBlank(filterExpression)) {
ParseFilter parseFilter = new ParseFilter();
filter = parseFilter.parseFilterString(filterExpression);
}
if (filter != null){
scan.setFilter(filter);
}
if (timerangeMin != null && timerangeMax != null){
scan.setTimeRange(timerangeMin, timerangeMax);
}
// ->>> reserved for HBase v 2 or later
//if (limitRows != null && limitRows > 0){
// scan.setLimit(limitRows)
//}
if (isReversed != null){
scan.setReversed(isReversed);
}
return table.getScanner(scan);
}
// protected and extracted into separate method for testing
protected ResultScanner getResults(final Table table, final byte[] startRow, final byte[] endRow, final Collection<Column> columns) throws IOException {
final Scan scan = new Scan();

@ -156,6 +156,14 @@ public class MockHBaseClientService extends HBase_1_1_2_ClientService {
return scanner;
}
@Override
protected ResultScanner getResults(final Table table, final String startRow, final String endRow, final String filterExpression, final Long timerangeMin, final Long timerangeMax,
final Integer limitRows, final Boolean isReversed, final Collection<Column> columns) throws IOException {
final ResultScanner scanner = Mockito.mock(ResultScanner.class);
Mockito.when(scanner.iterator()).thenReturn(results.iterator());
return scanner;
}
@Override
protected Connection createConnection(ConfigurationContext context) throws IOException {
Connection connection = Mockito.mock(Connection.class);