mirror of https://github.com/apache/nifi.git
NIFI-3366 MoveHDFS processor supports expressions language for input and copy operations
Signed-off-by: Jeff Storck <jtswork@gmail.com>
This commit is contained in:
parent
c138987bb4
commit
3731fbee88
|
@ -0,0 +1,527 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.nifi.processors.hadoop;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.security.PrivilegedAction;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.BlockingQueue;
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import java.util.concurrent.locks.Lock;
|
||||||
|
import java.util.concurrent.locks.ReentrantLock;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.FileUtil;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.fs.PathFilter;
|
||||||
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
|
import org.apache.nifi.annotation.behavior.ReadsAttribute;
|
||||||
|
import org.apache.nifi.annotation.behavior.WritesAttribute;
|
||||||
|
import org.apache.nifi.annotation.behavior.WritesAttributes;
|
||||||
|
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||||
|
import org.apache.nifi.annotation.documentation.SeeAlso;
|
||||||
|
import org.apache.nifi.annotation.documentation.Tags;
|
||||||
|
import org.apache.nifi.annotation.lifecycle.OnScheduled;
|
||||||
|
import org.apache.nifi.components.AllowableValue;
|
||||||
|
import org.apache.nifi.components.PropertyDescriptor;
|
||||||
|
import org.apache.nifi.flowfile.FlowFile;
|
||||||
|
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||||
|
import org.apache.nifi.processor.ProcessContext;
|
||||||
|
import org.apache.nifi.processor.ProcessSession;
|
||||||
|
import org.apache.nifi.processor.Relationship;
|
||||||
|
import org.apache.nifi.processor.exception.ProcessException;
|
||||||
|
import org.apache.nifi.processor.util.StandardValidators;
|
||||||
|
import org.apache.nifi.util.StopWatch;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This processor renames files on HDFS.
|
||||||
|
*/
|
||||||
|
@Tags({ "hadoop", "HDFS", "put", "move", "filesystem", "restricted", "moveHDFS" })
|
||||||
|
@CapabilityDescription("Rename existing files on Hadoop Distributed File System (HDFS)")
|
||||||
|
@ReadsAttribute(attribute = "filename", description = "The name of the file written to HDFS comes from the value of this attribute.")
|
||||||
|
@WritesAttributes({
|
||||||
|
@WritesAttribute(attribute = "filename", description = "The name of the file written to HDFS is stored in this attribute."),
|
||||||
|
@WritesAttribute(attribute = "absolute.hdfs.path", description = "The absolute path to the file on HDFS is stored in this attribute.") })
|
||||||
|
@SeeAlso({ PutHDFS.class, GetHDFS.class })
|
||||||
|
public class MoveHDFS extends AbstractHadoopProcessor {
|
||||||
|
|
||||||
|
// static global
|
||||||
|
public static final int MAX_WORKING_QUEUE_SIZE = 25000;
|
||||||
|
public static final String REPLACE_RESOLUTION = "replace";
|
||||||
|
public static final String IGNORE_RESOLUTION = "ignore";
|
||||||
|
public static final String FAIL_RESOLUTION = "fail";
|
||||||
|
|
||||||
|
private static final Set<Relationship> relationships;
|
||||||
|
|
||||||
|
public static final AllowableValue REPLACE_RESOLUTION_AV = new AllowableValue(REPLACE_RESOLUTION,
|
||||||
|
REPLACE_RESOLUTION, "Replaces the existing file if any.");
|
||||||
|
public static final AllowableValue IGNORE_RESOLUTION_AV = new AllowableValue(IGNORE_RESOLUTION, IGNORE_RESOLUTION,
|
||||||
|
"Failed rename operation stops processing and routes to success.");
|
||||||
|
public static final AllowableValue FAIL_RESOLUTION_AV = new AllowableValue(FAIL_RESOLUTION, FAIL_RESOLUTION,
|
||||||
|
"Failing to rename a file routes to failure.");
|
||||||
|
|
||||||
|
public static final String BUFFER_SIZE_KEY = "io.file.buffer.size";
|
||||||
|
public static final int BUFFER_SIZE_DEFAULT = 4096;
|
||||||
|
|
||||||
|
public static final String ABSOLUTE_HDFS_PATH_ATTRIBUTE = "absolute.hdfs.path";
|
||||||
|
|
||||||
|
// relationships
|
||||||
|
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
|
||||||
|
.description("Files that have been successfully renamed on HDFS are transferred to this relationship")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
|
||||||
|
.description("Files that could not be renamed on HDFS are transferred to this relationship").build();
|
||||||
|
|
||||||
|
// properties
|
||||||
|
public static final PropertyDescriptor CONFLICT_RESOLUTION = new PropertyDescriptor.Builder()
|
||||||
|
.name("Conflict Resolution Strategy")
|
||||||
|
.description(
|
||||||
|
"Indicates what should happen when a file with the same name already exists in the output directory")
|
||||||
|
.required(true).defaultValue(FAIL_RESOLUTION_AV.getValue())
|
||||||
|
.allowableValues(REPLACE_RESOLUTION_AV, IGNORE_RESOLUTION_AV, FAIL_RESOLUTION_AV).build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor FILE_FILTER_REGEX = new PropertyDescriptor.Builder()
|
||||||
|
.name("File Filter Regex")
|
||||||
|
.description(
|
||||||
|
"A Java Regular Expression for filtering Filenames; if a filter is supplied then only files whose names match that Regular "
|
||||||
|
+ "Expression will be fetched, otherwise all files will be fetched")
|
||||||
|
.required(false).addValidator(StandardValidators.REGULAR_EXPRESSION_VALIDATOR).build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor IGNORE_DOTTED_FILES = new PropertyDescriptor.Builder()
|
||||||
|
.name("Ignore Dotted Files")
|
||||||
|
.description("If true, files whose names begin with a dot (\".\") will be ignored").required(true)
|
||||||
|
.allowableValues("true", "false").defaultValue("true").build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor INPUT_DIRECTORY_OR_FILE = new PropertyDescriptor.Builder()
|
||||||
|
.name("Input Directory or File")
|
||||||
|
.description("The HDFS directory from which files should be read, or a single file to read")
|
||||||
|
.defaultValue("${path}").addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
|
||||||
|
.expressionLanguageSupported(true).build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor OUTPUT_DIRECTORY = new PropertyDescriptor.Builder().name("Output Directory")
|
||||||
|
.description("The HDFS directory where the files will be moved to").required(true)
|
||||||
|
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR).expressionLanguageSupported(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor OPERATION = new PropertyDescriptor.Builder().name("HDFS Operation")
|
||||||
|
.description("The operation that will be performed on the source file").required(true)
|
||||||
|
.allowableValues("move", "copy").defaultValue("move").build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor REMOTE_OWNER = new PropertyDescriptor.Builder().name("Remote Owner")
|
||||||
|
.description(
|
||||||
|
"Changes the owner of the HDFS file to this value after it is written. This only works if NiFi is running as a user that has HDFS super user privilege to change owner")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor REMOTE_GROUP = new PropertyDescriptor.Builder().name("Remote Group")
|
||||||
|
.description(
|
||||||
|
"Changes the group of the HDFS file to this value after it is written. This only works if NiFi is running as a user that has HDFS super user privilege to change group")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();
|
||||||
|
|
||||||
|
static {
|
||||||
|
final Set<Relationship> rels = new HashSet<>();
|
||||||
|
rels.add(REL_SUCCESS);
|
||||||
|
rels.add(REL_FAILURE);
|
||||||
|
relationships = Collections.unmodifiableSet(rels);
|
||||||
|
}
|
||||||
|
|
||||||
|
// non-static global
|
||||||
|
protected ProcessorConfiguration processorConfig;
|
||||||
|
private final AtomicLong logEmptyListing = new AtomicLong(2L);
|
||||||
|
|
||||||
|
private final Lock listingLock = new ReentrantLock();
|
||||||
|
private final Lock queueLock = new ReentrantLock();
|
||||||
|
|
||||||
|
private final BlockingQueue<Path> filePathQueue = new LinkedBlockingQueue<>();
|
||||||
|
private final BlockingQueue<Path> processing = new LinkedBlockingQueue<>();
|
||||||
|
|
||||||
|
// methods
|
||||||
|
@Override
|
||||||
|
public Set<Relationship> getRelationships() {
|
||||||
|
return relationships;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||||
|
List<PropertyDescriptor> props = new ArrayList<>(properties);
|
||||||
|
props.add(CONFLICT_RESOLUTION);
|
||||||
|
props.add(INPUT_DIRECTORY_OR_FILE);
|
||||||
|
props.add(OUTPUT_DIRECTORY);
|
||||||
|
props.add(OPERATION);
|
||||||
|
props.add(FILE_FILTER_REGEX);
|
||||||
|
props.add(IGNORE_DOTTED_FILES);
|
||||||
|
props.add(REMOTE_OWNER);
|
||||||
|
props.add(REMOTE_GROUP);
|
||||||
|
return props;
|
||||||
|
}
|
||||||
|
|
||||||
|
@OnScheduled
|
||||||
|
public void onScheduled(ProcessContext context) throws Exception {
|
||||||
|
super.abstractOnScheduled(context);
|
||||||
|
// copy configuration values to pass them around cleanly
|
||||||
|
processorConfig = new ProcessorConfiguration(context);
|
||||||
|
// forget the state of the queue in case HDFS contents changed while
|
||||||
|
// this processor was turned off
|
||||||
|
queueLock.lock();
|
||||||
|
try {
|
||||||
|
filePathQueue.clear();
|
||||||
|
processing.clear();
|
||||||
|
} finally {
|
||||||
|
queueLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
|
||||||
|
// MoveHDFS
|
||||||
|
FlowFile parentFlowFile = session.get();
|
||||||
|
if (parentFlowFile == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final FileSystem hdfs = getFileSystem();
|
||||||
|
final String filenameValue = context.getProperty(INPUT_DIRECTORY_OR_FILE).evaluateAttributeExpressions(parentFlowFile).getValue();
|
||||||
|
|
||||||
|
Path inputPath = null;
|
||||||
|
try {
|
||||||
|
inputPath = new Path(filenameValue);
|
||||||
|
if(!hdfs.exists(inputPath)) {
|
||||||
|
throw new IOException("Input Directory or File does not exist in HDFS");
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] {filenameValue, parentFlowFile, e});
|
||||||
|
parentFlowFile = session.putAttribute(parentFlowFile, "hdfs.failure.reason", e.getMessage());
|
||||||
|
parentFlowFile = session.penalize(parentFlowFile);
|
||||||
|
session.transfer(parentFlowFile, REL_FAILURE);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
session.remove(parentFlowFile);
|
||||||
|
|
||||||
|
List<Path> files = new ArrayList<Path>();
|
||||||
|
|
||||||
|
try {
|
||||||
|
final StopWatch stopWatch = new StopWatch(true);
|
||||||
|
Set<Path> listedFiles = performListing(context, inputPath);
|
||||||
|
stopWatch.stop();
|
||||||
|
final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
|
if (listedFiles != null) {
|
||||||
|
// place files into the work queue
|
||||||
|
int newItems = 0;
|
||||||
|
queueLock.lock();
|
||||||
|
try {
|
||||||
|
for (Path file : listedFiles) {
|
||||||
|
if (!filePathQueue.contains(file) && !processing.contains(file)) {
|
||||||
|
if (!filePathQueue.offer(file)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
newItems++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
getLogger().warn("Could not add to processing queue due to {}", new Object[] { e });
|
||||||
|
} finally {
|
||||||
|
queueLock.unlock();
|
||||||
|
}
|
||||||
|
if (listedFiles.size() > 0) {
|
||||||
|
logEmptyListing.set(3L);
|
||||||
|
}
|
||||||
|
if (logEmptyListing.getAndDecrement() > 0) {
|
||||||
|
getLogger().info(
|
||||||
|
"Obtained file listing in {} milliseconds; listing had {} items, {} of which were new",
|
||||||
|
new Object[] { millis, listedFiles.size(), newItems });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
context.yield();
|
||||||
|
getLogger().warn("Error while retrieving list of files due to {}", new Object[] { e });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare to process a batch of files in the queue
|
||||||
|
queueLock.lock();
|
||||||
|
try {
|
||||||
|
filePathQueue.drainTo(files);
|
||||||
|
if (files.isEmpty()) {
|
||||||
|
// nothing to do!
|
||||||
|
context.yield();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
queueLock.unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
processBatchOfFiles(files, context, session);
|
||||||
|
|
||||||
|
queueLock.lock();
|
||||||
|
try {
|
||||||
|
processing.removeAll(files);
|
||||||
|
} finally {
|
||||||
|
queueLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void processBatchOfFiles(final List<Path> files, final ProcessContext context,
|
||||||
|
final ProcessSession session) {
|
||||||
|
// process the batch of files
|
||||||
|
final Configuration conf = getConfiguration();
|
||||||
|
final FileSystem hdfs = getFileSystem();
|
||||||
|
final UserGroupInformation ugi = getUserGroupInformation();
|
||||||
|
|
||||||
|
if (conf == null || ugi == null) {
|
||||||
|
getLogger().error("Configuration or UserGroupInformation not configured properly");
|
||||||
|
session.transfer(session.get(), REL_FAILURE);
|
||||||
|
context.yield();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (final Path file : files) {
|
||||||
|
|
||||||
|
ugi.doAs(new PrivilegedAction<Object>() {
|
||||||
|
@Override
|
||||||
|
public Object run() {
|
||||||
|
FlowFile flowFile = session.create();
|
||||||
|
try {
|
||||||
|
final String originalFilename = file.getName();
|
||||||
|
final Path configuredRootOutputDirPath = processorConfig.getOutputDirectory();
|
||||||
|
final Path newFile = new Path(configuredRootOutputDirPath, originalFilename);
|
||||||
|
final boolean destinationExists = hdfs.exists(newFile);
|
||||||
|
// If destination file already exists, resolve that
|
||||||
|
// based on processor configuration
|
||||||
|
if (destinationExists) {
|
||||||
|
switch (processorConfig.getConflictResolution()) {
|
||||||
|
case REPLACE_RESOLUTION:
|
||||||
|
if (hdfs.delete(file, false)) {
|
||||||
|
getLogger().info("deleted {} in order to replace with the contents of {}",
|
||||||
|
new Object[] { file, flowFile });
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case IGNORE_RESOLUTION:
|
||||||
|
session.transfer(flowFile, REL_SUCCESS);
|
||||||
|
getLogger().info(
|
||||||
|
"transferring {} to success because file with same name already exists",
|
||||||
|
new Object[] { flowFile });
|
||||||
|
return null;
|
||||||
|
case FAIL_RESOLUTION:
|
||||||
|
session.transfer(session.penalize(flowFile), REL_FAILURE);
|
||||||
|
getLogger().warn(
|
||||||
|
"penalizing {} and routing to failure because file with same name already exists",
|
||||||
|
new Object[] { flowFile });
|
||||||
|
return null;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create destination directory if it does not exist
|
||||||
|
try {
|
||||||
|
if (!hdfs.getFileStatus(configuredRootOutputDirPath).isDirectory()) {
|
||||||
|
throw new IOException(configuredRootOutputDirPath.toString()
|
||||||
|
+ " already exists and is not a directory");
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException fe) {
|
||||||
|
if (!hdfs.mkdirs(configuredRootOutputDirPath)) {
|
||||||
|
throw new IOException(configuredRootOutputDirPath.toString() + " could not be created");
|
||||||
|
}
|
||||||
|
changeOwner(context, hdfs, configuredRootOutputDirPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean moved = false;
|
||||||
|
for (int i = 0; i < 10; i++) { // try to rename multiple
|
||||||
|
// times.
|
||||||
|
if (processorConfig.getOperation().equals("move")) {
|
||||||
|
if (hdfs.rename(file, newFile)) {
|
||||||
|
moved = true;
|
||||||
|
break;// rename was successful
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (FileUtil.copy(hdfs, file, hdfs, newFile, false, conf)) {
|
||||||
|
moved = true;
|
||||||
|
break;// copy was successful
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Thread.sleep(200L);// try waiting to let whatever
|
||||||
|
// might cause rename failure to
|
||||||
|
// resolve
|
||||||
|
}
|
||||||
|
if (!moved) {
|
||||||
|
throw new ProcessException("Could not move file " + file + " to its final filename");
|
||||||
|
}
|
||||||
|
|
||||||
|
changeOwner(context, hdfs, file);
|
||||||
|
final String outputPath = newFile.toString();
|
||||||
|
final String newFilename = newFile.getName();
|
||||||
|
final String hdfsPath = newFile.getParent().toString();
|
||||||
|
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename);
|
||||||
|
flowFile = session.putAttribute(flowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
|
||||||
|
final String transitUri = (outputPath.startsWith("/")) ? "hdfs:/" + outputPath
|
||||||
|
: "hdfs://" + outputPath;
|
||||||
|
session.getProvenanceReporter().send(flowFile, transitUri);
|
||||||
|
session.transfer(flowFile, REL_SUCCESS);
|
||||||
|
|
||||||
|
} catch (final Throwable t) {
|
||||||
|
getLogger().error("Failed to rename on HDFS due to {}", new Object[] { t });
|
||||||
|
session.transfer(session.penalize(flowFile), REL_FAILURE);
|
||||||
|
context.yield();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Set<Path> performListing(final ProcessContext context, Path path) throws IOException {
|
||||||
|
Set<Path> listing = null;
|
||||||
|
|
||||||
|
if (listingLock.tryLock()) {
|
||||||
|
try {
|
||||||
|
final FileSystem hdfs = getFileSystem();
|
||||||
|
// get listing
|
||||||
|
listing = selectFiles(hdfs, path, null);
|
||||||
|
} finally {
|
||||||
|
listingLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return listing;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void changeOwner(final ProcessContext context, final FileSystem hdfs, final Path name) {
|
||||||
|
try {
|
||||||
|
// Change owner and group of file if configured to do so
|
||||||
|
String owner = context.getProperty(REMOTE_OWNER).getValue();
|
||||||
|
String group = context.getProperty(REMOTE_GROUP).getValue();
|
||||||
|
if (owner != null || group != null) {
|
||||||
|
hdfs.setOwner(name, owner, group);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
getLogger().warn("Could not change owner or group of {} on HDFS due to {}", new Object[] { name, e });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Set<Path> selectFiles(final FileSystem hdfs, final Path inputPath, Set<Path> filesVisited)
|
||||||
|
throws IOException {
|
||||||
|
if (null == filesVisited) {
|
||||||
|
filesVisited = new HashSet<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hdfs.exists(inputPath)) {
|
||||||
|
throw new IOException("Selection directory " + inputPath.toString() + " doesn't appear to exist!");
|
||||||
|
}
|
||||||
|
|
||||||
|
final Set<Path> files = new HashSet<>();
|
||||||
|
|
||||||
|
FileStatus inputStatus = hdfs.getFileStatus(inputPath);
|
||||||
|
|
||||||
|
if (inputStatus.isDirectory()) {
|
||||||
|
for (final FileStatus file : hdfs.listStatus(inputPath)) {
|
||||||
|
final Path canonicalFile = file.getPath();
|
||||||
|
|
||||||
|
if (!filesVisited.add(canonicalFile)) { // skip files we've
|
||||||
|
// already
|
||||||
|
// seen (may be looping
|
||||||
|
// directory links)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!file.isDirectory() && processorConfig.getPathFilter(inputPath).accept(canonicalFile)) {
|
||||||
|
files.add(canonicalFile);
|
||||||
|
|
||||||
|
if (getLogger().isDebugEnabled()) {
|
||||||
|
getLogger().debug(this + " selected file at path: " + canonicalFile.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (inputStatus.isFile()) {
|
||||||
|
files.add(inputPath);
|
||||||
|
}
|
||||||
|
return files;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static class ProcessorConfiguration {
|
||||||
|
|
||||||
|
final private String conflictResolution;
|
||||||
|
final private String operation;
|
||||||
|
final private Path inputRootDirPath;
|
||||||
|
final private Path outputRootDirPath;
|
||||||
|
final private Pattern fileFilterPattern;
|
||||||
|
final private boolean ignoreDottedFiles;
|
||||||
|
|
||||||
|
ProcessorConfiguration(final ProcessContext context) {
|
||||||
|
conflictResolution = context.getProperty(CONFLICT_RESOLUTION).getValue();
|
||||||
|
operation = context.getProperty(OPERATION).getValue();
|
||||||
|
final String inputDirValue = context.getProperty(INPUT_DIRECTORY_OR_FILE).getValue();
|
||||||
|
inputRootDirPath = new Path(inputDirValue);
|
||||||
|
final String outputDirValue = context.getProperty(OUTPUT_DIRECTORY).getValue();
|
||||||
|
outputRootDirPath = new Path(outputDirValue);
|
||||||
|
final String fileFilterRegex = context.getProperty(FILE_FILTER_REGEX).getValue();
|
||||||
|
fileFilterPattern = (fileFilterRegex == null) ? null : Pattern.compile(fileFilterRegex);
|
||||||
|
ignoreDottedFiles = context.getProperty(IGNORE_DOTTED_FILES).asBoolean();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOperation() {
|
||||||
|
return operation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getConflictResolution() {
|
||||||
|
return conflictResolution;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Path getInput() {
|
||||||
|
return inputRootDirPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Path getOutputDirectory() {
|
||||||
|
return outputRootDirPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected PathFilter getPathFilter(final Path dir) {
|
||||||
|
return new PathFilter() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean accept(Path path) {
|
||||||
|
if (ignoreDottedFiles && path.getName().startsWith(".")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final String pathToCompare;
|
||||||
|
String relativePath = getPathDifference(dir, path);
|
||||||
|
if (relativePath.length() == 0) {
|
||||||
|
pathToCompare = path.getName();
|
||||||
|
} else {
|
||||||
|
pathToCompare = relativePath + Path.SEPARATOR + path.getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fileFilterPattern != null && !fileFilterPattern.matcher(pathToCompare).matches()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -20,3 +20,4 @@ org.apache.nifi.processors.hadoop.inotify.GetHDFSEvents
|
||||||
org.apache.nifi.processors.hadoop.ListHDFS
|
org.apache.nifi.processors.hadoop.ListHDFS
|
||||||
org.apache.nifi.processors.hadoop.PutHDFS
|
org.apache.nifi.processors.hadoop.PutHDFS
|
||||||
org.apache.nifi.processors.hadoop.DeleteHDFS
|
org.apache.nifi.processors.hadoop.DeleteHDFS
|
||||||
|
org.apache.nifi.processors.hadoop.MoveHDFS
|
||||||
|
|
|
@ -0,0 +1,195 @@
|
||||||
|
package org.apache.nifi.processors.hadoop;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.nifi.components.ValidationResult;
|
||||||
|
import org.apache.nifi.hadoop.KerberosProperties;
|
||||||
|
import org.apache.nifi.processor.ProcessContext;
|
||||||
|
import org.apache.nifi.util.MockFlowFile;
|
||||||
|
import org.apache.nifi.util.MockProcessContext;
|
||||||
|
import org.apache.nifi.util.NiFiProperties;
|
||||||
|
import org.apache.nifi.util.TestRunner;
|
||||||
|
import org.apache.nifi.util.TestRunners;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class MoveHDFSTest {
|
||||||
|
|
||||||
|
private static final String OUTPUT_DIRECTORY = "src/test/resources/testdataoutput";
|
||||||
|
private static final String INPUT_DIRECTORY = "src/test/resources/testdata";
|
||||||
|
private static final String DOT_FILE_PATH = "src/test/resources/testdata/.testfordotfiles";
|
||||||
|
private NiFiProperties mockNiFiProperties;
|
||||||
|
private KerberosProperties kerberosProperties;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() {
|
||||||
|
mockNiFiProperties = mock(NiFiProperties.class);
|
||||||
|
when(mockNiFiProperties.getKerberosConfigurationFile()).thenReturn(null);
|
||||||
|
kerberosProperties = new KerberosProperties(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void teardown() {
|
||||||
|
File outputDirectory = new File(OUTPUT_DIRECTORY);
|
||||||
|
if (outputDirectory.exists()) {
|
||||||
|
if (outputDirectory.isDirectory()) {
|
||||||
|
moveFilesFromOutputDirectoryToInput();
|
||||||
|
}
|
||||||
|
outputDirectory.delete();
|
||||||
|
}
|
||||||
|
removeDotFile();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removeDotFile() {
|
||||||
|
File dotFile = new File(DOT_FILE_PATH);
|
||||||
|
if (dotFile.exists()) {
|
||||||
|
dotFile.delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void moveFilesFromOutputDirectoryToInput() {
|
||||||
|
File folder = new File(OUTPUT_DIRECTORY);
|
||||||
|
for (File file : folder.listFiles()) {
|
||||||
|
if (file.isFile()) {
|
||||||
|
String path = file.getAbsolutePath();
|
||||||
|
if(!path.endsWith(".crc")) {
|
||||||
|
String newPath = path.replaceAll("testdataoutput", "testdata");
|
||||||
|
File newFile = new File(newPath);
|
||||||
|
if (!newFile.exists()) {
|
||||||
|
file.renameTo(newFile);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
file.delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOutputDirectoryValidator() {
|
||||||
|
MoveHDFS proc = new TestableMoveHDFS(kerberosProperties);
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(proc);
|
||||||
|
Collection<ValidationResult> results;
|
||||||
|
ProcessContext pc;
|
||||||
|
|
||||||
|
results = new HashSet<>();
|
||||||
|
runner.setProperty(MoveHDFS.INPUT_DIRECTORY_OR_FILE, "/source");
|
||||||
|
runner.enqueue(new byte[0]);
|
||||||
|
pc = runner.getProcessContext();
|
||||||
|
if (pc instanceof MockProcessContext) {
|
||||||
|
results = ((MockProcessContext) pc).validate();
|
||||||
|
}
|
||||||
|
Assert.assertEquals(1, results.size());
|
||||||
|
for (ValidationResult vr : results) {
|
||||||
|
assertTrue(vr.toString().contains("Output Directory is required"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBothInputAndOutputDirectoriesAreValid() {
|
||||||
|
MoveHDFS proc = new TestableMoveHDFS(kerberosProperties);
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(proc);
|
||||||
|
Collection<ValidationResult> results;
|
||||||
|
ProcessContext pc;
|
||||||
|
|
||||||
|
results = new HashSet<>();
|
||||||
|
runner.setProperty(MoveHDFS.INPUT_DIRECTORY_OR_FILE, INPUT_DIRECTORY);
|
||||||
|
runner.setProperty(MoveHDFS.OUTPUT_DIRECTORY, OUTPUT_DIRECTORY);
|
||||||
|
runner.enqueue(new byte[0]);
|
||||||
|
pc = runner.getProcessContext();
|
||||||
|
if (pc instanceof MockProcessContext) {
|
||||||
|
results = ((MockProcessContext) pc).validate();
|
||||||
|
}
|
||||||
|
Assert.assertEquals(0, results.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOnScheduledShouldRunCleanly() {
|
||||||
|
MoveHDFS proc = new TestableMoveHDFS(kerberosProperties);
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(proc);
|
||||||
|
runner.setProperty(MoveHDFS.INPUT_DIRECTORY_OR_FILE, INPUT_DIRECTORY);
|
||||||
|
runner.setProperty(MoveHDFS.OUTPUT_DIRECTORY, OUTPUT_DIRECTORY);
|
||||||
|
runner.enqueue(new byte[0]);
|
||||||
|
runner.setValidateExpressionUsage(false);
|
||||||
|
runner.run();
|
||||||
|
List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(MoveHDFS.REL_SUCCESS);
|
||||||
|
runner.assertAllFlowFilesTransferred(MoveHDFS.REL_SUCCESS);
|
||||||
|
Assert.assertEquals(7, flowFiles.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDotFileFilter() throws IOException {
|
||||||
|
createDotFile();
|
||||||
|
MoveHDFS proc = new TestableMoveHDFS(kerberosProperties);
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(proc);
|
||||||
|
runner.setProperty(MoveHDFS.INPUT_DIRECTORY_OR_FILE, INPUT_DIRECTORY);
|
||||||
|
runner.setProperty(MoveHDFS.OUTPUT_DIRECTORY, OUTPUT_DIRECTORY);
|
||||||
|
runner.setProperty(MoveHDFS.IGNORE_DOTTED_FILES, "false");
|
||||||
|
runner.enqueue(new byte[0]);
|
||||||
|
runner.setValidateExpressionUsage(false);
|
||||||
|
runner.run();
|
||||||
|
List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(MoveHDFS.REL_SUCCESS);
|
||||||
|
runner.assertAllFlowFilesTransferred(MoveHDFS.REL_SUCCESS);
|
||||||
|
Assert.assertEquals(8, flowFiles.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFileFilterRegex() {
|
||||||
|
MoveHDFS proc = new TestableMoveHDFS(kerberosProperties);
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(proc);
|
||||||
|
runner.setProperty(MoveHDFS.INPUT_DIRECTORY_OR_FILE, INPUT_DIRECTORY);
|
||||||
|
runner.setProperty(MoveHDFS.OUTPUT_DIRECTORY, OUTPUT_DIRECTORY);
|
||||||
|
runner.setProperty(MoveHDFS.FILE_FILTER_REGEX, ".*\\.gz");
|
||||||
|
runner.enqueue(new byte[0]);
|
||||||
|
runner.setValidateExpressionUsage(false);
|
||||||
|
runner.run();
|
||||||
|
List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(MoveHDFS.REL_SUCCESS);
|
||||||
|
runner.assertAllFlowFilesTransferred(MoveHDFS.REL_SUCCESS);
|
||||||
|
Assert.assertEquals(1, flowFiles.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSingleFileAsInput() {
|
||||||
|
MoveHDFS proc = new TestableMoveHDFS(kerberosProperties);
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(proc);
|
||||||
|
runner.setProperty(MoveHDFS.INPUT_DIRECTORY_OR_FILE, INPUT_DIRECTORY + "/randombytes-1");
|
||||||
|
runner.setProperty(MoveHDFS.OUTPUT_DIRECTORY, OUTPUT_DIRECTORY);
|
||||||
|
runner.enqueue(new byte[0]);
|
||||||
|
runner.setValidateExpressionUsage(false);
|
||||||
|
runner.run();
|
||||||
|
List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(MoveHDFS.REL_SUCCESS);
|
||||||
|
runner.assertAllFlowFilesTransferred(MoveHDFS.REL_SUCCESS);
|
||||||
|
Assert.assertEquals(1, flowFiles.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createDotFile() throws IOException {
|
||||||
|
File dotFile = new File(DOT_FILE_PATH);
|
||||||
|
dotFile.createNewFile();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class TestableMoveHDFS extends MoveHDFS {
|
||||||
|
|
||||||
|
private KerberosProperties testKerberosProperties;
|
||||||
|
|
||||||
|
public TestableMoveHDFS(KerberosProperties testKerberosProperties) {
|
||||||
|
this.testKerberosProperties = testKerberosProperties;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected KerberosProperties getKerberosProperties(File kerberosConfigFile) {
|
||||||
|
return testKerberosProperties;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue