NIFI-3979 Documenting how ListHDFS maintains state and performs listings

Signed-off-by: Pierre Villard <pierre.villard.fr@gmail.com>

This closes #1860.
This commit is contained in:
Bryan Bende 2017-05-25 14:43:21 -04:00 committed by Pierre Villard
parent 118f4e8cca
commit e85081ed20
1 changed files with 11 additions and 8 deletions

View File

@ -63,10 +63,12 @@ import java.util.regex.Pattern;
@TriggerWhenEmpty
@InputRequirement(Requirement.INPUT_FORBIDDEN)
@Tags({"hadoop", "HDFS", "get", "list", "ingest", "source", "filesystem"})
@CapabilityDescription("Retrieves a listing of files from HDFS. For each file that is listed in HDFS, creates a FlowFile that represents "
+ "the HDFS file so that it can be fetched in conjunction with FetchHDFS. This Processor is designed to run on Primary Node only "
+ "in a cluster. If the primary node changes, the new Primary Node will pick up where the previous node left off without duplicating "
+ "all of the data. Unlike GetHDFS, this Processor does not delete any data from HDFS.")
@CapabilityDescription("Retrieves a listing of files from HDFS. Each time a listing is performed, the files with the latest timestamp will be excluded "
+ "and picked up during the next execution of the processor. This is done to ensure that we do not miss any files, or produce duplicates, in the "
+ "cases where files with the same timestamp are written immediately before and after a single execution of the processor. For each file that is "
+ "listed in HDFS, this processor creates a FlowFile that represents the HDFS file to be fetched in conjunction with FetchHDFS. This Processor is "
+ "designed to run on Primary Node only in a cluster. If the primary node changes, the new Primary Node will pick up where the previous node left "
+ "off without duplicating all of the data. Unlike GetHDFS, this Processor does not delete any data from HDFS.")
@WritesAttributes({
@WritesAttribute(attribute="filename", description="The name of the file that was read from HDFS."),
@WritesAttribute(attribute="path", description="The path is set to the absolute path of the file's directory on HDFS. For example, if the Directory property is set to /tmp, "
@ -80,10 +82,11 @@ import java.util.regex.Pattern;
@WritesAttribute(attribute="hdfs.permissions", description="The permissions for the file in HDFS. This is formatted as 3 characters for the owner, "
+ "3 for the group, and 3 for other users. For example rw-rw-r--")
})
@Stateful(scopes = Scope.CLUSTER, description = "After performing a listing of HDFS files, the timestamp of the newest file is stored, "
+ "along with the filenames of all files that share that same timestamp. This allows the Processor to list only files that have been added or modified after "
+ "this date the next time that the Processor is run. State is stored across the cluster so that this Processor can be run on Primary Node only and if a new Primary "
+ "Node is selected, the new node can pick up where the previous node left off, without duplicating the data.")
@Stateful(scopes = Scope.CLUSTER, description = "After performing a listing of HDFS files, the latest timestamp of all the files listed and the latest "
+ "timestamp of all the files transferred are both stored. This allows the Processor to list only files that have been added or modified after "
+ "this date the next time that the Processor is run, without having to store all of the actual filenames/paths which could lead to performance "
+ "problems. State is stored across the cluster so that this Processor can be run on Primary Node only and if a new Primary "
+ "Node is selected, the new node can pick up where the previous node left off, without duplicating the data.")
@SeeAlso({GetHDFS.class, FetchHDFS.class, PutHDFS.class})
public class ListHDFS extends AbstractHadoopProcessor {