Merge branch 'NIFI-USER-GUIDE' into develop

This commit is contained in:
Mark Payne 2014-12-19 14:14:44 -05:00
commit f048ed710f
50 changed files with 877 additions and 4 deletions

View File

@ -41,6 +41,7 @@
<div class="header">Documents</div> <div class="header">Documents</div>
<div class="component-links"> <div class="component-links">
<ul> <ul>
<li class="component-item"><a class="component-link user-guide" href="user-guide/nifi-user-guide.html" target="component-usage">User Guide</a></li>
</ul> </ul>
</div> </div>
</div> </div>

View File

@ -256,6 +256,11 @@ $(document).ready(function () {
} }
}); });
// listen for on the rest api and user guide
$('a.rest-api a.user-guide').on('click', function() {
selectComponent($(this).text());
});
// get the initial selection // get the initial selection
var initialComponentLink = $('a.component-link:first'); var initialComponentLink = $('a.component-link:first');
var initialSelection = $('#initial-selection').text(); var initialSelection = $('#initial-selection').text();

85
nifi-docs/pom.xml Normal file
View File

@ -0,0 +1,85 @@
<?xml version="1.0"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-parent</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<artifactId>nifi-docs</artifactId>
<name>nifi-docs</name>
<build>
<plugins>
<plugin>
<groupId>org.asciidoctor</groupId>
<artifactId>asciidoctor-maven-plugin</artifactId>
<version>1.5.0</version>
<executions>
<execution>
<id>output-html</id>
<phase>generate-resources</phase>
<goals>
<goal>process-asciidoc</goal>
</goals>
</execution>
</executions>
<configuration>
<backend>html</backend>
<attributes>
<skipFrontMatter>true</skipFrontMatter>
</attributes>
</configuration>
</plugin>
<!-- This plugin is used to insert the Apache License into the output HMTL because
AsciiDoc doesn't appear to provide a mechanism for doing this. -->
<plugin>
<groupId>com.google.code.maven-replacer-plugin</groupId>
<artifactId>replacer</artifactId>
<version>1.5.3</version>
<executions>
<execution>
<phase>prepare-package</phase>
<goals>
<goal>replace</goal>
</goals>
</execution>
</executions>
<configuration>
<file>target/generated-docs/nifi-user-guide.html</file>
<regex>true</regex>
<regexFlags>
<regexFlag>DOTALL</regexFlag>
<regexFlag>MULTILINE</regexFlag>
</regexFlags>
<token>^(.*)$</token>
<value>
&lt;!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--&gt;
$1
</value>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
</dependencies>
</project>

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 670 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 764 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 493 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 667 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 550 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 838 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 137 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 674 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 538 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 402 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 970 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.9 KiB

View File

@ -0,0 +1,782 @@
//
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
NiFi User Guide (Draft - Preview Version)
=========================================
Apache_NiFi_Team
:toc:
:icons:
[template="glossary", id="terminology"]
Terminology
-----------
*DataFlow Manager*: A DataFlow Manager (DFM) is a NiFi user who has permissions to add, remove, and modify components of a NiFi dataflow.
*FlowFile*: The FlowFile represents a single piece of data in NiFi. A FlowFile is made up of two components:
FlowFile Attributes and FlowFile Content.
Content is the data that is represented by the FlowFile. Attributes are key-value pairs that provide information or
context about the data.
All FlowFiles have the following Standard Attributes:
- *uuid*: A unique identifier for the FlowFile
- *filename*: A human-readable filename that may be used when storing the data to disk or in an external service
- *path*: A hierarchically structured value that can be used when storing data to disk or an external service so that the data is not stored in a single directory
*Processor*: The Processor is the NiFi component that is used to listen for incoming data; pull data from external sources;
publish data to external sources; and route, transform, or extract information from FlowFiles.
*Relationship*: Each Processor has zero or more Relationships defined for it. These Relationships are named to indicate the result of processing a FlowFile.
After a Processor has finished processing a FlowFile, it will route (or “transfer”) the FlowFile to one of the Relationships.
A DataFlow Manager is then able to connect each of these Relationships to other components in order to specify where the FlowFile should
go next under each potential processing result.
*Connection*: A DataFlow Manager creates an automated dataflow by dragging components from the Components part of the NiFi toolbar to the canvas
and then connecting the components together via Connections. Each connection consists of one or more Relationships.
For each Connection that is drawn, a DataFlow Manager can determine which Relationships should be used for the Connection.
This allows data to be routed in different ways based on its processing outcome. Each connection houses a FlowFile Queue.
When a FlowFile is transferred to a particular Relationship, it is added to the queue belonging to the associated Connection.
*Funnel*: A funnel is a NiFi component that is used to combine the data from several Connections into a single Connection.
*Process Group*: When a dataflow becomes complex, it often is beneficial to reason about the dataflow at a higher, more abstract level.
NiFi allows multiple components, such as Processors, to be grouped together into a Process Group.
The NiFi User Interface then makes it easy for a DataFlow Manager to connect together multiple Process Groups into a logical dataflow,
as well as allowing the DataFlow Manager to enter a Process Group in order to see and manipulate the components within the Process Group.
*Port*: Dataflows that are constructed using one or more Process Groups need a way to connect a Process Group to other dataflow components.
This is achieved by using Ports. A DataFlow Manager can add any number of Input Ports and Output Ports to a Process Group and name these ports appropriately.
*Remote Process Group*: Just as data is transferred into and out of a Process Group, it is sometimes necessary to transfer data from one instance of NiFi to another.
While NiFi provides many different mechanisms for transferring data from one system to another, Remote Process Groups are often the easiest way to accomplish
this if transferring data to another instance of NiFi.
*Bulletin*: The NiFi User Interface provides a significant amount of monitoring and feedback about the current status of the application.
In addition to rolling statistics and the current status that are provided for each component, components are able to report Bulletins.
Whenever a component reports a Bulletin, an icon is displayed on that component (or on the Status bar near the top of the page, for System-Level Bulletins).
Using the mouse to hover over that icon will provide a tool-tip that shows the time and severity (Debug, Info, Warning, Error) of the bulletin,
as well as the message of the Bulletin.
Bulletins from all components can also be viewed and filtered in the Bulletins Page, available in the Management Toolbar.
*Template*: Often times, a dataflow is comprised of many sub-flows that could be reused. NiFi allows DataFlow Managers to select a part of the dataflow
(or the entire dataflow) and create a Template. This Template is given a name can then be dragged onto the canvas just like the other components.
As a result, several components be combined together to make a larger building block from which to create a dataflow.
These templates can also be exported as XML and imported into another NiFi instance, allowing these building blocks to be shared.
NiFi User Interface
-------------------
The NiFi User Interface (UI) provides mechanisms for creating automated dataflows, as well as visualizing,
editing, monitoring, and administering those dataflows. The UI can be broken down into several different segments,
each responsible for different functionality of the application. We will begin by looking at screenshots of the
application and labeling the different segments of the UI. We will provide a brief explanation of the purpose of each segment.
Then, in the following sections of this document, we will discuss each of those segments in greater detail.
When the application is started, by default, the user is able to navigate to the User Interface by going to
`http://<hostname>:8080/nifi` in a web browser. There are no permissions configured, by default, so anyone is
able to view and modify the dataflow. For information on securing the system, see Systems Administrator guide.
When a DataFlow Manager navigates to the UI for the first time, a blank canvas is provided on which a dataflow can be built:
image::new-flow.png["Empty Flow"]
Along the top of the of the screen is a toolbar that contains several of these segments.
To the left is the Components Toolbar. This toolbar consists of the different components that can be dragged onto the canvas.
Next to the Components Toolbar is the Actions Toolbar. This toolbar consists of buttons to manipulate the existing
components on the graph. Following the Actions Toolbar is the Search Toolbar. This toolbar consists of a single
Search field that allows users to easily find components on the graph. Users are able to search by component name,
type, identifier, and configuration properties.
Finally, the Management Toolbar sits to the right-hand side of the screen. This toolbar consists of buttons that are
of use to DataFlow Managers to manage the flow as well as administrators who may use this section to manage user access
and configure system properties, such as how many system resources should be provided to the application.
image::nifi-toolbar-components.png["NiFi Components Toolbar"]
Next, we have segments that provide capabilities to easily navigate around the graph. On the left-hand side is a toolbar that
provides the ability to pan around the graph and zoom in and out. On the right-hand side is a “Birds-Eye View” of the dataflow.
This provides a high-level view of the dataflow and allows the user to quickly and easily pan across large portions of the dataflow.
Along the top of the screen is a trail of breadcrumbs. As users navigate into and out of Process Groups, the breadcrumbs show
the depth in the flow and each Process Group that was entered to reach this depth. Each of the Process Groups listed in the breadcrumbs
is a link that will take you back up to that level in the flow.
image::nifi-navigation.png["NiFi Navigation"]
[[status_bar]]
Below the breadcrumbs lives the Status bar. The Status bar provides information about how many Processors exist in the graph in
each state (Stopped, Running, Invalid, Disabled), how many Remote Process Groups exist on the graph in each state
(Transmitting, Not Transmitting), the number of threads that are currently active in the flow, the amount of data that currently
exists in the flow, and the timestamp at which all of this information was last refreshed. If there are any System-Level bulletins,
these are shown in the Status bar as well. Additionally, if the instance of NiFi is clustered, the Status bar shows many nodes
are in the cluster and how many are currently connected.
image::status-bar.png["NiFi Status Bar"]
Building a DataFlow
-------------------
A DataFlow Manager (DFM) is able to build an automated dataflow using the NiFi User Interface (UI). This is accomplished
by dragging components from the toolbar to the canvas, configuring the components to meet specific needs, and connecting
the components together.
=== Adding Components to the Canvas
In the User Interface section above, we outlined the different segments of the UI and pointed out a Components Toolbar.
Here, we will look at each of the Components in that toolbar:
image::components.png["Components"]
image:iconProcessor.png["Processor", width=32]
*Processor*: The Processor is the most commonly used component, as it is responsible for data ingress, egress, routing, and
manipulating. There are many different types of Processors. In fact, this is a very common Extension Point in NiFi,
meaning that many vendors may implement their own Processors to perform whatever functions are necessary for their use case.
When a Processor is dragged onto the graph, the user is presented with a dialog to choose which type of Processor to use:
image::add-processor.png["Add Processor Dialog"]
In the top-right corner, the user is able to filter the list based on the Processor Type or the Tags associated with a Processor.
Processor developers have the ability to add Tags to their Processors. These tags are used in this dialog for filtering and are
displayed on the left-hand side in a Tag Cloud. The more Processors that exist with a particular Tag, the larger the Tag appears
in the Tag Cloud. Clicking a Tag in the Cloud will filter the available Processors to only those that contain that Tag. If multiple
Tags are selected, only those Processors that contain all of those Tags are shown. For example, if we want to show only those
Processors that allow us to ingest data via HTTP, we can select both the `http` Tag and the `ingest` Tag:
image::add-processor-with-tag-cloud.png["Add Processor with Tag Cloud"]
Clicking the `Add` button or double-clicking on a Processor Type will add the selected Processor to the canvas at the
location that it was dropped.
image:iconInputPort.png["Input Port", width=32]
*Input Port*: Input Ports provide a mechanism for transferring data into a Process Group. When an Input Port is dragged
onto the canvas, the DFM is prompted to name the Port. All Ports within a Process Group must have unique names.
All components exist only within a Process Group. When a user navigates to the NiFi page, the user is placed in the
Root Progress Group. If the Input Port is dragged onto the Root Progress Group, the Input Port provides a mechanism
to receive data from remote instances of NiFi. In this case, the Input Port can be configured to restrict access to
appropriate users.
image:iconOutputPort.png["Output Port", width=32]
*Output Port*: Output Ports provide a mechanism for transferring data from a Process Group back to destination outside
of the Process Group. When an Output Port is dragged onto the canvas, the DFM is prompted to name the Port. All Ports
within a Process Group must have unique names.
If the Output Port is dragged onto the Root Process Group, the Output Port provides a mechanism for sending data to
remote instances of NiFi. In this case, the Port acts as a queue. As remote instances of NiFi pull data from the port,
that data is removed from the queues of the incoming Connections.
image:iconProcessGroup.png["Process Group", width=32]
*Process Group*: Process Groups can be used logically group a set of components so that the dataflow is easier to understand
and maintain. When a Process Group is dragged onto the canvas, the DFM is prompted to name the Process Group. All Process
Groups within the same parent group must have unique names.
image:iconRemoteProcessGroup.png["Remote Process Group", width=32]
*Remote Process Group*: Remote Process Groups appear and behave similar to Process Groups. However, the Remote Process Group (RPG)
references a remote instance of NiFi. When an RPG is dragged onto the canvas, rather than being prompted for a name, the DFM
is prompted for the URL of the remote NiFi instance. If the remote NiFi is a clustered instance, the URL that should be used
is the URL of the remote instance's NiFi Cluster Manager (NCM). When data is transferred to a clustered instance of NiFi
via an RPG, the RPG it will first connect to the remote instance's NCM to determine which nodes are in the cluster and
how busy each node is. This information is then used to load balance the data that is pushed to each node. The remote NCM is
then interrogated periodically to ensure that any nodes that are dropped from the cluster and no longer sent to, any new nodes
will be added to the list of nodes, and to recalculate the load balancing based on each node's load.
image:iconFunnel.png["Funnel", width=32]
*Funnel*: Funnels are used to combine the data from many Connections into a single Connection. This has two advantages.
First, if many Connections are created with the same destination, the canvas can become cluttered if those Connections
have to span a large space. By funneling these Connections into a single Connection, that single Connection can then be
drawn to span that large space instead. Secondly, Connections can be configured with FlowFile Prioritizers. Data from
several Connections can be funneled into a single Connection, providing the ability to Prioritize all of the data on that
one Connection, rather than prioritizing the data on each Connection independently.
image:iconTemplate.png["Template", width=32]
*Template*: Templates can be created by DataFlow Managers from sections of the flow, or they can be imported from other
dataflows. These Templates provide larger building blocks for creating a complex flow quickly. When the Template is
dragged onto the canvas, the DFM is provided a dialog to choose which Template to add to the canvas:
image::instantiate-template.png["Instantiate Template Dialog"]
Clicking the drop-down box shows all available Templates. Any Template that was created with a description will show an
icon indicating that there is more information. Hovering over the icon with the mouse will show this description:
image::instantiate-template-description.png["Instantiate Template Dialog"]
image:iconLabel.png["Label"]
*Label*: Labels are used to provide documentation to parts of a dataflow. When a Label is dropped onto the canvas,
it is created with a default size. The Label can then be resized by dragging the handle in the bottom-right corner.
The Label has no text when initially created. The text of the Label can be added by right-clicking on the Label and
choosing `Configure...`
=== Configuring a Processor
Once a Processor has been dragged onto the Canvas, it is ready to configure. This is done by right-clicking on the
Processor and clicking the `Configure...` option from the context menu. The configuration dialog is opened with four
different tabs, each of which is discussed below. Once you have finished configuring the Processor, you can apply
the changes by clicking the `Apply` button or cancel all changes by clicking the `Cancel` button.
Note that after a Processor has been started, the context menu shown for the Processor no longer has a `Configure...`
option but rather has a `View Configuration` option. Processor configuration cannot be changed while the Processor is
running. You must first stop the Processor and wait for all of its active tasks to complete before configuring
the Processor again.
==== Settings Tab
The first tab in the Processor Configuration dialog is the Settings tab:
image::settings-tab.png["Settings Tab"]
This tab contains several different configuration items. First, it allows the DFM to change the name of the Processor.
The name of a Processor by default is the same as the Processor type. Next to the Processor Name is a control for
determining whether or not the Processor is Enabled. When a Processor is added to the graph, it is enabled. If the
Processor is disabled, it cannot be started. This is used to indicate that even when a group of Processors are started,
such as when a DFM starts an entire Process Group, this Processor should be excluded.
Below the Name configuration, the Processor's unique identifier is displayed along with the Processor's type. These
values cannot be modified.
Next are two dialogues for configuring `Penalty duration' and `Yield duration'. During the normal course of processing a
piece of data (a FlowFile), an event may occur that indicates that the data cannot be processed at this time but the
data may be processable at a later time. When this occurs, the Processor may choose to Penalize the FlowFile. This will
prevent the FlowFile from being Processed for some period of time. For example, if the Processor is to push the data
to a remote service, but the remote service already has a file with the same name as the filename that the Processor
is specifying, the Processor may penalize the FlowFile. The `Penalty duration' allows the DFM to specify what
how long the FlowFile should be penalized. The default value is 30 seconds.
Similarly, the Processor may determine that some situation exists such that the Processor can no longer make any progress,
regardless of the data that it is processing. For example, if a Processor is to push data to a remote service and that
service is not responding, the Processor cannot make any progress. As a result, the Processor should `yield,' which will
prevent the Processor from being scheduled to run for some period of time. That period of time is specified by setting
the `Yield duration.' The default value is 1 second.
The last configurable option on the left-hand side of the Settings tab is the Bulletin level. Whenever the Processor writes
to its log, the Processor also will generate a Bulletin. This setting indicates the lowest level of Bulletin that should be
shown in the User Interface. By default, the Bulletin level is set to WARN.
The right-hand side of the dialogue provides an `Auto-terminate relationships' section. Each of the Relationships that is
defined by the Processor is listed here, along with its description. In order for a Processor to be considered valid and
able to run, each Relationship defined by the Processor must be either connected to a downstream component or auto-terminated.
If a Relationship is auto-terminated, any FlowFile that is routed to that Relationship will be removed from the flow and
its processing considered complete. Any Relationship that is already connected to a downstream component cannot be auto-terminated.
The Relationship must first be removed from any Connection that uses it. Additionally, for any Relationship that is selected to be
auto-terminated, the auto-termination status will be cleared if the Relationship is added to a Connection.
==== Scheduling Tab
The second tab in the Processor Configuration dialog is the Scheduling Tab:
image::scheduling-tab.png["Scheduling Tab"]
The first configuration option is the Scheduling Strategy. There are three options for scheduling components:
- *Timer driven*: This is the default mode. The Processor will be scheduled to run on a regular interval. The interval
at which the Processor is run is defined by the `Run schedule' option (see below).
- *Event driven*: When this mode is selected, the Processor will be triggered to run by FlowFiles entering the Connections
that have this Processor as their destination. This mode is not supported by all Processors. When this mode is
selected, the `Run schedule' option is not configurable, as the Processor is not triggered to run periodically but
rather is triggered to run as the result of an event. Additionally, this is the only mode for which the `Concurrent tasks'
option can be set to 0. In this case, the number of threads is limited only by the size of the Event-Driven Thread Pool that
the administrator has configured.
- *CRON driven*: When using the CRON driven scheduling mode, the Processor is scheduled to run periodically, similarly to the
Timer driven scheduling mode. However, the CRON driven mode provides significantly more flexibility at the expensive of
increasing the complexity of the configuration. This value is made up of 6 fields, each separated by a space. These
fields represent the following fields:
+
. Seconds
. Minutes
. Hours
. Day of Month
. Month
. Day of Week
. Year
+
The value for each of these fields should be a number, range, or increment.
Range here refers to a syntax of <number>-<number>.
For example,the Seconds field could be set to 0-30, meaning that the Processor should only be scheduled if the time is 0 to 30 seconds
after the minute. Additionally, a value of `*` indicates that all values are valid for this field. Multiple values can also
be entered using a `,` as a separator: `0,5,10,15,30`.
An increment is written as <start value>/<increment>. For example, settings a value of `0/10` for the seconds fields means that valid
values are 0, 10, 20, 30, 40, and 50. However, if we change this to `5/10`, valid values become 5, 15, 25, 35, 45, and 55.
+
For the Month field, valid values are 1 (January) through 12 (December).
+
For the Day of Week field, valid values are 1 (Sunday) through 7 (Saturday). Additionally, a value of `L` may be appended to one of these
values to indicate the last occurrence of this day in the month. For example, `1L` can be used to indicate the last Monday of the month.
Next, the Scheduling Tab provides a configuration option named `Concurrent tasks.' This controls how many threads the Processor
will use. Said a different way, this controls how many FlowFiles should be processed by this Processor at the same time. Increasing
this value will typically allow the Processor to handle more data in the same amount of time. However, it does this by using system
resources that then are not usable by other Processors. This essentially provides a relative weighting of Processors -- it controls
how much of the system's resources should be allocated to this Processor instead of other Processors. This field is available for
most Processors. There are, however, some types of Processors that can only be scheduled with a single Concurrent task.
The ``Run schedule'' dictates how often this Processor should be scheduled to run. The valid values for this field depend on the selected
Scheduling Strategy (see above). If using the Event driven Scheduling Strategy, this field is not available. When using the Timer driven
Scheduling Strategy, this value is a time duration specified by a number followed by a time unit. For example, `1 second` or `5 mins`.
The default value of `0 sec` means that the Processor should run as often as possible as long as it has data to process. This is true
for any time duration of 0, regardless of the time unit (i.e., `0 sec`, `0 mins`, `0 days`). For an explanation of values that are
applicable for the CRON driven Scheduling Strategy, see the description of the CRON driven Scheduling Strategy itself.
The right-hand side of the tab contains a slider for choosing the `Run duration.' This controls how long the Processor should be scheduled
to run each time that it is triggered. On the left-hand side of the slider, it is marked `Lower latency' while the right-hand side
is marked `Higher throughput.' When a Processor finishes running, it must update the repository in order to transfer the FlowFiles to
the next Connection. Updating this repository is expensive, so the more work that can be done at once before updating the repository
the more work the Processor can handle (Higher throughput). However, this means that the next Processor cannot start processing
those FlowFiles until the previous Process updates this repository. As a result, the latency will be longer (the time required to process
the FlowFile from beginning to end will be longer). As a result, the slider provides a spectrum from which the DFM can choose to favor
Lower Latency or Higher Throughput.
==== Properties Tab
The Properties Tab provides a mechanism to configure Processor-specific behavior. There are no default properties. Each type of Processor
must define which Properties make sense for its use case. Below, we see the Properties Tab for a RouteOnAttribute Processor:
image::properties-tab.png["Properties Tab"]
This Processor, by default, has only a single property: `Routing Strategy.' The default value is `Route on Property name.' Next to
the name of this property is a small question-mark symbol (
image:iconInfo.png["Question Mark"]
). This help symbol is seen in other places throughout the application, as well, and indicates that more information is available.
Hovering over this symbol with the mouse will provide additional details about the property and the default value, as well as
historical values that have been set for the Property.
Clicking on the value for the property will allow a DFM to change the value. Depending on the values that are allowed for the property,
the user is either provided a drop-down from which to choose a value or is given a text area to type a value:
image::edit-property-dropdown.png["Edit Property with Dropdown"]
In the top-right corner of the tab is a button for adding a New Property. Clicking this button will provide the DFM with a dialog to
enter the name and value of a new property. Not all Processors allow User-Defined properties. In this case, the Processor would become
invalid when the properties are applied. RouteOnAttribute, for example, does allow User-Defined properties. In fact, this Processor
will not be valid until the user has added a property.
image:edit-property-textarea.png["Edit Property with Text Area"]
Not that after a User-Defined property has been added, an icon will appear on the right-hand side of that row (
image:iconDelete.png["Delete Icon"]
). Clicking this button will remove the User-Defined property from the Processor.
==== Comments Tab
The last tab in the Processor configuration dialog is the Comments tab. This tab simply provides an area for users to provide
whatever comments are appropriate for this component:
image::comments-tab.png["Comments Tab"]
=== Additional Help
Each Processor has the ability to provide additional documentation about its usage. This documentation can be found by right-clicking
on the Processor and then selecting the `Usage' item from the context menu. Alternatively, clicking the `Help' link in the top-right
corner of the application will provide a Help page with all of the Processors that are available. Clicking on the Processor in the list
will then show its usage.
=== Connecting Components
After the appropriate Processors have been added to the graph and configured to meet your needs, they will have to be connected
to one another so that NiFi knows what to do with each FlowFile after it has been processed. This is accomplished by creating a
Connection between two components. When the mouse hovers over a component, a new Connection icon (
image:addConnect.png["Connection Bubble"]
) will appear in the middle of the component:
image:processor-connection-bubble.png["Processor with Connection Bubble"]
This Connection bubble can then be dragged from this component to another component, which will provide to the user a
`Create Connection' dialog. This dialog consists of two tabs: `Details' and `Settings'.
==== Details Tab
The Details Tab provides information about the source and destination components, including the component name, the
component type, and the Process Group in which the component lives:
image::create-connection.png["Create Connection"]
Additionally, this tab provides the ability to choose which Relationships should be included in this Connection. At least one
Relationship must be selected. If only one Relationship is available, it is automatically selected.
*Note*: If multiple Connections are added with the same Relationship, any FlowFile that is routed to that Relationship will
automatically be `cloned', and a copy will be sent to each of those Connections.
==== Settings
The Settings Tab provides the ability to configure the Connection's name, FlowFile expiration, back pressure thresholds, and
Prioritization:
image:connection-settings.png["Connection Settings"]
The Connection name is optional. If not specified, the name shown for the Connection will be names of the Relationships
that are active for the Connection.
File expiration is a concept by which data that cannot be processed in a timely fashion can be automatically destroyed.
This is useful, for example, when the volume of data is expected to exceed the volume that can be sent to a remote site.
In this case, the expiration can be used in conjunction with Prioritizers to ensure that the highest priority data is
processed first and then anything that cannot be processed within one hour, for example, can be dropped. The default
value of `0 sec` indicates that the data will never expire.
NiFi provides two different configuration elements for back pressure. These thresholds indicate how much data should be
allowed to exist in the queue before the component that is the source of the Connection is no longer scheduled to run.
This allows the system to avoid being overrun with data. The first option provided is the ``Back pressure object threshold.''
This is the number of FlowFiles that can be in the queue before back pressure is applied. The second configuration option
is the ``Back pressure data size threshold.''
This specifies the maximum amount of data that should be queued up before
applying back pressure. This value is configured by entering a number followed by a data size (`B` for bytes, `KB` for
kilobytes, `MB` for megabytes, `GB` for gigabytes, or `TB` for terabytes).
The right-hand side of the tab provides the ability to prioritize the data in queue so that higher priority data is
processed first. Prioritizers can be dragged from the top (`Available prioritizers') to the bottom (`Selected prioritizers').
Multiple prioritizers can be selected. The prioritizer that is at the top of the `Selected prioritizers' list is the highest
priority. If two FlowFiles have the same value according to this prioritizer, the second prioritizer will determine which
FlowFile to process first, and so on. If a prioritizer is no longer desired, it can then be dragged from the `Selected
prioritizers' list to the `Available prioritizers' list.
=== Processor Validation
Before trying to start a Processor, it's important to make sure that the Processor's configuration is valid.
A status indicator is shown in the top-left of the Processor. If the Processor is invalid, the indicator
will show a yellow Warning indicator with an exclamation mark indicating that there is a problem:
image::invalid-processor.png["Invalid Processor"]
In this case, hovering over the indicator icon with the mouse will provide a tooltip showing all of the validation
failures for the Processor. Once all of the validation errors have been addressed, the status indicator will change
to a Stop icon, indicating that the Processor is valid and ready to be start but currently is not running:
image::valid-processor.png["Valid Processor"]
== Command and Control of DataFlow
When a component is added to the NiFi canvas, it is in the Stopped state. In order to cause the component to
be triggered, the component must be started. Once started, the component can be stopped at any time. From a
Stopped state, the component can then be configured, started, or disabled.
=== Starting a Component
In order to start a component, the following conditions must be met:
- The component's configuration must be valid.
- All defined Relationships for component must be connected to another component or auto-terminated.
- The component must be stopped.
- The component must be enabled.
- The component must have no active tasks. For more information about active tasks, see the ``Anatomy of ...''
sections under <<monitoring>> (<<processor_anatomy>>, <<process_group_anatomy>>, <<remote_group_anatomy>>).
Components can be started by selecting all of the components to start and then clicking the Start icon (
image:iconRun.png["Start"]
) in the
Actions Toolbar or by right-clicking a single component and choosing Start from the context menu.
If starting a Process Group, all components within that Process Group (including child Process Groups) will
be started, with the exception of those components that are invalid or disabled.
Once started, the status indicator of a Processor will change to a Play symbol (
image:iconRun.png["Run"]
).
=== Stopping a Component
A component can be stopped any time that it is running. A component is stopped by right-clicking on the component
and clicking Stop from the context menu, or by clicking the Stop icon (
image:iconStop.png["Stop"]
) in the Actions Toolbar.
If a Process Group is stopped, all of the components within the Process Group (including child Process Groups)
will be stopped.
Once stopped, the status indicator of a Processor will change to the Stop symbol (
image:iconStop.png["Stop"]
).
Stopping a component does not interrupt its currently running tasks. Rather, it stops scheduling new tasks to
be performed. The number of active tasks is shown in the top-right corner of the Processor (see <<processor_anatomy>>
for more information).
=== Enabling/Disabling a Component
When a component is enabled, it is able to be started. Components may be disabled when part of a
dataflow is still being assembled, for example, and as a result should not be started. Typically,
if a component is not intended to be run, the component is disabled, rather than being left in the
Stopped state. This helps to distinguish between components that are intentionally not running and
those components that may have been stopped temporarily (for instance, to change the component's
configuration) and inadvertently were never restarted.
When it is desirable to re-enable a component, it can be enabled by selecting the component and
clicking the Enable icon (
image:iconEnable.png["Enable"]
) in the Actions Toolbar. This is available only when the selected component or components are disabled.
Alternatively, a component can be enabled by checking the checkbox next to the ``Enabled'' option in
the Settings tab of the Processor configuration dialog or the configuration dialog for a Port.
Once enabled, the component's status indicator will change to either Invalid (
image:iconAlert.png["Invalid"]
) or Stopped (
image:iconStop.png["Stopped"]
), depending on whether or not the component is valid.
A component is then disabled by selecting the component and clicking the Disable icon (
image:iconDisable.png["Disable"]
) in the Actions Toolbar, or by clearing the checkbox next to the ``Enabled'' option in the Settings tab
of the Processor configuration dialog or the configuration dialog for a Port.
Only Ports and Processors can be enabled and disabled.
=== Remote Process Group Transmission
Remote Process Groups provide a mechanism for sending data to or retrieving data from a remote instance
of NiFi. When a Remote Process Group (RPG) is added to the canvas, it is added with the Transmision Disabled,
as indicated by the icon (
image:iconTransmissionInactive.png["Transmission Disabled"]
) in the top-left corner. When Transmission is Disabled, it can be enabled by right-clicking on the
RPG and clicking the ``Enable Transmission'' menu item. This will cause all ports for which there is a Connection
to begin transmitting data. This will cause the status indicator to then change to the Transmission Enabled icon (
image:iconTransmissionActive.png["Transmission Enabled"]
).
If there are problems communicating with the Remote Process Group, a Warning indicator (
image:iconAlert.png["Warning"]
) may instead be present in the top-left corner. Hovering over this Warning indicator with the mouse will provide
more information about the problem.
==== Individual Port Transmission
There are times when the DFM may want to either enable or disable transmission for only a specific
Port within the Remote Process Group. This can be accomplished by right-clicking on the Remote Process Group
and choosing the ``Remote ports'' menu item. This provides a configuration dialog from which each Port can be
configured:
image::remote-group-ports-dialog.png["Remote Process Groups"]
The left-hand side lists all of the Input Ports that the remote instance of NiFi allows data to be sent to.
The right-hand side lists all of the Output Ports from which this instance is able to pull data.
If the remote instance is using secure communications (the URL of the NiFi instance begins with `https://`,
rather than `http://`), any Ports that the remote instance has not made available to this instance will not
be shown.
*Note*: If a Port that is expected to be shown is not shown in this dialog, ensure that the instance has proper
permissions and that the Remote Process Group's flow is current. This can be checked by closing the Port
Configuration Dialog and looking at the bottom-right corner of the Remote Process Group. The data at which
the flow was last refresh is shown. If the flow appears to be outdated, it can be updated by right-clicking
on the Remote Process Group and selecting ``Refresh flow.'' (See <<remote_group_anatomy>> for more information).
Each Port is shown with the Port name, followed by its description, currently configured number of Concurrent
tasks, and whether or not data sent to this port will be compressed. To the left of this information is a switch
to turn the Port on or off. Those Ports that have no Connections attached to them are grayed out:
image::remote-port-connection-status.png["Remote Port Statuses"]
The on/off switch provides a mechanism to enable and disable transmission for each Port in the Remote
Process Group independently. Those Ports that are connected but are not currently transmitting can be
configured by clicking the pencil icon (
image:iconEdit.png["Edit"]
) below the on/off switch. Clicking this icon will allow the DFM to change the number of Concurrent tasks and whether
or not compression should be used when transmitting data to or from this Port.
[[monitoring]]
== Monitoring of DataFlow
NiFi provides a great deal of information about the status of the DataFlow in order to monitor the
health and status. The Status bar provides information about the overall system health
(See <<status_bar>> above for more information). Processors, Process Groups, and Remote Process Groups
provide fine-grained details about their operations. Connections and Process Groups provide information
about the amount of data in their queues. The Summary Page provides information about all of the components
on the graph in a tabular format and also provides System Diagnostics information that includes disk usage,
CPU utilization, and Java Heap and Garbage Collection information. In a clustered environment, this
information is available per-node or as aggregates across the entire cluster. We will explore each of these
monitoring artifacts below.
[[processor_anatomy]]
=== Anatomy of a Processor
NiFi provides a significant amount of information about each Processor on the canvas. The following diagram
is the anatomy of a Processor:
image:processor-anatomy.png["Anatomy of a Processor"]
The image outlines the following elements:
- *Processor Type*: NiFi provides several different types of Processors in order to allow for a wide range
of tasks to be performed. Each type of Processor is designed to perform one specific task. The Processor
type (PutFile, in this example) describes the task that this Processor performs. In this case, the
Processor writes a FlowFile to disk - or ``Puts'' a FlowFile to a File.
- *Bulletin Indicator*: When a Processor logs that some event has occurred, it generates a Bulletin to notify
those who are monitoring NiFi via the User Interface. The DataFlow Manager is able to configure which
bulletins should be displayed in the User Interface by updating the ``Bulletin level'' field in the
``Settings'' tab of the Processor configuration dialog. The default value is `WARN`, which means that only
warnings and errors will be displayed in the UI. This icon is not present unless a Bulletin exists for this
Processor. When it is present, hovering over the icon with the mouse will provide a tooltip explaining the
message provided by the Processor as well as the Bulletin level. If the instance of NiFi is clustered,
it will also show the Node that emitted the Bulletin. Bulletins automatically expire after five minutes.
- *Status Indicator*: Shows the current Status of the Processor. The following indicators are possible:
** image:iconRun.png["Running"]
*Running*: The Processor is currently running.
** image:iconStop.png["Stopped"]
*Stopped*: The Processor is valid and enabled but is not running.
** image:iconAlert.png["Invalid"]
*Invalid*: The Processor is enabled but is not currently valid and cannot be started.
Hovering over this icon will provide a tooltip indicating why the Processor is not valid.
** image:iconDisable.png["Disabled"]
*Disabled*: The Processor is not running and cannot be started until it has been enabled.
This status does not indicate whether or not the Processor is valid.
- *Processor Name*: This is the user-defined name of the Processor. By default, the name of the Processor is
the same as the Processor Type. In the example, this value is "Copy to /review".
- *Active Tasks*: The number of tasks that this Processor is currently executing. This number is constrained
by the ``Concurrent tasks'' setting in the ``Scheduling'' tab of the Processor configuration dialog.
Here, we can see that the Processor is currently performing two tasks. If the NiFi instance is clustered,
this value represents the number of tasks that are currently executing across all nodes in the cluster.
- *5-Minute Statistics*: The Processor shows several different statistics in tabular form. Each of these
statistics represent the amount of work that has been performed in the past five minutes. If the NiFi
instance is clustered, these values indicate how much work has been done by all of the Nodes combined
in the past five minutes. These metrics are:
** *In*: The amount of data that the Processor has pulled from the queues of its incoming Connections.
This value is represented as <count> / <size> where <count> is the number of FlowFiles that have been
pulled from the queues and <size> is the total size of those FlowFiles' content. In this example,
the Processor has pulled 884 FlowFiles from the input queues, for a total of 8.85 megabytes (MB).
** *Read/Write*: The total size of the FlowFile content that the Processor has read from disk and written
to disk. This provides valuable information about the I/O performance that this Processor requires.
Some Processors may only read the data without writing anything while some will not read the data but
will only write data. Others will neither read nor write data, and some Processors will both read
and write data. In this example, we see that in the past five minutes, this Processor has read 4.7
MB of the FlowFile content and has written 4.7 MB as well. This is what we would expect,
since this Processor simply copies the contents of a FlowFile to disk. Note, however, that this is
not the same as the amount of data that it pulled from its input queues. This is because some of
the files that it pulled from the input queues already exists in the output directory, and the
Processor is configured to route FlowFiles to failure when this occurs. Therefore, for those files
which already existed in the output directory, no data was read nor written to disk.
** *Out*: The amount of data that the Processor has transferred to its outbound Connections. This does
not include FlowFiles that the Processor removes itself, or FlowFiles that are routed to connections
that are auto-terminated. Like the ``In'' metric above, this value is represented as <count> / <size>
where <count> is the number of FlowFiles that have been transferred to outbound Connections and <size>
is the total size of those FlowFiles' content. In this example, all of the Relationships are configured to be
auto-terminated, so no FlowFiles are reported as having been transferred Out.
** *Tasks/Time*: The number of times that this Processor has been triggered to run in the past 5 minutes, and
the amount of time taken to perform those tasks. The format of the time is <hour>:<minute>:<second>. Note
that the amount of time taken can exceed five minutes, because many tasks can be executed in parallel. For
instance, if the Processor is scheduled to run with 60 Concurrent tasks, and each of those tasks takes one
second to complete, it is possible that all 60 tasks will be completed in a single second. However, in this
case we will see the Time metric showing that it took 60 seconds, instead of 1 second. This time can be
thought of as ``System Time,'' or said another way, this value is 60 seconds because that's the amount of
time it would have taken to perform the action if only a single concurrent task were used.
[[process_group_anatomy]]
=== Anatomy of a Process Group
[[remote_group_anatomy]]
=== Anatomy of a Remote Process Group
=== Summary Page
=== Historical Statics of a Component
== Templates
=== Creating a Template
=== Instantiating a Template
=== Managing Templates
==== Importing a Template
==== Exporting a Template
==== Removing a Template
== Data Provenance
=== Searching for Events
=== Details of an Event
=== Viewing FlowFile Content
=== Replaying a FlowFile
=== Viewing FlowFile Lineage
==== Find Parents
==== Expanding an Event

View File

@ -12,8 +12,7 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
--> --><project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>org.apache</groupId> <groupId>org.apache</groupId>
@ -69,7 +68,8 @@
<module>nifi-mock</module> <module>nifi-mock</module>
<module>nar-bundles</module> <module>nar-bundles</module>
<module>assembly</module> <module>assembly</module>
</modules> <module>nifi-docs</module>
</modules>
<scm> <scm>
<connection>scm:git:git://git.apache.org/incubator-nifi.git</connection> <connection>scm:git:git://git.apache.org/incubator-nifi.git</connection>
<developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/incubator-nifi.git</developerConnection> <developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/incubator-nifi.git</developerConnection>