HTML Parsing Processors Bundle

NIFI-1156 HTML Parsing Processors Bundle
This commit is contained in:
Jeremy Dyer 2015-11-13 15:01:10 -05:00
parent 56ad22aea6
commit c82fc18f8e
15 changed files with 1604 additions and 2 deletions

View File

@ -162,6 +162,11 @@ language governing permissions and limitations under the License. -->
<artifactId>nifi-http-context-map-nar</artifactId>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-html-nar</artifactId>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-kite-nar</artifactId>

View File

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-html-bundle</artifactId>
<version>0.4.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-html-nar</artifactId>
<packaging>nar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-standard-services-api-nar</artifactId>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-html-processors</artifactId>
<version>0.4.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,59 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-html-bundle</artifactId>
<version>0.4.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-html-processors</artifactId>
<description>Support for parsing HTML documents</description>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-api</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-processor-utils</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-mock</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.atomic.AtomicReference;
public abstract class AbstractHTMLProcessor extends AbstractProcessor {
protected static final String ELEMENT_HTML = "HTML";
protected static final String ELEMENT_TEXT = "Text";
protected static final String ELEMENT_DATA = "Data";
protected static final String ELEMENT_ATTRIBUTE = "Attribute";
public static final PropertyDescriptor URL = new PropertyDescriptor
.Builder().name("URL")
.description("Base URL for the HTML page being parsed.")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor
.Builder().name("CSS Selector")
.description("CSS selector syntax string used to extract the desired HTML element(s).")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
public static final PropertyDescriptor HTML_CHARSET = new PropertyDescriptor
.Builder().name("HTML character encoding")
.description("Character encoding of the input HTML")
.defaultValue("UTF-8")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final Relationship REL_ORIGINAL = new Relationship.Builder()
.name("original")
.description("The original HTML input")
.build();
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("Successfully parsed HTML element")
.build();
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("Failed to parse HTML content")
.build();
public static final Relationship REL_INVALID_HTML = new Relationship.Builder()
.name("invalid html")
.description("The input HTML syntax is invalid")
.build();
public static final Relationship REL_NOT_FOUND = new Relationship.Builder()
.name("element not found")
.description("Element could not be found in the HTML document. The original HTML input will remain " +
"in the flowfile content unchanged. Relationship '" + REL_ORIGINAL + "' will not be invoked " +
"in this scenario.")
.build();
/**
* Parses the Jsoup HTML document from the FlowFile input content.
*
* @param inputFlowFile
* Input FlowFile containing the HTML
*
* @param context
* ProcessContext
*
* @param session
* ProcessSession
*
* @return
* Jsoup Document
*/
protected Document parseHTMLDocumentFromFlowfile(FlowFile inputFlowFile,
final ProcessContext context,
final ProcessSession session) {
final AtomicReference<Document> doc = new AtomicReference<>();
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream inputStream) throws IOException {
doc.set(Jsoup.parse(inputStream,
context.getProperty(HTML_CHARSET).getValue(),
context.getProperty(URL).getValue()));
}
});
return doc.get();
}
}

View File

@ -0,0 +1,243 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.SeeAlso;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.io.StreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
@Tags({"get", "html", "dom", "css", "element"})
@CapabilityDescription("Parses HTML input using CSS selector syntax and creates a new flowfile containing the extracted" +
" element content for each matching CSS selector.")
@SeeAlso({ModifyHTMLElement.class, PutHTMLElement.class})
@WritesAttributes({@WritesAttribute(attribute="HTMLElement", description="Flowfile attribute where the element result" +
" parsed from the HTML using the CSS selector syntax are placed if the destination is a flowfile attribute.")})
public class GetHTMLElement
extends AbstractHTMLProcessor {
public static final String HTML_ELEMENT_ATTRIBUTE_NAME = "HTMLElement";
public static final String DESTINATION_ATTRIBUTE = "flowfile-attribute";
public static final String DESTINATION_CONTENT = "flowfile-content";
public static final PropertyDescriptor PREPEND_ELEMENT_VALUE = new PropertyDescriptor
.Builder().name("Prepend Element value")
.description("Prepends the specified value to the resulting Element")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
public static final PropertyDescriptor APPEND_ELEMENT_VALUE = new PropertyDescriptor
.Builder().name("Append Element value")
.description("Appends the specified value to the resulting Element")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
public static final PropertyDescriptor ATTRIBUTE_KEY = new PropertyDescriptor
.Builder().name("Attribute Name")
.description(("When getting the value of an element attribute this value is used as the key to determine" +
" which attribute on the selected element should be retrieved."))
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
public static final PropertyDescriptor OUTPUT_TYPE = new PropertyDescriptor.Builder()
.name("Output Type")
.description("Controls the type of value that is retrieved from the element. " +
ELEMENT_HTML + "," + ELEMENT_TEXT + ", " + ELEMENT_ATTRIBUTE + " or " + ELEMENT_DATA)
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.allowableValues(ELEMENT_HTML, ELEMENT_TEXT, ELEMENT_ATTRIBUTE, ELEMENT_DATA)
.defaultValue(ELEMENT_HTML)
.build();
public static final PropertyDescriptor DESTINATION = new PropertyDescriptor.Builder()
.name("Destination")
.description("Control if element extracted is written as a flowfile attribute or " +
"as flowfile content.")
.required(true)
.allowableValues(DESTINATION_ATTRIBUTE, DESTINATION_CONTENT)
.defaultValue(DESTINATION_ATTRIBUTE)
.build();
private List<PropertyDescriptor> descriptors;
private Set<Relationship> relationships;
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<>();
descriptors.add(URL);
descriptors.add(CSS_SELECTOR);
descriptors.add(HTML_CHARSET);
descriptors.add(OUTPUT_TYPE);
descriptors.add(DESTINATION);
descriptors.add(PREPEND_ELEMENT_VALUE);
descriptors.add(APPEND_ELEMENT_VALUE);
descriptors.add(ATTRIBUTE_KEY);
this.descriptors = Collections.unmodifiableList(descriptors);
final Set<Relationship> relationships = new HashSet<>();
relationships.add(REL_ORIGINAL);
relationships.add(REL_SUCCESS);
relationships.add(REL_FAILURE);
relationships.add(REL_NOT_FOUND);
this.relationships = Collections.unmodifiableSet(relationships);
}
@Override
public Set<Relationship> getRelationships() {
return this.relationships;
}
@Override
public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return descriptors;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile flowFile = session.get();
if ( flowFile == null ) {
return;
}
try {
final Document doc = parseHTMLDocumentFromFlowfile(flowFile, context, session);
final Elements eles = doc.select(context.getProperty(CSS_SELECTOR)
.evaluateAttributeExpressions().getValue());
final String prependValue = context.getProperty(PREPEND_ELEMENT_VALUE)
.evaluateAttributeExpressions(flowFile).getValue();
final String appendValue = context.getProperty(APPEND_ELEMENT_VALUE)
.evaluateAttributeExpressions(flowFile).getValue();
if (eles == null || eles.size() == 0) {
//No element found
session.transfer(flowFile, REL_NOT_FOUND);
} else {
for (final Element ele : eles) {
final FlowFile ff = session.create();
switch (context.getProperty(DESTINATION).getValue()) {
case DESTINATION_ATTRIBUTE:
final FlowFile atFlowfile = session.putAttribute(ff, HTML_ELEMENT_ATTRIBUTE_NAME,
extractElementValue(
prependValue,
context.getProperty(OUTPUT_TYPE).getValue(),
appendValue,
ele,
context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions()
.getValue()));
session.getProvenanceReporter().create(atFlowfile);
session.transfer(atFlowfile, REL_SUCCESS);
break;
case DESTINATION_CONTENT:
final FlowFile conFlowfile = session.write(ff, new StreamCallback() {
@Override
public void process(InputStream inputStream, OutputStream outputStream) throws IOException {
try {
outputStream.write(extractElementValue(
prependValue,
context.getProperty(OUTPUT_TYPE).getValue(),
appendValue,
ele,
context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions()
.getValue()).getBytes());
} catch (Exception ex) {
session.transfer(ff, REL_FAILURE);
}
}
});
session.getProvenanceReporter().create(conFlowfile);
session.transfer(conFlowfile, REL_SUCCESS);
break;
}
}
//Transfer the original HTML
session.transfer(flowFile, REL_ORIGINAL);
}
} catch (Exception ex) {
getLogger().error(ex.getMessage());
session.transfer(flowFile, REL_FAILURE);
}
}
/**
* Extracts the HTML value based on the configuration values.
*
* @return
* value from the parsed HTML element
*/
private String extractElementValue(String prependValue, String outputType, String appendValue, Element ele,
String attrKey) {
if (StringUtils.isEmpty(prependValue)) {
prependValue = "";
}
if (StringUtils.isEmpty(appendValue)) {
appendValue = "";
}
switch (outputType) {
case ELEMENT_HTML:
return prependValue + ele.html() + appendValue;
case ELEMENT_TEXT:
return prependValue + ele.text() + appendValue;
case ELEMENT_DATA:
return prependValue + ele.data() + appendValue;
case ELEMENT_ATTRIBUTE:
return prependValue + ele.attr(attrKey) + appendValue;
default:
return prependValue + ele.html() + appendValue;
}
}
}

View File

@ -0,0 +1,164 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.SeeAlso;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.io.StreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
@Tags({"modify", "html", "dom", "css", "element"})
@CapabilityDescription("Modifies the value of an existing HTML element in the original input HTML")
@SeeAlso({GetHTMLElement.class, PutHTMLElement.class})
@WritesAttributes({@WritesAttribute(attribute="NumElementsModified", description="Total number of HTML " +
"element modifications made")})
public class ModifyHTMLElement extends AbstractHTMLProcessor {
public static final String NUM_ELEMENTS_MODIFIED_ATTR = "NumElementsModified";
public static final PropertyDescriptor OUTPUT_TYPE = new PropertyDescriptor.Builder()
.name("Output Type")
.description("Controls whether the HTML element is output as " +
ELEMENT_HTML + "," + ELEMENT_TEXT + " or " + ELEMENT_DATA)
.required(true)
.allowableValues(ELEMENT_HTML, ELEMENT_TEXT, ELEMENT_ATTRIBUTE)
.defaultValue(ELEMENT_HTML)
.build();
public static final PropertyDescriptor MODIFIED_VALUE = new PropertyDescriptor
.Builder().name("Modified Value")
.description("Value to update the found HTML elements with")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
public static final PropertyDescriptor ATTRIBUTE_KEY = new PropertyDescriptor
.Builder().name("Attribute Name")
.description(("When modifying the value of an element attribute this value is used as the key to determine" +
" which attribute on the selected element will be modified with the new value."))
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
private List<PropertyDescriptor> descriptors;
private Set<Relationship> relationships;
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<>();
descriptors.add(URL);
descriptors.add(CSS_SELECTOR);
descriptors.add(HTML_CHARSET);
descriptors.add(OUTPUT_TYPE);
descriptors.add(MODIFIED_VALUE);
descriptors.add(ATTRIBUTE_KEY);
this.descriptors = Collections.unmodifiableList(descriptors);
final Set<Relationship> relationships = new HashSet<Relationship>();
relationships.add(REL_ORIGINAL);
relationships.add(REL_SUCCESS);
relationships.add(REL_FAILURE);
relationships.add(REL_INVALID_HTML);
relationships.add(REL_NOT_FOUND);
this.relationships = Collections.unmodifiableSet(relationships);
}
@Override
public Set<Relationship> getRelationships() {
return this.relationships;
}
@Override
public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return descriptors;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
try {
final Document doc = parseHTMLDocumentFromFlowfile(flowFile, context, session);
final Elements eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions().getValue());
if (eles == null || eles.size() == 0) {
//No element found
session.transfer(flowFile, REL_NOT_FOUND);
} else {
for (Element ele : eles) {
switch (context.getProperty(OUTPUT_TYPE).getValue()) {
case ELEMENT_HTML:
ele.html(context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions().getValue());
break;
case ELEMENT_ATTRIBUTE:
ele.attr(context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions().getValue(),
context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions().getValue());
break;
case ELEMENT_TEXT:
ele.text(context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions().getValue());
break;
}
}
FlowFile ff = session.write(session.create(flowFile), new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
out.write(doc.html().getBytes());
}
});
ff = session.putAttribute(ff, NUM_ELEMENTS_MODIFIED_ATTR, new Integer(eles.size()).toString());
session.transfer(ff, REL_SUCCESS);
//Transfer the original HTML
session.transfer(flowFile, REL_ORIGINAL);
}
} catch (Exception ex) {
getLogger().error(ex.getMessage());
session.transfer(flowFile, REL_FAILURE);
}
}
}

View File

@ -0,0 +1,150 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.SeeAlso;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.io.StreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
@Tags({"put", "html", "dom", "css", "element"})
@CapabilityDescription("Creates a new HTML element in the input HTML")
@SeeAlso({GetHTMLElement.class, ModifyHTMLElement.class})
public class PutHTMLElement extends AbstractHTMLProcessor {
public static final String APPEND_ELEMENT = "append-html";
public static final String PREPEND_ELEMENT = "prepend-html";
public static final PropertyDescriptor PUT_LOCATION_TYPE = new PropertyDescriptor.Builder()
.name("Element Insert Location Type")
.description("Controls whether the new element is prepended or appended to the children of the " +
"Element located by the CSS selector. EX: prepended value '<b>Hi</b>' inside of " +
"Element (using CSS Selector 'p') '<p>There</p>' would result in " +
"'<p><b>Hi</b>There</p>'. Appending the value would result in '<p>There<b>Hi</b></p>'")
.required(true)
.allowableValues(APPEND_ELEMENT, PREPEND_ELEMENT)
.defaultValue(APPEND_ELEMENT)
.build();
public static final PropertyDescriptor PUT_VALUE = new PropertyDescriptor.Builder()
.name("Put Value")
.description("Value used when creating the new Element. Value should be a valid HTML element. " +
"The text should be supplied unencoded: characters like '<', '>', etc will be properly HTML " +
"encoded in the output.")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
private List<PropertyDescriptor> descriptors;
private Set<Relationship> relationships;
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
descriptors.add(URL);
descriptors.add(CSS_SELECTOR);
descriptors.add(HTML_CHARSET);
descriptors.add(PUT_LOCATION_TYPE);
descriptors.add(PUT_VALUE);
this.descriptors = Collections.unmodifiableList(descriptors);
final Set<Relationship> relationships = new HashSet<Relationship>();
relationships.add(REL_ORIGINAL);
relationships.add(REL_SUCCESS);
relationships.add(REL_FAILURE);
relationships.add(REL_INVALID_HTML);
this.relationships = Collections.unmodifiableSet(relationships);
}
@Override
public Set<Relationship> getRelationships() {
return this.relationships;
}
@Override
public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return descriptors;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
try {
final Document doc = parseHTMLDocumentFromFlowfile(flowFile, context, session);
final Elements eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions().getValue());
if (eles == null || eles.size() == 0) {
//No element found
session.transfer(flowFile, REL_NOT_FOUND);
} else {
for (Element ele : eles) {
switch (context.getProperty(PUT_LOCATION_TYPE).getValue()) {
case APPEND_ELEMENT:
ele.append(context.getProperty(PUT_VALUE).evaluateAttributeExpressions().getValue());
break;
case PREPEND_ELEMENT:
ele.prepend(context.getProperty(PUT_VALUE).evaluateAttributeExpressions().getValue());
break;
}
}
FlowFile ff = session.write(session.create(flowFile), new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
out.write(doc.html().getBytes());
}
});
session.transfer(ff, REL_SUCCESS);
//Transfer the original HTML
session.transfer(flowFile, REL_ORIGINAL);
}
} catch (Exception ex) {
getLogger().error(ex.getMessage());
session.transfer(flowFile, REL_FAILURE);
}
}
}

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.nifi.GetHTMLElement
org.apache.nifi.ModifyHTMLElement
org.apache.nifi.PutHTMLElement

View File

@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.io.StreamCallback;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
public class AbstractHTMLTest {
protected final String ATL_WEATHER_TEXT = "Atlanta Weather";
protected final String GDR_WEATHER_TEXT = "<i>Grand Rapids Weather</i>";
protected final String ATL_WEATHER_LINK = "http://w1.weather.gov/obhistory/KPDK.html";
protected final String GR_WEATHER_LINK = "http://w1.weather.gov/obhistory/KGRR.html";
protected final String AUTHOR_NAME = "Jeremy Dyer";
protected final String ATL_ID = "ATL";
protected final String GDR_ID = "GDR";
protected final String HTML = "<!doctype html>\n" +
"\n" +
"<html lang=\"en\">\n" +
"<head>\n" +
" <meta charset=\"utf-8\">\n" +
"\n" +
" <title>NiFi HTML Parsing Demo</title>\n" +
" <meta name=\"description\" content=\"NiFi HTML Parsing Demo\">\n" +
" <meta name=\"author\" content=\"" + AUTHOR_NAME + "\">\n" +
"\n" +
" <link rel=\"stylesheet\" href=\"css/styles.css?v=1.0\">\n" +
"\n" +
" <!--[if lt IE 9]>\n" +
" <script src=\"http://html5shiv.googlecode.com/svn/trunk/html5.js\"></script>\n" +
" <![endif]-->\n" +
"</head>\n" +
"\n" +
"<body>\n" +
" <script src=\"js/scripts.js\"></script>\n" +
" <p>Check out this weather! <a id=\"" + ATL_ID + "\" href=\"" +
ATL_WEATHER_LINK + "\">" + ATL_WEATHER_TEXT + "</a></p>\n" +
" <p>I guess it could be colder ... <a id=\"" + GDR_ID + "\" href=\"" +
GR_WEATHER_LINK + "\">" + GDR_WEATHER_TEXT + "</a></p>\n" +
" <div id=\"put\"><a href=\"httpd://localhost\" /></div>\n" +
"</body>\n" +
"</html>";
protected FlowFile writeContentToNewFlowFile(final byte[] content, ProcessSession session) {
FlowFile ff = session.write(session.create(), new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
out.write(content);
}
});
return ff;
}
}

View File

@ -0,0 +1,319 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Before;
import org.junit.Test;
import java.lang.Exception;
import java.util.List;
import static org.junit.Assert.assertTrue;
public class TestGetHTMLElement extends AbstractHTMLTest {
private TestRunner testRunner;
@Before
public void init() {
testRunner = TestRunners.newTestRunner(GetHTMLElement.class);
testRunner.setProperty(GetHTMLElement.URL, "http://localhost");
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML);
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.HTML_CHARSET, "UTF-8");
}
@Test
public void testNoElementFound() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); //Bold element is not present in sample HTML
// testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, "");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1);
}
@Test
public void testInvalidSelector() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "InvalidCSSSelectorSyntax");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1);
}
@Test
public void testSingleElementFound() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "head");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
}
@Test
public void testMultipleElementFound() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "a");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 3);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
}
@Test
public void testElementFoundWriteToAttribute() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
MockFlowFile fff = ffs.get(0);
String atValue = fff.getAttribute(GetHTMLElement.HTML_ELEMENT_ATTRIBUTE_NAME);
assertTrue(StringUtils.equals(ATL_WEATHER_LINK, atValue));
}
@Test
public void testElementFoundWriteToContent() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
assertTrue(StringUtils.equals(ATL_WEATHER_LINK, data));
}
@Test
public void testValidPrependValueToFoundElement() throws Exception {
final String PREPEND_VALUE = "TestPrepend";
testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE);
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
assertTrue(StringUtils.equals(PREPEND_VALUE + ATL_WEATHER_LINK, data));
}
@Test
public void testValidPrependValueToNotFoundElement() throws Exception {
final String PREPEND_VALUE = "TestPrepend";
testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE);
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b");
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1);
}
@Test
public void testValidAppendValueToFoundElement() throws Exception {
final String APPEND_VALUE = "TestAppend";
testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE);
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
assertTrue(StringUtils.equals(ATL_WEATHER_LINK + APPEND_VALUE, data));
}
@Test
public void testValidAppendValueToNotFoundElement() throws Exception {
final String APPEND_VALUE = "TestAppend";
testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE);
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b");
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1);
}
@Test
public void testExtractAttributeFromElement() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "meta[name=author]");
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "Content");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
assertTrue(StringUtils.equals(AUTHOR_NAME, data));
}
@Test
public void testExtractTextFromElement() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
assertTrue(StringUtils.equals(ATL_WEATHER_TEXT, data));
}
@Test
public void testExtractHTMLFromElement() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + GDR_ID);
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
assertTrue(StringUtils.equals(GDR_WEATHER_TEXT, data));
}
}

View File

@ -0,0 +1,223 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Before;
import org.junit.Test;
import java.util.List;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
public class TestModifyHTMLElement extends AbstractHTMLTest {
private TestRunner testRunner;
@Before
public void init() {
testRunner = TestRunners.newTestRunner(ModifyHTMLElement.class);
testRunner = TestRunners.newTestRunner(ModifyHTMLElement.class);
testRunner.setProperty(ModifyHTMLElement.URL, "http://localhost");
testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML);
testRunner.setProperty(ModifyHTMLElement.HTML_CHARSET, "UTF-8");
}
@Test
public void testModifyText() throws Exception {
final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT;
testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT);
testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("#" + ATL_ID);
Element ele = eles.get(0);
assertTrue(StringUtils.equals(MOD_VALUE, ele.text()));
}
@Test
public void testModifyHTMLWithExpressionLanguage() throws Exception {
final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT;
testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT);
testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, "${\" " + MOD_VALUE + " \":trim()}");
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("#" + ATL_ID);
Element ele = eles.get(0);
assertNotNull(ele.text());
}
@Test
public void testModifyHTML() throws Exception {
final String MOD_VALUE = "Newly modified HTML to replace " + GDR_WEATHER_TEXT;
testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID);
testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML);
testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("#" + GDR_ID);
Element ele = eles.get(0);
assertTrue(StringUtils.equals(MOD_VALUE, ele.html()));
}
@Test
public void testModifyAttribute() throws Exception {
final String MOD_VALUE = "http://localhost/newlink";
testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID);
testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(ModifyHTMLElement.ATTRIBUTE_KEY, "href");
testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("#" + GDR_ID);
Element ele = eles.get(0);
assertTrue(StringUtils.equals(MOD_VALUE, ele.attr("href")));
}
@Test
public void testModifyElementNotFound() throws Exception {
final String MOD_VALUE = "http://localhost/newlink";
testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "b");
testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML);
testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 1);
}
@Test
public void testModifyValueContainsHTMLCharacters() throws Exception {
final String MOD_VALUE = "Text that contains > and < characters";
testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID);
testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML);
testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("#" + GDR_ID);
Element ele = eles.get(0);
assertTrue(StringUtils.equals(MOD_VALUE, ele.text()));
assertTrue(StringUtils.equals(MOD_VALUE.replace(">", "&gt;").replace("<", "&lt;"), ele.html()));
}
}

View File

@ -0,0 +1,137 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Before;
import org.junit.Test;
import java.util.List;
import static org.junit.Assert.assertTrue;
public class TestPutHTMLElement extends AbstractHTMLTest {
private TestRunner testRunner;
@Before
public void init() {
testRunner = TestRunners.newTestRunner(PutHTMLElement.class);
testRunner.setProperty(PutHTMLElement.URL, "http://localhost");
}
@Test
public void testAddNewElementToRoot() throws Exception {
final String MOD_VALUE = "<p>modified value</p>";
testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "body");
testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT);
testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("body > p");
Element ele = eles.get(0);
assertTrue(StringUtils.equals(MOD_VALUE.replace("<p>", "").replace("</p>", ""), ele.html()));
}
@Test
public void testPrependPElementToDiv() throws Exception {
final String MOD_VALUE = "<p>modified value</p>";
testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "#put");
testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT);
testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("#put");
Element ele = eles.get(0);
assertTrue(StringUtils.equals("<p>modified value</p> \n<a href=\"httpd://localhost\"></a>", ele.html()));
}
@Test
public void testAppendPElementToDiv() throws Exception {
final String MOD_VALUE = "<p>modified value</p>";
testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "#put");
testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.APPEND_ELEMENT);
testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE);
ProcessSession session = testRunner.getProcessSessionFactory().createSession();
FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session);
testRunner.enqueue(ff);
testRunner.run();
testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0);
testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS);
assertTrue(ffs.size() == 1);
String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));
//Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
Document doc = Jsoup.parse(data);
Elements eles = doc.select("#put");
Element ele = eles.get(0);
assertTrue(StringUtils.equals("<a href=\"httpd://localhost\"></a> \n" +
"<p>modified value</p>", ele.html()));
}
}

View File

@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-nar-bundles</artifactId>
<version>0.4.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-html-bundle</artifactId>
<packaging>pom</packaging>
<modules>
<module>nifi-html-processors</module>
<module>nifi-html-nar</module>
</modules>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-html-processors</artifactId>
<type>nar</type>
</dependency>
</dependencies>
</dependencyManagement>
</project>

View File

@ -42,12 +42,13 @@
<module>nifi-language-translation-bundle</module>
<module>nifi-mongodb-bundle</module>
<module>nifi-flume-bundle</module>
<module>nifi-hbase-bundle</module>
<module>nifi-hbase-bundle</module>
<module>nifi-ambari-bundle</module>
<module>nifi-image-bundle</module>
<module>nifi-avro-bundle</module>
<module>nifi-couchbase-bundle</module>
<module>nifi-azure-bundle</module>
<module>nifi-html-bundle</module>
</modules>
<dependencyManagement>
<dependencies>

View File

@ -823,6 +823,12 @@
<version>0.4.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-html-nar</artifactId>
<version>0.4.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-kite-nar</artifactId>
@ -1375,4 +1381,4 @@
</build>
</profile>
</profiles>
</project>
</project>