From c82fc18f8e306c5a31345856e529cfd9fe4c81ef Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Fri, 13 Nov 2015 15:01:10 -0500 Subject: [PATCH 1/3] HTML Parsing Processors Bundle NIFI-1156 HTML Parsing Processors Bundle --- nifi-assembly/pom.xml | 5 + .../nifi-html-bundle/nifi-html-nar/pom.xml | 41 +++ .../nifi-html-processors/pom.xml | 59 ++++ .../apache/nifi/AbstractHTMLProcessor.java | 120 +++++++ .../java/org/apache/nifi/GetHTMLElement.java | 243 +++++++++++++ .../org/apache/nifi/ModifyHTMLElement.java | 164 +++++++++ .../java/org/apache/nifi/PutHTMLElement.java | 150 ++++++++ .../org.apache.nifi.processor.Processor | 17 + .../org/apache/nifi/AbstractHTMLTest.java | 74 ++++ .../org/apache/nifi/TestGetHTMLElement.java | 319 ++++++++++++++++++ .../apache/nifi/TestModifyHTMLElement.java | 223 ++++++++++++ .../org/apache/nifi/TestPutHTMLElement.java | 137 ++++++++ nifi-nar-bundles/nifi-html-bundle/pom.xml | 43 +++ nifi-nar-bundles/pom.xml | 3 +- pom.xml | 8 +- 15 files changed, 1604 insertions(+), 2 deletions(-) create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/pom.xml create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java create mode 100644 nifi-nar-bundles/nifi-html-bundle/pom.xml diff --git a/nifi-assembly/pom.xml b/nifi-assembly/pom.xml index 961349f1ff..593de9914c 100644 --- a/nifi-assembly/pom.xml +++ b/nifi-assembly/pom.xml @@ -162,6 +162,11 @@ language governing permissions and limitations under the License. --> nifi-http-context-map-nar nar + + org.apache.nifi + nifi-html-nar + nar + org.apache.nifi nifi-kite-nar diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/pom.xml b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/pom.xml new file mode 100644 index 0000000000..fd23f7b91d --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/pom.xml @@ -0,0 +1,41 @@ + + + + 4.0.0 + + + org.apache.nifi + nifi-html-bundle + 0.4.0-SNAPSHOT + + + nifi-html-nar + nar + + + + org.apache.nifi + nifi-standard-services-api-nar + nar + + + org.apache.nifi + nifi-html-processors + 0.4.0-SNAPSHOT + + + + diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml new file mode 100644 index 0000000000..609d679af8 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml @@ -0,0 +1,59 @@ + + + + 4.0.0 + + + org.apache.nifi + nifi-html-bundle + 0.4.0-SNAPSHOT + + + nifi-html-processors + Support for parsing HTML documents + + + + org.jsoup + jsoup + 1.8.3 + + + org.apache.nifi + nifi-api + + + org.apache.nifi + nifi-processor-utils + + + org.apache.nifi + nifi-mock + test + + + org.slf4j + slf4j-simple + test + + + junit + junit + 4.11 + test + + + diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java new file mode 100644 index 0000000000..49b4ffbaa1 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.AbstractProcessor; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.io.InputStreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.atomic.AtomicReference; + +public abstract class AbstractHTMLProcessor extends AbstractProcessor { + + protected static final String ELEMENT_HTML = "HTML"; + protected static final String ELEMENT_TEXT = "Text"; + protected static final String ELEMENT_DATA = "Data"; + protected static final String ELEMENT_ATTRIBUTE = "Attribute"; + + public static final PropertyDescriptor URL = new PropertyDescriptor + .Builder().name("URL") + .description("Base URL for the HTML page being parsed.") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor + .Builder().name("CSS Selector") + .description("CSS selector syntax string used to extract the desired HTML element(s).") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor HTML_CHARSET = new PropertyDescriptor + .Builder().name("HTML character encoding") + .description("Character encoding of the input HTML") + .defaultValue("UTF-8") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + public static final Relationship REL_ORIGINAL = new Relationship.Builder() + .name("original") + .description("The original HTML input") + .build(); + + public static final Relationship REL_SUCCESS = new Relationship.Builder() + .name("success") + .description("Successfully parsed HTML element") + .build(); + + public static final Relationship REL_FAILURE = new Relationship.Builder() + .name("failure") + .description("Failed to parse HTML content") + .build(); + + public static final Relationship REL_INVALID_HTML = new Relationship.Builder() + .name("invalid html") + .description("The input HTML syntax is invalid") + .build(); + + public static final Relationship REL_NOT_FOUND = new Relationship.Builder() + .name("element not found") + .description("Element could not be found in the HTML document. The original HTML input will remain " + + "in the flowfile content unchanged. Relationship '" + REL_ORIGINAL + "' will not be invoked " + + "in this scenario.") + .build(); + + /** + * Parses the Jsoup HTML document from the FlowFile input content. + * + * @param inputFlowFile + * Input FlowFile containing the HTML + * + * @param context + * ProcessContext + * + * @param session + * ProcessSession + * + * @return + * Jsoup Document + */ + protected Document parseHTMLDocumentFromFlowfile(FlowFile inputFlowFile, + final ProcessContext context, + final ProcessSession session) { + final AtomicReference doc = new AtomicReference<>(); + session.read(inputFlowFile, new InputStreamCallback() { + @Override + public void process(InputStream inputStream) throws IOException { + doc.set(Jsoup.parse(inputStream, + context.getProperty(HTML_CHARSET).getValue(), + context.getProperty(URL).getValue())); + } + }); + return doc.get(); + } +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java new file mode 100644 index 0000000000..63d457c253 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.SeeAlso; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.io.StreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.HashSet; +import java.util.Collections; + +@Tags({"get", "html", "dom", "css", "element"}) +@CapabilityDescription("Parses HTML input using CSS selector syntax and creates a new flowfile containing the extracted" + + " element content for each matching CSS selector.") +@SeeAlso({ModifyHTMLElement.class, PutHTMLElement.class}) +@WritesAttributes({@WritesAttribute(attribute="HTMLElement", description="Flowfile attribute where the element result" + + " parsed from the HTML using the CSS selector syntax are placed if the destination is a flowfile attribute.")}) +public class GetHTMLElement + extends AbstractHTMLProcessor { + + public static final String HTML_ELEMENT_ATTRIBUTE_NAME = "HTMLElement"; + public static final String DESTINATION_ATTRIBUTE = "flowfile-attribute"; + public static final String DESTINATION_CONTENT = "flowfile-content"; + + public static final PropertyDescriptor PREPEND_ELEMENT_VALUE = new PropertyDescriptor + .Builder().name("Prepend Element value") + .description("Prepends the specified value to the resulting Element") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor APPEND_ELEMENT_VALUE = new PropertyDescriptor + .Builder().name("Append Element value") + .description("Appends the specified value to the resulting Element") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor ATTRIBUTE_KEY = new PropertyDescriptor + .Builder().name("Attribute Name") + .description(("When getting the value of an element attribute this value is used as the key to determine" + + " which attribute on the selected element should be retrieved.")) + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + + public static final PropertyDescriptor OUTPUT_TYPE = new PropertyDescriptor.Builder() + .name("Output Type") + .description("Controls the type of value that is retrieved from the element. " + + ELEMENT_HTML + "," + ELEMENT_TEXT + ", " + ELEMENT_ATTRIBUTE + " or " + ELEMENT_DATA) + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(ELEMENT_HTML, ELEMENT_TEXT, ELEMENT_ATTRIBUTE, ELEMENT_DATA) + .defaultValue(ELEMENT_HTML) + .build(); + + public static final PropertyDescriptor DESTINATION = new PropertyDescriptor.Builder() + .name("Destination") + .description("Control if element extracted is written as a flowfile attribute or " + + "as flowfile content.") + .required(true) + .allowableValues(DESTINATION_ATTRIBUTE, DESTINATION_CONTENT) + .defaultValue(DESTINATION_ATTRIBUTE) + .build(); + + private List descriptors; + + private Set relationships; + + @Override + protected void init(final ProcessorInitializationContext context) { + final List descriptors = new ArrayList<>(); + descriptors.add(URL); + descriptors.add(CSS_SELECTOR); + descriptors.add(HTML_CHARSET); + descriptors.add(OUTPUT_TYPE); + descriptors.add(DESTINATION); + descriptors.add(PREPEND_ELEMENT_VALUE); + descriptors.add(APPEND_ELEMENT_VALUE); + descriptors.add(ATTRIBUTE_KEY); + this.descriptors = Collections.unmodifiableList(descriptors); + + final Set relationships = new HashSet<>(); + relationships.add(REL_ORIGINAL); + relationships.add(REL_SUCCESS); + relationships.add(REL_FAILURE); + relationships.add(REL_NOT_FOUND); + this.relationships = Collections.unmodifiableSet(relationships); + } + + @Override + public Set getRelationships() { + return this.relationships; + } + + @Override + public final List getSupportedPropertyDescriptors() { + return descriptors; + } + + @Override + public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { + final FlowFile flowFile = session.get(); + if ( flowFile == null ) { + return; + } + + try { + + final Document doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); + final Elements eles = doc.select(context.getProperty(CSS_SELECTOR) + .evaluateAttributeExpressions().getValue()); + final String prependValue = context.getProperty(PREPEND_ELEMENT_VALUE) + .evaluateAttributeExpressions(flowFile).getValue(); + final String appendValue = context.getProperty(APPEND_ELEMENT_VALUE) + .evaluateAttributeExpressions(flowFile).getValue(); + + if (eles == null || eles.size() == 0) { + //No element found + session.transfer(flowFile, REL_NOT_FOUND); + } else { + for (final Element ele : eles) { + final FlowFile ff = session.create(); + + switch (context.getProperty(DESTINATION).getValue()) { + case DESTINATION_ATTRIBUTE: + final FlowFile atFlowfile = session.putAttribute(ff, HTML_ELEMENT_ATTRIBUTE_NAME, + extractElementValue( + prependValue, + context.getProperty(OUTPUT_TYPE).getValue(), + appendValue, + ele, + context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions() + .getValue())); + session.getProvenanceReporter().create(atFlowfile); + session.transfer(atFlowfile, REL_SUCCESS); + break; + case DESTINATION_CONTENT: + final FlowFile conFlowfile = session.write(ff, new StreamCallback() { + @Override + public void process(InputStream inputStream, OutputStream outputStream) throws IOException { + try { + outputStream.write(extractElementValue( + prependValue, + context.getProperty(OUTPUT_TYPE).getValue(), + appendValue, + ele, + context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions() + .getValue()).getBytes()); + } catch (Exception ex) { + session.transfer(ff, REL_FAILURE); + } + } + }); + + session.getProvenanceReporter().create(conFlowfile); + session.transfer(conFlowfile, REL_SUCCESS); + break; + } + + } + + //Transfer the original HTML + session.transfer(flowFile, REL_ORIGINAL); + } + + } catch (Exception ex) { + getLogger().error(ex.getMessage()); + session.transfer(flowFile, REL_FAILURE); + } + + } + + + /** + * Extracts the HTML value based on the configuration values. + * + * @return + * value from the parsed HTML element + */ + private String extractElementValue(String prependValue, String outputType, String appendValue, Element ele, + String attrKey) { + if (StringUtils.isEmpty(prependValue)) { + prependValue = ""; + } + if (StringUtils.isEmpty(appendValue)) { + appendValue = ""; + } + + switch (outputType) { + case ELEMENT_HTML: + return prependValue + ele.html() + appendValue; + case ELEMENT_TEXT: + return prependValue + ele.text() + appendValue; + case ELEMENT_DATA: + return prependValue + ele.data() + appendValue; + case ELEMENT_ATTRIBUTE: + return prependValue + ele.attr(attrKey) + appendValue; + default: + return prependValue + ele.html() + appendValue; + } + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java new file mode 100644 index 0000000000..425d8fa366 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.SeeAlso; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.io.StreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; +import java.util.Collections; + +@Tags({"modify", "html", "dom", "css", "element"}) +@CapabilityDescription("Modifies the value of an existing HTML element in the original input HTML") +@SeeAlso({GetHTMLElement.class, PutHTMLElement.class}) +@WritesAttributes({@WritesAttribute(attribute="NumElementsModified", description="Total number of HTML " + + "element modifications made")}) +public class ModifyHTMLElement extends AbstractHTMLProcessor { + + public static final String NUM_ELEMENTS_MODIFIED_ATTR = "NumElementsModified"; + + public static final PropertyDescriptor OUTPUT_TYPE = new PropertyDescriptor.Builder() + .name("Output Type") + .description("Controls whether the HTML element is output as " + + ELEMENT_HTML + "," + ELEMENT_TEXT + " or " + ELEMENT_DATA) + .required(true) + .allowableValues(ELEMENT_HTML, ELEMENT_TEXT, ELEMENT_ATTRIBUTE) + .defaultValue(ELEMENT_HTML) + .build(); + + public static final PropertyDescriptor MODIFIED_VALUE = new PropertyDescriptor + .Builder().name("Modified Value") + .description("Value to update the found HTML elements with") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor ATTRIBUTE_KEY = new PropertyDescriptor + .Builder().name("Attribute Name") + .description(("When modifying the value of an element attribute this value is used as the key to determine" + + " which attribute on the selected element will be modified with the new value.")) + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + private List descriptors; + + private Set relationships; + + @Override + protected void init(final ProcessorInitializationContext context) { + final List descriptors = new ArrayList<>(); + descriptors.add(URL); + descriptors.add(CSS_SELECTOR); + descriptors.add(HTML_CHARSET); + descriptors.add(OUTPUT_TYPE); + descriptors.add(MODIFIED_VALUE); + descriptors.add(ATTRIBUTE_KEY); + this.descriptors = Collections.unmodifiableList(descriptors); + + final Set relationships = new HashSet(); + relationships.add(REL_ORIGINAL); + relationships.add(REL_SUCCESS); + relationships.add(REL_FAILURE); + relationships.add(REL_INVALID_HTML); + relationships.add(REL_NOT_FOUND); + this.relationships = Collections.unmodifiableSet(relationships); + } + + @Override + public Set getRelationships() { + return this.relationships; + } + + @Override + public final List getSupportedPropertyDescriptors() { + return descriptors; + } + + @Override + public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { + final FlowFile flowFile = session.get(); + if (flowFile == null) { + return; + } + + try { + final Document doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); + final Elements eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions().getValue()); + + if (eles == null || eles.size() == 0) { + //No element found + session.transfer(flowFile, REL_NOT_FOUND); + } else { + for (Element ele : eles) { + switch (context.getProperty(OUTPUT_TYPE).getValue()) { + case ELEMENT_HTML: + ele.html(context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions().getValue()); + break; + case ELEMENT_ATTRIBUTE: + ele.attr(context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions().getValue(), + context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions().getValue()); + break; + case ELEMENT_TEXT: + ele.text(context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions().getValue()); + break; + } + } + + FlowFile ff = session.write(session.create(flowFile), new StreamCallback() { + @Override + public void process(InputStream in, OutputStream out) throws IOException { + out.write(doc.html().getBytes()); + } + }); + ff = session.putAttribute(ff, NUM_ELEMENTS_MODIFIED_ATTR, new Integer(eles.size()).toString()); + session.transfer(ff, REL_SUCCESS); + + //Transfer the original HTML + session.transfer(flowFile, REL_ORIGINAL); + } + + } catch (Exception ex) { + getLogger().error(ex.getMessage()); + session.transfer(flowFile, REL_FAILURE); + } + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java new file mode 100644 index 0000000000..f0a8c398c4 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.SeeAlso; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.io.StreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.HashSet; +import java.util.Collections; + +@Tags({"put", "html", "dom", "css", "element"}) +@CapabilityDescription("Creates a new HTML element in the input HTML") +@SeeAlso({GetHTMLElement.class, ModifyHTMLElement.class}) +public class PutHTMLElement extends AbstractHTMLProcessor { + + public static final String APPEND_ELEMENT = "append-html"; + public static final String PREPEND_ELEMENT = "prepend-html"; + + public static final PropertyDescriptor PUT_LOCATION_TYPE = new PropertyDescriptor.Builder() + .name("Element Insert Location Type") + .description("Controls whether the new element is prepended or appended to the children of the " + + "Element located by the CSS selector. EX: prepended value 'Hi' inside of " + + "Element (using CSS Selector 'p') '

There

' would result in " + + "'

HiThere

'. Appending the value would result in '

ThereHi

'") + .required(true) + .allowableValues(APPEND_ELEMENT, PREPEND_ELEMENT) + .defaultValue(APPEND_ELEMENT) + .build(); + + public static final PropertyDescriptor PUT_VALUE = new PropertyDescriptor.Builder() + .name("Put Value") + .description("Value used when creating the new Element. Value should be a valid HTML element. " + + "The text should be supplied unencoded: characters like '<', '>', etc will be properly HTML " + + "encoded in the output.") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + private List descriptors; + + private Set relationships; + + @Override + protected void init(final ProcessorInitializationContext context) { + final List descriptors = new ArrayList(); + descriptors.add(URL); + descriptors.add(CSS_SELECTOR); + descriptors.add(HTML_CHARSET); + descriptors.add(PUT_LOCATION_TYPE); + descriptors.add(PUT_VALUE); + this.descriptors = Collections.unmodifiableList(descriptors); + + final Set relationships = new HashSet(); + relationships.add(REL_ORIGINAL); + relationships.add(REL_SUCCESS); + relationships.add(REL_FAILURE); + relationships.add(REL_INVALID_HTML); + this.relationships = Collections.unmodifiableSet(relationships); + } + + @Override + public Set getRelationships() { + return this.relationships; + } + + @Override + public final List getSupportedPropertyDescriptors() { + return descriptors; + } + + @Override + public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { + final FlowFile flowFile = session.get(); + if (flowFile == null) { + return; + } + + try { + final Document doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); + final Elements eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions().getValue()); + + if (eles == null || eles.size() == 0) { + //No element found + session.transfer(flowFile, REL_NOT_FOUND); + } else { + for (Element ele : eles) { + switch (context.getProperty(PUT_LOCATION_TYPE).getValue()) { + case APPEND_ELEMENT: + ele.append(context.getProperty(PUT_VALUE).evaluateAttributeExpressions().getValue()); + break; + case PREPEND_ELEMENT: + ele.prepend(context.getProperty(PUT_VALUE).evaluateAttributeExpressions().getValue()); + break; + } + } + + FlowFile ff = session.write(session.create(flowFile), new StreamCallback() { + @Override + public void process(InputStream in, OutputStream out) throws IOException { + out.write(doc.html().getBytes()); + } + }); + session.transfer(ff, REL_SUCCESS); + + //Transfer the original HTML + session.transfer(flowFile, REL_ORIGINAL); + } + + } catch (Exception ex) { + getLogger().error(ex.getMessage()); + session.transfer(flowFile, REL_FAILURE); + } + + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor new file mode 100644 index 0000000000..aea106050c --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +org.apache.nifi.GetHTMLElement +org.apache.nifi.ModifyHTMLElement +org.apache.nifi.PutHTMLElement \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java new file mode 100644 index 0000000000..88f4c632e4 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.io.StreamCallback; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +public class AbstractHTMLTest { + + protected final String ATL_WEATHER_TEXT = "Atlanta Weather"; + protected final String GDR_WEATHER_TEXT = "Grand Rapids Weather"; + protected final String ATL_WEATHER_LINK = "http://w1.weather.gov/obhistory/KPDK.html"; + protected final String GR_WEATHER_LINK = "http://w1.weather.gov/obhistory/KGRR.html"; + protected final String AUTHOR_NAME = "Jeremy Dyer"; + protected final String ATL_ID = "ATL"; + protected final String GDR_ID = "GDR"; + + protected final String HTML = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + " NiFi HTML Parsing Demo\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + "

Check out this weather! " + ATL_WEATHER_TEXT + "

\n" + + "

I guess it could be colder ... " + GDR_WEATHER_TEXT + "

\n" + + "
\n" + + "\n" + + ""; + + + protected FlowFile writeContentToNewFlowFile(final byte[] content, ProcessSession session) { + FlowFile ff = session.write(session.create(), new StreamCallback() { + @Override + public void process(InputStream in, OutputStream out) throws IOException { + out.write(content); + } + }); + return ff; + } +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java new file mode 100644 index 0000000000..ae117683e3 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java @@ -0,0 +1,319 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.util.MockFlowFile; +import org.apache.nifi.util.TestRunner; +import org.apache.nifi.util.TestRunners; +import org.junit.Before; +import org.junit.Test; + +import java.lang.Exception; +import java.util.List; + +import static org.junit.Assert.assertTrue; + +public class TestGetHTMLElement extends AbstractHTMLTest { + + private TestRunner testRunner; + + @Before + public void init() { + testRunner = TestRunners.newTestRunner(GetHTMLElement.class); + testRunner.setProperty(GetHTMLElement.URL, "http://localhost"); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.HTML_CHARSET, "UTF-8"); + } + + @Test + public void testNoElementFound() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); //Bold element is not present in sample HTML +// testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, ""); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testInvalidSelector() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "InvalidCSSSelectorSyntax"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testSingleElementFound() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "head"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + } + + @Test + public void testMultipleElementFound() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "a"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 3); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + } + + @Test + public void testElementFoundWriteToAttribute() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + MockFlowFile fff = ffs.get(0); + String atValue = fff.getAttribute(GetHTMLElement.HTML_ELEMENT_ATTRIBUTE_NAME); + assertTrue(StringUtils.equals(ATL_WEATHER_LINK, atValue)); + } + + @Test + public void testElementFoundWriteToContent() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + assertTrue(StringUtils.equals(ATL_WEATHER_LINK, data)); + } + + @Test + public void testValidPrependValueToFoundElement() throws Exception { + final String PREPEND_VALUE = "TestPrepend"; + testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + assertTrue(StringUtils.equals(PREPEND_VALUE + ATL_WEATHER_LINK, data)); + } + + @Test + public void testValidPrependValueToNotFoundElement() throws Exception { + final String PREPEND_VALUE = "TestPrepend"; + testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testValidAppendValueToFoundElement() throws Exception { + final String APPEND_VALUE = "TestAppend"; + testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + assertTrue(StringUtils.equals(ATL_WEATHER_LINK + APPEND_VALUE, data)); + } + + @Test + public void testValidAppendValueToNotFoundElement() throws Exception { + final String APPEND_VALUE = "TestAppend"; + testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testExtractAttributeFromElement() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "meta[name=author]"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "Content"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + assertTrue(StringUtils.equals(AUTHOR_NAME, data)); + } + + @Test + public void testExtractTextFromElement() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + assertTrue(StringUtils.equals(ATL_WEATHER_TEXT, data)); + } + + @Test + public void testExtractHTMLFromElement() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + assertTrue(StringUtils.equals(GDR_WEATHER_TEXT, data)); + } +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java new file mode 100644 index 0000000000..010107f819 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.util.MockFlowFile; +import org.apache.nifi.util.TestRunner; +import org.apache.nifi.util.TestRunners; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Before; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +public class TestModifyHTMLElement extends AbstractHTMLTest { + + private TestRunner testRunner; + + @Before + public void init() { + testRunner = TestRunners.newTestRunner(ModifyHTMLElement.class); + testRunner = TestRunners.newTestRunner(ModifyHTMLElement.class); + testRunner.setProperty(ModifyHTMLElement.URL, "http://localhost"); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.HTML_CHARSET, "UTF-8"); + } + + @Test + public void testModifyText() throws Exception { + final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + ATL_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.text())); + } + + @Test + public void testModifyHTMLWithExpressionLanguage() throws Exception { + + final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT; + + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, "${\" " + MOD_VALUE + " \":trim()}"); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + ATL_ID); + Element ele = eles.get(0); + + assertNotNull(ele.text()); + } + + @Test + public void testModifyHTML() throws Exception { + final String MOD_VALUE = "Newly modified HTML to replace " + GDR_WEATHER_TEXT; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + GDR_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.html())); + } + + @Test + public void testModifyAttribute() throws Exception { + final String MOD_VALUE = "http://localhost/newlink"; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(ModifyHTMLElement.ATTRIBUTE_KEY, "href"); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + GDR_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.attr("href"))); + } + + @Test + public void testModifyElementNotFound() throws Exception { + final String MOD_VALUE = "http://localhost/newlink"; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "b"); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testModifyValueContainsHTMLCharacters() throws Exception { + final String MOD_VALUE = "Text that contains > and < characters"; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + GDR_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.text())); + assertTrue(StringUtils.equals(MOD_VALUE.replace(">", ">").replace("<", "<"), ele.html())); + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java new file mode 100644 index 0000000000..1dcc085b87 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.util.MockFlowFile; +import org.apache.nifi.util.TestRunner; +import org.apache.nifi.util.TestRunners; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Before; +import org.junit.Test; + +import java.util.List; +import static org.junit.Assert.assertTrue; + + +public class TestPutHTMLElement extends AbstractHTMLTest { + + private TestRunner testRunner; + + @Before + public void init() { + testRunner = TestRunners.newTestRunner(PutHTMLElement.class); + testRunner.setProperty(PutHTMLElement.URL, "http://localhost"); + } + + @Test + public void testAddNewElementToRoot() throws Exception { + final String MOD_VALUE = "

modified value

"; + testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "body"); + testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT); + testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("body > p"); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE.replace("

", "").replace("

", ""), ele.html())); + } + + @Test + public void testPrependPElementToDiv() throws Exception { + final String MOD_VALUE = "

modified value

"; + testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "#put"); + testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT); + testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#put"); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals("

modified value

\n
", ele.html())); + } + + @Test + public void testAppendPElementToDiv() throws Exception { + final String MOD_VALUE = "

modified value

"; + testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "#put"); + testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.APPEND_ELEMENT); + testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); + + ProcessSession session = testRunner.getProcessSessionFactory().createSession(); + FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); + + testRunner.enqueue(ff); + testRunner.run(); + + testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#put"); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(" \n" + + "

modified value

", ele.html())); + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/pom.xml b/nifi-nar-bundles/nifi-html-bundle/pom.xml new file mode 100644 index 0000000000..186fef3b8f --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/pom.xml @@ -0,0 +1,43 @@ + + + + 4.0.0 + + + org.apache.nifi + nifi-nar-bundles + 0.4.0-SNAPSHOT + + + nifi-html-bundle + pom + + + nifi-html-processors + nifi-html-nar + + + + + + org.apache.nifi + nifi-html-processors + nar + + + + + diff --git a/nifi-nar-bundles/pom.xml b/nifi-nar-bundles/pom.xml index 4c0925f957..5e3a97ca93 100644 --- a/nifi-nar-bundles/pom.xml +++ b/nifi-nar-bundles/pom.xml @@ -42,12 +42,13 @@ nifi-language-translation-bundle nifi-mongodb-bundle nifi-flume-bundle - nifi-hbase-bundle + nifi-hbase-bundle nifi-ambari-bundle nifi-image-bundle nifi-avro-bundle nifi-couchbase-bundle nifi-azure-bundle + nifi-html-bundle diff --git a/pom.xml b/pom.xml index 8a8cdb09a7..5efc0c6089 100644 --- a/pom.xml +++ b/pom.xml @@ -823,6 +823,12 @@ 0.4.0-SNAPSHOT nar
+ + org.apache.nifi + nifi-html-nar + 0.4.0-SNAPSHOT + nar + org.apache.nifi nifi-kite-nar @@ -1375,4 +1381,4 @@ - + \ No newline at end of file From c9d59fa8192c8791dd55344ab90f7eddefec2215 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 30 Nov 2015 10:13:52 -0500 Subject: [PATCH 2/3] Changed JUnit version from 4.11 to 4.12 as suggested --- nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml index 609d679af8..a2e8fe20bf 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml @@ -52,7 +52,7 @@ junit junit - 4.11 + 4.12 test From ee7400ef5389fd177c570f524ff29b51d667b09c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 13 Jan 2016 15:59:47 -0500 Subject: [PATCH 3/3] NIFI-1156 --- nifi-assembly/LICENSE | 28 +++- .../nifi-html-processors/pom.xml | 14 ++ .../apache/nifi/AbstractHTMLProcessor.java | 34 ++++- .../java/org/apache/nifi/GetHTMLElement.java | 41 +++-- .../org/apache/nifi/ModifyHTMLElement.java | 17 ++- .../java/org/apache/nifi/PutHTMLElement.java | 22 ++- .../org/apache/nifi/AbstractHTMLTest.java | 48 +----- .../org/apache/nifi/TestGetHTMLElement.java | 143 ++++++------------ .../apache/nifi/TestModifyHTMLElement.java | 45 ++---- .../org/apache/nifi/TestPutHTMLElement.java | 24 +-- .../src/test/resources/Weather.html | 25 +++ 11 files changed, 219 insertions(+), 222 deletions(-) create mode 100644 nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/resources/Weather.html diff --git a/nifi-assembly/LICENSE b/nifi-assembly/LICENSE index 5c499e3ec6..d12843b49f 100644 --- a/nifi-assembly/LICENSE +++ b/nifi-assembly/LICENSE @@ -1030,4 +1030,30 @@ information can be found here: http://www.adobe.com/devnet/xmp/library/eula-xmp- OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file + THE POSSIBILITY OF SUCH DAMAGE. + + +This product bundles 'Jsoup' which is available under "The MIT license". More +information can be found here: http://jsoup.org/license + + The MIT License + + Copyright (c) 2009-2015, Jonathan Hedley + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml index a2e8fe20bf..25e19b93fb 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml @@ -56,4 +56,18 @@ test + + + + + org.apache.rat + apache-rat-plugin + + + src/test/resources/Weather.html + + + + + diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java index 49b4ffbaa1..42467d9de5 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java @@ -17,6 +17,9 @@ package org.apache.nifi; import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.components.Validator; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.processor.AbstractProcessor; import org.apache.nifi.processor.ProcessContext; @@ -26,6 +29,7 @@ import org.apache.nifi.processor.io.InputStreamCallback; import org.apache.nifi.processor.util.StandardValidators; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.select.Selector; import java.io.IOException; import java.io.InputStream; @@ -38,6 +42,25 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor { protected static final String ELEMENT_DATA = "Data"; protected static final String ELEMENT_ATTRIBUTE = "Attribute"; + protected static final Validator CSS_SELECTOR_VALIDATOR = new Validator() { + @Override + public ValidationResult validate(final String subject, final String value, final ValidationContext context) { + if (context.isExpressionLanguageSupported(subject) && context.isExpressionLanguagePresent(value)) { + return new ValidationResult.Builder().subject(subject).input(value).explanation("Expression Language Present").valid(true).build(); + } + + String reason = null; + try { + Document doc = Jsoup.parse(""); + doc.select(value); + } catch (final Selector.SelectorParseException e) { + reason = "\"" + value + "\" is an invalid CSS selector"; + } + + return new ValidationResult.Builder().subject(subject).input(value).explanation(reason).valid(reason == null).build(); + } + }; + public static final PropertyDescriptor URL = new PropertyDescriptor .Builder().name("URL") .description("Base URL for the HTML page being parsed.") @@ -49,16 +72,16 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor { .Builder().name("CSS Selector") .description("CSS selector syntax string used to extract the desired HTML element(s).") .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .addValidator(CSS_SELECTOR_VALIDATOR) .expressionLanguageSupported(true) .build(); public static final PropertyDescriptor HTML_CHARSET = new PropertyDescriptor - .Builder().name("HTML character encoding") + .Builder().name("HTML Character Encoding") .description("Character encoding of the input HTML") .defaultValue("UTF-8") .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .addValidator(StandardValidators.CHARACTER_SET_VALIDATOR) .build(); public static final Relationship REL_ORIGINAL = new Relationship.Builder() @@ -71,11 +94,6 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor { .description("Successfully parsed HTML element") .build(); - public static final Relationship REL_FAILURE = new Relationship.Builder() - .name("failure") - .description("Failed to parse HTML content") - .build(); - public static final Relationship REL_INVALID_HTML = new Relationship.Builder() .name("invalid html") .description("The input HTML syntax is invalid") diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java index 63d457c253..feda16c1d9 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java @@ -17,6 +17,7 @@ package org.apache.nifi; import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.processor.ProcessContext; @@ -45,8 +46,19 @@ import java.util.HashSet; import java.util.Collections; @Tags({"get", "html", "dom", "css", "element"}) -@CapabilityDescription("Parses HTML input using CSS selector syntax and creates a new flowfile containing the extracted" + - " element content for each matching CSS selector.") +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) +@CapabilityDescription("Extracts HTML element values from the incoming flowfile's content using a CSS selector." + + " The incoming HTML is first converted into a HTML Document Object Model so that HTML elements may be selected" + + " in the similar manner that CSS selectors are used to apply styles to HTML. The resulting HTML DOM is then \"queried\"" + + " using the user defined CSS selector string. The result of \"querying\" the HTML DOM may produce 0-N results." + + " If no results are found the flowfile will be transferred to the \"element not found\" relationship to indicate" + + " so to the end user. If N results are found a new flowfile will be created and emitted for each result. The query result will" + + " either be placed in the content of the new flowfile or as an attribute of the new flowfile. By default the result is written to an" + + " attribute. This can be controlled by the \"Destination\" property. Resulting query values may also have data" + + " prepended or appended to them by setting the value of property \"Prepend Element Value\" or \"Append Element Value\"." + + " Prepended and appended values are treated as string values and concatenated to the result retrieved from the" + + " HTML DOM query operation. A more thorough reference for the CSS selector syntax can be found at" + + " \"http://jsoup.org/apidocs/org/jsoup/select/Selector.html\"") @SeeAlso({ModifyHTMLElement.class, PutHTMLElement.class}) @WritesAttributes({@WritesAttribute(attribute="HTMLElement", description="Flowfile attribute where the element result" + " parsed from the HTML using the CSS selector syntax are placed if the destination is a flowfile attribute.")}) @@ -58,7 +70,7 @@ public class GetHTMLElement public static final String DESTINATION_CONTENT = "flowfile-content"; public static final PropertyDescriptor PREPEND_ELEMENT_VALUE = new PropertyDescriptor - .Builder().name("Prepend Element value") + .Builder().name("Prepend Element Value") .description("Prepends the specified value to the resulting Element") .required(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) @@ -66,7 +78,7 @@ public class GetHTMLElement .build(); public static final PropertyDescriptor APPEND_ELEMENT_VALUE = new PropertyDescriptor - .Builder().name("Append Element value") + .Builder().name("Append Element Value") .description("Appends the specified value to the resulting Element") .required(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) @@ -75,8 +87,9 @@ public class GetHTMLElement public static final PropertyDescriptor ATTRIBUTE_KEY = new PropertyDescriptor .Builder().name("Attribute Name") - .description(("When getting the value of an element attribute this value is used as the key to determine" + - " which attribute on the selected element should be retrieved.")) + .description(("When getting the value of a HTML element attribute this value is used as the key to determine" + + " which attribute on the selected element should be retrieved. This value is used when the \"Output Type\"" + + " is set to \"" + ELEMENT_ATTRIBUTE + "\"")) .required(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) @@ -85,8 +98,7 @@ public class GetHTMLElement public static final PropertyDescriptor OUTPUT_TYPE = new PropertyDescriptor.Builder() .name("Output Type") - .description("Controls the type of value that is retrieved from the element. " + - ELEMENT_HTML + "," + ELEMENT_TEXT + ", " + ELEMENT_ATTRIBUTE + " or " + ELEMENT_DATA) + .description("Controls the type of DOM value that is retrieved from the HTML element.") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .allowableValues(ELEMENT_HTML, ELEMENT_TEXT, ELEMENT_ATTRIBUTE, ELEMENT_DATA) @@ -122,7 +134,7 @@ public class GetHTMLElement final Set relationships = new HashSet<>(); relationships.add(REL_ORIGINAL); relationships.add(REL_SUCCESS); - relationships.add(REL_FAILURE); + relationships.add(REL_INVALID_HTML); relationships.add(REL_NOT_FOUND); this.relationships = Collections.unmodifiableSet(relationships); } @@ -148,7 +160,7 @@ public class GetHTMLElement final Document doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); final Elements eles = doc.select(context.getProperty(CSS_SELECTOR) - .evaluateAttributeExpressions().getValue()); + .evaluateAttributeExpressions(flowFile).getValue()); final String prependValue = context.getProperty(PREPEND_ELEMENT_VALUE) .evaluateAttributeExpressions(flowFile).getValue(); final String appendValue = context.getProperty(APPEND_ELEMENT_VALUE) @@ -159,7 +171,7 @@ public class GetHTMLElement session.transfer(flowFile, REL_NOT_FOUND); } else { for (final Element ele : eles) { - final FlowFile ff = session.create(); + final FlowFile ff = session.create(flowFile); switch (context.getProperty(DESTINATION).getValue()) { case DESTINATION_ATTRIBUTE: @@ -171,7 +183,6 @@ public class GetHTMLElement ele, context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions() .getValue())); - session.getProvenanceReporter().create(atFlowfile); session.transfer(atFlowfile, REL_SUCCESS); break; case DESTINATION_CONTENT: @@ -187,12 +198,12 @@ public class GetHTMLElement context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions() .getValue()).getBytes()); } catch (Exception ex) { - session.transfer(ff, REL_FAILURE); + getLogger().error(ex.getMessage()); + session.transfer(ff, REL_INVALID_HTML); } } }); - session.getProvenanceReporter().create(conFlowfile); session.transfer(conFlowfile, REL_SUCCESS); break; } @@ -205,7 +216,7 @@ public class GetHTMLElement } catch (Exception ex) { getLogger().error(ex.getMessage()); - session.transfer(flowFile, REL_FAILURE); + session.transfer(flowFile, REL_INVALID_HTML); } } diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java index 425d8fa366..0829158c49 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java @@ -16,6 +16,8 @@ */ package org.apache.nifi; +import org.apache.nifi.annotation.behavior.InputRequirement; +import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.behavior.WritesAttribute; import org.apache.nifi.annotation.behavior.WritesAttributes; import org.apache.nifi.annotation.documentation.CapabilityDescription; @@ -44,7 +46,17 @@ import java.util.HashSet; import java.util.Collections; @Tags({"modify", "html", "dom", "css", "element"}) -@CapabilityDescription("Modifies the value of an existing HTML element in the original input HTML") +@SupportsBatching +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) +@CapabilityDescription("Modifies the value of an existing HTML element. The desired element to be modified is located by" + + " using CSS selector syntax. The incoming HTML is first converted into a HTML Document Object Model so that HTML elements may be selected" + + " in the similar manner that CSS selectors are used to apply styles to HTML. The resulting HTML DOM is then \"queried\"" + + " using the user defined CSS selector string to find the element the user desires to modify. If the HTML element is found" + + " the element's value is updated in the DOM using the value specified \"Modified Value\" property. All DOM elements" + + " that match the CSS selector will be updated. Once all of the DOM elements have been updated the DOM is rendered" + + " to HTML and the result replaces the flowfile content with the updated HTML. A more thorough reference for the" + + " CSS selector syntax can be found at" + + " \"http://jsoup.org/apidocs/org/jsoup/select/Selector.html\"") @SeeAlso({GetHTMLElement.class, PutHTMLElement.class}) @WritesAttributes({@WritesAttribute(attribute="NumElementsModified", description="Total number of HTML " + "element modifications made")}) @@ -96,7 +108,6 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor { final Set relationships = new HashSet(); relationships.add(REL_ORIGINAL); relationships.add(REL_SUCCESS); - relationships.add(REL_FAILURE); relationships.add(REL_INVALID_HTML); relationships.add(REL_NOT_FOUND); this.relationships = Collections.unmodifiableSet(relationships); @@ -157,7 +168,7 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor { } catch (Exception ex) { getLogger().error(ex.getMessage()); - session.transfer(flowFile, REL_FAILURE); + session.transfer(flowFile, REL_INVALID_HTML); } } diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java index f0a8c398c4..2af31d5593 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java @@ -16,6 +16,8 @@ */ package org.apache.nifi; +import org.apache.nifi.annotation.behavior.InputRequirement; +import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.SeeAlso; import org.apache.nifi.annotation.documentation.Tags; @@ -42,7 +44,15 @@ import java.util.HashSet; import java.util.Collections; @Tags({"put", "html", "dom", "css", "element"}) -@CapabilityDescription("Creates a new HTML element in the input HTML") +@SupportsBatching +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) +@CapabilityDescription("Places a new HTML element in the existing HTML DOM. The desired position for the new HTML element is specified by" + + " using CSS selector syntax. The incoming HTML is first converted into a HTML Document Object Model so that HTML DOM location may be located" + + " in a similar manner that CSS selectors are used to apply styles to HTML. The resulting HTML DOM is then \"queried\"" + + " using the user defined CSS selector string to find the position where the user desires to add the new HTML element." + + " Once the new HTML element is added to the DOM it is rendered to HTML and the result replaces the flowfile" + + " content with the updated HTML. A more thorough reference for the CSS selector syntax can be found at" + + " \"http://jsoup.org/apidocs/org/jsoup/select/Selector.html\"") @SeeAlso({GetHTMLElement.class, ModifyHTMLElement.class}) public class PutHTMLElement extends AbstractHTMLProcessor { @@ -64,7 +74,7 @@ public class PutHTMLElement extends AbstractHTMLProcessor { .name("Put Value") .description("Value used when creating the new Element. Value should be a valid HTML element. " + "The text should be supplied unencoded: characters like '<', '>', etc will be properly HTML " + - "encoded in the output.") + "encoded in the resulting output.") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) @@ -87,8 +97,8 @@ public class PutHTMLElement extends AbstractHTMLProcessor { final Set relationships = new HashSet(); relationships.add(REL_ORIGINAL); relationships.add(REL_SUCCESS); - relationships.add(REL_FAILURE); relationships.add(REL_INVALID_HTML); + relationships.add(REL_NOT_FOUND); this.relationships = Collections.unmodifiableSet(relationships); } @@ -120,10 +130,10 @@ public class PutHTMLElement extends AbstractHTMLProcessor { for (Element ele : eles) { switch (context.getProperty(PUT_LOCATION_TYPE).getValue()) { case APPEND_ELEMENT: - ele.append(context.getProperty(PUT_VALUE).evaluateAttributeExpressions().getValue()); + ele.append(context.getProperty(PUT_VALUE).evaluateAttributeExpressions(flowFile).getValue()); break; case PREPEND_ELEMENT: - ele.prepend(context.getProperty(PUT_VALUE).evaluateAttributeExpressions().getValue()); + ele.prepend(context.getProperty(PUT_VALUE).evaluateAttributeExpressions(flowFile).getValue()); break; } } @@ -142,7 +152,7 @@ public class PutHTMLElement extends AbstractHTMLProcessor { } catch (Exception ex) { getLogger().error(ex.getMessage()); - session.transfer(flowFile, REL_FAILURE); + session.transfer(flowFile, REL_INVALID_HTML); } } diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java index 88f4c632e4..10bc33ea16 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java @@ -16,15 +16,7 @@ */ package org.apache.nifi; -import org.apache.nifi.flowfile.FlowFile; -import org.apache.nifi.processor.ProcessSession; -import org.apache.nifi.processor.io.StreamCallback; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -public class AbstractHTMLTest { +public abstract class AbstractHTMLTest { protected final String ATL_WEATHER_TEXT = "Atlanta Weather"; protected final String GDR_WEATHER_TEXT = "Grand Rapids Weather"; @@ -33,42 +25,4 @@ public class AbstractHTMLTest { protected final String AUTHOR_NAME = "Jeremy Dyer"; protected final String ATL_ID = "ATL"; protected final String GDR_ID = "GDR"; - - protected final String HTML = "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - " NiFi HTML Parsing Demo\n" + - " \n" + - " \n" + - "\n" + - " \n" + - "\n" + - " \n" + - "\n" + - "\n" + - "\n" + - " \n" + - "

Check out this weather! " + ATL_WEATHER_TEXT + "

\n" + - "

I guess it could be colder ... " + GDR_WEATHER_TEXT + "

\n" + - "
\n" + - "\n" + - ""; - - - protected FlowFile writeContentToNewFlowFile(final byte[] content, ProcessSession session) { - FlowFile ff = session.write(session.create(), new StreamCallback() { - @Override - public void process(InputStream in, OutputStream out) throws IOException { - out.write(content); - } - }); - return ff; - } } diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java index ae117683e3..1cc5d73ee6 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java @@ -16,20 +16,21 @@ */ package org.apache.nifi; -import org.apache.commons.lang3.StringUtils; -import org.apache.nifi.flowfile.FlowFile; -import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; import org.apache.nifi.util.TestRunners; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Selector; import org.junit.Before; import org.junit.Test; +import java.io.File; +import java.io.IOException; import java.lang.Exception; +import java.net.URL; import java.util.List; -import static org.junit.Assert.assertTrue; - public class TestGetHTMLElement extends AbstractHTMLTest { private TestRunner testRunner; @@ -43,19 +44,26 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.HTML_CHARSET, "UTF-8"); } + @Test + public void testCSSSelectorSyntaxValidator() throws IOException { + Document doc = Jsoup.parse(new URL("http://www.google.com"), 5000); + try { + doc.select("---jeremy"); + } catch (Selector.SelectorParseException ex) { + String mes = ex.getMessage(); + ex.printStackTrace(); + } + } + @Test public void testNoElementFound() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); //Bold element is not present in sample HTML -// testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, ""); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @@ -63,14 +71,11 @@ public class TestGetHTMLElement extends AbstractHTMLTest { public void testInvalidSelector() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "InvalidCSSSelectorSyntax"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @@ -78,14 +83,11 @@ public class TestGetHTMLElement extends AbstractHTMLTest { public void testSingleElementFound() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "head"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); } @@ -94,14 +96,11 @@ public class TestGetHTMLElement extends AbstractHTMLTest { public void testMultipleElementFound() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "a"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 3); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); } @@ -113,22 +112,16 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); - assertTrue(ffs.size() == 1); - MockFlowFile fff = ffs.get(0); - String atValue = fff.getAttribute(GetHTMLElement.HTML_ELEMENT_ATTRIBUTE_NAME); - assertTrue(StringUtils.equals(ATL_WEATHER_LINK, atValue)); + ffs.get(0).assertAttributeEquals(GetHTMLElement.HTML_ELEMENT_ATTRIBUTE_NAME, ATL_WEATHER_LINK); } @Test @@ -138,21 +131,16 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); - assertTrue(ffs.size() == 1); - String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); - assertTrue(StringUtils.equals(ATL_WEATHER_LINK, data)); + ffs.get(0).assertContentEquals(ATL_WEATHER_LINK); } @Test @@ -164,21 +152,16 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); - assertTrue(ffs.size() == 1); - String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); - assertTrue(StringUtils.equals(PREPEND_VALUE + ATL_WEATHER_LINK, data)); + ffs.get(0).assertContentEquals(PREPEND_VALUE + ATL_WEATHER_LINK); } @Test @@ -189,14 +172,11 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @@ -210,21 +190,16 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); - assertTrue(ffs.size() == 1); - String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); - assertTrue(StringUtils.equals(ATL_WEATHER_LINK + APPEND_VALUE, data)); + ffs.get(0).assertContentEquals(ATL_WEATHER_LINK + APPEND_VALUE); } @Test @@ -235,14 +210,11 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @@ -254,21 +226,16 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "Content"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); - assertTrue(ffs.size() == 1); - String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); - assertTrue(StringUtils.equals(AUTHOR_NAME, data)); + ffs.get(0).assertContentEquals(AUTHOR_NAME); } @Test @@ -277,21 +244,16 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); - assertTrue(ffs.size() == 1); - String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); - assertTrue(StringUtils.equals(ATL_WEATHER_TEXT, data)); + ffs.get(0).assertContentEquals(ATL_WEATHER_TEXT); } @Test @@ -300,20 +262,15 @@ public class TestGetHTMLElement extends AbstractHTMLTest { testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(GetHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); - assertTrue(ffs.size() == 1); - String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); - assertTrue(StringUtils.equals(GDR_WEATHER_TEXT, data)); + ffs.get(0).assertContentEquals(GDR_WEATHER_TEXT); } } diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java index 010107f819..a45feb3a34 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java @@ -17,8 +17,6 @@ package org.apache.nifi; import org.apache.commons.lang3.StringUtils; -import org.apache.nifi.flowfile.FlowFile; -import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; import org.apache.nifi.util.TestRunners; @@ -29,6 +27,7 @@ import org.jsoup.select.Elements; import org.junit.Before; import org.junit.Test; +import java.io.File; import java.util.List; import static org.junit.Assert.assertNotNull; @@ -54,14 +53,11 @@ public class TestModifyHTMLElement extends AbstractHTMLTest { testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); @@ -86,14 +82,11 @@ public class TestModifyHTMLElement extends AbstractHTMLTest { testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, "${\" " + MOD_VALUE + " \":trim()}"); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); @@ -116,14 +109,11 @@ public class TestModifyHTMLElement extends AbstractHTMLTest { testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); @@ -147,14 +137,11 @@ public class TestModifyHTMLElement extends AbstractHTMLTest { testRunner.setProperty(ModifyHTMLElement.ATTRIBUTE_KEY, "href"); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); @@ -177,14 +164,11 @@ public class TestModifyHTMLElement extends AbstractHTMLTest { testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 0); - testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 1); } @@ -196,14 +180,11 @@ public class TestModifyHTMLElement extends AbstractHTMLTest { testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(ModifyHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java index 1dcc085b87..b842ccb44d 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java @@ -17,8 +17,6 @@ package org.apache.nifi; import org.apache.commons.lang3.StringUtils; -import org.apache.nifi.flowfile.FlowFile; -import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; import org.apache.nifi.util.TestRunners; @@ -29,6 +27,7 @@ import org.jsoup.select.Elements; import org.junit.Before; import org.junit.Test; +import java.io.File; import java.util.List; import static org.junit.Assert.assertTrue; @@ -50,14 +49,11 @@ public class TestPutHTMLElement extends AbstractHTMLTest { testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT); testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); @@ -80,14 +76,11 @@ public class TestPutHTMLElement extends AbstractHTMLTest { testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT); testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); @@ -110,14 +103,11 @@ public class TestPutHTMLElement extends AbstractHTMLTest { testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.APPEND_ELEMENT); testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); - ProcessSession session = testRunner.getProcessSessionFactory().createSession(); - FlowFile ff = writeContentToNewFlowFile(HTML.getBytes(), session); - - testRunner.enqueue(ff); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); - testRunner.assertTransferCount(PutHTMLElement.REL_FAILURE, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/resources/Weather.html b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/resources/Weather.html new file mode 100644 index 0000000000..673f7cb055 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/resources/Weather.html @@ -0,0 +1,25 @@ + + + + NiFi HTML Parsing Demo + + + + + + + + + + +

Check out this weather! + Atlanta Weather +

+

I guess it could be colder ... + Grand Rapids Weather +

+
+ + \ No newline at end of file