diff --git a/nifi-assembly/LICENSE b/nifi-assembly/LICENSE index e21ecc95ad..ca4a8805a0 100644 --- a/nifi-assembly/LICENSE +++ b/nifi-assembly/LICENSE @@ -471,6 +471,38 @@ For details see http://www.abeautifulsite.net/ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +This product bundles 'jsoup' which is available under the MIT License. +For details see http://jsoup.org/ + + jsoup License + The jsoup code-base (include source and compiled packages) are distributed under the open source MIT license as described below. + + The MIT License + Copyright © 2009 - 2013 Jonathan Hedley (jonathan@hedley.net) + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + + This product bundles 'json2.js' which is available in the 'public domain'. For details see https://github.com/douglascrockford/JSON-js @@ -1052,4 +1084,30 @@ information can be found here: http://www.adobe.com/devnet/xmp/library/eula-xmp- OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file + THE POSSIBILITY OF SUCH DAMAGE. + + +This product bundles 'Jsoup' which is available under "The MIT license". More +information can be found here: http://jsoup.org/license + + The MIT License + + Copyright (c) 2009-2015, Jonathan Hedley + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. \ No newline at end of file diff --git a/nifi-assembly/pom.xml b/nifi-assembly/pom.xml index b8e83bd076..d649317fa2 100644 --- a/nifi-assembly/pom.xml +++ b/nifi-assembly/pom.xml @@ -162,6 +162,11 @@ language governing permissions and limitations under the License. --> nifi-http-context-map-nar nar + + org.apache.nifi + nifi-html-nar + nar + org.apache.nifi nifi-kite-nar diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/pom.xml b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/pom.xml new file mode 100644 index 0000000000..97b432280b --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/pom.xml @@ -0,0 +1,40 @@ + + + + 4.0.0 + + + org.apache.nifi + nifi-html-bundle + 0.4.2-SNAPSHOT + + + nifi-html-nar + nar + + + + org.apache.nifi + nifi-standard-services-api-nar + nar + + + org.apache.nifi + nifi-html-processors + + + + diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/src/main/resources/META-INF/LICENSE b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/src/main/resources/META-INF/LICENSE new file mode 100644 index 0000000000..c62123ee47 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/src/main/resources/META-INF/LICENSE @@ -0,0 +1,240 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +APACHE NIFI SUBCOMPONENTS: + +The Apache NiFi project contains subcomponents with separate copyright +notices and license terms. Your use of the source code for the these +subcomponents is subject to the terms and conditions of the following +licenses. + +This product bundles 'jsoup' which is available under the MIT License. +For details see http://jsoup.org/ + + jsoup License + The jsoup code-base (include source and compiled packages) are distributed under the open source MIT license as described below. + + The MIT License + Copyright © 2009 - 2013 Jonathan Hedley (jonathan@hedley.net) + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/src/main/resources/META-INF/NOTICE b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/src/main/resources/META-INF/NOTICE new file mode 100644 index 0000000000..894d3de784 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-nar/src/main/resources/META-INF/NOTICE @@ -0,0 +1,19 @@ +nifi-html-nar +Copyright 2015 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +****************** +Apache Software License v2 +****************** + +The following binary components are provided under the Apache Software License v2 + + (ASLv2) Apache Commons Lang + The following NOTICE information applies: + Apache Commons Lang + Copyright 2001-2014 The Apache Software Foundation + + This product includes software from the Spring Framework, + under the Apache License 2.0 (see: StringUtils.containsWhitespace()) diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml new file mode 100644 index 0000000000..77b769f4ee --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/pom.xml @@ -0,0 +1,71 @@ + + + + 4.0.0 + + + org.apache.nifi + nifi-html-bundle + 0.4.2-SNAPSHOT + + + nifi-html-processors + Support for parsing HTML documents + + + + org.jsoup + jsoup + + + org.apache.nifi + nifi-api + + + org.apache.nifi + nifi-processor-utils + + + org.apache.nifi + nifi-mock + test + + + org.slf4j + slf4j-simple + test + + + junit + junit + test + + + + + + + org.apache.rat + apache-rat-plugin + + + src/test/resources/Weather.html + + + + + + diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java new file mode 100644 index 0000000000..20dca29219 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.components.Validator; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.AbstractProcessor; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.io.InputStreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Selector; + +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.atomic.AtomicReference; + +public abstract class AbstractHTMLProcessor extends AbstractProcessor { + + protected static final String ELEMENT_HTML = "HTML"; + protected static final String ELEMENT_TEXT = "Text"; + protected static final String ELEMENT_DATA = "Data"; + protected static final String ELEMENT_ATTRIBUTE = "Attribute"; + + protected static final Validator CSS_SELECTOR_VALIDATOR = new Validator() { + @Override + public ValidationResult validate(final String subject, final String value, final ValidationContext context) { + if (context.isExpressionLanguageSupported(subject) && context.isExpressionLanguagePresent(value)) { + return new ValidationResult.Builder().subject(subject).input(value).explanation("Expression Language Present").valid(true).build(); + } + + String reason = null; + try { + Document doc = Jsoup.parse(""); + doc.select(value); + } catch (final Selector.SelectorParseException e) { + reason = "\"" + value + "\" is an invalid CSS selector"; + } + + return new ValidationResult.Builder().subject(subject).input(value).explanation(reason).valid(reason == null).build(); + } + }; + + public static final PropertyDescriptor URL = new PropertyDescriptor + .Builder().name("URL") + .description("Base URL for the HTML page being parsed.") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor + .Builder().name("CSS Selector") + .description("CSS selector syntax string used to extract the desired HTML element(s).") + .required(true) + .addValidator(CSS_SELECTOR_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor HTML_CHARSET = new PropertyDescriptor + .Builder().name("HTML Character Encoding") + .description("Character encoding of the input HTML") + .defaultValue("UTF-8") + .required(true) + .addValidator(StandardValidators.CHARACTER_SET_VALIDATOR) + .build(); + + public static final Relationship REL_ORIGINAL = new Relationship.Builder() + .name("original") + .description("The original HTML input") + .build(); + + public static final Relationship REL_SUCCESS = new Relationship.Builder() + .name("success") + .description("Successfully parsed HTML element") + .build(); + + public static final Relationship REL_INVALID_HTML = new Relationship.Builder() + .name("invalid html") + .description("The input HTML syntax is invalid") + .build(); + + public static final Relationship REL_NOT_FOUND = new Relationship.Builder() + .name("element not found") + .description("Element could not be found in the HTML document. The original HTML input will remain " + + "in the flowfile content unchanged. Relationship '" + REL_ORIGINAL + "' will not be invoked " + + "in this scenario.") + .build(); + + /** + * Parses the Jsoup HTML document from the FlowFile input content. + * + * @param inputFlowFile Input FlowFile containing the HTML + * @param context ProcessContext + * @param session ProcessSession + * + * @return Jsoup Document + */ + protected Document parseHTMLDocumentFromFlowfile(final FlowFile inputFlowFile, final ProcessContext context, final ProcessSession session) { + final AtomicReference doc = new AtomicReference<>(); + session.read(inputFlowFile, new InputStreamCallback() { + @Override + public void process(InputStream inputStream) throws IOException { + doc.set(Jsoup.parse(inputStream, + context.getProperty(HTML_CHARSET).getValue(), + context.getProperty(URL).getValue())); + } + }); + return doc.get(); + } +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java new file mode 100644 index 0000000000..1d421a0f16 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.annotation.behavior.InputRequirement; +import org.apache.nifi.annotation.behavior.SupportsBatching; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.SeeAlso; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.io.StreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.HashSet; +import java.util.Collections; + +@SupportsBatching +@Tags({"get", "html", "dom", "css", "element"}) +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) +@CapabilityDescription("Extracts HTML element values from the incoming flowfile's content using a CSS selector." + + " The incoming HTML is first converted into a HTML Document Object Model so that HTML elements may be selected" + + " in the similar manner that CSS selectors are used to apply styles to HTML. The resulting HTML DOM is then \"queried\"" + + " using the user defined CSS selector string. The result of \"querying\" the HTML DOM may produce 0-N results." + + " If no results are found the flowfile will be transferred to the \"element not found\" relationship to indicate" + + " so to the end user. If N results are found a new flowfile will be created and emitted for each result. The query result will" + + " either be placed in the content of the new flowfile or as an attribute of the new flowfile. By default the result is written to an" + + " attribute. This can be controlled by the \"Destination\" property. Resulting query values may also have data" + + " prepended or appended to them by setting the value of property \"Prepend Element Value\" or \"Append Element Value\"." + + " Prepended and appended values are treated as string values and concatenated to the result retrieved from the" + + " HTML DOM query operation. A more thorough reference for the CSS selector syntax can be found at" + + " \"http://jsoup.org/apidocs/org/jsoup/select/Selector.html\"") +@SeeAlso({ModifyHTMLElement.class, PutHTMLElement.class}) +@WritesAttributes({@WritesAttribute(attribute="HTMLElement", description="Flowfile attribute where the element result" + + " parsed from the HTML using the CSS selector syntax are placed if the destination is a flowfile attribute.")}) +public class GetHTMLElement + extends AbstractHTMLProcessor { + + public static final String HTML_ELEMENT_ATTRIBUTE_NAME = "HTMLElement"; + public static final String DESTINATION_ATTRIBUTE = "flowfile-attribute"; + public static final String DESTINATION_CONTENT = "flowfile-content"; + + public static final PropertyDescriptor PREPEND_ELEMENT_VALUE = new PropertyDescriptor + .Builder().name("Prepend Element Value") + .description("Prepends the specified value to the resulting Element") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor APPEND_ELEMENT_VALUE = new PropertyDescriptor + .Builder().name("Append Element Value") + .description("Appends the specified value to the resulting Element") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor ATTRIBUTE_KEY = new PropertyDescriptor + .Builder().name("Attribute Name") + .description(("When getting the value of a HTML element attribute this value is used as the key to determine" + + " which attribute on the selected element should be retrieved. This value is used when the \"Output Type\"" + + " is set to \"" + ELEMENT_ATTRIBUTE + "\"")) + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + + public static final PropertyDescriptor OUTPUT_TYPE = new PropertyDescriptor.Builder() + .name("Output Type") + .description("Controls the type of DOM value that is retrieved from the HTML element.") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(ELEMENT_HTML, ELEMENT_TEXT, ELEMENT_ATTRIBUTE, ELEMENT_DATA) + .defaultValue(ELEMENT_HTML) + .build(); + + public static final PropertyDescriptor DESTINATION = new PropertyDescriptor.Builder() + .name("Destination") + .description("Control if element extracted is written as a flowfile attribute or " + + "as flowfile content.") + .required(true) + .allowableValues(DESTINATION_ATTRIBUTE, DESTINATION_CONTENT) + .defaultValue(DESTINATION_ATTRIBUTE) + .build(); + + private List descriptors; + + private Set relationships; + + @Override + protected void init(final ProcessorInitializationContext context) { + final List descriptors = new ArrayList<>(); + descriptors.add(URL); + descriptors.add(CSS_SELECTOR); + descriptors.add(HTML_CHARSET); + descriptors.add(OUTPUT_TYPE); + descriptors.add(DESTINATION); + descriptors.add(PREPEND_ELEMENT_VALUE); + descriptors.add(APPEND_ELEMENT_VALUE); + descriptors.add(ATTRIBUTE_KEY); + this.descriptors = Collections.unmodifiableList(descriptors); + + final Set relationships = new HashSet<>(); + relationships.add(REL_ORIGINAL); + relationships.add(REL_SUCCESS); + relationships.add(REL_INVALID_HTML); + relationships.add(REL_NOT_FOUND); + this.relationships = Collections.unmodifiableSet(relationships); + } + + @Override + public Set getRelationships() { + return this.relationships; + } + + @Override + public final List getSupportedPropertyDescriptors() { + return descriptors; + } + + @Override + public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { + final FlowFile flowFile = session.get(); + if ( flowFile == null ) { + return; + } + + final Document doc; + final Elements eles; + + try { + doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); + eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions(flowFile).getValue()); + } catch (final Exception ex) { + getLogger().error("Failed to extract HTML from {} due to {}; routing to {}", new Object[] {flowFile, ex, REL_INVALID_HTML}, ex); + session.transfer(flowFile, REL_INVALID_HTML); + return; + } + + final String prependValue = context.getProperty(PREPEND_ELEMENT_VALUE).evaluateAttributeExpressions(flowFile).getValue(); + final String appendValue = context.getProperty(APPEND_ELEMENT_VALUE).evaluateAttributeExpressions(flowFile).getValue(); + final String outputType = context.getProperty(OUTPUT_TYPE).getValue(); + final String attributeKey = context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions(flowFile).getValue(); + + if (eles == null || eles.isEmpty()) { + // No element found + session.transfer(flowFile, REL_NOT_FOUND); + } else { + // Create a new FlowFile for each matching element. + for (final Element ele : eles) { + final String extractedElementValue = extractElementValue(prependValue, outputType, appendValue, ele, attributeKey); + + final FlowFile ff = session.create(flowFile); + FlowFile updatedFF = ff; + + switch (context.getProperty(DESTINATION).getValue()) { + case DESTINATION_ATTRIBUTE: + updatedFF = session.putAttribute(ff, HTML_ELEMENT_ATTRIBUTE_NAME, extractedElementValue); + break; + case DESTINATION_CONTENT: + updatedFF = session.write(ff, new StreamCallback() { + @Override + public void process(final InputStream inputStream, final OutputStream outputStream) throws IOException { + outputStream.write(extractedElementValue.getBytes(StandardCharsets.UTF_8)); + } + }); + + break; + } + + session.transfer(updatedFF, REL_SUCCESS); + } + + // Transfer the original HTML + session.transfer(flowFile, REL_ORIGINAL); + } + } + + + /** + * Extracts the HTML value based on the configuration values. + * + * @return value from the parsed HTML element + */ + private String extractElementValue(String prependValue, final String outputType, String appendValue, final Element ele, final String attrKey) { + if (StringUtils.isEmpty(prependValue)) { + prependValue = ""; + } + if (StringUtils.isEmpty(appendValue)) { + appendValue = ""; + } + + switch (outputType) { + case ELEMENT_HTML: + return prependValue + ele.html() + appendValue; + case ELEMENT_TEXT: + return prependValue + ele.text() + appendValue; + case ELEMENT_DATA: + return prependValue + ele.data() + appendValue; + case ELEMENT_ATTRIBUTE: + return prependValue + ele.attr(attrKey) + appendValue; + default: + return prependValue + ele.html() + appendValue; + } + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java new file mode 100644 index 0000000000..e84d4edd53 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.annotation.behavior.InputRequirement; +import org.apache.nifi.annotation.behavior.SupportsBatching; +import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.SeeAlso; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.io.StreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; +import java.util.Collections; + +@Tags({"modify", "html", "dom", "css", "element"}) +@SupportsBatching +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) +@CapabilityDescription("Modifies the value of an existing HTML element. The desired element to be modified is located by" + + " using CSS selector syntax. The incoming HTML is first converted into a HTML Document Object Model so that HTML elements may be selected" + + " in the similar manner that CSS selectors are used to apply styles to HTML. The resulting HTML DOM is then \"queried\"" + + " using the user defined CSS selector string to find the element the user desires to modify. If the HTML element is found" + + " the element's value is updated in the DOM using the value specified \"Modified Value\" property. All DOM elements" + + " that match the CSS selector will be updated. Once all of the DOM elements have been updated the DOM is rendered" + + " to HTML and the result replaces the flowfile content with the updated HTML. A more thorough reference for the" + + " CSS selector syntax can be found at" + + " \"http://jsoup.org/apidocs/org/jsoup/select/Selector.html\"") +@SeeAlso({GetHTMLElement.class, PutHTMLElement.class}) +@WritesAttributes({@WritesAttribute(attribute="NumElementsModified", description="Total number of HTML " + + "element modifications made")}) +public class ModifyHTMLElement extends AbstractHTMLProcessor { + + public static final String NUM_ELEMENTS_MODIFIED_ATTR = "NumElementsModified"; + + public static final PropertyDescriptor OUTPUT_TYPE = new PropertyDescriptor.Builder() + .name("Output Type") + .description("Controls whether the HTML element is output as " + + ELEMENT_HTML + "," + ELEMENT_TEXT + " or " + ELEMENT_DATA) + .required(true) + .allowableValues(ELEMENT_HTML, ELEMENT_TEXT, ELEMENT_ATTRIBUTE) + .defaultValue(ELEMENT_HTML) + .build(); + + public static final PropertyDescriptor MODIFIED_VALUE = new PropertyDescriptor + .Builder().name("Modified Value") + .description("Value to update the found HTML elements with") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + public static final PropertyDescriptor ATTRIBUTE_KEY = new PropertyDescriptor + .Builder().name("Attribute Name") + .description(("When modifying the value of an element attribute this value is used as the key to determine" + + " which attribute on the selected element will be modified with the new value.")) + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + private List descriptors; + + private Set relationships; + + @Override + protected void init(final ProcessorInitializationContext context) { + final List descriptors = new ArrayList<>(); + descriptors.add(URL); + descriptors.add(CSS_SELECTOR); + descriptors.add(HTML_CHARSET); + descriptors.add(OUTPUT_TYPE); + descriptors.add(MODIFIED_VALUE); + descriptors.add(ATTRIBUTE_KEY); + this.descriptors = Collections.unmodifiableList(descriptors); + + final Set relationships = new HashSet(); + relationships.add(REL_ORIGINAL); + relationships.add(REL_SUCCESS); + relationships.add(REL_INVALID_HTML); + relationships.add(REL_NOT_FOUND); + this.relationships = Collections.unmodifiableSet(relationships); + } + + @Override + public Set getRelationships() { + return this.relationships; + } + + @Override + public final List getSupportedPropertyDescriptors() { + return descriptors; + } + + @Override + public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { + final FlowFile flowFile = session.get(); + if (flowFile == null) { + return; + } + + final Document doc; + final Elements eles; + try { + doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); + eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions(flowFile).getValue()); + } catch (Exception ex) { + getLogger().error("Failed to extract HTML from {} due to {}; routing to {}", new Object[] {flowFile, ex.toString(), REL_INVALID_HTML.getName()}, ex); + session.transfer(flowFile, REL_INVALID_HTML); + return; + } + + final String modifiedValue = context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions(flowFile).getValue(); + + if (eles == null || eles.size() == 0) { + // No element found + session.transfer(flowFile, REL_NOT_FOUND); + } else { + for (Element ele : eles) { + switch (context.getProperty(OUTPUT_TYPE).getValue()) { + case ELEMENT_HTML: + ele.html(modifiedValue); + break; + case ELEMENT_ATTRIBUTE: + ele.attr(context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions(flowFile).getValue(), modifiedValue); + break; + case ELEMENT_TEXT: + ele.text(modifiedValue); + break; + } + } + + FlowFile ff = session.write(session.create(flowFile), new StreamCallback() { + @Override + public void process(InputStream in, OutputStream out) throws IOException { + out.write(doc.html().getBytes(StandardCharsets.UTF_8)); + } + }); + ff = session.putAttribute(ff, NUM_ELEMENTS_MODIFIED_ATTR, new Integer(eles.size()).toString()); + session.transfer(ff, REL_SUCCESS); + + // Transfer the original HTML + session.transfer(flowFile, REL_ORIGINAL); + } + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java new file mode 100644 index 0000000000..995fc9953b --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.annotation.behavior.InputRequirement; +import org.apache.nifi.annotation.behavior.SupportsBatching; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.SeeAlso; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.io.StreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.HashSet; +import java.util.Collections; + +@Tags({"put", "html", "dom", "css", "element"}) +@SupportsBatching +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) +@CapabilityDescription("Places a new HTML element in the existing HTML DOM. The desired position for the new HTML element is specified by" + + " using CSS selector syntax. The incoming HTML is first converted into a HTML Document Object Model so that HTML DOM location may be located" + + " in a similar manner that CSS selectors are used to apply styles to HTML. The resulting HTML DOM is then \"queried\"" + + " using the user defined CSS selector string to find the position where the user desires to add the new HTML element." + + " Once the new HTML element is added to the DOM it is rendered to HTML and the result replaces the flowfile" + + " content with the updated HTML. A more thorough reference for the CSS selector syntax can be found at" + + " \"http://jsoup.org/apidocs/org/jsoup/select/Selector.html\"") +@SeeAlso({GetHTMLElement.class, ModifyHTMLElement.class}) +public class PutHTMLElement extends AbstractHTMLProcessor { + + public static final String APPEND_ELEMENT = "append-html"; + public static final String PREPEND_ELEMENT = "prepend-html"; + + public static final PropertyDescriptor PUT_LOCATION_TYPE = new PropertyDescriptor.Builder() + .name("Element Insert Location Type") + .description("Controls whether the new element is prepended or appended to the children of the " + + "Element located by the CSS selector. EX: prepended value 'Hi' inside of " + + "Element (using CSS Selector 'p') '

There

' would result in " + + "'

HiThere

'. Appending the value would result in '

ThereHi

'") + .required(true) + .allowableValues(APPEND_ELEMENT, PREPEND_ELEMENT) + .defaultValue(APPEND_ELEMENT) + .build(); + + public static final PropertyDescriptor PUT_VALUE = new PropertyDescriptor.Builder() + .name("Put Value") + .description("Value used when creating the new Element. Value should be a valid HTML element. " + + "The text should be supplied unencoded: characters like '<', '>', etc will be properly HTML " + + "encoded in the resulting output.") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) + .build(); + + private List descriptors; + + private Set relationships; + + @Override + protected void init(final ProcessorInitializationContext context) { + final List descriptors = new ArrayList(); + descriptors.add(URL); + descriptors.add(CSS_SELECTOR); + descriptors.add(HTML_CHARSET); + descriptors.add(PUT_LOCATION_TYPE); + descriptors.add(PUT_VALUE); + this.descriptors = Collections.unmodifiableList(descriptors); + + final Set relationships = new HashSet(); + relationships.add(REL_ORIGINAL); + relationships.add(REL_SUCCESS); + relationships.add(REL_INVALID_HTML); + relationships.add(REL_NOT_FOUND); + this.relationships = Collections.unmodifiableSet(relationships); + } + + @Override + public Set getRelationships() { + return this.relationships; + } + + @Override + public final List getSupportedPropertyDescriptors() { + return descriptors; + } + + @Override + public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { + final FlowFile flowFile = session.get(); + if (flowFile == null) { + return; + } + + final Document doc; + final Elements eles; + try { + doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); + eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions().getValue()); + } catch (Exception ex) { + getLogger().error("Failed to extract HTML from {} due to {}; routing to {}", new Object[] {flowFile, ex.toString(), REL_INVALID_HTML.getName()}, ex); + session.transfer(flowFile, REL_INVALID_HTML); + return; + } + + + if (eles == null || eles.isEmpty()) { + // No element found + session.transfer(flowFile, REL_NOT_FOUND); + } else { + final String putValue = context.getProperty(PUT_VALUE).evaluateAttributeExpressions(flowFile).getValue(); + + for (final Element ele : eles) { + switch (context.getProperty(PUT_LOCATION_TYPE).getValue()) { + case APPEND_ELEMENT: + ele.append(putValue); + break; + case PREPEND_ELEMENT: + ele.prepend(putValue); + break; + } + } + + FlowFile ff = session.write(session.create(flowFile), new StreamCallback() { + @Override + public void process(final InputStream in, final OutputStream out) throws IOException { + out.write(doc.html().getBytes(StandardCharsets.UTF_8)); + } + }); + + session.transfer(ff, REL_SUCCESS); + + // Transfer the original HTML + session.transfer(flowFile, REL_ORIGINAL); + } + } +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor new file mode 100644 index 0000000000..aea106050c --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +org.apache.nifi.GetHTMLElement +org.apache.nifi.ModifyHTMLElement +org.apache.nifi.PutHTMLElement \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java new file mode 100644 index 0000000000..10bc33ea16 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/AbstractHTMLTest.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +public abstract class AbstractHTMLTest { + + protected final String ATL_WEATHER_TEXT = "Atlanta Weather"; + protected final String GDR_WEATHER_TEXT = "Grand Rapids Weather"; + protected final String ATL_WEATHER_LINK = "http://w1.weather.gov/obhistory/KPDK.html"; + protected final String GR_WEATHER_LINK = "http://w1.weather.gov/obhistory/KGRR.html"; + protected final String AUTHOR_NAME = "Jeremy Dyer"; + protected final String ATL_ID = "ATL"; + protected final String GDR_ID = "GDR"; +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java new file mode 100644 index 0000000000..4839fcefec --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.nifi.util.MockFlowFile; +import org.apache.nifi.util.TestRunner; +import org.apache.nifi.util.TestRunners; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Selector; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.lang.Exception; +import java.net.URL; +import java.util.List; + +public class TestGetHTMLElement extends AbstractHTMLTest { + + private TestRunner testRunner; + + @Before + public void init() { + testRunner = TestRunners.newTestRunner(GetHTMLElement.class); + testRunner.setProperty(GetHTMLElement.URL, "http://localhost"); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.HTML_CHARSET, "UTF-8"); + } + + @Test + public void testCSSSelectorSyntaxValidator() throws IOException { + Document doc = Jsoup.parse(new URL("http://www.google.com"), 5000); + try { + doc.select("---jeremy"); + } catch (Selector.SelectorParseException ex) { + ex.printStackTrace(); + } + } + + @Test + public void testNoElementFound() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); //Bold element is not present in sample HTML + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testInvalidSelector() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "InvalidCSSSelectorSyntax"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testSingleElementFound() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "head"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + } + + @Test + public void testMultipleElementFound() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "a"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 3); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + } + + @Test + public void testElementFoundWriteToAttribute() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertAttributeEquals(GetHTMLElement.HTML_ELEMENT_ATTRIBUTE_NAME, ATL_WEATHER_LINK); + } + + @Test + public void testElementFoundWriteToContent() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals(ATL_WEATHER_LINK); + } + + @Test + public void testValidPrependValueToFoundElement() throws Exception { + final String PREPEND_VALUE = "TestPrepend"; + testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals(PREPEND_VALUE + ATL_WEATHER_LINK); + } + + @Test + public void testValidPrependValueToNotFoundElement() throws Exception { + final String PREPEND_VALUE = "TestPrepend"; + testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testValidAppendValueToFoundElement() throws Exception { + final String APPEND_VALUE = "TestAppend"; + testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals(ATL_WEATHER_LINK + APPEND_VALUE); + } + + @Test + public void testValidAppendValueToNotFoundElement() throws Exception { + final String APPEND_VALUE = "TestAppend"; + testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE); + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testExtractAttributeFromElement() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "meta[name=author]"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "Content"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals(AUTHOR_NAME); + } + + @Test + public void testExtractTextFromElement() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals(ATL_WEATHER_TEXT); + } + + @Test + public void testExtractHTMLFromElement() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals(GDR_WEATHER_TEXT); + } +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java new file mode 100644 index 0000000000..a45feb3a34 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestModifyHTMLElement.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.util.MockFlowFile; +import org.apache.nifi.util.TestRunner; +import org.apache.nifi.util.TestRunners; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.List; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +public class TestModifyHTMLElement extends AbstractHTMLTest { + + private TestRunner testRunner; + + @Before + public void init() { + testRunner = TestRunners.newTestRunner(ModifyHTMLElement.class); + testRunner = TestRunners.newTestRunner(ModifyHTMLElement.class); + testRunner.setProperty(ModifyHTMLElement.URL, "http://localhost"); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.HTML_CHARSET, "UTF-8"); + } + + @Test + public void testModifyText() throws Exception { + final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + ATL_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.text())); + } + + @Test + public void testModifyHTMLWithExpressionLanguage() throws Exception { + + final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT; + + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, "${\" " + MOD_VALUE + " \":trim()}"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + ATL_ID); + Element ele = eles.get(0); + + assertNotNull(ele.text()); + } + + @Test + public void testModifyHTML() throws Exception { + final String MOD_VALUE = "Newly modified HTML to replace " + GDR_WEATHER_TEXT; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + GDR_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.html())); + } + + @Test + public void testModifyAttribute() throws Exception { + final String MOD_VALUE = "http://localhost/newlink"; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(ModifyHTMLElement.ATTRIBUTE_KEY, "href"); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + GDR_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.attr("href"))); + } + + @Test + public void testModifyElementNotFound() throws Exception { + final String MOD_VALUE = "http://localhost/newlink"; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "b"); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 1); + } + + @Test + public void testModifyValueContainsHTMLCharacters() throws Exception { + final String MOD_VALUE = "Text that contains > and < characters"; + testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); + testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); + testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#" + GDR_ID); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE, ele.text())); + assertTrue(StringUtils.equals(MOD_VALUE.replace(">", ">").replace("<", "<"), ele.html())); + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java new file mode 100644 index 0000000000..b842ccb44d --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestPutHTMLElement.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.util.MockFlowFile; +import org.apache.nifi.util.TestRunner; +import org.apache.nifi.util.TestRunners; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.List; +import static org.junit.Assert.assertTrue; + + +public class TestPutHTMLElement extends AbstractHTMLTest { + + private TestRunner testRunner; + + @Before + public void init() { + testRunner = TestRunners.newTestRunner(PutHTMLElement.class); + testRunner.setProperty(PutHTMLElement.URL, "http://localhost"); + } + + @Test + public void testAddNewElementToRoot() throws Exception { + final String MOD_VALUE = "

modified value

"; + testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "body"); + testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT); + testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("body > p"); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(MOD_VALUE.replace("

", "").replace("

", ""), ele.html())); + } + + @Test + public void testPrependPElementToDiv() throws Exception { + final String MOD_VALUE = "

modified value

"; + testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "#put"); + testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.PREPEND_ELEMENT); + testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#put"); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals("

modified value

\n", ele.html())); + } + + @Test + public void testAppendPElementToDiv() throws Exception { + final String MOD_VALUE = "

modified value

"; + testRunner.setProperty(PutHTMLElement.CSS_SELECTOR, "#put"); + testRunner.setProperty(PutHTMLElement.PUT_LOCATION_TYPE, PutHTMLElement.APPEND_ELEMENT); + testRunner.setProperty(PutHTMLElement.PUT_VALUE, MOD_VALUE); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(PutHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(PutHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(PutHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(PutHTMLElement.REL_SUCCESS); + assertTrue(ffs.size() == 1); + String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); + + //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. + Document doc = Jsoup.parse(data); + Elements eles = doc.select("#put"); + Element ele = eles.get(0); + + assertTrue(StringUtils.equals(" \n" + + "

modified value

", ele.html())); + } + +} diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/resources/Weather.html b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/resources/Weather.html new file mode 100644 index 0000000000..673f7cb055 --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/resources/Weather.html @@ -0,0 +1,25 @@ + + + + NiFi HTML Parsing Demo + + + + + + + + + + +

Check out this weather! + Atlanta Weather +

+

I guess it could be colder ... + Grand Rapids Weather +

+
+ + \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-html-bundle/pom.xml b/nifi-nar-bundles/nifi-html-bundle/pom.xml new file mode 100644 index 0000000000..95d61ded0e --- /dev/null +++ b/nifi-nar-bundles/nifi-html-bundle/pom.xml @@ -0,0 +1,43 @@ + + + + 4.0.0 + + + org.apache.nifi + nifi-nar-bundles + 0.4.2-SNAPSHOT + + + nifi-html-bundle + pom + + + nifi-html-processors + nifi-html-nar + + + + + + org.apache.nifi + nifi-html-processors + 0.4.2-SNAPSHOT + + + + + diff --git a/nifi-nar-bundles/pom.xml b/nifi-nar-bundles/pom.xml index 96ab012dd9..3bc915b7f5 100644 --- a/nifi-nar-bundles/pom.xml +++ b/nifi-nar-bundles/pom.xml @@ -50,6 +50,7 @@ nifi-azure-bundle nifi-ldap-iaa-providers-bundle nifi-riemann-bundle + nifi-html-bundle diff --git a/pom.xml b/pom.xml index 213888b65b..501593fa63 100644 --- a/pom.xml +++ b/pom.xml @@ -753,6 +753,11 @@ language governing permissions and limitations under the License. --> spark-streaming_2.10 1.3.1
+ + org.jsoup + jsoup + 1.8.3 + org.apache.nifi nifi-api @@ -889,6 +894,12 @@ language governing permissions and limitations under the License. --> 0.4.2-SNAPSHOT nar + + org.apache.nifi + nifi-html-nar + 0.4.2-SNAPSHOT + nar + org.apache.nifi nifi-kite-nar @@ -1481,4 +1492,4 @@ language governing permissions and limitations under the License. --> - + \ No newline at end of file