From 4bf267c8bb11939d715c02c3ca818dec372bec26 Mon Sep 17 00:00:00 2001 From: Koji Kawamura Date: Fri, 25 Nov 2016 17:58:32 +0900 Subject: [PATCH] NIFI-3101: This closes #1271. Improve Get/Modify/PutHTMLElement URL - Added detailed description about how the URL property works with GetHTMLElement - Added Expression support with URL - Made URL property dynamic with ModifyHTMLElement and PutHTMLElement, since it won't be used to alter HTML element and need not to be specified. Making it a dynamic property let existing processor configuration stays valid --- .../apache/nifi/AbstractHTMLProcessor.java | 16 +++- .../java/org/apache/nifi/GetHTMLElement.java | 8 +- .../org/apache/nifi/ModifyHTMLElement.java | 11 ++- .../java/org/apache/nifi/PutHTMLElement.java | 11 ++- .../org/apache/nifi/TestGetHTMLElement.java | 80 +++++++++++++++++++ 5 files changed, 121 insertions(+), 5 deletions(-) diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java index 127f0d86a0..8ad6f8a44e 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java @@ -63,9 +63,12 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor { public static final PropertyDescriptor URL = new PropertyDescriptor .Builder().name("URL") - .description("Base URL for the HTML page being parsed.") + .description("Base URL for the HTML page being parsed." + + " This URL will be used to resolve an absolute URL" + + " when an attribute value is extracted from a HTML element.") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .expressionLanguageSupported(true) .build(); public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor @@ -120,11 +123,20 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor { session.read(inputFlowFile, new InputStreamCallback() { @Override public void process(InputStream inputStream) throws IOException { + final String baseUrl = getBaseUrl(inputFlowFile, context); + if (baseUrl == null || baseUrl.isEmpty()) { + throw new RuntimeException("Base URL was empty."); + } doc.set(Jsoup.parse(inputStream, context.getProperty(HTML_CHARSET).getValue(), - context.getProperty(URL).getValue())); + baseUrl)); } }); return doc.get(); } + + + protected String getBaseUrl(final FlowFile inputFlowFile, final ProcessContext context) { + return "http://localhost/"; + } } diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java index 1d421a0f16..713fabd0d7 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java @@ -92,7 +92,9 @@ public class GetHTMLElement .Builder().name("Attribute Name") .description(("When getting the value of a HTML element attribute this value is used as the key to determine" + " which attribute on the selected element should be retrieved. This value is used when the \"Output Type\"" + - " is set to \"" + ELEMENT_ATTRIBUTE + "\"")) + " is set to \"" + ELEMENT_ATTRIBUTE + "\"." + + " If this value is prefixed with 'abs:', then the extracted attribute value will be converted into" + + " an absolute URL form using the specified base URL.")) .required(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) @@ -238,4 +240,8 @@ public class GetHTMLElement } } + @Override + protected String getBaseUrl(FlowFile inputFlowFile, ProcessContext context) { + return context.getProperty(URL).evaluateAttributeExpressions(inputFlowFile).getValue(); + } } diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java index e84d4edd53..7f6e12e837 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java @@ -98,7 +98,6 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor { @Override protected void init(final ProcessorInitializationContext context) { final List descriptors = new ArrayList<>(); - descriptors.add(URL); descriptors.add(CSS_SELECTOR); descriptors.add(HTML_CHARSET); descriptors.add(OUTPUT_TYPE); @@ -124,6 +123,16 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor { return descriptors; } + /** + * This processor used to support URL property, but it has been removed + * since it's not required when altering HTML elements. + * Support URL as dynamic property so that existing data flow can stay in valid state without modification. + */ + @Override + protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) { + return URL; + } + @Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { final FlowFile flowFile = session.get(); diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java index 995fc9953b..bc9b70c7bc 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java @@ -88,7 +88,6 @@ public class PutHTMLElement extends AbstractHTMLProcessor { @Override protected void init(final ProcessorInitializationContext context) { final List descriptors = new ArrayList(); - descriptors.add(URL); descriptors.add(CSS_SELECTOR); descriptors.add(HTML_CHARSET); descriptors.add(PUT_LOCATION_TYPE); @@ -113,6 +112,16 @@ public class PutHTMLElement extends AbstractHTMLProcessor { return descriptors; } + /** + * This processor used to support URL property, but it has been removed + * since it's not required when altering HTML elements. + * Support URL as dynamic property so that existing data flow can stay in valid state without modification. + */ + @Override + protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) { + return URL; + } + @Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { final FlowFile flowFile = session.get(); diff --git a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java index 4b215fda8e..2b2706d060 100644 --- a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java +++ b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java @@ -19,7 +19,9 @@ package org.apache.nifi; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; @@ -232,6 +234,84 @@ public class TestGetHTMLElement extends AbstractHTMLTest { ffs.get(0).assertContentEquals(AUTHOR_NAME); } + @Test + public void testExtractAttributeFromElementRelativeUrl() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "src"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals("js/scripts.js"); + } + + @Test + public void testExtractAttributeFromElementAbsoluteUrl() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals("http://localhost/js/scripts.js"); + } + + @Test + public void testExtractAttributeFromElementAbsoluteUrlWithEL() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src"); + testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}"); + + final Map attributes = new HashMap<>(); + attributes.put("contentUrl", "https://example.com/a/b/c/Weather.html"); + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath(), attributes); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + + List ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); + ffs.get(0).assertContentEquals("https://example.com/a/b/c/js/scripts.js"); + } + + @Test + public void testExtractAttributeFromElementAbsoluteUrlWithEmptyElResult() throws Exception { + testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); + testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); + testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); + testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src"); + // Expression Language returns empty string because flow-file doesn't have contentUrl attribute. + testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}"); + + testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 1); + testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); + testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); + } + @Test public void testExtractTextFromElement() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);