NIFI-3101: This closes #1271. Improve Get/Modify/PutHTMLElement URL

- Added detailed description about how the URL property works with
  GetHTMLElement
- Added Expression support with URL
- Made URL property dynamic with ModifyHTMLElement and PutHTMLElement,
  since it won't be used to alter HTML element and need not to be
  specified. Making it a dynamic property let existing processor configuration stays valid
This commit is contained in:
Koji Kawamura 2016-11-25 17:58:32 +09:00 committed by joewitt
parent 3dc7a160ce
commit 4bf267c8bb
5 changed files with 121 additions and 5 deletions

View File

@ -63,9 +63,12 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor {
public static final PropertyDescriptor URL = new PropertyDescriptor
.Builder().name("URL")
.description("Base URL for the HTML page being parsed.")
.description("Base URL for the HTML page being parsed." +
" This URL will be used to resolve an absolute URL" +
" when an attribute value is extracted from a HTML element.")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor
@ -120,11 +123,20 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor {
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream inputStream) throws IOException {
final String baseUrl = getBaseUrl(inputFlowFile, context);
if (baseUrl == null || baseUrl.isEmpty()) {
throw new RuntimeException("Base URL was empty.");
}
doc.set(Jsoup.parse(inputStream,
context.getProperty(HTML_CHARSET).getValue(),
context.getProperty(URL).getValue()));
baseUrl));
}
});
return doc.get();
}
protected String getBaseUrl(final FlowFile inputFlowFile, final ProcessContext context) {
return "http://localhost/";
}
}

View File

@ -92,7 +92,9 @@ public class GetHTMLElement
.Builder().name("Attribute Name")
.description(("When getting the value of a HTML element attribute this value is used as the key to determine" +
" which attribute on the selected element should be retrieved. This value is used when the \"Output Type\"" +
" is set to \"" + ELEMENT_ATTRIBUTE + "\""))
" is set to \"" + ELEMENT_ATTRIBUTE + "\"." +
" If this value is prefixed with 'abs:', then the extracted attribute value will be converted into" +
" an absolute URL form using the specified base URL."))
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
@ -238,4 +240,8 @@ public class GetHTMLElement
}
}
@Override
protected String getBaseUrl(FlowFile inputFlowFile, ProcessContext context) {
return context.getProperty(URL).evaluateAttributeExpressions(inputFlowFile).getValue();
}
}

View File

@ -98,7 +98,6 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor {
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<>();
descriptors.add(URL);
descriptors.add(CSS_SELECTOR);
descriptors.add(HTML_CHARSET);
descriptors.add(OUTPUT_TYPE);
@ -124,6 +123,16 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor {
return descriptors;
}
/**
* This processor used to support URL property, but it has been removed
* since it's not required when altering HTML elements.
* Support URL as dynamic property so that existing data flow can stay in valid state without modification.
*/
@Override
protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
return URL;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile flowFile = session.get();

View File

@ -88,7 +88,6 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
descriptors.add(URL);
descriptors.add(CSS_SELECTOR);
descriptors.add(HTML_CHARSET);
descriptors.add(PUT_LOCATION_TYPE);
@ -113,6 +112,16 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
return descriptors;
}
/**
* This processor used to support URL property, but it has been removed
* since it's not required when altering HTML elements.
* Support URL as dynamic property so that existing data flow can stay in valid state without modification.
*/
@Override
protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
return URL;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile flowFile = session.get();

View File

@ -19,7 +19,9 @@ package org.apache.nifi;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
@ -232,6 +234,84 @@ public class TestGetHTMLElement extends AbstractHTMLTest {
ffs.get(0).assertContentEquals(AUTHOR_NAME);
}
@Test
public void testExtractAttributeFromElementRelativeUrl() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "src");
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
ffs.get(0).assertContentEquals("js/scripts.js");
}
@Test
public void testExtractAttributeFromElementAbsoluteUrl() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
ffs.get(0).assertContentEquals("http://localhost/js/scripts.js");
}
@Test
public void testExtractAttributeFromElementAbsoluteUrlWithEL() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
final Map<String, String> attributes = new HashMap<>();
attributes.put("contentUrl", "https://example.com/a/b/c/Weather.html");
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath(), attributes);
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
ffs.get(0).assertContentEquals("https://example.com/a/b/c/js/scripts.js");
}
@Test
public void testExtractAttributeFromElementAbsoluteUrlWithEmptyElResult() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
// Expression Language returns empty string because flow-file doesn't have contentUrl attribute.
testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
testRunner.run();
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 1);
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0);
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
}
@Test
public void testExtractTextFromElement() throws Exception {
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);