mirror of https://github.com/apache/nifi.git
NIFI-3101: This closes #1271. Improve Get/Modify/PutHTMLElement URL
- Added detailed description about how the URL property works with GetHTMLElement - Added Expression support with URL - Made URL property dynamic with ModifyHTMLElement and PutHTMLElement, since it won't be used to alter HTML element and need not to be specified. Making it a dynamic property let existing processor configuration stays valid
This commit is contained in:
parent
3dc7a160ce
commit
4bf267c8bb
|
@ -63,9 +63,12 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor {
|
|||
|
||||
public static final PropertyDescriptor URL = new PropertyDescriptor
|
||||
.Builder().name("URL")
|
||||
.description("Base URL for the HTML page being parsed.")
|
||||
.description("Base URL for the HTML page being parsed." +
|
||||
" This URL will be used to resolve an absolute URL" +
|
||||
" when an attribute value is extracted from a HTML element.")
|
||||
.required(true)
|
||||
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||
.expressionLanguageSupported(true)
|
||||
.build();
|
||||
|
||||
public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor
|
||||
|
@ -120,11 +123,20 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor {
|
|||
session.read(inputFlowFile, new InputStreamCallback() {
|
||||
@Override
|
||||
public void process(InputStream inputStream) throws IOException {
|
||||
final String baseUrl = getBaseUrl(inputFlowFile, context);
|
||||
if (baseUrl == null || baseUrl.isEmpty()) {
|
||||
throw new RuntimeException("Base URL was empty.");
|
||||
}
|
||||
doc.set(Jsoup.parse(inputStream,
|
||||
context.getProperty(HTML_CHARSET).getValue(),
|
||||
context.getProperty(URL).getValue()));
|
||||
baseUrl));
|
||||
}
|
||||
});
|
||||
return doc.get();
|
||||
}
|
||||
|
||||
|
||||
protected String getBaseUrl(final FlowFile inputFlowFile, final ProcessContext context) {
|
||||
return "http://localhost/";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,7 +92,9 @@ public class GetHTMLElement
|
|||
.Builder().name("Attribute Name")
|
||||
.description(("When getting the value of a HTML element attribute this value is used as the key to determine" +
|
||||
" which attribute on the selected element should be retrieved. This value is used when the \"Output Type\"" +
|
||||
" is set to \"" + ELEMENT_ATTRIBUTE + "\""))
|
||||
" is set to \"" + ELEMENT_ATTRIBUTE + "\"." +
|
||||
" If this value is prefixed with 'abs:', then the extracted attribute value will be converted into" +
|
||||
" an absolute URL form using the specified base URL."))
|
||||
.required(false)
|
||||
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||
.expressionLanguageSupported(true)
|
||||
|
@ -238,4 +240,8 @@ public class GetHTMLElement
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getBaseUrl(FlowFile inputFlowFile, ProcessContext context) {
|
||||
return context.getProperty(URL).evaluateAttributeExpressions(inputFlowFile).getValue();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -98,7 +98,6 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor {
|
|||
@Override
|
||||
protected void init(final ProcessorInitializationContext context) {
|
||||
final List<PropertyDescriptor> descriptors = new ArrayList<>();
|
||||
descriptors.add(URL);
|
||||
descriptors.add(CSS_SELECTOR);
|
||||
descriptors.add(HTML_CHARSET);
|
||||
descriptors.add(OUTPUT_TYPE);
|
||||
|
@ -124,6 +123,16 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor {
|
|||
return descriptors;
|
||||
}
|
||||
|
||||
/**
|
||||
* This processor used to support URL property, but it has been removed
|
||||
* since it's not required when altering HTML elements.
|
||||
* Support URL as dynamic property so that existing data flow can stay in valid state without modification.
|
||||
*/
|
||||
@Override
|
||||
protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
|
||||
return URL;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||
final FlowFile flowFile = session.get();
|
||||
|
|
|
@ -88,7 +88,6 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
|
|||
@Override
|
||||
protected void init(final ProcessorInitializationContext context) {
|
||||
final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
|
||||
descriptors.add(URL);
|
||||
descriptors.add(CSS_SELECTOR);
|
||||
descriptors.add(HTML_CHARSET);
|
||||
descriptors.add(PUT_LOCATION_TYPE);
|
||||
|
@ -113,6 +112,16 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
|
|||
return descriptors;
|
||||
}
|
||||
|
||||
/**
|
||||
* This processor used to support URL property, but it has been removed
|
||||
* since it's not required when altering HTML elements.
|
||||
* Support URL as dynamic property so that existing data flow can stay in valid state without modification.
|
||||
*/
|
||||
@Override
|
||||
protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
|
||||
return URL;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||
final FlowFile flowFile = session.get();
|
||||
|
|
|
@ -19,7 +19,9 @@ package org.apache.nifi;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.nifi.util.MockFlowFile;
|
||||
import org.apache.nifi.util.TestRunner;
|
||||
|
@ -232,6 +234,84 @@ public class TestGetHTMLElement extends AbstractHTMLTest {
|
|||
ffs.get(0).assertContentEquals(AUTHOR_NAME);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractAttributeFromElementRelativeUrl() throws Exception {
|
||||
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "src");
|
||||
|
||||
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
|
||||
testRunner.run();
|
||||
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||
|
||||
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
|
||||
ffs.get(0).assertContentEquals("js/scripts.js");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractAttributeFromElementAbsoluteUrl() throws Exception {
|
||||
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
|
||||
|
||||
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
|
||||
testRunner.run();
|
||||
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||
|
||||
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
|
||||
ffs.get(0).assertContentEquals("http://localhost/js/scripts.js");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractAttributeFromElementAbsoluteUrlWithEL() throws Exception {
|
||||
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
|
||||
testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
|
||||
|
||||
final Map<String, String> attributes = new HashMap<>();
|
||||
attributes.put("contentUrl", "https://example.com/a/b/c/Weather.html");
|
||||
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath(), attributes);
|
||||
testRunner.run();
|
||||
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||
|
||||
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
|
||||
ffs.get(0).assertContentEquals("https://example.com/a/b/c/js/scripts.js");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractAttributeFromElementAbsoluteUrlWithEmptyElResult() throws Exception {
|
||||
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
|
||||
// Expression Language returns empty string because flow-file doesn't have contentUrl attribute.
|
||||
testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
|
||||
|
||||
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
|
||||
testRunner.run();
|
||||
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 1);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0);
|
||||
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractTextFromElement() throws Exception {
|
||||
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
|
||||
|
|
Loading…
Reference in New Issue