mirror of https://github.com/apache/nifi.git
NIFI-3101: This closes #1271. Improve Get/Modify/PutHTMLElement URL
- Added detailed description about how the URL property works with GetHTMLElement - Added Expression support with URL - Made URL property dynamic with ModifyHTMLElement and PutHTMLElement, since it won't be used to alter HTML element and need not to be specified. Making it a dynamic property let existing processor configuration stays valid
This commit is contained in:
parent
3dc7a160ce
commit
4bf267c8bb
|
@ -63,9 +63,12 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor {
|
||||||
|
|
||||||
public static final PropertyDescriptor URL = new PropertyDescriptor
|
public static final PropertyDescriptor URL = new PropertyDescriptor
|
||||||
.Builder().name("URL")
|
.Builder().name("URL")
|
||||||
.description("Base URL for the HTML page being parsed.")
|
.description("Base URL for the HTML page being parsed." +
|
||||||
|
" This URL will be used to resolve an absolute URL" +
|
||||||
|
" when an attribute value is extracted from a HTML element.")
|
||||||
.required(true)
|
.required(true)
|
||||||
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.expressionLanguageSupported(true)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor
|
public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor
|
||||||
|
@ -120,11 +123,20 @@ public abstract class AbstractHTMLProcessor extends AbstractProcessor {
|
||||||
session.read(inputFlowFile, new InputStreamCallback() {
|
session.read(inputFlowFile, new InputStreamCallback() {
|
||||||
@Override
|
@Override
|
||||||
public void process(InputStream inputStream) throws IOException {
|
public void process(InputStream inputStream) throws IOException {
|
||||||
|
final String baseUrl = getBaseUrl(inputFlowFile, context);
|
||||||
|
if (baseUrl == null || baseUrl.isEmpty()) {
|
||||||
|
throw new RuntimeException("Base URL was empty.");
|
||||||
|
}
|
||||||
doc.set(Jsoup.parse(inputStream,
|
doc.set(Jsoup.parse(inputStream,
|
||||||
context.getProperty(HTML_CHARSET).getValue(),
|
context.getProperty(HTML_CHARSET).getValue(),
|
||||||
context.getProperty(URL).getValue()));
|
baseUrl));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return doc.get();
|
return doc.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected String getBaseUrl(final FlowFile inputFlowFile, final ProcessContext context) {
|
||||||
|
return "http://localhost/";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,7 +92,9 @@ public class GetHTMLElement
|
||||||
.Builder().name("Attribute Name")
|
.Builder().name("Attribute Name")
|
||||||
.description(("When getting the value of a HTML element attribute this value is used as the key to determine" +
|
.description(("When getting the value of a HTML element attribute this value is used as the key to determine" +
|
||||||
" which attribute on the selected element should be retrieved. This value is used when the \"Output Type\"" +
|
" which attribute on the selected element should be retrieved. This value is used when the \"Output Type\"" +
|
||||||
" is set to \"" + ELEMENT_ATTRIBUTE + "\""))
|
" is set to \"" + ELEMENT_ATTRIBUTE + "\"." +
|
||||||
|
" If this value is prefixed with 'abs:', then the extracted attribute value will be converted into" +
|
||||||
|
" an absolute URL form using the specified base URL."))
|
||||||
.required(false)
|
.required(false)
|
||||||
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
.expressionLanguageSupported(true)
|
.expressionLanguageSupported(true)
|
||||||
|
@ -238,4 +240,8 @@ public class GetHTMLElement
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected String getBaseUrl(FlowFile inputFlowFile, ProcessContext context) {
|
||||||
|
return context.getProperty(URL).evaluateAttributeExpressions(inputFlowFile).getValue();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -98,7 +98,6 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor {
|
||||||
@Override
|
@Override
|
||||||
protected void init(final ProcessorInitializationContext context) {
|
protected void init(final ProcessorInitializationContext context) {
|
||||||
final List<PropertyDescriptor> descriptors = new ArrayList<>();
|
final List<PropertyDescriptor> descriptors = new ArrayList<>();
|
||||||
descriptors.add(URL);
|
|
||||||
descriptors.add(CSS_SELECTOR);
|
descriptors.add(CSS_SELECTOR);
|
||||||
descriptors.add(HTML_CHARSET);
|
descriptors.add(HTML_CHARSET);
|
||||||
descriptors.add(OUTPUT_TYPE);
|
descriptors.add(OUTPUT_TYPE);
|
||||||
|
@ -124,6 +123,16 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor {
|
||||||
return descriptors;
|
return descriptors;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This processor used to support URL property, but it has been removed
|
||||||
|
* since it's not required when altering HTML elements.
|
||||||
|
* Support URL as dynamic property so that existing data flow can stay in valid state without modification.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
|
||||||
|
return URL;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||||
final FlowFile flowFile = session.get();
|
final FlowFile flowFile = session.get();
|
||||||
|
|
|
@ -88,7 +88,6 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
|
||||||
@Override
|
@Override
|
||||||
protected void init(final ProcessorInitializationContext context) {
|
protected void init(final ProcessorInitializationContext context) {
|
||||||
final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
|
final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
|
||||||
descriptors.add(URL);
|
|
||||||
descriptors.add(CSS_SELECTOR);
|
descriptors.add(CSS_SELECTOR);
|
||||||
descriptors.add(HTML_CHARSET);
|
descriptors.add(HTML_CHARSET);
|
||||||
descriptors.add(PUT_LOCATION_TYPE);
|
descriptors.add(PUT_LOCATION_TYPE);
|
||||||
|
@ -113,6 +112,16 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
|
||||||
return descriptors;
|
return descriptors;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This processor used to support URL property, but it has been removed
|
||||||
|
* since it's not required when altering HTML elements.
|
||||||
|
* Support URL as dynamic property so that existing data flow can stay in valid state without modification.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
|
||||||
|
return URL;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||||
final FlowFile flowFile = session.get();
|
final FlowFile flowFile = session.get();
|
||||||
|
|
|
@ -19,7 +19,9 @@ package org.apache.nifi;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.nifi.util.MockFlowFile;
|
import org.apache.nifi.util.MockFlowFile;
|
||||||
import org.apache.nifi.util.TestRunner;
|
import org.apache.nifi.util.TestRunner;
|
||||||
|
@ -232,6 +234,84 @@ public class TestGetHTMLElement extends AbstractHTMLTest {
|
||||||
ffs.get(0).assertContentEquals(AUTHOR_NAME);
|
ffs.get(0).assertContentEquals(AUTHOR_NAME);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExtractAttributeFromElementRelativeUrl() throws Exception {
|
||||||
|
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||||
|
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||||
|
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||||
|
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "src");
|
||||||
|
|
||||||
|
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
|
||||||
|
testRunner.run();
|
||||||
|
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||||
|
|
||||||
|
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
|
||||||
|
ffs.get(0).assertContentEquals("js/scripts.js");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExtractAttributeFromElementAbsoluteUrl() throws Exception {
|
||||||
|
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||||
|
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||||
|
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||||
|
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
|
||||||
|
|
||||||
|
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
|
||||||
|
testRunner.run();
|
||||||
|
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||||
|
|
||||||
|
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
|
||||||
|
ffs.get(0).assertContentEquals("http://localhost/js/scripts.js");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExtractAttributeFromElementAbsoluteUrlWithEL() throws Exception {
|
||||||
|
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||||
|
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||||
|
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||||
|
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
|
||||||
|
testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
|
||||||
|
|
||||||
|
final Map<String, String> attributes = new HashMap<>();
|
||||||
|
attributes.put("contentUrl", "https://example.com/a/b/c/Weather.html");
|
||||||
|
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath(), attributes);
|
||||||
|
testRunner.run();
|
||||||
|
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||||
|
|
||||||
|
List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
|
||||||
|
ffs.get(0).assertContentEquals("https://example.com/a/b/c/js/scripts.js");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExtractAttributeFromElementAbsoluteUrlWithEmptyElResult() throws Exception {
|
||||||
|
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
|
||||||
|
testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT);
|
||||||
|
testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE);
|
||||||
|
testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
|
||||||
|
// Expression Language returns empty string because flow-file doesn't have contentUrl attribute.
|
||||||
|
testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
|
||||||
|
|
||||||
|
testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
|
||||||
|
testRunner.run();
|
||||||
|
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 1);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0);
|
||||||
|
testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExtractTextFromElement() throws Exception {
|
public void testExtractTextFromElement() throws Exception {
|
||||||
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
|
testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
|
||||||
|
|
Loading…
Reference in New Issue