BAEL-5194 rm html tags (#11404)
This commit is contained in:
parent
7f97bc1b98
commit
ddf531faa7
19
xml/pom.xml
19
xml/pom.xml
|
@ -14,6 +14,22 @@
|
|||
</parent>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>${jsoup.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||
<artifactId>htmlcleaner</artifactId>
|
||||
<version>${htmlcleaner.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.htmlparser.jericho</groupId>
|
||||
<artifactId>jericho-html</artifactId>
|
||||
<version>${jericho.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- xml libraries -->
|
||||
<dependency>
|
||||
<groupId>org.dom4j</groupId>
|
||||
|
@ -361,6 +377,9 @@
|
|||
<!-- maven plugins -->
|
||||
<maven-jibx-plugin.version>1.3.1</maven-jibx-plugin.version>
|
||||
<maven-compiler-plugin.version>3.8.0</maven-compiler-plugin.version>
|
||||
<jsoup.version>1.14.3</jsoup.version>
|
||||
<htmlcleaner.version>2.25</htmlcleaner.version>
|
||||
<jericho.version>3.4</jericho.version>
|
||||
</properties>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,61 @@
|
|||
package com.baeldung.xmlhtml.delhtmltags;
|
||||
|
||||
import net.htmlparser.jericho.Renderer;
|
||||
import net.htmlparser.jericho.Segment;
|
||||
import net.htmlparser.jericho.Source;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
class RemoveHtmlTagsLiveTest {
|
||||
|
||||
@Test
|
||||
void givenHtml1_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
|
||||
String html = new String(Files.readAllBytes(
|
||||
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example1.html").toURI()))));
|
||||
String result = html.replaceAll("<[^>]*>", "")
|
||||
.replaceAll("(?m)^\\s*$", ""); // remove empty and blank lines
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenHtml2_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
|
||||
String html = new String(Files.readAllBytes(
|
||||
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||
String result = html.replaceAll("<[^>]*>", "");
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenHtml2_whenRemoveTagsByJsoup_thenPrintText() throws IOException, URISyntaxException {
|
||||
String html = new String(Files.readAllBytes(
|
||||
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||
System.out.println(Jsoup.parse(html).text());
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenHtml2_whenRemoveTagsByHtmlCleaner_thenPrintText() throws IOException, URISyntaxException {
|
||||
String html = new String(Files.readAllBytes(
|
||||
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setPruneTags("script");
|
||||
String result = new HtmlCleaner(props).clean(html).getText().toString();
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenHtml2_whenRemoveTagsByJericho_thenPrintText() throws IOException, URISyntaxException {
|
||||
String html = new String(Files.readAllBytes(
|
||||
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||
Source htmlSource = new Source(html);
|
||||
Segment segment = new Segment(htmlSource, 0, htmlSource.length());
|
||||
Renderer htmlRender = new Renderer(segment).setIncludeHyperlinkURLs(true);
|
||||
System.out.println(htmlRender);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>This is the page title</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
If the application X doesn't start, the possible causes could be:<br/>
|
||||
1. <a href="maven.com">Maven</a> is not installed.<br/>
|
||||
2. Not enough disk space.<br/>
|
||||
3. Not enough memory.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,22 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>This is the page title</title>
|
||||
</head>
|
||||
<script>
|
||||
// some interesting script functions
|
||||
</script>
|
||||
<body>
|
||||
<p>
|
||||
If the application X doesn't start, the possible causes could be:<br/>
|
||||
1. <a
|
||||
id="link"
|
||||
href="http://maven.apache.org/">
|
||||
Maven
|
||||
</a> is not installed.<br/>
|
||||
2. Not enough (<1G) disk space.<br/>
|
||||
3. Not enough (<64MB) memory.<br/>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue