BAEL-5194 rm html tags (#11404)

This commit is contained in:
Kai Yuan 2021-11-11 23:19:32 +01:00 committed by GitHub
parent 7f97bc1b98
commit ddf531faa7
4 changed files with 117 additions and 0 deletions

View File

@ -14,6 +14,22 @@
</parent>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>${htmlcleaner.version}</version>
</dependency>
<dependency>
<groupId>net.htmlparser.jericho</groupId>
<artifactId>jericho-html</artifactId>
<version>${jericho.version}</version>
</dependency>
<!-- xml libraries -->
<dependency>
<groupId>org.dom4j</groupId>
@ -361,6 +377,9 @@
<!-- maven plugins -->
<maven-jibx-plugin.version>1.3.1</maven-jibx-plugin.version>
<maven-compiler-plugin.version>3.8.0</maven-compiler-plugin.version>
<jsoup.version>1.14.3</jsoup.version>
<htmlcleaner.version>2.25</htmlcleaner.version>
<jericho.version>3.4</jericho.version>
</properties>
</project>

View File

@ -0,0 +1,61 @@
package com.baeldung.xmlhtml.delhtmltags;
import net.htmlparser.jericho.Renderer;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.Source;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Paths;
class RemoveHtmlTagsLiveTest {
@Test
void givenHtml1_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
String html = new String(Files.readAllBytes(
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example1.html").toURI()))));
String result = html.replaceAll("<[^>]*>", "")
.replaceAll("(?m)^\\s*$", ""); // remove empty and blank lines
System.out.println(result);
}
@Test
void givenHtml2_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
String html = new String(Files.readAllBytes(
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
String result = html.replaceAll("<[^>]*>", "");
System.out.println(result);
}
@Test
void givenHtml2_whenRemoveTagsByJsoup_thenPrintText() throws IOException, URISyntaxException {
String html = new String(Files.readAllBytes(
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
System.out.println(Jsoup.parse(html).text());
}
@Test
void givenHtml2_whenRemoveTagsByHtmlCleaner_thenPrintText() throws IOException, URISyntaxException {
String html = new String(Files.readAllBytes(
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
CleanerProperties props = new CleanerProperties();
props.setPruneTags("script");
String result = new HtmlCleaner(props).clean(html).getText().toString();
System.out.println(result);
}
@Test
void givenHtml2_whenRemoveTagsByJericho_thenPrintText() throws IOException, URISyntaxException {
String html = new String(Files.readAllBytes(
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
Source htmlSource = new Source(html);
Segment segment = new Segment(htmlSource, 0, htmlSource.length());
Renderer htmlRender = new Renderer(segment).setIncludeHyperlinkURLs(true);
System.out.println(htmlRender);
}
}

View File

@ -0,0 +1,15 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>This is the page title</title>
</head>
<body>
<p>
If the application X doesn't start, the possible causes could be:<br/>
1. <a href="maven.com">Maven</a> is not installed.<br/>
2. Not enough disk space.<br/>
3. Not enough memory.
</p>
</body>
</html>

View File

@ -0,0 +1,22 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>This is the page title</title>
</head>
<script>
// some interesting script functions
</script>
<body>
<p>
If the application X doesn't start, the possible causes could be:<br/>
1. <a
id="link"
href="http://maven.apache.org/">
Maven
</a> is not installed.<br/>
2. Not enough (<1G) disk space.<br/>
3. Not enough (<64MB) memory.<br/>
</p>
</body>
</html>