BAEL-5194 rm html tags (#11404)
This commit is contained in:
parent
7f97bc1b98
commit
ddf531faa7
19
xml/pom.xml
19
xml/pom.xml
|
@ -14,6 +14,22 @@
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
<version>${jsoup.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||||
|
<artifactId>htmlcleaner</artifactId>
|
||||||
|
<version>${htmlcleaner.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.htmlparser.jericho</groupId>
|
||||||
|
<artifactId>jericho-html</artifactId>
|
||||||
|
<version>${jericho.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- xml libraries -->
|
<!-- xml libraries -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.dom4j</groupId>
|
<groupId>org.dom4j</groupId>
|
||||||
|
@ -361,6 +377,9 @@
|
||||||
<!-- maven plugins -->
|
<!-- maven plugins -->
|
||||||
<maven-jibx-plugin.version>1.3.1</maven-jibx-plugin.version>
|
<maven-jibx-plugin.version>1.3.1</maven-jibx-plugin.version>
|
||||||
<maven-compiler-plugin.version>3.8.0</maven-compiler-plugin.version>
|
<maven-compiler-plugin.version>3.8.0</maven-compiler-plugin.version>
|
||||||
|
<jsoup.version>1.14.3</jsoup.version>
|
||||||
|
<htmlcleaner.version>2.25</htmlcleaner.version>
|
||||||
|
<jericho.version>3.4</jericho.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -0,0 +1,61 @@
|
||||||
|
package com.baeldung.xmlhtml.delhtmltags;
|
||||||
|
|
||||||
|
import net.htmlparser.jericho.Renderer;
|
||||||
|
import net.htmlparser.jericho.Segment;
|
||||||
|
import net.htmlparser.jericho.Source;
|
||||||
|
import org.htmlcleaner.CleanerProperties;
|
||||||
|
import org.htmlcleaner.HtmlCleaner;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
|
class RemoveHtmlTagsLiveTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenHtml1_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
|
||||||
|
String html = new String(Files.readAllBytes(
|
||||||
|
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example1.html").toURI()))));
|
||||||
|
String result = html.replaceAll("<[^>]*>", "")
|
||||||
|
.replaceAll("(?m)^\\s*$", ""); // remove empty and blank lines
|
||||||
|
System.out.println(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenHtml2_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
|
||||||
|
String html = new String(Files.readAllBytes(
|
||||||
|
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||||
|
String result = html.replaceAll("<[^>]*>", "");
|
||||||
|
System.out.println(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenHtml2_whenRemoveTagsByJsoup_thenPrintText() throws IOException, URISyntaxException {
|
||||||
|
String html = new String(Files.readAllBytes(
|
||||||
|
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||||
|
System.out.println(Jsoup.parse(html).text());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenHtml2_whenRemoveTagsByHtmlCleaner_thenPrintText() throws IOException, URISyntaxException {
|
||||||
|
String html = new String(Files.readAllBytes(
|
||||||
|
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||||
|
CleanerProperties props = new CleanerProperties();
|
||||||
|
props.setPruneTags("script");
|
||||||
|
String result = new HtmlCleaner(props).clean(html).getText().toString();
|
||||||
|
System.out.println(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenHtml2_whenRemoveTagsByJericho_thenPrintText() throws IOException, URISyntaxException {
|
||||||
|
String html = new String(Files.readAllBytes(
|
||||||
|
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
|
||||||
|
Source htmlSource = new Source(html);
|
||||||
|
Segment segment = new Segment(htmlSource, 0, htmlSource.length());
|
||||||
|
Renderer htmlRender = new Renderer(segment).setIncludeHyperlinkURLs(true);
|
||||||
|
System.out.println(htmlRender);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,15 @@
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/html4/loose.dtd">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>This is the page title</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
If the application X doesn't start, the possible causes could be:<br/>
|
||||||
|
1. <a href="maven.com">Maven</a> is not installed.<br/>
|
||||||
|
2. Not enough disk space.<br/>
|
||||||
|
3. Not enough memory.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/html4/loose.dtd">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>This is the page title</title>
|
||||||
|
</head>
|
||||||
|
<script>
|
||||||
|
// some interesting script functions
|
||||||
|
</script>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
If the application X doesn't start, the possible causes could be:<br/>
|
||||||
|
1. <a
|
||||||
|
id="link"
|
||||||
|
href="http://maven.apache.org/">
|
||||||
|
Maven
|
||||||
|
</a> is not installed.<br/>
|
||||||
|
2. Not enough (<1G) disk space.<br/>
|
||||||
|
3. Not enough (<64MB) memory.<br/>
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in New Issue