From ddf531faa7b99d06eae049fd6058fe72e32d4aaa Mon Sep 17 00:00:00 2001 From: Kai Yuan Date: Thu, 11 Nov 2021 23:19:32 +0100 Subject: [PATCH] BAEL-5194 rm html tags (#11404) --- xml/pom.xml | 19 ++++++ .../delhtmltags/RemoveHtmlTagsLiveTest.java | 61 +++++++++++++++++++ .../xmlhtml/delhtmltags/example1.html | 15 +++++ .../xmlhtml/delhtmltags/example2.html | 22 +++++++ 4 files changed, 117 insertions(+) create mode 100644 xml/src/test/java/com/baeldung/xmlhtml/delhtmltags/RemoveHtmlTagsLiveTest.java create mode 100644 xml/src/test/resources/xmlhtml/delhtmltags/example1.html create mode 100644 xml/src/test/resources/xmlhtml/delhtmltags/example2.html diff --git a/xml/pom.xml b/xml/pom.xml index d8aa77583b..d05f2401e3 100644 --- a/xml/pom.xml +++ b/xml/pom.xml @@ -14,6 +14,22 @@ + + org.jsoup + jsoup + ${jsoup.version} + + + net.sourceforge.htmlcleaner + htmlcleaner + ${htmlcleaner.version} + + + net.htmlparser.jericho + jericho-html + ${jericho.version} + + org.dom4j @@ -361,6 +377,9 @@ 1.3.1 3.8.0 + 1.14.3 + 2.25 + 3.4 \ No newline at end of file diff --git a/xml/src/test/java/com/baeldung/xmlhtml/delhtmltags/RemoveHtmlTagsLiveTest.java b/xml/src/test/java/com/baeldung/xmlhtml/delhtmltags/RemoveHtmlTagsLiveTest.java new file mode 100644 index 0000000000..b05123cbdc --- /dev/null +++ b/xml/src/test/java/com/baeldung/xmlhtml/delhtmltags/RemoveHtmlTagsLiveTest.java @@ -0,0 +1,61 @@ +package com.baeldung.xmlhtml.delhtmltags; + +import net.htmlparser.jericho.Renderer; +import net.htmlparser.jericho.Segment; +import net.htmlparser.jericho.Source; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.HtmlCleaner; +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Paths; + +class RemoveHtmlTagsLiveTest { + + @Test + void givenHtml1_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException { + String html = new String(Files.readAllBytes( + (Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example1.html").toURI())))); + String result = html.replaceAll("<[^>]*>", "") + .replaceAll("(?m)^\\s*$", ""); // remove empty and blank lines + System.out.println(result); + } + + @Test + void givenHtml2_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException { + String html = new String(Files.readAllBytes( + (Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI())))); + String result = html.replaceAll("<[^>]*>", ""); + System.out.println(result); + } + + @Test + void givenHtml2_whenRemoveTagsByJsoup_thenPrintText() throws IOException, URISyntaxException { + String html = new String(Files.readAllBytes( + (Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI())))); + System.out.println(Jsoup.parse(html).text()); + } + + @Test + void givenHtml2_whenRemoveTagsByHtmlCleaner_thenPrintText() throws IOException, URISyntaxException { + String html = new String(Files.readAllBytes( + (Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI())))); + CleanerProperties props = new CleanerProperties(); + props.setPruneTags("script"); + String result = new HtmlCleaner(props).clean(html).getText().toString(); + System.out.println(result); + } + + @Test + void givenHtml2_whenRemoveTagsByJericho_thenPrintText() throws IOException, URISyntaxException { + String html = new String(Files.readAllBytes( + (Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI())))); + Source htmlSource = new Source(html); + Segment segment = new Segment(htmlSource, 0, htmlSource.length()); + Renderer htmlRender = new Renderer(segment).setIncludeHyperlinkURLs(true); + System.out.println(htmlRender); + } +} diff --git a/xml/src/test/resources/xmlhtml/delhtmltags/example1.html b/xml/src/test/resources/xmlhtml/delhtmltags/example1.html new file mode 100644 index 0000000000..43f3a22ef4 --- /dev/null +++ b/xml/src/test/resources/xmlhtml/delhtmltags/example1.html @@ -0,0 +1,15 @@ + + + + This is the page title + + +

+ If the application X doesn't start, the possible causes could be:
+ 1. Maven is not installed.
+ 2. Not enough disk space.
+ 3. Not enough memory. +

+ + \ No newline at end of file diff --git a/xml/src/test/resources/xmlhtml/delhtmltags/example2.html b/xml/src/test/resources/xmlhtml/delhtmltags/example2.html new file mode 100644 index 0000000000..532eadc101 --- /dev/null +++ b/xml/src/test/resources/xmlhtml/delhtmltags/example2.html @@ -0,0 +1,22 @@ + + + + This is the page title + + + +

+ If the application X doesn't start, the possible causes could be:
+ 1. + Maven + is not installed.
+ 2. Not enough (<1G) disk space.
+ 3. Not enough (<64MB) memory.
+

+ + \ No newline at end of file