Remove Apache Commons IO dependency and split into smaller methods (#942)

* Fix the requested changes

* Split into smaller methods

* Split into smaller methods

* Remove apache dependency and split into smaller methods

* Add unit tests
This commit is contained in:
Luís Soares 2017-01-09 14:06:46 +00:00 committed by KevinGilmore
parent 729c8990d0
commit 6a60defc94
3 changed files with 71 additions and 32 deletions

View File

@ -15,15 +15,16 @@
<version>${jsoup.version}</version> <version>${jsoup.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-io</groupId> <groupId>junit</groupId>
<artifactId>commons-io</artifactId> <artifactId>junit</artifactId>
<version>${commons.io.version}</version> <version>4.12</version>
<scope>test</scope>
</dependency> </dependency>
</dependencies> </dependencies>
<properties> <properties>
<maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.target>1.8</maven.compiler.target>
<commons.io.version>2.5</commons.io.version>
<jsoup.version>1.10.1</jsoup.version> <jsoup.version>1.10.1</jsoup.version>
</properties> </properties>
</project> </project>

View File

@ -1,47 +1,33 @@
package com.baeldung.jsoup; package com.baeldung.jsoup;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag; import org.jsoup.parser.Tag;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
public class JsoupExample { public class JsoupParser {
public static void main(String[] args) throws IOException { Document doc;
scrapeSpringBlog();
public void loadDocument(String blogUrl) throws IOException {
doc = Jsoup.connect(blogUrl).get();
} }
static void scrapeSpringBlog() throws IOException { void loadDocumentCustomized(String blogUrl) throws IOException {
String blogUrl = "https://spring.io/blog"; doc = Jsoup.connect(blogUrl)
Document doc = Jsoup.connect(blogUrl).get();
try {
Document doc404 = Jsoup.connect("https://spring.io/will-not-be-found").get();
} catch (HttpStatusException ex) {
System.out.println(ex.getMessage());
}
Document docCustomConn = Jsoup.connect(blogUrl).userAgent("Mozilla").get();
docCustomConn = Jsoup.connect(blogUrl).timeout(5000).get();
docCustomConn = Jsoup.connect(blogUrl).cookie("cookiename", "val234").get();
// docCustomConn = Jsoup.connect(blogUrl).data("datakey", "datavalue").post();
docCustomConn = Jsoup.connect(blogUrl).header("headersecurity", "xyz123").get();
docCustomConn = Jsoup.connect(blogUrl)
.userAgent("Mozilla") .userAgent("Mozilla")
.timeout(5000) .timeout(5000)
.cookie("cookiename", "val234") .cookie("cookiename", "val234")
.cookie("anothercookie", "ilovejsoup") .cookie("anothercookie", "ilovejsoup")
.referrer("http://google.com")
.header("headersecurity", "xyz123") .header("headersecurity", "xyz123")
.get(); .get();
}
void examplesSelectors() {
Elements links = doc.select("a"); Elements links = doc.select("a");
Elements sections = doc.select("section");
Elements logo = doc.select(".spring-logo--container"); Elements logo = doc.select(".spring-logo--container");
Elements pagination = doc.select("#pagination_control"); Elements pagination = doc.select("#pagination_control");
Elements divsDescendant = doc.select("header div"); Elements divsDescendant = doc.select("header div");
@ -50,6 +36,14 @@ public class JsoupExample {
Element pag = doc.getElementById("pagination_control"); Element pag = doc.getElementById("pagination_control");
Elements desktopOnly = doc.getElementsByClass("desktopOnly"); Elements desktopOnly = doc.getElementsByClass("desktopOnly");
Elements sections = doc.select("section");
Element firstSection = sections.first();
Elements sectionParagraphs = firstSection.select(".paragraph");
}
void examplesTraversing() {
Elements sections = doc.select("section");
Element firstSection = sections.first(); Element firstSection = sections.first();
Element lastSection = sections.last(); Element lastSection = sections.last();
Element secondSection = sections.get(2); Element secondSection = sections.get(2);
@ -59,9 +53,9 @@ public class JsoupExample {
Elements siblings = firstSection.siblingElements(); Elements siblings = firstSection.siblingElements();
sections.stream().forEach(el -> System.out.println("section: " + el)); sections.stream().forEach(el -> System.out.println("section: " + el));
}
Elements sectionParagraphs = firstSection.select(".paragraph"); void examplesExtracting() {
Element firstArticle = doc.select("article").first(); Element firstArticle = doc.select("article").first();
Element timeElement = firstArticle.select("time").first(); Element timeElement = firstArticle.select("time").first();
String dateTimeOfFirstArticle = timeElement.attr("datetime"); String dateTimeOfFirstArticle = timeElement.attr("datetime");
@ -69,7 +63,14 @@ public class JsoupExample {
String sectionDivText = sectionDiv.text(); String sectionDivText = sectionDiv.text();
String articleHtml = firstArticle.html(); String articleHtml = firstArticle.html();
String outerHtml = firstArticle.outerHtml(); String outerHtml = firstArticle.outerHtml();
}
void examplesModifying() {
Element firstArticle = doc.select("article").first();
Element timeElement = firstArticle.select("time").first();
Element sectionDiv = firstArticle.select("section div").first();
String dateTimeOfFirstArticle = timeElement.attr("datetime");
timeElement.attr("datetime", "2016-12-16 15:19:54.3"); timeElement.attr("datetime", "2016-12-16 15:19:54.3");
sectionDiv.text("foo bar"); sectionDiv.text("foo bar");
firstArticle.select("h2").html("<div><span></span></div>"); firstArticle.select("h2").html("<div><span></span></div>");
@ -82,8 +83,9 @@ public class JsoupExample {
doc.select("li.navbar-link").remove(); doc.select("li.navbar-link").remove();
firstArticle.select("img").remove(); firstArticle.select("img").remove();
}
File indexFile = new File("/tmp", "spring_blog_home.html"); String getTidyHtml() {
FileUtils.writeStringToFile(indexFile, doc.html(), doc.charset()); return doc.html();
} }
} }

View File

@ -0,0 +1,36 @@
package com.baeldung.jsoup;
import java.io.IOException;
import org.jsoup.HttpStatusException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Before;
import org.junit.Test;
public class JsoupParserTest {
JsoupParser jsoupParser;
@Before
public void setUp() {
jsoupParser = new JsoupParser();
}
@Test
public void test404() throws IOException {
try {
jsoupParser.loadDocument("https://spring.io/will-not-be-found");
} catch (HttpStatusException ex) {
assertEquals(404, ex.getStatusCode());
}
}
@Test
public void testChange() throws IOException {
jsoupParser.loadDocument("http://spring.io/blog");
jsoupParser.examplesModifying();
assertTrue(jsoupParser.getTidyHtml().contains("http://baeldung.com"));
}
}