Initial commit of Jsoup example (#913)

* Initial commit of Jsoup example

* Initial commit of Jsoup example

* Fix the requested changes
This commit is contained in:
Luís Soares 2016-12-25 14:11:27 +00:00 committed by KevinGilmore
parent 428bfcc8f2
commit c271755048
3 changed files with 123 additions and 4 deletions

29
jsoup/pom.xml Normal file
View File

@ -0,0 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<artifactId>jsoup</artifactId>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<commons.io.version>2.5</commons.io.version>
<jsoup.version>1.10.1</jsoup.version>
</properties>
</project>

View File

@ -0,0 +1,89 @@
package com.baeldung.jsoup;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
public class JsoupExample {
public static void main(String[] args) throws IOException {
scrapeSpringBlog();
}
static void scrapeSpringBlog() throws IOException {
String blogUrl = "https://spring.io/blog";
Document doc = Jsoup.connect(blogUrl).get();
try {
Document doc404 = Jsoup.connect("https://spring.io/will-not-be-found").get();
} catch (HttpStatusException ex) {
System.out.println(ex.getMessage());
}
Document docCustomConn = Jsoup.connect(blogUrl).userAgent("Mozilla").get();
docCustomConn = Jsoup.connect(blogUrl).timeout(5000).get();
docCustomConn = Jsoup.connect(blogUrl).cookie("cookiename", "val234").get();
// docCustomConn = Jsoup.connect(blogUrl).data("datakey", "datavalue").post();
docCustomConn = Jsoup.connect(blogUrl).header("headersecurity", "xyz123").get();
docCustomConn = Jsoup.connect(blogUrl)
.userAgent("Mozilla")
.timeout(5000)
.cookie("cookiename", "val234")
.cookie("anothercookie", "ilovejsoup")
.header("headersecurity", "xyz123")
.get();
Elements links = doc.select("a");
Elements sections = doc.select("section");
Elements logo = doc.select(".spring-logo--container");
Elements pagination = doc.select("#pagination_control");
Elements divsDescendant = doc.select("header div");
Elements divsDirect = doc.select("header > div");
Element pag = doc.getElementById("pagination_control");
Elements desktopOnly = doc.getElementsByClass("desktopOnly");
Element firstSection = sections.first();
Element lastSection = sections.last();
Element secondSection = sections.get(2);
Elements allParents = firstSection.parents();
Element parent = firstSection.parent();
Elements children = firstSection.children();
Elements siblings = firstSection.siblingElements();
sections.stream().forEach(el -> System.out.println("section: " + el));
Elements sectionParagraphs = firstSection.select(".paragraph");
Element firstArticle = doc.select("article").first();
Element timeElement = firstArticle.select("time").first();
String dateTimeOfFirstArticle = timeElement.attr("datetime");
Element sectionDiv = firstArticle.select("section div").first();
String sectionDivText = sectionDiv.text();
String articleHtml = firstArticle.html();
String outerHtml = firstArticle.outerHtml();
timeElement.attr("datetime", "2016-12-16 15:19:54.3");
sectionDiv.text("foo bar");
firstArticle.select("h2").html("<div><span></span></div>");
Element link = new Element(Tag.valueOf("a"), "")
.text("Checkout this amazing website!")
.attr("href", "http://baeldung.com")
.attr("target", "_blank");
firstArticle.appendChild(link);
doc.select("li.navbar-link").remove();
firstArticle.select("img").remove();
File indexFile = new File("/tmp", "spring_blog_home.html");
FileUtils.writeStringToFile(indexFile, doc.html(), doc.charset());
}
}

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
@ -64,11 +64,12 @@
<module>jsf</module>
<module>json-path</module>
<module>json</module>
<module>jsoup</module>
<module>junit5</module>
<module>log-mdc</module>
<module>log4j</module>
<module>log4j2</module>
<module>log4j2</module>
<module>lombok</module>
<module>mapstruct</module>
@ -119,8 +120,8 @@
<module>spring-jpa</module>
<module>spring-katharsis</module>
<module>spring-mockito</module>
<module>spring-mvc-email</module>
<module>spring-mvc-forms</module>
<module>spring-mvc-email</module>
<module>spring-mvc-forms</module>
<module>spring-mvc-java</module>
<module>spring-mvc-no-xml</module>
<module>spring-mvc-tiles</module>