Bael 2979 Guide to Crawler4j (#7071)
* BAEL-2727 Example Code * BAEL-2979 Guide to Crawler4j * BAEL-2979 adjust based on feedback
This commit is contained in:
parent
8d6e70ffee
commit
0ece18c766
|
@ -55,6 +55,11 @@
|
||||||
<artifactId>spring-boot-starter</artifactId>
|
<artifactId>spring-boot-starter</artifactId>
|
||||||
<version>${spring-boot-starter.version}</version>
|
<version>${spring-boot-starter.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>edu.uci.ics</groupId>
|
||||||
|
<artifactId>crawler4j</artifactId>
|
||||||
|
<version>${crawler4j.version}</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -62,6 +67,7 @@
|
||||||
<classgraph.version>4.8.28</classgraph.version>
|
<classgraph.version>4.8.28</classgraph.version>
|
||||||
<jbpm.version>6.0.0.Final</jbpm.version>
|
<jbpm.version>6.0.0.Final</jbpm.version>
|
||||||
<picocli.version>3.9.6</picocli.version>
|
<picocli.version>3.9.6</picocli.version>
|
||||||
|
<crawler4j.version>4.4.0</crawler4j.version>
|
||||||
<spring-boot-starter.version>2.1.4.RELEASE</spring-boot-starter.version>
|
<spring-boot-starter.version>2.1.4.RELEASE</spring-boot-starter.version>
|
||||||
</properties>
|
</properties>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
package com.baeldung.crawler4j;
|
||||||
|
|
||||||
|
public class CrawlerStatistics {
|
||||||
|
private int processedPageCount = 0;
|
||||||
|
private int totalLinksCount = 0;
|
||||||
|
|
||||||
|
public void incrementProcessedPageCount() {
|
||||||
|
processedPageCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrementTotalLinksCount(int linksCount) {
|
||||||
|
totalLinksCount += linksCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getProcessedPageCount() {
|
||||||
|
return processedPageCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getTotalLinksCount() {
|
||||||
|
return totalLinksCount;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
package com.baeldung.crawler4j;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import edu.uci.ics.crawler4j.crawler.Page;
|
||||||
|
import edu.uci.ics.crawler4j.crawler.WebCrawler;
|
||||||
|
import edu.uci.ics.crawler4j.parser.HtmlParseData;
|
||||||
|
import edu.uci.ics.crawler4j.url.WebURL;
|
||||||
|
|
||||||
|
public class HtmlCrawler extends WebCrawler {
|
||||||
|
|
||||||
|
private final static Pattern EXCLUSIONS = Pattern.compile(".*(\\.(css|js|xml|gif|jpg|png|mp3|mp4|zip|gz|pdf))$");
|
||||||
|
|
||||||
|
private CrawlerStatistics stats;
|
||||||
|
|
||||||
|
public HtmlCrawler(CrawlerStatistics stats) {
|
||||||
|
this.stats = stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean shouldVisit(Page referringPage, WebURL url) {
|
||||||
|
String urlString = url.getURL().toLowerCase();
|
||||||
|
return !EXCLUSIONS.matcher(urlString).matches()
|
||||||
|
&& urlString.startsWith("https://www.baeldung.com/");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(Page page) {
|
||||||
|
String url = page.getWebURL().getURL();
|
||||||
|
stats.incrementProcessedPageCount();
|
||||||
|
|
||||||
|
if (page.getParseData() instanceof HtmlParseData) {
|
||||||
|
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
|
||||||
|
String title = htmlParseData.getTitle();
|
||||||
|
String text = htmlParseData.getText();
|
||||||
|
String html = htmlParseData.getHtml();
|
||||||
|
Set<WebURL> links = htmlParseData.getOutgoingUrls();
|
||||||
|
stats.incrementTotalLinksCount(links.size());
|
||||||
|
|
||||||
|
System.out.printf("Page with title '%s' %n", title);
|
||||||
|
System.out.printf(" Text length: %d %n", text.length());
|
||||||
|
System.out.printf(" HTML length: %d %n", html.length());
|
||||||
|
System.out.printf(" %d outbound links %n", links.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package com.baeldung.crawler4j;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||||
|
import edu.uci.ics.crawler4j.crawler.CrawlController;
|
||||||
|
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
|
||||||
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
|
||||||
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
||||||
|
|
||||||
|
public class HtmlCrawlerController {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
File crawlStorage = new File("src/test/resources/crawler4j");
|
||||||
|
CrawlConfig config = new CrawlConfig();
|
||||||
|
config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
|
||||||
|
config.setMaxDepthOfCrawling(2);
|
||||||
|
|
||||||
|
int numCrawlers = 12;
|
||||||
|
|
||||||
|
PageFetcher pageFetcher = new PageFetcher(config);
|
||||||
|
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
|
||||||
|
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
|
||||||
|
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
|
||||||
|
|
||||||
|
controller.addSeed("https://www.baeldung.com/");
|
||||||
|
|
||||||
|
CrawlerStatistics stats = new CrawlerStatistics();
|
||||||
|
CrawlController.WebCrawlerFactory<HtmlCrawler> factory = () -> new HtmlCrawler(stats);
|
||||||
|
|
||||||
|
controller.start(factory, numCrawlers);
|
||||||
|
System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
|
||||||
|
System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
package com.baeldung.crawler4j;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import edu.uci.ics.crawler4j.crawler.Page;
|
||||||
|
import edu.uci.ics.crawler4j.crawler.WebCrawler;
|
||||||
|
import edu.uci.ics.crawler4j.parser.BinaryParseData;
|
||||||
|
import edu.uci.ics.crawler4j.url.WebURL;
|
||||||
|
|
||||||
|
public class ImageCrawler extends WebCrawler {
|
||||||
|
private final static Pattern EXCLUSIONS = Pattern.compile(".*(\\.(css|js|xml|gif|png|mp3|mp4|zip|gz|pdf))$");
|
||||||
|
|
||||||
|
private static final Pattern IMG_PATTERNS = Pattern.compile(".*(\\.(jpg|jpeg))$");
|
||||||
|
|
||||||
|
private File saveDir;
|
||||||
|
|
||||||
|
public ImageCrawler(File saveDir) {
|
||||||
|
this.saveDir = saveDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean shouldVisit(Page referringPage, WebURL url) {
|
||||||
|
String urlString = url.getURL().toLowerCase();
|
||||||
|
if (EXCLUSIONS.matcher(urlString).matches()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IMG_PATTERNS.matcher(urlString).matches()
|
||||||
|
|| urlString.startsWith("https://www.baeldung.com/")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(Page page) {
|
||||||
|
String url = page.getWebURL().getURL();
|
||||||
|
if (IMG_PATTERNS.matcher(url).matches()
|
||||||
|
&& page.getParseData() instanceof BinaryParseData) {
|
||||||
|
String extension = url.substring(url.lastIndexOf("."));
|
||||||
|
int contentLength = page.getContentData().length;
|
||||||
|
|
||||||
|
System.out.printf("Extension is '%s' with content length %d %n", extension, contentLength);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package com.baeldung.crawler4j;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||||
|
import edu.uci.ics.crawler4j.crawler.CrawlController;
|
||||||
|
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
|
||||||
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
|
||||||
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
||||||
|
|
||||||
|
public class ImageCrawlerController {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
File crawlStorage = new File("src/test/resources/crawler4j");
|
||||||
|
CrawlConfig config = new CrawlConfig();
|
||||||
|
config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
|
||||||
|
config.setIncludeBinaryContentInCrawling(true);
|
||||||
|
config.setMaxPagesToFetch(500);
|
||||||
|
|
||||||
|
File saveDir = new File("src/test/resources/crawler4j");
|
||||||
|
|
||||||
|
int numCrawlers = 12;
|
||||||
|
|
||||||
|
PageFetcher pageFetcher = new PageFetcher(config);
|
||||||
|
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
|
||||||
|
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
|
||||||
|
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
|
||||||
|
|
||||||
|
controller.addSeed("https://www.baeldung.com/");
|
||||||
|
|
||||||
|
CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
|
||||||
|
|
||||||
|
controller.start(factory, numCrawlers);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package com.baeldung.crawler4j;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||||
|
import edu.uci.ics.crawler4j.crawler.CrawlController;
|
||||||
|
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
|
||||||
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
|
||||||
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
||||||
|
|
||||||
|
public class MultipleCrawlerController {
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
File crawlStorageBase = new File("src/test/resources/crawler4j");
|
||||||
|
CrawlConfig htmlConfig = new CrawlConfig();
|
||||||
|
CrawlConfig imageConfig = new CrawlConfig();
|
||||||
|
|
||||||
|
htmlConfig.setCrawlStorageFolder(new File(crawlStorageBase, "html").getAbsolutePath());
|
||||||
|
imageConfig.setCrawlStorageFolder(new File(crawlStorageBase, "image").getAbsolutePath());
|
||||||
|
imageConfig.setIncludeBinaryContentInCrawling(true);
|
||||||
|
|
||||||
|
htmlConfig.setMaxPagesToFetch(500);
|
||||||
|
imageConfig.setMaxPagesToFetch(1000);
|
||||||
|
|
||||||
|
PageFetcher pageFetcherHtml = new PageFetcher(htmlConfig);
|
||||||
|
PageFetcher pageFetcherImage = new PageFetcher(imageConfig);
|
||||||
|
|
||||||
|
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
|
||||||
|
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcherHtml);
|
||||||
|
|
||||||
|
CrawlController htmlController = new CrawlController(htmlConfig, pageFetcherHtml, robotstxtServer);
|
||||||
|
CrawlController imageController = new CrawlController(imageConfig, pageFetcherImage, robotstxtServer);
|
||||||
|
|
||||||
|
htmlController.addSeed("https://www.baeldung.com/");
|
||||||
|
imageController.addSeed("https://www.baeldung.com/");
|
||||||
|
|
||||||
|
CrawlerStatistics stats = new CrawlerStatistics();
|
||||||
|
CrawlController.WebCrawlerFactory<HtmlCrawler> htmlFactory = () -> new HtmlCrawler(stats);
|
||||||
|
|
||||||
|
File saveDir = new File("src/test/resources/crawler4j");
|
||||||
|
CrawlController.WebCrawlerFactory<ImageCrawler> imageFactory = () -> new ImageCrawler(saveDir);
|
||||||
|
|
||||||
|
imageController.startNonBlocking(imageFactory, 7);
|
||||||
|
htmlController.startNonBlocking(htmlFactory, 10);
|
||||||
|
|
||||||
|
|
||||||
|
htmlController.waitUntilFinish();
|
||||||
|
System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
|
||||||
|
System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
|
||||||
|
|
||||||
|
imageController.waitUntilFinish();
|
||||||
|
System.out.printf("Image Crawler is finished.");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue