Amy DeGregorio 0ece18c766 Bael 2979 Guide to Crawler4j (#7071)
* BAEL-2727 Example Code

* BAEL-2979 Guide to Crawler4j

* BAEL-2979 adjust based on feedback
2019-06-07 13:54:00 +02:00

37 lines
1.4 KiB
Java

package com.baeldung.crawler4j;
import java.io.File;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class HtmlCrawlerController {
public static void main(String[] args) throws Exception {
File crawlStorage = new File("src/test/resources/crawler4j");
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
config.setMaxDepthOfCrawling(2);
int numCrawlers = 12;
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("https://www.baeldung.com/");
CrawlerStatistics stats = new CrawlerStatistics();
CrawlController.WebCrawlerFactory<HtmlCrawler> factory = () -> new HtmlCrawler(stats);
controller.start(factory, numCrawlers);
System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
}
}