37 lines
1.3 KiB
Java
Raw Normal View History

package com.baeldung.crawler4j;
import java.io.File;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class ImageCrawlerController {
public static void main(String[] args) throws Exception {
File crawlStorage = new File("src/test/resources/crawler4j");
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
config.setIncludeBinaryContentInCrawling(true);
config.setMaxPagesToFetch(500);
File saveDir = new File("src/test/resources/crawler4j");
int numCrawlers = 12;
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("https://www.baeldung.com/");
CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
controller.start(factory, numCrawlers);
}
}