37 lines
1.3 KiB
Java
37 lines
1.3 KiB
Java
|
|
package com.baeldung.crawler4j;
|
||
|
|
|
||
|
|
import java.io.File;
|
||
|
|
|
||
|
|
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||
|
|
import edu.uci.ics.crawler4j.crawler.CrawlController;
|
||
|
|
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
|
||
|
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
|
||
|
|
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
||
|
|
|
||
|
|
public class ImageCrawlerController {
|
||
|
|
|
||
|
|
public static void main(String[] args) throws Exception {
|
||
|
|
File crawlStorage = new File("src/test/resources/crawler4j");
|
||
|
|
CrawlConfig config = new CrawlConfig();
|
||
|
|
config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
|
||
|
|
config.setIncludeBinaryContentInCrawling(true);
|
||
|
|
config.setMaxPagesToFetch(500);
|
||
|
|
|
||
|
|
File saveDir = new File("src/test/resources/crawler4j");
|
||
|
|
|
||
|
|
int numCrawlers = 12;
|
||
|
|
|
||
|
|
PageFetcher pageFetcher = new PageFetcher(config);
|
||
|
|
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
|
||
|
|
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
|
||
|
|
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
|
||
|
|
|
||
|
|
controller.addSeed("https://www.baeldung.com/");
|
||
|
|
|
||
|
|
CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
|
||
|
|
|
||
|
|
controller.start(factory, numCrawlers);
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|