实现爬虫自动同步数据库 加入spring batch做未来的改动准备

This commit is contained in:
yang.xie 2022-11-06 11:40:48 +08:00
parent 60fd6c7d14
commit 2f412bee98
23 changed files with 711 additions and 398 deletions

View File

@ -19,3 +19,17 @@ docker tag usvisartrackapi:0.0.2-snapshot admin/usvisartrackapi:0.0.2-snapshot
docker push pubuser/usvisartrackapi:0.0.2-snapshot
docker tag usvisartrackapi:0.0.2-snapshot 54.39.157.60:8092/library/usvisartrackapi:0.0.2-snapshot
docker compose 发布脚本
docker compose pull
docker compose up -d --remove-orphans
docker image prune -f
docker logs --tail=100 gitlab-runner -f
docker logs --tail=100 usvisatrackapi -f

View File

@ -7,7 +7,7 @@ param (
[string]$DockerServerName = "usvisartrackapi",
[string]$CodeServerPort = "8282",
[string]$PublishServerPort = "8383",
[string]$BuildVerison = "0.0.2-snapshot",
[string]$BuildVerison = "0.0.5-snapshot",
[string]$PushServer = "repo-docker.ossez.com",
[string]$PushPath = "/docker-hub/"
)

17
pom.xml
View File

@ -70,11 +70,6 @@
<version>31.1-jre</version>
</dependency>
<dependency>
<groupId>com.configcat</groupId>
<artifactId>configcat-java-client</artifactId>
<version>7.2.0</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
@ -97,14 +92,20 @@
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.8</version>
<version>5.8.9</version>
</dependency>
<dependency>
<groupId>com.flagsmith</groupId>
<artifactId>flagsmith-java-client</artifactId>
<version>5.0.6</version>
</dependency>
<!-- hibernate enhancement -->
<dependency>
<groupId>com.vladmihalcea</groupId>
<artifactId>hibernate-types-55</artifactId>
<version>2.19.2</version>
<version>2.20.0</version>
</dependency>
<!-- DATABASE Client -->
@ -156,7 +157,7 @@
<dependency>
<groupId>com.mailgun</groupId>
<artifactId>mailgun-java</artifactId>
<version>1.0.3</version>
<version>1.0.4</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>

View File

@ -0,0 +1,23 @@
package com.northtecom.visatrack.api.base.util;
import java.util.Objects;
/**
* Created with IntelliJ IDEA.
*
* @Author: XieYang
* @Date: 2022/10/31/11:57
* @Description:
*/
@FunctionalInterface
public interface Action {
void accept();
default Action andThen(Action after) {
Objects.requireNonNull(after);
return () -> {
accept();
after.accept();
};
}
}

View File

@ -5,14 +5,11 @@
*/
package com.northtecom.visatrack.api.base.util;
import com.configcat.ConfigCatClient;
import com.configcat.User;
import com.mailgun.api.v3.MailgunMessagesApi;
import com.mailgun.client.MailgunClient;
import com.mailgun.model.message.Message;
import com.mailgun.model.message.MessageResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
/**
@ -33,20 +30,25 @@ public class EmailUtils {
private static String emailSenderAddress = "info@usvisatrack.com";
private ConfigCatClient configCatClient;
// @Autowired
// private FlagsmithClient flagsmithClient;
private MailgunMessagesApi mailgunMessagesApi;
public EmailUtils() {
configCatClient = new ConfigCatClient("d5naCOKEsUeKSEB2aamvxg/JRdvJ42xcUKZGqnHq1vQgQ");
User userObject = User.newBuilder().build(EMAIL_KEY_IDENTIFIER);
// Unique identifier is required. Could be UserID, Email address or SessionID.
String emailKey = configCatClient.getValue(String.class, EMAIL_KEY_NAME, userObject, StringUtils.EMPTY);
emailSenderAddress = configCatClient.getValue(String.class, EMAIL_KEY_SENDER_NAME, userObject,
emailSenderAddress);
String emailKey = TryGetConfigByKey(EMAIL_KEY_NAME, "979fcedb0aa8bcdeab632bbf6baa74e0");
emailSenderAddress = TryGetConfigByKey(EMAIL_KEY_SENDER_NAME, "updates@usvisatrack.com");
mailgunMessagesApi = MailgunClient.config(emailKey).createApi(MailgunMessagesApi.class);
}
private String TryGetConfigByKey(String key, String defaultValue) {
// try {
// Flags flags = flagsmithClient.getEnvironmentFlags();
// return flags.getFeatureValue(key).toString();
// } catch (FlagsmithClientError flagsmithClientError) {
// flagsmithClientError.printStackTrace();
// }
return defaultValue;
}
/**
* Send Test Email to check config and email sending API

View File

@ -1,6 +1,6 @@
package com.northtecom.visatrack.api.config;
import com.configcat.ConfigCatClient;
import com.flagsmith.FlagsmithClient;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@ -26,10 +26,15 @@ public class ApplicationBeanConfig {
private AuthenticationConfiguration authenticationConfiguration;
@Bean
public ConfigCatClient configCatClient() throws Exception {
return new ConfigCatClient("d5naCOKEsUeKSEB2aamvxg/JRdvJ42xcUKZGqnHq1vQgQ");
public FlagsmithClient flagsmithClient() throws Exception {
return FlagsmithClient
.newBuilder()
.setApiKey("bNRvdzMgcojGLCP6ts6fjB")
.withApiUrl("https://flag.ossez.com/api/v1/")
.build();
}
/**
* 用于配置 AuthenticationManager 实例
*/

View File

@ -12,6 +12,7 @@ import com.northtecom.visatrack.api.service.impl.CaseVisaReportService;
import com.northtecom.visatrack.api.service.impl.VisaCaseService;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
@ -37,6 +38,7 @@ public class CaseVisaReportReportController {
private final CaseVisaReportService caseVisaReportService;
private final VisaCaseService visaCaseService;
@Autowired
public CaseVisaReportReportController(CaseVisaReportService caseVisaReportService,
VisaCaseService visaCaseService) {
this.caseVisaReportService = caseVisaReportService;

View File

@ -52,18 +52,49 @@ public class CrawlController {
this.visaCaseService = visaCaseService;
}
/**
* Crawl checkee report data
*
* @return boolean
*/
@PostMapping("/checkee_report/crawl")
@Operation(summary = "爬取签证申请报表数据", description = "爬取签证申请报表数据")
public boolean crawlCheckeeReport() {
this.visaReportCheckeeService.crawlCheckeeReport();
@PostMapping("/checkee_visa/syncAllVisaDataFromCheckee")
@Operation(summary = "同步所有数据", description = "同步所有数据")
public Boolean syncAllVisaDataFromCheckee() {
LocalDate endDate = LocalDate.now();
LocalDate startDate = LocalDate.of(2018, 10, 1);
this.visaReportCheckeeService.syncDataAndReport(endDate, startDate);
return true;
}
@PostMapping("/checkee_visa/syncLast3monthVisaDataFromCheckee")
@Operation(summary = "同步最近3个月的数据", description = "同步最近3个月的数据")
public Boolean syncLast3monthVisaDataFromCheckee() {
LocalDate endDate = LocalDate.now();
LocalDate startDate = endDate.minusMonths(3);
this.visaReportCheckeeService.syncDataAndReport(endDate, startDate);
return true;
}
@PostMapping("/checkee_visa/syncLast3YearsVisaDataFromCheckee")
@Operation(summary = "同步最近3年的数据", description = "同步最近3年的数据")
public Boolean syncLast3YearsVisaDataFromCheckee() {
LocalDate endDate = LocalDate.now();
LocalDate startDate = endDate.minusYears(3);
this.visaReportCheckeeService.syncDataAndReport(endDate, startDate);
return true;
}
@PostMapping("/checkee_visa/rebuildAllReport")
@Operation(summary = "重建所有报表", description = "重建所有报表")
public Boolean rebuildAllReport() {
this.visaReportCheckeeService.rebuildAllReport();
return true;
}
@PostMapping("/checkee_visa/importNewCrawlData")
@Operation(summary = "导入新的签证数据", description = "导入新的签证数据")
public boolean importNewCrawlData() throws ParseException {
this.visaCaseService.importNewCrawlData();
return true;
}
/**
* Import checkee crawl data to database
*

View File

@ -14,21 +14,16 @@ import java.util.Date;
* Created with IntelliJ IDEA.
*
* @Author: XieYang
* @Date: 2022/10/04/20:00
* @Date: 2022/10/31/8:48
* @Description:
*/
@Entity
@Data
@Table(name = "visa_checkee_crawl_html",uniqueConstraints = {
@UniqueConstraint(columnNames={"crawl_key"})
@Table(name = "crawl_cache", uniqueConstraints = {
@UniqueConstraint(columnNames = {"crawl_key"})
})
@org.hibernate.annotations.Table(appliesTo = "visa_checkee_crawl_html", comment = "Visa case checkee crawl html")
public class VisaCheckeeCrawlHtml extends BaseEntity<Long> {
public static final String CRAWL_KEY_REPORT_LIST = "report_list";
public static final String CRAWL_KEY_REPORT_DETAIL_LIST = "report_detail_list";
public static final String CRAWL_KEY_VISA_DETAIL = "visa_detail";
public static final String CRAWL_KEY_VISA_UPDATE = "visa_update";
@org.hibernate.annotations.Table(appliesTo = "crawl_cache", comment = "Crawl cache")
public class CrawlCache extends BaseEntity<Long> {
/**
* Crawl key
*/
@ -42,8 +37,8 @@ public class VisaCheckeeCrawlHtml extends BaseEntity<Long> {
/**
* Crawl html content
*/
@Column(name = "content", columnDefinition = "LONGTEXT comment 'crawl html content'")
private String content;
@Column(name = "oss_path", columnDefinition = "varchar(500) comment 'Oss path'")
private String OssPath;
/**
* Cache days
*/

View File

@ -80,7 +80,7 @@ public class VisaCase extends BaseEntity<Long> {
/**
* Note
*/
@Column(name = "note", columnDefinition = "varchar(2000) COMMENT 'note'")
@Column(name = "note", columnDefinition = "Text COMMENT 'note'")
private String note;
/**
* Visa interview date

View File

@ -43,24 +43,24 @@ public class VisaCheckeeCrawlData extends BaseEntity<Long> {
private String status;
@Column(name = "complete_date", columnDefinition = "Date COMMENT 'complete date'")
private LocalDate completeDate;
@Column(name = "note", columnDefinition = "varchar(2000) COMMENT 'note'")
@Column(name = "note", columnDefinition = "TEXT COMMENT 'note'")
private String note;
@Column(name = "last_name", columnDefinition = "varchar(50) COMMENT 'last name'")
@Column(name = "last_name", columnDefinition = "varchar(100) COMMENT 'last name'")
private String lastName;
@Column(name = "first_name", columnDefinition = "varchar(50) COMMENT 'first name'")
@Column(name = "first_name", columnDefinition = "varchar(100) COMMENT 'first name'")
private String firstName;
@Column(name = "university", columnDefinition = "varchar(100) COMMENT 'university'")
private String university;
@Column(name = "degree", columnDefinition = "varchar(50) COMMENT 'degree'")
@Column(name = "degree", columnDefinition = "varchar(200) COMMENT 'degree'")
private String degree;
@Column(name = "employer", columnDefinition = "varchar(50) COMMENT 'employer'")
@Column(name = "employer", columnDefinition = "varchar(100) COMMENT 'employer'")
private String employer;
@Column(name = "job_title", columnDefinition = "varchar(50) COMMENT 'job title'")
@Column(name = "job_title", columnDefinition = "varchar(100) COMMENT 'job title'")
private String jobTitle;
@Column(name = "years_in_usa", columnDefinition = "varchar(50) COMMENT 'years in usa'")
@Column(name = "years_in_usa", columnDefinition = "varchar(100) COMMENT 'years in usa'")
private String yearsInUsa;
@Column(name = "country", columnDefinition = "varchar(50) COMMENT 'country'")
@Column(name = "country", columnDefinition = "varchar(100) COMMENT 'country'")
private String country;
@Column(name = "part_email", columnDefinition = "varchar(50) COMMENT 'part email'")
@Column(name = "part_email", columnDefinition = "varchar(100) COMMENT 'part email'")
private String partEmail;
}

View File

@ -0,0 +1,20 @@
package com.northtecom.visatrack.api.data.repository;
import com.northtecom.visatrack.api.data.entity.CrawlCache;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import java.util.Optional;
/**
* Created with IntelliJ IDEA.
*
* @Author: XieYang
* @Date: 2022/10/04/20:05
* @Description:
*/
public interface CrawlCacheRepository extends JpaRepository<CrawlCache, Long> {
@Query("select vh from CrawlCache vh where vh.crawlKey = :crawlKey order by vh.crawlDate desc")
Optional<CrawlCache> findLatestCrawlCache(@Param("crawlKey") String crawlKey);
}

View File

@ -30,4 +30,9 @@ public interface VisaCheckeeCrawlDataRepository extends JpaRepository<VisaChecke
@Query(value = "Select * from usvisatrack.visa_checkee_crawl_data where (first_name is null or first_name = '')" +
" or (part_email is null or part_email = '')", nativeQuery = true)
List<VisaCheckeeCrawlData> QueryNotDetail();
@Query(value = "Select * from usvisatrack.visa_checkee_crawl_data where case_num not in (SELECT v" +
".ref_crawl_case_number from usvisatrack.visa_case v)", nativeQuery = true)
List<VisaCheckeeCrawlData> QueryNewCrawlData();
}

View File

@ -1,20 +0,0 @@
package com.northtecom.visatrack.api.data.repository;
import com.northtecom.visatrack.api.data.entity.VisaCheckeeCrawlHtml;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import java.util.Optional;
/**
* Created with IntelliJ IDEA.
*
* @Author: XieYang
* @Date: 2022/10/04/20:05
* @Description:
*/
public interface VisaCheckeeCrawlHtmlRepository extends JpaRepository<VisaCheckeeCrawlHtml, Long> {
@Query("select vh from VisaCheckeeCrawlHtml vh where vh.crawlKey = :crawlKey order by vh.crawlDate desc")
Optional<VisaCheckeeCrawlHtml> findLatestCrawlHtml(@Param("crawlKey") String crawlKey);
}

View File

@ -0,0 +1,97 @@
package com.northtecom.visatrack.api.schedule;
import com.northtecom.visatrack.api.service.impl.VisaCaseService;
import com.northtecom.visatrack.api.service.impl.VisaReportCheckeeService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.time.LocalDate;
import java.time.LocalDateTime;
/**
* Created with IntelliJ IDEA.
*
* @Author: XieYang
* @Date: 2022/10/15/9:01
* @Description:
*/
@Component
@Slf4j
public class VisaCaseSyncTask {
private final VisaCaseService visaCaseService;
private final VisaReportCheckeeService visaReportCheckeeService;
public VisaCaseSyncTask(VisaCaseService visaCaseService, VisaReportCheckeeService visaReportCheckeeService) {
this.visaCaseService = visaCaseService;
this.visaReportCheckeeService = visaReportCheckeeService;
}
/**
* 同步最近半年的签证数据并更新报表 每天每隔1小时执行一次
*/
@Scheduled(cron = "0 0 0/1 * * ?")
public void syncLast3monthVisaDataFromCheckee() {
log.info("同步最近半年的签证数据,并更新报表 触发于:{}", LocalDateTime.now());
try {
LocalDate endDate = LocalDate.now();
LocalDate startDate = endDate.minusMonths(3);
this.visaReportCheckeeService.syncDataAndReport(endDate, startDate);
} catch (Exception e) {
log.error("同步最近半年的签证数据,并更新报表 异常:{}", e.getMessage(), e);
}
}
/**
* 同步最近3年的签证数据并更新报表 每天凌晨2点执行一次
*/
@Scheduled(cron = "0 0 3 * * ?")
public void syncLast3YearsVisaDataFromCheckee() {
log.info("同步最近3年的签证数据并更新报表 触发于:{}", LocalDateTime.now());
try {
LocalDate endDate = LocalDate.now();
LocalDate startDate = endDate.minusYears(3);
this.visaReportCheckeeService.syncDataAndReport(endDate, startDate);
} catch (Exception e) {
log.error("同步最近3年的签证数据 异常:{}", e.getMessage(), e);
}
}
/**
* 同步最近所有的签证数据 每月1日凌晨1点执行一次
*/
@Scheduled(cron = "0 0 1 1 * ?")
public void syncAllVisaDataFromCheckee() {
log.info("同步最近所有的签证数据,并更新报表 触发于:{}", LocalDateTime.now());
try {
LocalDate endDate = LocalDate.now();
LocalDate startDate = LocalDate.of(2018, 10, 1);
this.visaReportCheckeeService.syncDataAndReport(endDate, startDate);
} catch (Exception e) {
log.error("同步最近所有的签证数据,并更新报表 异常:{}", e.getMessage(), e);
}
}
/**
* 重建报表数据 每天整点20,40分执行一次
*/
@Scheduled(cron = "0 20,40 * * * ?")
public void rebuildReport() {
log.info("重建报表数据 触发于:{}", LocalDateTime.now());
try {
this.visaReportCheckeeService.rebuildAllReport();
} catch (Exception e) {
log.error("重建报表数据 异常:{}", e.getMessage(), e);
}
}
}

View File

@ -1,51 +0,0 @@
package com.northtecom.visatrack.api.schedule;
import com.northtecom.visatrack.api.service.impl.VisaCaseService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
/**
* Created with IntelliJ IDEA.
*
* @Author: XieYang
* @Date: 2022/10/15/9:01
* @Description:
*/
@Component
@Slf4j
public class VisaReportTask {
private final VisaCaseService visaCaseService;
public VisaReportTask(VisaCaseService visaCaseService) {
this.visaCaseService = visaCaseService;
}
// @Scheduled(cron = "0/3 * * * * *")
public void test_3() {
log.info("test_3 触发于:{}", LocalDateTime.now());
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 计算最近12个月的签证统计(2小时更新一次)
*/
// @Scheduled(initialDelay = 20000, fixedDelay = 1000 * 60 * 60 * 2)
public void caculateLast12MonthsMainReport() {
log.info("计算最近12个月的签证统计 触发于:{}", LocalDateTime.now());
try {
visaCaseService.calculateMainReport(12);
} catch (Exception e) {
log.error("计算最近12个月的签证统计 异常:{}", e.getMessage(), e);
}
}
}

View File

@ -23,6 +23,7 @@ import java.util.stream.Collectors;
@Service
public class AwsSysFileService {
public static final String DEFAULT_BUCKET_NAME = "usvsiatrackcrawl";
private final AmazonS3 s3Client;

View File

@ -1,13 +1,12 @@
package com.northtecom.visatrack.api.service.impl;
import cn.hutool.core.date.DateUtil;
import com.northtecom.visatrack.api.config.CrawlConfig;
import com.northtecom.visatrack.api.data.entity.CrawlCache;
import com.northtecom.visatrack.api.data.entity.VisaCheckeeCrawlData;
import com.northtecom.visatrack.api.data.entity.VisaCheckeeCrawlHtml;
import com.northtecom.visatrack.api.data.repository.VisaCheckeeCrawlHtmlRepository;
import com.northtecom.visatrack.api.data.repository.CrawlCacheRepository;
import com.northtecom.visatrack.api.service.dto.CrawlHtml;
import com.northtecom.visatrack.api.service.dto.VisaCheckeeData;
import com.northtecom.visatrack.api.service.dto.VisaReportCheckeeData;
import com.northtecom.visatrack.api.service.rules.CrawlToOssRule;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
@ -17,13 +16,11 @@ import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Optional;
@ -37,20 +34,23 @@ import java.util.Optional;
@Service
@Slf4j
public class CrawlService {
public static final String URL_CHECKEE_REPORT = "https://www.checkee.info/index.php";
public static final String URL_CHECKEE_REPORT_DETAIL = "https://www.checkee.info/main.php?dispdate=";
public static final String URL_CHECKEE_VISA_DETAIL = "https://checkee.info/personal_detail.php?casenum=";
public static final String URL_CHECKEE_VISA_UPDATE = "https://checkee.info/update.php?casenum=";
private final VisaCheckeeCrawlHtmlRepository visaCheckeeCrawlHtmlRepository;
private final CrawlCacheRepository crawlCacheRepository;
private final CrawlConfig crawlConfig;
private final AwsSysFileService awsSysFileService;
private Integer crawlInterval = 2000;
@Autowired
public CrawlService(VisaCheckeeCrawlHtmlRepository visaCheckeeCrawlHtmlRepository, CrawlConfig crawlConfig) {
this.visaCheckeeCrawlHtmlRepository = visaCheckeeCrawlHtmlRepository;
public CrawlService(
CrawlCacheRepository crawlCacheRepository, CrawlConfig crawlConfig,
AwsSysFileService awsSysFileService) {
this.crawlCacheRepository = crawlCacheRepository;
this.crawlConfig = crawlConfig;
this.awsSysFileService = awsSysFileService;
}
/**
@ -60,7 +60,7 @@ public class CrawlService {
* @return {@link Document}
* @throws IOException ioexception
*/
public Document getDocument(String url) throws IOException, InterruptedException {
public Document getDocument(String url) throws InterruptedException {
Document document = null;
log.info("Crawl url: {},Crawl config {}", url, this.crawlConfig);
int i = 0;
@ -90,153 +90,106 @@ public class CrawlService {
return document;
}
public CrawlHtml crawlHtmlByCrawlUrl(String crawlUrl) throws IOException,
InterruptedException {
CrawlHtml crawlHtml = new CrawlHtml();
Date crawlDate = new Date();
crawlHtml.setUrl(crawlUrl);
Integer crawlCacheDay = caculateCrawlCacheDay(crawlUrl);
log.info("Start to read crawl url : {} from db", crawlUrl, crawlDate);
Optional<VisaCheckeeCrawlHtml> latestCrawlHtml =
visaCheckeeCrawlHtmlRepository.findLatestCrawlHtml(crawlUrl);
if (latestCrawlHtml.isPresent() && DateUtil.offsetDay(latestCrawlHtml.get().getCrawlDate(), crawlCacheDay).after(crawlDate)) {
log.info("crawl url cache in database,use db value");
crawlHtml.setCrawlTime(latestCrawlHtml.get().getUpdateTime());
crawlHtml.setHtml(latestCrawlHtml.get().getContent());
return crawlHtml;
}
Long crawlId = null;
if (latestCrawlHtml.isPresent()) {
crawlId = latestCrawlHtml.get().getId();
}
log.info("Start to crawl url {} at {}", crawlUrl, crawlDate);
String content = "";
public CrawlHtml crawlHtml(String crawlUrl) {
try {
Document doc = getDocument(crawlUrl);
if (!doc.select("title").text().equals("One moment, please...")) {
content = doc.html();
CrawlToOssRule crawlToOssRule = CrawlToOssRule.of(AwsSysFileService.DEFAULT_BUCKET_NAME, crawlUrl);
CrawlHtml crawlHtml = new CrawlHtml();
crawlHtml.setUrl(crawlUrl);
Optional<CrawlCache> latestCrawlHtml =
crawlCacheRepository.findLatestCrawlCache(crawlUrl);
// 从数据库中读取到缓存html并且没有过期
if (latestCrawlHtml.isPresent() && !crawlToOssRule.checkIsExpired(latestCrawlHtml.get().getCrawlDate())) {
log.info("crawl url cache in database,use db value,cache id is {}", latestCrawlHtml.get().getId());
crawlHtml.setCrawlTime(latestCrawlHtml.get().getUpdateTime());
String html = ReadFromOss(latestCrawlHtml.get().getOssPath(), crawlToOssRule);
crawlHtml.setHtml(html);
return crawlHtml;
}
Long crawlId = null;
if (latestCrawlHtml.isPresent()) {
crawlId = latestCrawlHtml.get().getId();
}
log.info("Start to crawl url {} at {}", crawlUrl, crawlToOssRule.getUrl());
String content = "";
try {
Document doc = getDocument(crawlUrl);
if (!doc.select("title").text().equals("One moment, please...")) {
content = doc.html();
}
} catch (Exception e) {
log.error(e.getMessage());
throw e;
}
if (StringUtils.isNotBlank(content)) {
SaveCacheHtml(content, crawlId, crawlToOssRule);
}
Integer sleepTime = this.crawlInterval;
log.info("Crawl url {} success, sleep {} ms", crawlUrl, sleepTime);
Thread.sleep(sleepTime);
crawlHtml.setCrawlTime(crawlToOssRule.getCrawlDate());
crawlHtml.setHtml(content);
return crawlHtml;
} catch (Exception e) {
log.error(e.getMessage());
throw e;
}
if (StringUtils.isNotBlank(content)) {
SaveHtml(content, crawlUrl, crawlDate, crawlId, crawlCacheDay);
}
Integer sleepTime = this.crawlInterval;
log.info("Crawl url {} success, sleep {} ms", crawlUrl, sleepTime);
Thread.sleep(sleepTime);
crawlHtml.setCrawlTime(crawlDate);
crawlHtml.setHtml(content);
return crawlHtml;
}
/**
* 计算页面缓存天数
* 如果是主页面报表 缓存一天
* 如果是月份列表 1-2月内的缓存一天 3-6 月份的缓存3天 6月-2年以上的缓存7天2年以上-6年的缓存30天6年以上的缓存3年
* 个人信息明细页面缓存 1年
*
* @param crawlUrl 爬行url
* @return {@link Integer}
*/
private Integer caculateCrawlCacheDay(String crawlUrl) {
if (crawlUrl.contains(URL_CHECKEE_REPORT)) {
return 1;
} else if (crawlUrl.contains(URL_CHECKEE_VISA_DETAIL) || crawlUrl.contains(URL_CHECKEE_VISA_UPDATE)) {
return 365;
} else if (crawlUrl.contains(URL_CHECKEE_REPORT_DETAIL)) {
String month = crawlUrl.substring(crawlUrl.lastIndexOf("=") + 1);
LocalDate localDate = LocalDate.parse(month + "-01");
LocalDate now = LocalDate.now();
int monthDiff =
now.getYear() * 12 + now.getMonthValue() - (localDate.getYear() * 12 + localDate.getMonthValue());
if (monthDiff <= 2) {
return 1;
} else if (monthDiff <= 6) {
return 7;
} else if (monthDiff <= 24) {
return 30;
} else if (monthDiff <= 72) {
return 365;
} else {
return 3 * 365;
}
} else {
return 3;
}
}
@Transactional
private void SaveHtml(String html, String crawlUrl, Date crawlDate, Long crawlId, Integer crawlCacheDay) {
VisaCheckeeCrawlHtml visaCheckeeCrawlHtml = new VisaCheckeeCrawlHtml();
if (crawlId != null) {
visaCheckeeCrawlHtml =
visaCheckeeCrawlHtmlRepository.findById(crawlId).orElse(new VisaCheckeeCrawlHtml());
}
visaCheckeeCrawlHtml.setCrawlKey(crawlUrl);
visaCheckeeCrawlHtml.setCrawlDate(crawlDate);
visaCheckeeCrawlHtml.setContent(html);
visaCheckeeCrawlHtml.setCacheDays(crawlCacheDay);
log.info("Before save crawl Key {} length {}", visaCheckeeCrawlHtml.getCrawlKey(),
visaCheckeeCrawlHtml.getCrawlKey().length());
visaCheckeeCrawlHtmlRepository.saveAndFlush(visaCheckeeCrawlHtml);
}
public List<VisaReportCheckeeData> parseCheckeeReport(String crawlContent) {
Document doc = Jsoup.parse(crawlContent);
Elements tables = doc.select("table");
Element table = tables.get(2);
Elements trs = table.select("tr");
List<VisaReportCheckeeData> visaReportCheckees = new ArrayList<>();
for (int i = 1; i < trs.size(); i++) {
VisaReportCheckeeData visaReportCheckee = parseTrToCheckeeReport(trs.get(i));
visaReportCheckees.add(visaReportCheckee);
}
return visaReportCheckees;
}
private VisaReportCheckeeData parseTrToCheckeeReport(Element tr) {
VisaReportCheckeeData visaReportCheckee = new VisaReportCheckeeData();
Elements tds = tr.select("td");
visaReportCheckee.setMonth(tds.get(1).text());
visaReportCheckee.setPendingCaseCount(Integer.parseInt(tds.get(2).text()));
visaReportCheckee.setClearCaseCount(Integer.parseInt(tds.get(3).text()));
visaReportCheckee.setRejectCaseCount(Integer.parseInt(tds.get(4).text()));
visaReportCheckee.setTotalCaseCount(Integer.parseInt(tds.get(5).text()));
visaReportCheckee.setAveWaitingDaysForCompleteCases(parseAveWaitingDaysForCompleteCasesTd(tds.get(6)));
return visaReportCheckee;
}
private Integer parseAveWaitingDaysForCompleteCasesTd(Element td) {
if (td.text().equals("-") || td.text().equals("")) {
log.error("Error when crawlHtml: {}", e.getMessage());
return null;
} else {
return Integer.parseInt(td.text());
}
}
private String ReadFromOss(String ossPath, CrawlToOssRule crawlToOssRule) {
String html = "";
try {
html = awsSysFileService.readHtmlFile(crawlToOssRule.getFileName(), crawlToOssRule.getBucket(),
crawlToOssRule.getFolder());
} catch (Exception e) {
log.error("Read from oss error: {}", e.getMessage());
}
return html;
}
private void SaveCacheHtml(String html, Long crawlId, CrawlToOssRule crawlToOssRule) throws IOException {
saveHtmlToOss(html, crawlToOssRule);
CrawlCache crawlCache = new CrawlCache();
if (crawlId != null) {
crawlCache =
crawlCacheRepository.findById(crawlId).orElse(new CrawlCache());
}
crawlCache.setCrawlKey(crawlToOssRule.getUrl());
crawlCache.setCrawlDate(crawlToOssRule.getCrawlDate());
crawlCache.setOssPath(crawlToOssRule.getOssPath());
crawlCache.setCacheDays(crawlToOssRule.getCacheDays());
crawlCacheRepository.save(crawlCache);
}
private void saveHtmlToOss(String html, CrawlToOssRule crawlToOssRule) throws IOException {
this.awsSysFileService.uploadHtmlFile(
crawlToOssRule.getFileName(),
crawlToOssRule.getBucket(),
crawlToOssRule.getFolder(),
html,
crawlToOssRule.getUrl());
}
public List<VisaCheckeeData> parseCheckeeVisa(String crawlCheckeeVisaList) {
Document doc = Jsoup.parse(crawlCheckeeVisaList);
Elements tables = doc.select("table");

View File

@ -1,6 +1,8 @@
package com.northtecom.visatrack.api.service.impl;
import com.northtecom.visatrack.api.base.data.BaseEntity;
import com.northtecom.visatrack.api.base.exception.BaseException;
import com.northtecom.visatrack.api.base.util.Action;
import com.northtecom.visatrack.api.base.web.Status;
import com.northtecom.visatrack.api.controller.vo.VisaCaseSearch;
import com.northtecom.visatrack.api.controller.vo.VisaSubmitRequest;
@ -8,16 +10,20 @@ import com.northtecom.visatrack.api.controller.vo.VisaTrackUserDetail;
import com.northtecom.visatrack.api.data.entity.CaseVisaReport;
import com.northtecom.visatrack.api.data.entity.User;
import com.northtecom.visatrack.api.data.entity.VisaCase;
import com.northtecom.visatrack.api.data.entity.VisaCheckeeCrawlData;
import com.northtecom.visatrack.api.data.repository.CaseVisaReportRepository;
import com.northtecom.visatrack.api.data.repository.UserRepository;
import com.northtecom.visatrack.api.data.repository.VisaCaseRepository;
import com.northtecom.visatrack.api.data.repository.VisaCheckeeCrawlDataRepository;
import com.northtecom.visatrack.api.data.spec.DateRange;
import com.northtecom.visatrack.api.data.spec.VisaCaseSpecification;
import com.northtecom.visatrack.api.service.dto.CaseAvgWaitDayReport;
import com.northtecom.visatrack.api.service.dto.CaseStatusSummaryReport;
import com.northtecom.visatrack.api.service.dto.CrawlHtml;
import com.northtecom.visatrack.api.service.dto.VisaReportCheckeeData;
import com.northtecom.visatrack.api.service.enums.VisaEntry;
import com.northtecom.visatrack.api.service.enums.VisaStatus;
import com.northtecom.visatrack.api.service.rules.CrawlToOssRule;
import lombok.extern.slf4j.Slf4j;
import org.jetbrains.annotations.NotNull;
import org.springframework.beans.factory.annotation.Autowired;
@ -29,7 +35,6 @@ import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.security.core.Authentication;
import org.springframework.security.core.context.SecurityContextHolder;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
import java.time.Instant;
@ -41,6 +46,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
@ -56,14 +62,19 @@ public class VisaCaseService {
private final VisaCaseRepository visaCaseRepository;
private final CaseVisaReportRepository caseVisaReportRepository;
private final UserRepository userRepository;
private final CrawlService crawlService;
private final VisaCheckeeCrawlDataRepository visaCheckeeCrawlDataRepository;
@Autowired
public VisaCaseService(VisaCaseRepository visaCaseRepository, VisaReportCheckeeService visaReportCheckeeService,
CaseVisaReportRepository caseVisaReportRepository, UserRepository userRepository) {
public VisaCaseService(VisaCaseRepository visaCaseRepository,
CaseVisaReportRepository caseVisaReportRepository, UserRepository userRepository,
CrawlService crawlService, VisaCheckeeCrawlDataRepository visaCheckeeCrawlDataRepository) {
this.visaCaseRepository = visaCaseRepository;
this.caseVisaReportRepository = caseVisaReportRepository;
this.userRepository = userRepository;
this.crawlService = crawlService;
this.visaCheckeeCrawlDataRepository = visaCheckeeCrawlDataRepository;
}
@ -213,7 +224,6 @@ public class VisaCaseService {
}
@Transactional
private void saveCaseVisaReport(VisaReportCheckeeData visaReportCheckeeData) {
Optional<CaseVisaReport> caseVisaReport =
caseVisaReportRepository.findByMonth(visaReportCheckeeData.getMonth());
@ -406,4 +416,171 @@ public class VisaCaseService {
return visaCaseRepository.queryFavouredVisaCaseByUserId(userDetail.getId());
}
public void importNewCrawlData() {
List<VisaCheckeeCrawlData> newCrawlData =
this.visaCheckeeCrawlDataRepository.QueryNewCrawlData();
getCrawlDataDetail(newCrawlData);
List<VisaCase> visaCaseList = new ArrayList<>();
for (int i = 0; i < newCrawlData.size(); i++) {
VisaCheckeeCrawlData crawlData = newCrawlData.get(i);
VisaCase savedVisaCase = saveCheckeeVisaDataToVisa(crawlData);
if (savedVisaCase != null) {
visaCaseList.add(savedVisaCase);
}
log.info("Crawl checkee visa data: {}/{}", i + 1, savedVisaCase);
}
log.info("Start to batch save crawl checkee visa data total: {}", visaCaseList.size());
BatchSaveVisaCase(visaCaseList);
}
private void getCrawlDataDetail(List<VisaCheckeeCrawlData> notDetailData) {
log.info("Start to save crawl checkee not detail visa data total: {}", notDetailData.size());
for (int i = 0; i < notDetailData.size(); i++) {
log.info("Start to save crawl checkee not detail visa data: {}/{}", i + 1, notDetailData.size());
VisaCheckeeCrawlData visaCheckeeCrawlData = notDetailData.get(i);
try {
CrawlHtml crawlCheckeeVisaDetail =
this.crawlService.crawlHtml(CrawlToOssRule.URL_CHECKEE_VISA_DETAIL + visaCheckeeCrawlData.getCaseNum());
this.crawlService.parseCheckeeVisaDetailDataAndFill(crawlCheckeeVisaDetail.getHtml(),
visaCheckeeCrawlData);
} catch (Exception e) {
log.error("Crawl checkee visa detail error: caseNum : {} error: {}", visaCheckeeCrawlData.getCaseNum()
, e.getMessage());
}
try {
CrawlHtml crawlCheckeeVisaUpdate =
this.crawlService.crawlHtml(CrawlToOssRule.URL_CHECKEE_VISA_UPDATE + visaCheckeeCrawlData.getCaseNum());
this.crawlService.parseCheckeeVisaUpdateDataAndFill(crawlCheckeeVisaUpdate.getHtml(),
visaCheckeeCrawlData);
} catch (Exception e) {
log.error("Crawl checkee visa update email error: caseNum : {} error: {}",
visaCheckeeCrawlData.getCaseNum(), e.getMessage());
}
}
log.info("End to save crawl checkee not detail visa data total: {}", notDetailData.size());
batchSaveDataItems(
notDetailData,
50,
visaCheckeeCrawlDataRepository::saveAllAndFlush,
visaCheckeeCrawlDataRepository::saveAndFlush,
visaCheckeeCrawlDataRepository::flush);
}
private <T extends BaseEntity<Long>> void batchSaveDataItems(List<T> saveItems, Integer batchSize,
Function<List<T>, List<T>> saveFunction,
Function<T, T> saveSingle, Action flushAction) {
List<T> batchItems = new ArrayList<>();
for (int i = 0; i < saveItems.size(); i++) {
batchItems.add(saveItems.get(i));
if (batchItems.size() >= batchSize) {
batchSaveDataItem(batchItems, saveFunction, saveSingle, flushAction);
}
}
if (batchItems.size() > 0) {
batchSaveDataItem(batchItems, saveFunction, saveSingle, flushAction);
}
}
private <T extends BaseEntity<Long>> void batchSaveDataItem(List<T> batchItems, Function<List<T>, List<T>> saveAll,
Function<T, T> saveSingle, Action flushAction) {
try {
saveAll.apply(batchItems);
} catch (Exception e) {
log.error("Batch save items error: {}", e.getMessage());
flushAction.accept();
for (T batchItem : batchItems) {
try {
saveSingle.apply(batchItem);
} catch (Exception e1) {
log.error("Save item error: {},Id is : {}", e1.getMessage(),
batchItem.getId());
}
}
} finally {
batchItems.clear();
}
}
private void BatchSaveVisaCase(List<VisaCase> visaCaseList) {
List<VisaCase> batchItems = new ArrayList<>();
for (int i = 0; i < visaCaseList.size(); i++) {
batchItems.add(visaCaseList.get(i));
if (batchItems.size() >= 50) {
batchSave(batchItems);
}
}
if (batchItems.size() > 0) {
batchSave(batchItems);
}
}
private void batchSave(List<VisaCase> batchItems) {
try {
visaCaseRepository.saveAll(batchItems);
} catch (Exception e) {
log.error("Batch save visa case error: {}", e.getMessage());
for (VisaCase visaCase : batchItems) {
try {
visaCaseRepository.save(visaCase);
} catch (Exception e1) {
log.error("Save visa case error: {},Case # : {}", e1.getMessage(),
visaCase.getRefCrawlCaseNumber());
}
}
} finally {
batchItems.clear();
}
}
public VisaCase saveCheckeeVisaDataToVisa(VisaCheckeeCrawlData visaCheckeeCrawlData) {
VisaCase saveVisaCase = new VisaCase();
if (StringUtils.hasText(visaCheckeeCrawlData.getCaseNum())) {
Optional<VisaCase> visaCase =
visaCaseRepository.findByRefCaseNum(visaCheckeeCrawlData.getCaseNum());
if (visaCase.isPresent()) {
saveVisaCase = visaCase.get();
}
}
saveVisaCase.setUserName(visaCheckeeCrawlData.getUserId());
saveVisaCase.setUserEmail(visaCheckeeCrawlData.getPartEmail());
saveVisaCase.setVisaCategory(visaCheckeeCrawlData.getVisaType());
saveVisaCase.setVisaStatus(VisaStatus.valueOf(visaCheckeeCrawlData.getStatus()));
saveVisaCase.setEmbassyConsulate(visaCheckeeCrawlData.getConsulate());
try {
saveVisaCase.setVisaEntry(VisaEntry.valueOf(visaCheckeeCrawlData.getVisaEntry()));
} catch (Exception e) {
log.error("Parse check date error: {}, error: {}", visaCheckeeCrawlData.getCheckDate(), e.getMessage());
saveVisaCase.setVisaEntry(VisaEntry.New);
}
saveVisaCase.setMajor(visaCheckeeCrawlData.getMajor());
saveVisaCase.setDateVisaInterview(visaCheckeeCrawlData.getCheckDate());
saveVisaCase.setDateVisaCheckCompleted(visaCheckeeCrawlData.getCompleteDate());
saveVisaCase.setNote(visaCheckeeCrawlData.getNote());
saveVisaCase.setRefCrawlCaseNumber(visaCheckeeCrawlData.getCaseNum());
saveVisaCase.setCrawled(true);
saveVisaCase.setCrawledTime(visaCheckeeCrawlData.getCrawlTime());
return saveVisaCase;
}
}

View File

@ -12,11 +12,8 @@ import com.northtecom.visatrack.api.data.repository.VisaCaseRepository;
import com.northtecom.visatrack.api.data.repository.VisaCheckeeCrawlDataRepository;
import com.northtecom.visatrack.api.data.spec.DateRange;
import com.northtecom.visatrack.api.service.dto.*;
import com.northtecom.visatrack.api.service.enums.VisaEntry;
import com.northtecom.visatrack.api.service.enums.VisaStatus;
import lombok.SneakyThrows;
import com.northtecom.visatrack.api.service.rules.CrawlToOssRule;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.ObjectUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
@ -46,16 +43,19 @@ public class VisaReportCheckeeService {
private final VisaCheckeeCrawlDataRepository visaCheckeeCrawlDataRepository;
private final VisaCaseRepository visaCaseRepository;
private final VisaCaseService visaCaseService;
@Autowired
public VisaReportCheckeeService(CrawlService crawlService,
CaseVisaReportRepository caseVisaReportRepository,
VisaCheckeeCrawlDataRepository visaCheckeeCrawlDataRepository,
VisaCaseRepository visaCaseRepository) {
VisaCaseRepository visaCaseRepository, VisaCaseService visaCaseService) {
this.crawlService = crawlService;
this.caseVisaReportRepository = caseVisaReportRepository;
this.visaCheckeeCrawlDataRepository = visaCheckeeCrawlDataRepository;
this.visaCaseRepository = visaCaseRepository;
this.visaCaseService = visaCaseService;
}
//Iterable List
@ -65,17 +65,6 @@ public class VisaReportCheckeeService {
return list;
}
@SneakyThrows
public void crawlCheckeeReport() {
CrawlHtml crawlContent = this.crawlService.crawlHtmlByCrawlUrl(CrawlService.URL_CHECKEE_REPORT);
if (ObjectUtils.isNotEmpty(crawlContent)) {
List<VisaReportCheckeeData> reportCheckeeData =
this.crawlService.parseCheckeeReport(crawlContent.getHtml());
log.info("Crawl checkee report size: {}", reportCheckeeData.size());
saveCheckeeReport(reportCheckeeData);
}
}
private void saveCheckeeReport(List<VisaReportCheckeeData> reportCheckeeData) {
if (reportCheckeeData.size() == 0) {
@ -142,7 +131,7 @@ public class VisaReportCheckeeService {
CrawlHtml crawlCheckeeVisaList =
this.crawlService.crawlHtmlByCrawlUrl(CrawlService.URL_CHECKEE_REPORT_DETAIL + visaCrawlRequest.getDateKey());
this.crawlService.crawlHtml(CrawlToOssRule.URL_CHECKEE_REPORT_DETAIL + visaCrawlRequest.getDateKey());
List<VisaCheckeeData> visaCheckeeCrawlCrawlDataList =
this.crawlService.parseCheckeeVisa(crawlCheckeeVisaList.getHtml());
@ -172,7 +161,7 @@ public class VisaReportCheckeeService {
if (!hasPersonalDetailData && visaCrawlRequest.getCrawlDetail()) {
try {
CrawlHtml crawlCheckeeVisaDetail =
this.crawlService.crawlHtmlByCrawlUrl(CrawlService.URL_CHECKEE_VISA_DETAIL + visaCheckeeData.getCaseNum());
this.crawlService.crawlHtml(CrawlToOssRule.URL_CHECKEE_VISA_DETAIL + visaCheckeeData.getCaseNum());
this.crawlService.parseCheckeeVisaDetailDataAndFill(crawlCheckeeVisaDetail.getHtml(),
visaCheckeeData);
} catch (Exception e) {
@ -182,7 +171,7 @@ public class VisaReportCheckeeService {
try {
CrawlHtml crawlCheckeeVisaUpdate =
this.crawlService.crawlHtmlByCrawlUrl(CrawlService.URL_CHECKEE_VISA_UPDATE + visaCheckeeData.getCaseNum());
this.crawlService.crawlHtml(CrawlToOssRule.URL_CHECKEE_VISA_UPDATE + visaCheckeeData.getCaseNum());
this.crawlService.parseCheckeeVisaUpdateDataAndFill(crawlCheckeeVisaUpdate.getHtml(),
visaCheckeeData);
} catch (Exception e) {
@ -203,6 +192,7 @@ public class VisaReportCheckeeService {
}
log.info("Start to batch save crawl checkee visa data total: {}", allSavedVisaCaseList.size());
visaCheckeeCrawlDataRepository.saveAll(allSavedVisaCaseList);
}
@ -281,65 +271,11 @@ public class VisaReportCheckeeService {
return saveVisaCheckeeCrawlData;
}
private void saveCheckeeVisa(VisaCheckeeData visaCheckeeData,
List<VisaCheckeeCrawlData> visaReportCheckeeCrawlDataList, Date crawlTime,
String month) {
VisaCheckeeCrawlData findCrawlData = visaReportCheckeeCrawlDataList.stream()
.filter(visaCheckeeCrawlData -> visaCheckeeCrawlData.getCaseNum().equals(visaCheckeeData.getCaseNum()))
.findFirst().orElse(null);
VisaCheckeeCrawlData saveVisaCheckeeCrawlData = new VisaCheckeeCrawlData();
if (findCrawlData == null) {
Optional<VisaCheckeeCrawlData> visaCheckeeCrawlData =
visaCheckeeCrawlDataRepository.findByCaseNum(visaCheckeeData.getCaseNum());
if (visaCheckeeCrawlData.isPresent()) {
saveVisaCheckeeCrawlData = visaCheckeeCrawlData.get();
}
} else {
saveVisaCheckeeCrawlData = findCrawlData;
}
// 爬取时间1小时以内不做更新
if (saveVisaCheckeeCrawlData.getCrawlTime() != null && DateUtil.offsetHour(saveVisaCheckeeCrawlData.getCrawlTime(), 1).after(crawlTime)) {
return;
}
saveVisaCheckeeCrawlData.setCrawlTime(crawlTime);
saveVisaCheckeeCrawlData.setMonth(month);
saveVisaCheckeeCrawlData.setCaseNum(visaCheckeeData.getCaseNum());
saveVisaCheckeeCrawlData.setUserId(visaCheckeeData.getUserId());
saveVisaCheckeeCrawlData.setVisaType(visaCheckeeData.getVisaType());
saveVisaCheckeeCrawlData.setVisaEntry(visaCheckeeData.getVisaEntry());
saveVisaCheckeeCrawlData.setConsulate(visaCheckeeData.getConsulate());
saveVisaCheckeeCrawlData.setMajor(visaCheckeeData.getMajor());
saveVisaCheckeeCrawlData.setStatus(visaCheckeeData.getStatus());
saveVisaCheckeeCrawlData.setCheckDate(visaCheckeeData.getCheckDate());
saveVisaCheckeeCrawlData.setCompleteDate(visaCheckeeData.getCompleteDate());
saveVisaCheckeeCrawlData.setNote(visaCheckeeData.getNote());
saveVisaCheckeeCrawlData.setPartEmail(visaCheckeeData.getPartEmail());
saveVisaCheckeeCrawlData.setFirstName(visaCheckeeData.getFirstName());
saveVisaCheckeeCrawlData.setLastName(visaCheckeeData.getLastName());
saveVisaCheckeeCrawlData.setUniversity(visaCheckeeData.getUniversity());
saveVisaCheckeeCrawlData.setDegree(visaCheckeeData.getDegree());
saveVisaCheckeeCrawlData.setJobTitle(visaCheckeeData.getJobTitle());
saveVisaCheckeeCrawlData.setEmployer(visaCheckeeData.getEmployer());
saveVisaCheckeeCrawlData.setYearsInUsa(visaCheckeeData.getYearsInUsa());
saveVisaCheckeeCrawlData.setCountry(visaCheckeeData.getCountry());
try {
visaCheckeeCrawlDataRepository.saveAndFlush(saveVisaCheckeeCrawlData);
} catch (Exception e) {
log.error("Save checkee visa error: caseNum : {} error: {}", visaCheckeeData.getCaseNum(), e.getMessage());
}
}
public void importCheckeeVisaCrawlData(LocalDate startDate, LocalDate endDate) {
log.info("Start to import checkee visa crawl data from {} to {}", startDate, endDate);
List<VisaCheckeeCrawlData> visaCheckeeCrawlDataList =
visaCheckeeCrawlDataRepository.findAllNotImportData(startDate, endDate);
@ -349,7 +285,7 @@ public class VisaReportCheckeeService {
log.info("Start to import checkee visa data: {}/{}", i + 1, visaCheckeeCrawlDataList.size());
try {
batchItems.add(saveCheckeeVisaDataToVisa(visaCheckeeCrawlDataList.get(i)));
batchItems.add(visaCaseService.saveCheckeeVisaDataToVisa(visaCheckeeCrawlDataList.get(i)));
} catch (Exception e) {
log.error("Save checkee visa data to visa error: {},case# {}", e.getMessage(),
visaCheckeeCrawlDataList.get(i).getCaseNum());
@ -385,41 +321,15 @@ public class VisaReportCheckeeService {
}
private VisaCase saveCheckeeVisaDataToVisa(VisaCheckeeCrawlData visaCheckeeCrawlData) {
VisaCase saveVisaCase = new VisaCase();
if (StringUtils.hasText(visaCheckeeCrawlData.getCaseNum())) {
Optional<VisaCase> visaCase =
visaCaseRepository.findByRefCaseNum(visaCheckeeCrawlData.getCaseNum());
if (visaCase.isPresent()) {
saveVisaCase = visaCase.get();
}
}
saveVisaCase.setUserName(visaCheckeeCrawlData.getUserId());
saveVisaCase.setUserEmail(visaCheckeeCrawlData.getPartEmail());
saveVisaCase.setVisaCategory(visaCheckeeCrawlData.getVisaType());
saveVisaCase.setVisaStatus(VisaStatus.valueOf(visaCheckeeCrawlData.getStatus()));
saveVisaCase.setEmbassyConsulate(visaCheckeeCrawlData.getConsulate());
try {
saveVisaCase.setVisaEntry(VisaEntry.valueOf(visaCheckeeCrawlData.getVisaEntry()));
} catch (Exception e) {
log.error("Parse check date error: {}, error: {}", visaCheckeeCrawlData.getCheckDate(), e.getMessage());
saveVisaCase.setVisaEntry(VisaEntry.New);
}
saveVisaCase.setMajor(visaCheckeeCrawlData.getMajor());
saveVisaCase.setDateVisaInterview(visaCheckeeCrawlData.getCheckDate());
saveVisaCase.setDateVisaCheckCompleted(visaCheckeeCrawlData.getCompleteDate());
saveVisaCase.setNote(visaCheckeeCrawlData.getNote());
saveVisaCase.setRefCrawlCaseNumber(visaCheckeeCrawlData.getCaseNum());
saveVisaCase.setCrawled(true);
saveVisaCase.setCrawledTime(visaCheckeeCrawlData.getCrawlTime());
return saveVisaCase;
}
public void crawlNotDetailData() {
List<VisaCheckeeCrawlData> notDetailData =
this.visaCheckeeCrawlDataRepository.QueryNotDetail();
getCrawlDataDetail(notDetailData);
}
private void getCrawlDataDetail(List<VisaCheckeeCrawlData> notDetailData) {
log.info("Start to save crawl checkee not detail visa data total: {}", notDetailData.size());
for (int i = 0; i < notDetailData.size(); i++) {
@ -429,7 +339,7 @@ public class VisaReportCheckeeService {
try {
CrawlHtml crawlCheckeeVisaDetail =
this.crawlService.crawlHtmlByCrawlUrl(CrawlService.URL_CHECKEE_VISA_DETAIL + visaCheckeeCrawlData.getCaseNum());
this.crawlService.crawlHtml(CrawlToOssRule.URL_CHECKEE_VISA_DETAIL + visaCheckeeCrawlData.getCaseNum());
this.crawlService.parseCheckeeVisaDetailDataAndFill(crawlCheckeeVisaDetail.getHtml(),
visaCheckeeCrawlData);
} catch (Exception e) {
@ -439,7 +349,7 @@ public class VisaReportCheckeeService {
try {
CrawlHtml crawlCheckeeVisaUpdate =
this.crawlService.crawlHtmlByCrawlUrl(CrawlService.URL_CHECKEE_VISA_UPDATE + visaCheckeeCrawlData.getCaseNum());
this.crawlService.crawlHtml(CrawlToOssRule.URL_CHECKEE_VISA_UPDATE + visaCheckeeCrawlData.getCaseNum());
this.crawlService.parseCheckeeVisaUpdateDataAndFill(crawlCheckeeVisaUpdate.getHtml(),
visaCheckeeCrawlData);
} catch (Exception e) {
@ -524,4 +434,48 @@ public class VisaReportCheckeeService {
return caseStatusSummaryReportList;
}
/**
* 重建所有报表数据
*/
public void rebuildAllReport() {
LocalDate endDate = LocalDate.now();
LocalDate startDate = LocalDate.of(2018, 10, 1);
log.info("Start to rebuild all report data from {} to {}", startDate, endDate);
this.generateReport(startDate, endDate);
}
public void syncVisaDataFromCheckee(LocalDate startDate, LocalDate endDate) {
log.info("Start to sync visa data from checkee from {} to {}", startDate, endDate);
DateRange dateRange = DateRange.of(startDate, endDate);
List<String> monthList = dateRange.getMonthKeys();
log.info("Start to sync visa data from checkee");
for (String monthKey : monthList) {
log.info("sync [{}] data", monthKey);
try {
VisaCrawlRequest visaCrawlRequest = new VisaCrawlRequest();
visaCrawlRequest.setDateKey(monthKey);
visaCrawlRequest.setCrawlDetail(true);
this.crawlCheckeeVisa(visaCrawlRequest);
} catch (Exception e) {
log.error("sync [{}] data error", monthKey, e);
}
}
log.info("End to sync visa data from checkee");
}
public void syncDataAndReport(LocalDate endDate, LocalDate startDate) {
this.syncVisaDataFromCheckee(startDate, endDate);
this.importCheckeeVisaCrawlData(startDate, endDate.plusDays(1));
this.rebuildAllReport();
}
}

View File

@ -0,0 +1,104 @@
package com.northtecom.visatrack.api.service.rules;
import cn.hutool.core.date.DateUtil;
import lombok.Data;
import java.time.LocalDate;
import java.util.Date;
/**
* Created with IntelliJ IDEA.
*
* @Author: XieYang
* @Date: 2022/10/31/9:29
* @Description:
*/
@Data
public class CrawlToOssRule {
public static final String URL_CHECKEE_REPORT = "https://www.checkee.info/index.php";
public static final String URL_CHECKEE_REPORT_DETAIL = "https://www.checkee.info/main.php?dispdate=";
public static final String URL_CHECKEE_VISA_DETAIL = "https://checkee.info/personal_detail.php?casenum=";
public static final String URL_CHECKEE_VISA_UPDATE = "https://checkee.info/update.php?casenum=";
private String folder;
private String bucket;
private String fileName;
private String url;
private String ossPath;
private Integer cacheDays;
private Date crawlDate;
public CrawlToOssRule(String bucket, String url) {
this.crawlDate = new Date();
this.bucket = bucket;
this.url = url;
if (this.url.startsWith(URL_CHECKEE_REPORT)) {
this.folder = "index";
this.fileName = "index.html";
} else if (this.url.startsWith(URL_CHECKEE_REPORT_DETAIL)) {
this.folder = "report/month";
this.fileName = String.format("report_detail_%s.html",
this.url.substring(URL_CHECKEE_REPORT_DETAIL.length()));
} else if (this.url.startsWith(URL_CHECKEE_VISA_DETAIL)) {
this.folder = "detail";
this.fileName = String.format("visa_detail_%s.html", this.url.substring(URL_CHECKEE_VISA_DETAIL.length()));
} else if (this.url.startsWith(URL_CHECKEE_VISA_UPDATE)) {
this.folder = "update";
this.fileName = String.format("visa_update_%s.html", this.url.substring(URL_CHECKEE_VISA_UPDATE.length()));
}
this.cacheDays = caculateCrawlCacheDay(this.url);
this.ossPath = buildOssPath(this.bucket, this.folder, this.fileName);
}
public static CrawlToOssRule of(String defaultBucketName, String crawlUrl) {
return new CrawlToOssRule(defaultBucketName, crawlUrl);
}
private String buildOssPath(String bucket, String folder, String fileName) {
bucket = bucket.replace("/", "").replace("\\", "");
folder = folder.replace("\\", "/");
if (folder.startsWith("/")) {
folder = folder.substring(1);
}
if (folder.endsWith("/")) {
folder = folder.substring(0, folder.length() - 1);
}
fileName = fileName.replace("/", "").replace("\\", "");
return String.format("%s/%s/%s", bucket, folder, fileName);
}
private Integer caculateCrawlCacheDay(String crawlUrl) {
if (crawlUrl.contains(URL_CHECKEE_REPORT)) {
return 1;
} else if (crawlUrl.contains(URL_CHECKEE_VISA_DETAIL) || crawlUrl.contains(URL_CHECKEE_VISA_UPDATE)) {
return 365;
} else if (crawlUrl.contains(URL_CHECKEE_REPORT_DETAIL)) {
String month = crawlUrl.substring(crawlUrl.lastIndexOf("=") + 1);
LocalDate localDate = LocalDate.parse(month + "-01");
LocalDate now = LocalDate.now();
int monthDiff =
now.getYear() * 12 + now.getMonthValue() - (localDate.getYear() * 12 + localDate.getMonthValue());
if (monthDiff <= 2) {
return 1;
} else if (monthDiff <= 6) {
return 7;
} else if (monthDiff <= 24) {
return 30;
} else if (monthDiff <= 72) {
return 365;
} else {
return 3 * 365;
}
} else {
return 3;
}
}
public boolean checkIsExpired(Date crawledDate) {
return DateUtil.offsetDay(crawledDate, this.cacheDays).before(crawlDate);
}
}

View File

@ -1,6 +1,6 @@
server:
ssl:
key-store: classpath:keystore.p12
key-store-password: 123456
key-store-type: PKCS12
key-alias: tomcat
#server:
# ssl:
# key-store: classpath:keystore.p12
# key-store-password: 123456
# key-store-type: PKCS12
# key-alias: tomcat

View File

@ -31,7 +31,7 @@ spring:
application:
name: usvisatrack
title: Us Visa Track API
version: 1.1.7
version: 1.1.8
jackson:
mapper:
accept-case-insensitive-properties: true
@ -57,7 +57,7 @@ spring:
order_updates: true
mvc:
async:
request-timeout: 30000
request-timeout: 60000
thymeleaf:
prefix: classpath:/templates/