-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Feat/#172 크롤링 및 MQ publish 로직에 Spring Batch 적용
- Loading branch information
Showing
10 changed files
with
290 additions
and
128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
119 changes: 119 additions & 0 deletions
119
backend/core/src/main/java/com/rollthedice/backend/batch/BatchJobConfig.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
package com.rollthedice.backend.batch; | ||
|
||
import com.rollthedice.backend.batch.newsContentStep.PreSummarizedNewsDto; | ||
import com.rollthedice.backend.batch.newsContentStep.UncrawledNewsContentReader; | ||
import com.rollthedice.backend.batch.newsUrlStep.InitNewsDto; | ||
import com.rollthedice.backend.batch.newsUrlStep.NewsUrlReader; | ||
import com.rollthedice.backend.domain.news.contentqueue.ContentProducer; | ||
import com.rollthedice.backend.domain.news.dto.ContentMessageDto; | ||
import com.rollthedice.backend.domain.news.repository.NewsRepository; | ||
import lombok.RequiredArgsConstructor; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.springframework.batch.core.Job; | ||
import org.springframework.batch.core.Step; | ||
import org.springframework.batch.core.configuration.annotation.JobScope; | ||
import org.springframework.batch.core.configuration.annotation.StepScope; | ||
import org.springframework.batch.core.job.builder.JobBuilder; | ||
import org.springframework.batch.core.launch.support.RunIdIncrementer; | ||
import org.springframework.batch.core.repository.JobRepository; | ||
import org.springframework.batch.core.step.builder.StepBuilder; | ||
import org.springframework.batch.item.ItemProcessor; | ||
import org.springframework.batch.item.ItemReader; | ||
import org.springframework.batch.item.database.JdbcBatchItemWriter; | ||
import org.springframework.batch.item.database.builder.JdbcBatchItemWriterBuilder; | ||
import org.springframework.beans.factory.annotation.Value; | ||
import org.springframework.context.annotation.Bean; | ||
import org.springframework.context.annotation.Configuration; | ||
import org.springframework.transaction.PlatformTransactionManager; | ||
|
||
import javax.sql.DataSource; | ||
|
||
@Slf4j | ||
@Configuration | ||
@RequiredArgsConstructor | ||
public class BatchJobConfig { | ||
|
||
@Value("${batch.chunk-size}") | ||
private int chunkSize; | ||
|
||
private final DataSource dataSource; | ||
private final NewsRepository newsRepository; | ||
private final ContentProducer contentProducer; | ||
|
||
@Bean | ||
public Job scrapJob(JobRepository jobRepository, | ||
Step crawlingNewsUrlStep, Step crawlingNewsContentStep) { | ||
return new JobBuilder("scrapJob", jobRepository) | ||
.incrementer(new RunIdIncrementer()) | ||
.start(crawlingNewsUrlStep) | ||
.next(crawlingNewsContentStep) | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@JobScope | ||
public Step crawlingNewsUrlStep(JobRepository jobRepository, | ||
PlatformTransactionManager transactionManager) { | ||
return new StepBuilder("crawlingNewsUrlStep", jobRepository) | ||
.allowStartIfComplete(true) | ||
.<InitNewsDto, InitNewsDto>chunk(30, transactionManager) | ||
.reader(newsUrlReader()) | ||
.writer(newsUrlWriter()) | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public ItemReader<InitNewsDto> newsUrlReader() { | ||
return new NewsUrlReader(); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public JdbcBatchItemWriter<InitNewsDto> newsUrlWriter() { | ||
return new JdbcBatchItemWriterBuilder<InitNewsDto>() | ||
.dataSource(dataSource) | ||
.sql("insert into news(url, category) values (:url, :newsCategory)") | ||
.beanMapped() | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@JobScope | ||
public Step crawlingNewsContentStep(JobRepository jobRepository, | ||
PlatformTransactionManager transactionManager) { | ||
return new StepBuilder("crawlingNewsContentStep", jobRepository) | ||
.allowStartIfComplete(true) | ||
.<PreSummarizedNewsDto, PreSummarizedNewsDto>chunk(chunkSize, transactionManager) | ||
.reader(uncrawledNewsContentReader()) | ||
.processor(summarizeContentProcessor()) | ||
.writer(newsContentWriter()) | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public ItemReader<PreSummarizedNewsDto> uncrawledNewsContentReader() { | ||
return new UncrawledNewsContentReader(newsRepository); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public ItemProcessor<PreSummarizedNewsDto, PreSummarizedNewsDto> summarizeContentProcessor() { | ||
return dto -> { | ||
contentProducer.sendMessage(new ContentMessageDto(dto.getId(), dto.getContent())); | ||
return dto; | ||
}; | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public JdbcBatchItemWriter<PreSummarizedNewsDto> newsContentWriter() { | ||
return new JdbcBatchItemWriterBuilder<PreSummarizedNewsDto>() | ||
.dataSource(dataSource) | ||
.sql("update news set title = :title, content = :content, post_date = :postDate" + | ||
" where id = :id") | ||
.beanMapped() | ||
.build(); | ||
} | ||
} |
14 changes: 14 additions & 0 deletions
14
...ore/src/main/java/com/rollthedice/backend/batch/newsContentStep/PreSummarizedNewsDto.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package com.rollthedice.backend.batch.newsContentStep; | ||
|
||
import lombok.*; | ||
|
||
@Getter | ||
@NoArgsConstructor | ||
@AllArgsConstructor | ||
@Builder | ||
public class PreSummarizedNewsDto { | ||
private Long id; | ||
private String title; | ||
private String content; | ||
private String postDate; | ||
} |
80 changes: 80 additions & 0 deletions
80
...c/main/java/com/rollthedice/backend/batch/newsContentStep/UncrawledNewsContentReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
package com.rollthedice.backend.batch.newsContentStep; | ||
|
||
import com.rollthedice.backend.domain.news.entity.News; | ||
import com.rollthedice.backend.domain.news.repository.NewsRepository; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
import org.springframework.batch.item.ItemReader; | ||
|
||
import java.io.IOException; | ||
import java.util.Iterator; | ||
|
||
public class UncrawledNewsContentReader implements ItemReader<PreSummarizedNewsDto> { | ||
private final NewsRepository newsRepository; | ||
private Iterator<News> uncrawledNewsContents; | ||
|
||
public UncrawledNewsContentReader(NewsRepository newsRepository) { | ||
this.newsRepository = newsRepository; | ||
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator(); | ||
} | ||
|
||
@Override | ||
public PreSummarizedNewsDto read() throws IOException { | ||
if (!hasNextUncrawledNews()) { | ||
return null; | ||
} | ||
News news = uncrawledNewsContents.next(); | ||
Document doc = Jsoup.connect(news.getUrl()).get(); | ||
return getNewsContent(news, doc); | ||
} | ||
|
||
private boolean hasNextUncrawledNews() { | ||
if (!uncrawledNewsContents.hasNext()) { | ||
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator(); | ||
} | ||
return uncrawledNewsContents.hasNext(); | ||
} | ||
|
||
private PreSummarizedNewsDto getNewsContent(News news, Document doc) { | ||
return PreSummarizedNewsDto.builder() | ||
.id(news.getId()) | ||
.title(scrapTitle(doc)) | ||
.content(scrapContent(doc)) | ||
.postDate(scrapPostDate(doc)) | ||
.build(); | ||
} | ||
|
||
private String scrapTitle(final Document doc) { | ||
Element titleElement = doc.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2"); | ||
if (titleElement == null) { | ||
titleElement = doc.selectFirst("#content > div.end_ct > div > h2"); | ||
} | ||
if (titleElement != null) { | ||
return titleElement.text(); | ||
} | ||
return null; | ||
} | ||
|
||
private String scrapContent(final Document doc) { | ||
Elements contentElements = doc.select("article#dic_area"); | ||
if (contentElements.isEmpty()) { | ||
contentElements = doc.select("#articeBody"); | ||
} | ||
return contentElements.outerHtml().replaceAll("\\<[^>]*>|\\n", ""); | ||
} | ||
|
||
private String scrapPostDate(final Document doc) { | ||
Element dateElement = doc.selectFirst("div#ct> div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span"); | ||
if (dateElement != null) { | ||
return dateElement.attr("data-date-time"); | ||
} else { | ||
Element altDateElement = doc.selectFirst("#content > div.end_ct > div > div.article_info > span > em"); | ||
if (altDateElement != null) { | ||
return altDateElement.text(); | ||
} | ||
} | ||
return null; | ||
} | ||
} |
15 changes: 15 additions & 0 deletions
15
backend/core/src/main/java/com/rollthedice/backend/batch/newsUrlStep/InitNewsDto.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package com.rollthedice.backend.batch.newsUrlStep; | ||
|
||
import lombok.AllArgsConstructor; | ||
import lombok.Getter; | ||
import lombok.NoArgsConstructor; | ||
import lombok.Setter; | ||
|
||
@Getter | ||
@NoArgsConstructor | ||
@AllArgsConstructor | ||
@Setter | ||
public class InitNewsDto { | ||
private String newsCategory; | ||
private String url; | ||
} |
53 changes: 53 additions & 0 deletions
53
backend/core/src/main/java/com/rollthedice/backend/batch/newsUrlStep/NewsUrlReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package com.rollthedice.backend.batch.newsUrlStep; | ||
|
||
import com.rollthedice.backend.domain.news.entity.NewsCategory; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
import org.springframework.batch.item.ItemReader; | ||
import org.springframework.beans.factory.annotation.Value; | ||
|
||
import java.io.IOException; | ||
import java.util.*; | ||
import java.util.stream.Collectors; | ||
|
||
public class NewsUrlReader implements ItemReader<InitNewsDto> { | ||
@Value("${crawling.quantity}") | ||
private int crawlingQuantity; | ||
|
||
private final Iterator<NewsCategory> categories; | ||
private final Queue<InitNewsDto> initNews = new LinkedList<>(); | ||
|
||
public NewsUrlReader() { | ||
categories = Arrays.stream(NewsCategory.values()).collect(Collectors.toList()).iterator(); | ||
} | ||
|
||
@Override | ||
public InitNewsDto read() throws IOException { | ||
while (initNews.isEmpty() && categories.hasNext()) { | ||
NewsCategory category = categories.next(); | ||
initNews.addAll(scrapCategoryNews(category)); | ||
} | ||
return initNews.poll(); | ||
} | ||
|
||
private List<InitNewsDto> scrapCategoryNews(NewsCategory category) throws IOException { | ||
Document doc = Jsoup.connect(category.getCategoryUrl()).get(); | ||
Elements newsList = doc.select(".sa_list").select("li"); | ||
if (newsList.size() < crawlingQuantity) { | ||
return scrapNewsUrl(newsList.size(), newsList, category); | ||
} | ||
return scrapNewsUrl(crawlingQuantity, newsList, category); | ||
} | ||
|
||
private List<InitNewsDto> scrapNewsUrl(int quantity, Elements newsList, NewsCategory category) { | ||
List<InitNewsDto> urls = new ArrayList<>(); | ||
for (int i = 0; i < quantity; i++) { | ||
Element news = newsList.get(i); | ||
String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href"); | ||
urls.add(new InitNewsDto(category.getName(), url)); | ||
} | ||
return urls; | ||
} | ||
} |
Oops, something went wrong.