Skip to content

Commit

Permalink
Merge pull request #178 from tukcomCD2024/feat/#172-backend-scrap-batch
Browse files Browse the repository at this point in the history
Feat/#172 크롤링 및 MQ publish 로직에 Spring Batch 적용
  • Loading branch information
yeonjy authored Jul 30, 2024
2 parents e60c1d6 + 195ca12 commit 709b218
Show file tree
Hide file tree
Showing 10 changed files with 290 additions and 128 deletions.
2 changes: 2 additions & 0 deletions backend/core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ dependencies {
implementation 'org.springframework.boot:spring-boot-starter-data-redis:2.3.1.RELEASE'
implementation 'org.springframework.boot:spring-boot-starter-webflux'
implementation 'com.fasterxml.jackson.core:jackson-core:2.17.0'
implementation 'org.springframework.boot:spring-boot-starter-batch'

implementation "com.querydsl:querydsl-jpa:${queryDslVersion}:jakarta"
annotationProcessor "com.querydsl:querydsl-apt:${queryDslVersion}:jakarta"
Expand All @@ -62,6 +63,7 @@ dependencies {
testImplementation 'org.springframework.security:spring-security-test'
testImplementation 'org.springframework.restdocs:spring-restdocs-mockmvc'
testImplementation 'io.rest-assured:rest-assured:5.1.1'
testImplementation 'org.springframework.batch:spring-batch-test'
}

tasks.named('bootBuildImage') {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package com.rollthedice.backend.batch;

import com.rollthedice.backend.batch.newsContentStep.PreSummarizedNewsDto;
import com.rollthedice.backend.batch.newsContentStep.UncrawledNewsContentReader;
import com.rollthedice.backend.batch.newsUrlStep.InitNewsDto;
import com.rollthedice.backend.batch.newsUrlStep.NewsUrlReader;
import com.rollthedice.backend.domain.news.contentqueue.ContentProducer;
import com.rollthedice.backend.domain.news.dto.ContentMessageDto;
import com.rollthedice.backend.domain.news.repository.NewsRepository;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.Step;
import org.springframework.batch.core.configuration.annotation.JobScope;
import org.springframework.batch.core.configuration.annotation.StepScope;
import org.springframework.batch.core.job.builder.JobBuilder;
import org.springframework.batch.core.launch.support.RunIdIncrementer;
import org.springframework.batch.core.repository.JobRepository;
import org.springframework.batch.core.step.builder.StepBuilder;
import org.springframework.batch.item.ItemProcessor;
import org.springframework.batch.item.ItemReader;
import org.springframework.batch.item.database.JdbcBatchItemWriter;
import org.springframework.batch.item.database.builder.JdbcBatchItemWriterBuilder;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.transaction.PlatformTransactionManager;

import javax.sql.DataSource;

@Slf4j
@Configuration
@RequiredArgsConstructor
public class BatchJobConfig {

@Value("${batch.chunk-size}")
private int chunkSize;

private final DataSource dataSource;
private final NewsRepository newsRepository;
private final ContentProducer contentProducer;

@Bean
public Job scrapJob(JobRepository jobRepository,
Step crawlingNewsUrlStep, Step crawlingNewsContentStep) {
return new JobBuilder("scrapJob", jobRepository)
.incrementer(new RunIdIncrementer())
.start(crawlingNewsUrlStep)
.next(crawlingNewsContentStep)
.build();
}

@Bean
@JobScope
public Step crawlingNewsUrlStep(JobRepository jobRepository,
PlatformTransactionManager transactionManager) {
return new StepBuilder("crawlingNewsUrlStep", jobRepository)
.allowStartIfComplete(true)
.<InitNewsDto, InitNewsDto>chunk(30, transactionManager)
.reader(newsUrlReader())
.writer(newsUrlWriter())
.build();
}

@Bean
@StepScope
public ItemReader<InitNewsDto> newsUrlReader() {
return new NewsUrlReader();
}

@Bean
@StepScope
public JdbcBatchItemWriter<InitNewsDto> newsUrlWriter() {
return new JdbcBatchItemWriterBuilder<InitNewsDto>()
.dataSource(dataSource)
.sql("insert into news(url, category) values (:url, :newsCategory)")
.beanMapped()
.build();
}

@Bean
@JobScope
public Step crawlingNewsContentStep(JobRepository jobRepository,
PlatformTransactionManager transactionManager) {
return new StepBuilder("crawlingNewsContentStep", jobRepository)
.allowStartIfComplete(true)
.<PreSummarizedNewsDto, PreSummarizedNewsDto>chunk(chunkSize, transactionManager)
.reader(uncrawledNewsContentReader())
.processor(summarizeContentProcessor())
.writer(newsContentWriter())
.build();
}

@Bean
@StepScope
public ItemReader<PreSummarizedNewsDto> uncrawledNewsContentReader() {
return new UncrawledNewsContentReader(newsRepository);
}

@Bean
@StepScope
public ItemProcessor<PreSummarizedNewsDto, PreSummarizedNewsDto> summarizeContentProcessor() {
return dto -> {
contentProducer.sendMessage(new ContentMessageDto(dto.getId(), dto.getContent()));
return dto;
};
}

@Bean
@StepScope
public JdbcBatchItemWriter<PreSummarizedNewsDto> newsContentWriter() {
return new JdbcBatchItemWriterBuilder<PreSummarizedNewsDto>()
.dataSource(dataSource)
.sql("update news set title = :title, content = :content, post_date = :postDate" +
" where id = :id")
.beanMapped()
.build();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.rollthedice.backend.batch.newsContentStep;

import lombok.*;

@Getter
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class PreSummarizedNewsDto {
private Long id;
private String title;
private String content;
private String postDate;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package com.rollthedice.backend.batch.newsContentStep;

import com.rollthedice.backend.domain.news.entity.News;
import com.rollthedice.backend.domain.news.repository.NewsRepository;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.batch.item.ItemReader;

import java.io.IOException;
import java.util.Iterator;

public class UncrawledNewsContentReader implements ItemReader<PreSummarizedNewsDto> {
private final NewsRepository newsRepository;
private Iterator<News> uncrawledNewsContents;

public UncrawledNewsContentReader(NewsRepository newsRepository) {
this.newsRepository = newsRepository;
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator();
}

@Override
public PreSummarizedNewsDto read() throws IOException {
if (!hasNextUncrawledNews()) {
return null;
}
News news = uncrawledNewsContents.next();
Document doc = Jsoup.connect(news.getUrl()).get();
return getNewsContent(news, doc);
}

private boolean hasNextUncrawledNews() {
if (!uncrawledNewsContents.hasNext()) {
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator();
}
return uncrawledNewsContents.hasNext();
}

private PreSummarizedNewsDto getNewsContent(News news, Document doc) {
return PreSummarizedNewsDto.builder()
.id(news.getId())
.title(scrapTitle(doc))
.content(scrapContent(doc))
.postDate(scrapPostDate(doc))
.build();
}

private String scrapTitle(final Document doc) {
Element titleElement = doc.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2");
if (titleElement == null) {
titleElement = doc.selectFirst("#content > div.end_ct > div > h2");
}
if (titleElement != null) {
return titleElement.text();
}
return null;
}

private String scrapContent(final Document doc) {
Elements contentElements = doc.select("article#dic_area");
if (contentElements.isEmpty()) {
contentElements = doc.select("#articeBody");
}
return contentElements.outerHtml().replaceAll("\\<[^>]*>|\\n", "");
}

private String scrapPostDate(final Document doc) {
Element dateElement = doc.selectFirst("div#ct> div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span");
if (dateElement != null) {
return dateElement.attr("data-date-time");
} else {
Element altDateElement = doc.selectFirst("#content > div.end_ct > div > div.article_info > span > em");
if (altDateElement != null) {
return altDateElement.text();
}
}
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.rollthedice.backend.batch.newsUrlStep;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

@Getter
@NoArgsConstructor
@AllArgsConstructor
@Setter
public class InitNewsDto {
private String newsCategory;
private String url;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.rollthedice.backend.batch.newsUrlStep;

import com.rollthedice.backend.domain.news.entity.NewsCategory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.batch.item.ItemReader;
import org.springframework.beans.factory.annotation.Value;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

public class NewsUrlReader implements ItemReader<InitNewsDto> {
@Value("${crawling.quantity}")
private int crawlingQuantity;

private final Iterator<NewsCategory> categories;
private final Queue<InitNewsDto> initNews = new LinkedList<>();

public NewsUrlReader() {
categories = Arrays.stream(NewsCategory.values()).collect(Collectors.toList()).iterator();
}

@Override
public InitNewsDto read() throws IOException {
while (initNews.isEmpty() && categories.hasNext()) {
NewsCategory category = categories.next();
initNews.addAll(scrapCategoryNews(category));
}
return initNews.poll();
}

private List<InitNewsDto> scrapCategoryNews(NewsCategory category) throws IOException {
Document doc = Jsoup.connect(category.getCategoryUrl()).get();
Elements newsList = doc.select(".sa_list").select("li");
if (newsList.size() < crawlingQuantity) {
return scrapNewsUrl(newsList.size(), newsList, category);
}
return scrapNewsUrl(crawlingQuantity, newsList, category);
}

private List<InitNewsDto> scrapNewsUrl(int quantity, Elements newsList, NewsCategory category) {
List<InitNewsDto> urls = new ArrayList<>();
for (int i = 0; i < quantity; i++) {
Element news = newsList.get(i);
String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href");
urls.add(new InitNewsDto(category.getName(), url));
}
return urls;
}
}
Loading

0 comments on commit 709b218

Please sign in to comment.