Skip to content

Commit

Permalink
feat: Reconfigure retry policy and add inert retry mechanism for fail…
Browse files Browse the repository at this point in the history
…ed crawls.

Signed-off-by: Fang Yuan <wojiushifangyuanlove@gmail.com>
  • Loading branch information
767829413 committed Jan 7, 2025
1 parent 50496d8 commit 6f838b3
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 19 deletions.
6 changes: 3 additions & 3 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func NewNovelCrawler() Crawler {
func (nc *novelCrawler) Search(key string) ([]*model.SearchResult, error) {
conf := config.GetConf()
// Parse
res, err := parse.NewSearchResultParser(conf.Base.SourceID).Parse(key)
res, err := parse.NewSearchResultParser(conf.Base.SourceID).Parse(key, conf.Retry.MaxAttempts)
if err != nil {
return nil, err
}
Expand All @@ -40,7 +40,7 @@ func (nc *novelCrawler) Search(key string) ([]*model.SearchResult, error) {
func (nc *novelCrawler) Crawl(res *model.SearchResult, start, end int) (*model.CrawlResult, error) {
conf := config.GetConf()
// Fetch and parse the novel details page
book, err := parse.NewBookParser(conf.Base.SourceID).Parse(res.Url)
book, err := parse.NewBookParser(conf.Base.SourceID).Parse(res.Url, conf.Retry.MaxAttempts)
if err != nil {
return nil, err
}
Expand All @@ -55,7 +55,7 @@ func (nc *novelCrawler) Crawl(res *model.SearchResult, start, end int) (*model.C
}
// Get the novel's table of contents
catalogsParser := parse.NewCatalogsParser(conf.Base.SourceID)
catalogs, err := catalogsParser.Parse(res.Url, start, end)
catalogs, err := catalogsParser.Parse(res.Url, start, end, conf.Retry.MaxAttempts)
if err != nil {
return nil, err
}
Expand Down
4 changes: 2 additions & 2 deletions internal/parse/book.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ func NewBookParser(sourceID int) *BookParser {
}
}

func (b *BookParser) Parse(bookUrl string) (*model.Book, error) {
func (b *BookParser) Parse(bookUrl string, retry int) (*model.Book, error) {
book := &model.Book{}
collector := getCollector(nil)
collector := getCollector(nil, retry)
// 抓取书名
collector.OnHTML(b.rule.Book.BookName, func(e *colly.HTMLElement) {
bookName := e.Attr("content")
Expand Down
4 changes: 2 additions & 2 deletions internal/parse/catalogs.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ func NewCatalogsParser(sourceID int) *CatalogsParser {
}
}

func (b *CatalogsParser) Parse(bookUrl string, start, end int) ([]*model.Chapter, error) {
collector := getCollector(nil)
func (b *CatalogsParser) Parse(bookUrl string, start, end, retry int) ([]*model.Chapter, error) {
collector := getCollector(nil, retry)

var chapters = make(map[string]*model.Chapter)

Expand Down
6 changes: 3 additions & 3 deletions internal/parse/chapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func (b *ChapterParser) Parse(
var errTemp = "==> Retrying to download failed chapter content: [%s], Attempt: %d/%d, Reason: %s\n"
// Prevent duplicate fetching
if !downOk {
chapter.Content, err = b.crawl(chapter.URL)
chapter.Content, err = b.crawl(chapter.URL, attempt)
if err != nil {
// Attempt retry
fmt.Printf(
Expand Down Expand Up @@ -76,12 +76,12 @@ func (b *ChapterParser) Parse(
return nil
}

func (b *ChapterParser) crawl(url string) (string, error) {
func (b *ChapterParser) crawl(url string, retry int) (string, error) {
nextUrl := url
sb := bytes.NewBufferString("")

for {
collector := getCollector(nil)
collector := getCollector(nil, retry)
collector.OnHTML(b.rule.Chapter.Content, func(e *colly.HTMLElement) {
html, err := e.DOM.Html()
if err == nil {
Expand Down
13 changes: 8 additions & 5 deletions internal/parse/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ import (
)

const timeoutMillis = 25000
const retryMax = 10
const retryDefault = 3

var urlLock sync.Mutex

var saveErrorUrl = make(map[string]int)

func getCollector(cookies map[string]string) *colly.Collector {
func getCollector(cookies map[string]string, retry int) *colly.Collector {
c := colly.NewCollector(
colly.Async(true),
// Attach a debugger to the collector
Expand All @@ -31,6 +31,10 @@ func getCollector(cookies map[string]string) *colly.Collector {
//Delay: 5 * time.Second,
})

if retry == 0 {
retry = retryDefault
}

// 设置错误重试
c.OnError(func(r *colly.Response, err error) {
// 加入一个自动重试机制
Expand All @@ -39,13 +43,12 @@ func getCollector(cookies map[string]string) *colly.Collector {
if _, ok := saveErrorUrl[link]; !ok {
saveErrorUrl[link]++
r.Request.Retry()
} else if saveErrorUrl[link] < retryMax {
} else if saveErrorUrl[link] < retry {
saveErrorUrl[link]++
r.Request.Retry()
} else {
fmt.Println(saveErrorUrl[link], link)
fmt.Printf("\nRetry %d Request URL: %s, Error: %v", saveErrorUrl[link], link, err)
}
fmt.Printf("Retry %d Request URL: %s, Error: %v", saveErrorUrl[link], link, err)
urlLock.Unlock()
})

Expand Down
12 changes: 8 additions & 4 deletions internal/parse/search_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,21 @@ func NewSearchResultParser(sourceID int) *SearchResultParser {
}
}

func (p *SearchResultParser) Parse(keyword string) ([]*model.SearchResult, error) {
func (p *SearchResultParser) Parse(keyword string, retry int) ([]*model.SearchResult, error) {
search := p.rule.Search
isPaging := search.Pagination

collector := getCollector(
p.rule.Search.Cookies,
retry,
)

firstPageResults, err := p.getSearchResults(
collector,
p.rule.Search.URL,
utils.BuildMethod(p.rule.Search.Method),
keyword,
retry,
)
if err != nil {
return nil, err
Expand Down Expand Up @@ -62,18 +64,18 @@ func (p *SearchResultParser) Parse(keyword string) ([]*model.SearchResult, error

for url := range urls {
wg.Add(1)
go func(url string) {
go func(url string, retry int) {
defer wg.Done()
semaphore <- struct{}{}
defer func() { <-semaphore }()

results, err := p.getSearchResults(nil, url, http.MethodGet, "")
results, err := p.getSearchResults(nil, url, http.MethodGet, "", retry)
if err != nil {
errorChan <- err
return
}
resultChan <- results
}(url)
}(url, retry)
}

go func() {
Expand All @@ -99,10 +101,12 @@ func (p *SearchResultParser) getSearchResults(
collector *colly.Collector,
url, method string,
keyword string,
retry int,
) ([]*model.SearchResult, error) {
if collector == nil {
collector = getCollector(
p.rule.Search.Cookies,
retry,
)
}
var results []*model.SearchResult
Expand Down

0 comments on commit 6f838b3

Please sign in to comment.