From 24e3098c328d3aa47ada49569cde27bde3db7e65 Mon Sep 17 00:00:00 2001 From: Fang Yuan Date: Thu, 9 Jan 2025 21:11:42 +0800 Subject: [PATCH] fix: Fix Book Source 3 not available Signed-off-by: Fang Yuan --- frontend/src/components/ViewConfig.tsx | 2 +- frontend/src/locales/en.json | 3 +- frontend/src/locales/zh.json | 5 +- internal/config/default_config.yaml | 2 +- internal/crawler/crawler.go | 2 +- internal/model/rule.go | 10 ++- internal/parse/catalogs.go | 9 ++- internal/parse/chapter.go | 3 + internal/parse/common.go | 4 +- internal/source/rule/rule1.json | 8 ++- internal/source/rule/rule2.json | 83 ++++++++++++----------- internal/source/rule/rule3.json | 3 +- internal/tools/concurrency/concurrency.go | 6 +- pkg/utils/utils.go | 39 +++++++++++ 14 files changed, 124 insertions(+), 55 deletions(-) diff --git a/frontend/src/components/ViewConfig.tsx b/frontend/src/components/ViewConfig.tsx index b07dbac..c217a33 100644 --- a/frontend/src/components/ViewConfig.tsx +++ b/frontend/src/components/ViewConfig.tsx @@ -86,7 +86,7 @@ const ViewConfig: React.FC = () => { options={[ { value: 1, label: '1' }, { value: 2, label: '2' }, - { value: 3, label: '3' }, + { value: 3, label: '3', tooltip: t('viewConfig.slowSource') }, ]} style={{ width: '100%' }} /> diff --git a/frontend/src/locales/en.json b/frontend/src/locales/en.json index 154e6ac..101e1a1 100644 --- a/frontend/src/locales/en.json +++ b/frontend/src/locales/en.json @@ -37,7 +37,8 @@ "chatbotSettings": "Chatbot Settings", "chatbotSupportInfo": "Note: The current configuration only supports Ollama, switching models may result in a download of the", "model": "model", - "modelTooltip": "Ollama supports a list of models available on https://ollama.com/library" + "modelTooltip": "Ollama supports a list of models available on https://ollama.com/library", + "slowSource": "There is a limitation on the current source of books, the download will be very slow" }, "checkUpdate": { "title": "Check Update", diff --git a/frontend/src/locales/zh.json b/frontend/src/locales/zh.json index 77fe104..562c38d 100644 --- a/frontend/src/locales/zh.json +++ b/frontend/src/locales/zh.json @@ -23,7 +23,7 @@ "crawlSettings": "爬取设置", "retrySettings": "重试设置", "enableBookSource": "启用书源", - "bookSourceTooltip": "只能选一个, 当前可选值:1、2、3,建议使用默认书源 3", + "bookSourceTooltip": "只能选一个, 当前可选值:1、2、3,建议不使用书源 3,有限速和防爬", "downloadPath": "下载路径", "downloadPathTooltip": "绝对相对均可 (Windows 路径分隔符不要用 \\ , 用 / 或 \\)", "fileExtension": "文件扩展名", @@ -37,7 +37,8 @@ "chatbotSettings": "聊天机器人设置", "chatbotSupportInfo": "注意:当前配置仅支持 Ollama,切换模型可能会导致下载", "model": "模型", - "modelTooltip": "Ollama 支持 https://ollama.com/library 上可用的模型列表" + "modelTooltip": "Ollama 支持 https://ollama.com/library 上可用的模型列表", + "slowSource": "当前书源有限制,下载会很慢" }, "checkUpdate": { "title": "检查更新", diff --git a/internal/config/default_config.yaml b/internal/config/default_config.yaml index 0e740fc..cca44dc 100644 --- a/internal/config/default_config.yaml +++ b/internal/config/default_config.yaml @@ -1,6 +1,6 @@ base: # 启用书源 (只能选一个, 当前可选值:1、2、3) - source-id: 3 + source-id: 1 # 下载路径, 绝对相对均可 (Windows 路径分隔符不要用 \ , 用 / 或 \) download-path: "downloads" # 文件扩展名, 支持 txt, epub, 推荐 epub diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index 20dcc94..2f4e49b 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -67,7 +67,7 @@ func (nc *novelCrawler) Crawl(res *model.SearchResult, start, end int) (*model.C // Parse and download content // Limit concurrent processing var wg sync.WaitGroup - threads := concurrencyTool.GetConcurrencyNum(conf.Crawl.Threads) + threads := concurrencyTool.GetConcurrencyNum(conf.Crawl.Threads, conf.Base.SourceID) semaphore := make(chan struct{}, threads) var nowCatalogsCount = int32(0) // Total completed tasks = number of chapters fetched + 1 (merging task) diff --git a/internal/model/rule.go b/internal/model/rule.go index 7252af5..661683f 100644 --- a/internal/model/rule.go +++ b/internal/model/rule.go @@ -10,6 +10,7 @@ type Rule struct { Search search `json:"search"` Book book `json:"book"` Chapter chapter `json:"chapter"` + Catalog catalog `json:"catalog"` } // Search represents the search rules @@ -39,10 +40,17 @@ type book struct { LatestChapter string `json:"latestChapter"` LatestUpdate string `json:"latestUpdate"` IsEnd string `json:"isEnd"` - Catalog string `json:"catalog"` CatalogOffset int `json:"catalogOffset"` } +type catalog struct { + URL string `json:"url"` + Result string `json:"result"` + Pagination bool `json:"pagination"` + NextPage string `json:"nextPage"` + Offset int `json:"offset"` +} + // Chapter represents the chapter rules type chapter struct { URL string `json:"url"` diff --git a/internal/parse/catalogs.go b/internal/parse/catalogs.go index 3cf04d6..5e2f565 100644 --- a/internal/parse/catalogs.go +++ b/internal/parse/catalogs.go @@ -1,11 +1,13 @@ package parse import ( + "fmt" "sort" "fy-novel/internal/model" "fy-novel/internal/source" "fy-novel/pkg/utils" + "github.com/gocolly/colly/v2" // "github.com/gocolly/colly/v2/debug" ) @@ -25,7 +27,12 @@ func (b *CatalogsParser) Parse(bookUrl string, start, end, retry int) ([]*model. var chapters = make(map[string]*model.Chapter) - collector.OnHTML(b.rule.Book.Catalog, func(e *colly.HTMLElement) { + if len(b.rule.Catalog.URL) > 0 { + id := utils.GetGroup1(b.rule.Book.URL, bookUrl) + bookUrl = fmt.Sprintf(b.rule.Catalog.URL, id) + } + + collector.OnHTML(b.rule.Catalog.Result, func(e *colly.HTMLElement) { chapter := &model.Chapter{ Title: e.Text, URL: utils.NormalizeURL(e.Attr("href"), b.rule.URL), diff --git a/internal/parse/chapter.go b/internal/parse/chapter.go index 96af3b6..3b327b8 100644 --- a/internal/parse/chapter.go +++ b/internal/parse/chapter.go @@ -33,6 +33,9 @@ func (b *ChapterParser) Parse( downOk := false attemptStart := 1 attempt := conf.Retry.MaxAttempts + if conf.Base.SourceID == 3 { + attempt = 0 + } // Fetch content utils.SpinWaitMaxRetryAttempts( func() bool { diff --git a/internal/parse/common.go b/internal/parse/common.go index 75f1ec6..89e9ed8 100644 --- a/internal/parse/common.go +++ b/internal/parse/common.go @@ -11,7 +11,8 @@ import ( ) const timeoutMillis = 25000 -const retryDefault = 3 +const retryDefault = 10 +const sleepSecond = 1 * time.Second var urlLock sync.Mutex @@ -44,6 +45,7 @@ func getCollector(cookies map[string]string, retry int) *colly.Collector { saveErrorUrl[link]++ r.Request.Retry() } else if saveErrorUrl[link] < retry { + time.Sleep(sleepSecond * time.Duration(retry)) saveErrorUrl[link]++ r.Request.Retry() } else { diff --git a/internal/source/rule/rule1.json b/internal/source/rule/rule1.json index 3f93072..77d3101 100644 --- a/internal/source/rule/rule1.json +++ b/internal/source/rule/rule1.json @@ -28,9 +28,11 @@ "coverUrl": "#fmimg > img", "latestChapter": "", "latestUpdate": "", - "isEnd": "", - "catalog": "#list > dl > dd > a", - "catalogOffset": 12 + "isEnd": "" + }, + "catalog": { + "result": "#list > dl > dd > a", + "offset": 12 }, "chapter": { "url": "http://www.mcmssc.la/%s/%s.html", diff --git a/internal/source/rule/rule2.json b/internal/source/rule/rule2.json index 0959d86..94d2ffc 100644 --- a/internal/source/rule/rule2.json +++ b/internal/source/rule/rule2.json @@ -1,42 +1,45 @@ { - "id": "2", - "url": "http://www.99xs.info/", - "name": "鸟书网", - "comment": "书很全,有15w+;无反爬", - "type": "html", - "search": { - "url": "http://www.99xs.info/read/search/", - "method": "post", - "body": {"kw": "searchkey"}, - "cookies": {}, - "pagination": false, - "result": "div.wrap > div > div > div", - "bookName": "div.bookinfo > h4 > a", - "author": "div.bookinfo > div.author", - "latestChapter": "div.bookinfo > div.update > a" + "id": "2", + "url": "http://www.99xs.info/", + "name": "鸟书网", + "comment": "书很全,有15w+;无反爬", + "type": "html", + "search": { + "url": "http://www.99xs.info/read/search/", + "method": "post", + "body": { + "kw": "searchkey" }, - "book": { - "url": "http://www.99xs.info/tag/%s/", - "bookName": "meta[property=\"og:novel:book_name\"]", - "author": "meta[property=\"og:novel:author\"]", - "intro": "meta[property=\"og:description\"]", - "category": "meta[property=\"og:novel:category\"]", - "coverUrl": "div.cover > img", - "latestChapter": "", - "latestUpdate": "", - "isEnd": "", - "catalog": "div.listmain > dl > dd > a", - "catalogOffset": 12 - }, - "chapter": { - "url": "http://www.99xs.info/tag/%s/%s.html", - "pagination": false, - "chapterNo": 0, - "title": "", - "content": "#content", - "paragraphTagClosed": false, - "paragraphTag": "

", - "filterTxt": "请记住本书首发域名:.+。鸟书网手机版阅读网址:.+|7017k", - "filterTag": "" - } - } \ No newline at end of file + "cookies": {}, + "pagination": false, + "result": "div.wrap > div > div > div", + "bookName": "div.bookinfo > h4 > a", + "author": "div.bookinfo > div.author", + "latestChapter": "div.bookinfo > div.update > a" + }, + "book": { + "url": "http://www.99xs.info/tag/%s/", + "bookName": "meta[property=\"og:novel:book_name\"]", + "author": "meta[property=\"og:novel:author\"]", + "intro": "meta[property=\"og:description\"]", + "category": "meta[property=\"og:novel:category\"]", + "coverUrl": "div.cover > img", + "latestChapter": "", + "latestUpdate": "", + "isEnd": "" + }, + "catalog": { + "result": "div.listmain > dl > dd > a", + "offset": 12 + }, + "chapter": { + "url": "http://www.99xs.info/tag/%s/%s.html", + "pagination": false, + "title": "", + "content": "#content", + "paragraphTagClosed": false, + "paragraphTag": "

", + "filterTxt": "请记住本书首发域名:.+。鸟书网手机版阅读网址:.+|7017k", + "filterTag": "" + } +} \ No newline at end of file diff --git a/internal/source/rule/rule3.json b/internal/source/rule/rule3.json index 2598a83..c49e796 100644 --- a/internal/source/rule/rule3.json +++ b/internal/source/rule/rule3.json @@ -2,9 +2,8 @@ "id": "3", "url": "https://69shux.co/", "name": "69书吧", - "comment": "无五秒盾、爬取章节有限流、需要梯子,美国节点🆗、数量5w+", + "comment": "需要梯子、无五秒盾、章节有限流、数量5w+", "type": "html", - "useProxy": true, "language": "zh-Hant", "search": { "url": "https://69shux.co/search", diff --git a/internal/tools/concurrency/concurrency.go b/internal/tools/concurrency/concurrency.go index 783c5c3..ee781e5 100644 --- a/internal/tools/concurrency/concurrency.go +++ b/internal/tools/concurrency/concurrency.go @@ -12,7 +12,7 @@ const ( var once sync.Once var defaultConcurrency int -func GetConcurrencyNum(target int) int { +func GetConcurrencyNum(target, sourceID int) int { once.Do(func() { defaultConcurrency = runtime.NumCPU() }) @@ -22,5 +22,9 @@ func GetConcurrencyNum(target int) int { if target >= maxThreads { target = maxThreads } + // 书源3有防爬机制,不能并发太高 + if sourceID == 3 { + return 1 + } return target } diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 54cb67b..9dfc06e 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -5,6 +5,7 @@ import ( "fmt" "hash/fnv" "os" + "regexp" "time" "unicode" ) @@ -133,3 +134,41 @@ func StringToUniqueHash(s string) uint64 { h.Write([]byte(s)) return h.Sum64() } + +// GetGroup0 获得匹配的字符串,获得正则中分组0的内容 +// +// regex: 匹配的正则 +// content: 被匹配的内容 +// 返回值: 匹配后得到的字符串,未匹配返回空字符串 +func GetGroup0(regex string, content string) string { + return Get(regex, content, 0) +} + +// GetGroup1 获得匹配的字符串,获得正则中分组1的内容 +// +// regex: 匹配的正则 +// content: 被匹配的内容 +// 返回值: 匹配后得到的字符串,未匹配返回空字符串 +func GetGroup1(regex string, content string) string { + return Get(regex, content, 1) +} + +// Get 获得匹配的字符串 +// +// regex: 匹配的正则 +// content: 被匹配的内容 +// groupIndex: 匹配正则的分组序号 +// 返回值: 匹配后得到的字符串,未匹配返回空字符串 +func Get(regex string, content string, groupIndex int) string { + re, err := regexp.Compile(regex) + if err != nil { + return "" + } + + matches := re.FindStringSubmatch(content) + if len(matches) > groupIndex { + return matches[groupIndex] + } + + return "" +}